• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;*!
2;* \copy
3;*     Copyright (c)  2009-2013, Cisco Systems
4;*     All rights reserved.
5;*
6;*     Redistribution and use in source and binary forms, with or without
7;*     modification, are permitted provided that the following conditions
8;*     are met:
9;*
10;*        * Redistributions of source code must retain the above copyright
11;*          notice, this list of conditions and the following disclaimer.
12;*
13;*        * Redistributions in binary form must reproduce the above copyright
14;*          notice, this list of conditions and the following disclaimer in
15;*          the documentation and/or other materials provided with the
16;*          distribution.
17;*
18;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29;*     POSSIBILITY OF SUCH DAMAGE.
30;*
31;*
32;*  intra_pred.asm
33;*
34;*  Abstract
35;*      sse2 and mmx function for intra predict operations(decoder)
36;*
37;*  History
38;*      18/09/2009 Created
39;*      19/11/2010 Added
40;*                  WelsDecoderI16x16LumaPredDcTop_sse2, WelsDecoderI16x16LumaPredDcNA_sse2,
41;*                  WelsDecoderIChromaPredDcLeft_mmx, WelsDecoderIChromaPredDcTop_sse2
42;*                  and WelsDecoderIChromaPredDcNA_mmx
43;*
44;*
45;*************************************************************************/
46
47%include "asm_inc.asm"
48;*******************************************************************************
49; Local Data (Read Only)
50;*******************************************************************************
51
52%ifdef X86_32_PICASM
53SECTION .text align=16
54%else
55SECTION .rodata align=16
56%endif
57
58align 16
59sse2_plane_inc_minus dw -7, -6, -5, -4, -3, -2, -1, 0
60align 16
61sse2_plane_inc dw 1, 2, 3, 4, 5, 6, 7, 8
62align 16
63sse2_plane_dec dw 8, 7, 6, 5, 4, 3, 2, 1
64
65; for chroma plane mode
66sse2_plane_inc_c dw 1, 2, 3, 4
67sse2_plane_dec_c dw 4, 3, 2, 1
68align 16
69sse2_plane_mul_b_c dw -3, -2, -1, 0, 1, 2, 3, 4
70
71align 16
72mmx_01bytes:        times 16    db 1
73
74align 16
75mmx_0x02: dw 0x02, 0x00, 0x00, 0x00
76
77align 16
78sse2_dc_0x80: times 16 db 0x80
79align 16
80sse2_wd_0x02: times 8 dw 0x02
81
82;*******************************************************************************
83; macros
84;*******************************************************************************
85;xmm0, xmm1, xmm2, eax, ecx
86;lower 64 bits of xmm0 save the result
87%macro SSE2_PRED_H_4X4_TWO_LINE 5
88    movd        %1, [%4-1]
89    movdqa      %3, %1
90    punpcklbw   %1, %3
91    movdqa      %3, %1
92    punpcklbw   %1, %3
93
94    ;add            %4, %5
95    movd        %2, [%4+%5-1]
96    movdqa      %3, %2
97    punpcklbw   %2, %3
98    movdqa      %3, %2
99    punpcklbw   %2, %3
100    punpckldq   %1, %2
101%endmacro
102
103
104%macro LOAD_COLUMN 6
105    movd    %1, [%5]
106    movd    %2, [%5+%6]
107    punpcklbw %1,   %2
108    lea     %5, [%5+2*%6]
109    movd    %3, [%5]
110    movd    %2, [%5+%6]
111    punpcklbw %3,   %2
112    punpcklwd %1,   %3
113    lea     %5, [%5+2*%6]
114    movd    %4, [%5]
115    movd    %2, [%5+%6]
116    punpcklbw %4,   %2
117    lea     %5, [%5+2*%6]
118    movd    %3, [%5]
119    movd    %2, [%5+%6]
120    lea     %5, [%5+2*%6]
121    punpcklbw %3,   %2
122    punpcklwd %4,   %3
123    punpckhdq %1,   %4
124%endmacro
125
126%macro SUMW_HORIZON 3
127    movhlps     %2, %1          ; x2 = xx xx xx xx d7 d6 d5 d4
128    paddw       %1, %2          ; x1 = xx xx xx xx d37 d26 d15 d04
129    punpcklwd   %1, %3          ; x1 =  d37  d26 d15 d04
130    movhlps     %2, %1          ; x2 = xxxx xxxx d37 d26
131    paddd       %1, %2          ; x1 = xxxx xxxx d1357 d0246
132    pshuflw     %2, %1, 0x4e    ; x2 = xxxx xxxx d0246 d1357
133    paddd       %1, %2          ; x1 = xxxx xxxx xxxx  d01234567
134%endmacro
135
136%macro COPY_16_TIMES 2
137    movdqa      %2, [%1-16]
138    psrldq      %2, 15
139    pmuludq     %2, [pic(mmx_01bytes)]
140    pshufd      %2, %2, 0
141%endmacro
142
143%macro COPY_16_TIMESS 3
144    movdqa      %2, [%1+%3-16]
145    psrldq      %2, 15
146    pmuludq     %2, [pic(mmx_01bytes)]
147    pshufd      %2, %2, 0
148%endmacro
149
150%macro LOAD_COLUMN_C 6
151    movd    %1, [%5]
152    movd    %2, [%5+%6]
153    punpcklbw %1,%2
154    lea     %5, [%5+2*%6]
155    movd    %3, [%5]
156    movd    %2, [%5+%6]
157    punpcklbw %3,   %2
158    punpckhwd %1,   %3
159    lea     %5, [%5+2*%6]
160%endmacro
161
162%macro LOAD_2_LEFT_AND_ADD 0
163    lea         r0, [r0+2*r1]
164    movzx       r3, byte [r0-0x01]
165    add         r2, r3
166    movzx       r3, byte [r0+r1-0x01]
167    add         r2, r3
168%endmacro
169
170;*******************************************************************************
171; Code
172;*******************************************************************************
173
174SECTION .text
175
176
177;*******************************************************************************
178;   void WelsDecoderI4x4LumaPredH_sse2(uint8_t *pPred, const int32_t kiStride)
179;
180;   pPred must align to 16
181;*******************************************************************************
182WELS_EXTERN WelsDecoderI4x4LumaPredH_sse2
183    %assign push_num 0
184    INIT_X86_32_PIC r3
185    LOAD_2_PARA
186    SIGN_EXTENSION r1, r1d
187
188    movzx       r2, byte [r0-1]
189    movd        xmm0,   r2d
190    pmuludq     xmm0,   [pic(mmx_01bytes)]
191
192    movzx       r2, byte [r0+r1-1]
193    movd        xmm1,   r2d
194    pmuludq     xmm1,   [pic(mmx_01bytes)]
195
196    lea         r0, [r0+r1]
197    movzx       r2, byte [r0+r1-1]
198    movd        xmm2,   r2d
199    pmuludq     xmm2,   [pic(mmx_01bytes)]
200
201    movzx       r2, byte [r0+2*r1-1]
202    movd        xmm3,   r2d
203    pmuludq     xmm3,   [pic(mmx_01bytes)]
204
205    sub         r0,    r1
206    movd        [r0], xmm0
207    movd        [r0+r1], xmm1
208    lea         r0, [r0+2*r1]
209    movd        [r0], xmm2
210    movd        [r0+r1], xmm3
211
212    DEINIT_X86_32_PIC
213    ret
214
215;*******************************************************************************
216; void WelsDecoderI16x16LumaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
217;*******************************************************************************
218WELS_EXTERN WelsDecoderI16x16LumaPredPlane_sse2
219    push r3
220    push r4
221    %assign push_num 2
222    INIT_X86_32_PIC r5
223    LOAD_2_PARA
224    PUSH_XMM 8
225    SIGN_EXTENSION r1, r1d
226    mov r4, r0 ; save r0 in r4
227    sub     r0, 1
228    sub     r0, r1
229
230    ;for H
231    pxor    xmm7,   xmm7
232    movq    xmm0,   [r0]
233    movdqa  xmm5,   [pic(sse2_plane_dec)]
234    punpcklbw xmm0, xmm7
235    pmullw  xmm0,   xmm5
236    movq    xmm1,   [r0 + 9]
237    movdqa  xmm6,   [pic(sse2_plane_inc)]
238    punpcklbw xmm1, xmm7
239    pmullw  xmm1,   xmm6
240    psubw   xmm1,   xmm0
241
242    SUMW_HORIZON    xmm1,xmm0,xmm2
243    movd    r2d,    xmm1        ; H += (i + 1) * (top[8 + i] - top[6 - i]);
244    movsx   r2, r2w
245    imul    r2, 5
246    add     r2, 32
247    sar     r2, 6           ; b = (5 * H + 32) >> 6;
248    SSE2_Copy8Times xmm1, r2d   ; xmm1 = b,b,b,b,b,b,b,b
249
250    movzx   r3, BYTE [r0+16]
251    sub r0, 3
252    LOAD_COLUMN     xmm0, xmm2, xmm3, xmm4, r0, r1
253
254    add     r0, 3
255    movzx   r2, BYTE [r0+8*r1]
256    add     r3, r2
257    shl     r3, 4           ;   a = (left[15*kiStride] + top[15]) << 4;
258
259    sub r0, 3
260    add     r0, r1
261    LOAD_COLUMN     xmm7, xmm2, xmm3, xmm4, r0, r1
262    pxor    xmm4,   xmm4
263    punpckhbw xmm0, xmm4
264    pmullw  xmm0,   xmm5
265    punpckhbw xmm7, xmm4
266    pmullw  xmm7,   xmm6
267    psubw   xmm7,   xmm0
268
269    ; Indicate that xmm2 is fully initialized. Its actual value doesn't
270    ; matter in SUMW_HORIZON below, but after being used in LOAD_COLUMN above,
271    ; valgrind thinks that xmm2 contains uninitalized data (if the columns outside
272    ; of the left are uninitialized, such as in DecUT_IntraPrediction), which taints
273    ; r2d below, even if actually isn't based on the uninitialized data.
274    pxor xmm2, xmm2
275
276    SUMW_HORIZON   xmm7,xmm0,xmm2
277    movd    r2d,   xmm7         ; V
278    movsx   r2, r2w
279
280    imul    r2, 5
281    add     r2, 32
282    sar     r2, 6               ; c = (5 * V + 32) >> 6;
283    SSE2_Copy8Times xmm4, r2d       ; xmm4 = c,c,c,c,c,c,c,c
284
285    mov r0, r4
286    add     r3, 16
287    imul    r2, -7
288    add     r3, r2      ; s = a + 16 + (-7)*c
289    SSE2_Copy8Times xmm0, r3d       ; xmm0 = s,s,s,s,s,s,s,s
290
291    xor     r2, r2
292    movdqa  xmm5,   [pic(sse2_plane_inc_minus)]
293
294get_i16x16_luma_pred_plane_sse2_1:
295    movdqa  xmm2,   xmm1
296    pmullw  xmm2,   xmm5
297    paddw   xmm2,   xmm0
298    psraw   xmm2,   5
299    movdqa  xmm3,   xmm1
300    pmullw  xmm3,   xmm6
301    paddw   xmm3,   xmm0
302    psraw   xmm3,   5
303    packuswb xmm2,  xmm3
304    movdqa  [r0],   xmm2
305    paddw   xmm0,   xmm4
306    add     r0, r1
307    inc     r2
308    cmp     r2, 16
309    jnz get_i16x16_luma_pred_plane_sse2_1
310
311    POP_XMM
312    DEINIT_X86_32_PIC
313    pop r4
314    pop r3
315    ret
316
317
318
319;*******************************************************************************
320; void WelsDecoderI16x16LumaPredH_sse2(uint8_t *pPred, const int32_t kiStride);
321;*******************************************************************************
322
323%macro SSE2_PRED_H_16X16_TWO_LINE_DEC 2
324    lea     %1, [%1+%2*2]
325
326    COPY_16_TIMES %1,   xmm0
327    movdqa  [%1],   xmm0
328    COPY_16_TIMESS %1,  xmm0,   %2
329    movdqa  [%1+%2],    xmm0
330%endmacro
331
332WELS_EXTERN WelsDecoderI16x16LumaPredH_sse2
333    %assign push_num 0
334    INIT_X86_32_PIC_NOPRESERVE r2
335    LOAD_2_PARA
336    SIGN_EXTENSION r1, r1d
337
338    COPY_16_TIMES r0,   xmm0
339    movdqa  [r0],       xmm0
340    COPY_16_TIMESS r0,  xmm0,   r1
341    movdqa  [r0+r1],    xmm0
342
343    SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
344    SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
345    SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
346    SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
347    SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
348    SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
349    SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
350
351    DEINIT_X86_32_PIC
352    ret
353
354;*******************************************************************************
355; void WelsDecoderI16x16LumaPredV_sse2(uint8_t *pPred, const int32_t kiStride);
356;*******************************************************************************
357WELS_EXTERN WelsDecoderI16x16LumaPredV_sse2
358    %assign push_num 0
359    LOAD_2_PARA
360    SIGN_EXTENSION r1, r1d
361
362    sub     r0, r1
363    movdqa  xmm0, [r0]
364
365    movdqa  [r0+r1], xmm0
366    lea     r0, [r0+2*r1]
367    movdqa  [r0],     xmm0
368    movdqa  [r0+r1], xmm0
369    lea     r0, [r0+2*r1]
370    movdqa  [r0],     xmm0
371    movdqa  [r0+r1], xmm0
372    lea     r0, [r0+2*r1]
373    movdqa  [r0],     xmm0
374    movdqa  [r0+r1], xmm0
375    lea     r0, [r0+2*r1]
376    movdqa  [r0],     xmm0
377    movdqa  [r0+r1], xmm0
378    lea     r0, [r0+2*r1]
379    movdqa  [r0],     xmm0
380    movdqa  [r0+r1], xmm0
381    lea     r0, [r0+2*r1]
382    movdqa  [r0],     xmm0
383    movdqa  [r0+r1], xmm0
384    lea     r0, [r0+2*r1]
385    movdqa  [r0],     xmm0
386    movdqa  [r0+r1], xmm0
387    lea     r0, [r0+2*r1]
388    movdqa  [r0],     xmm0
389
390    ret
391
392;*******************************************************************************
393; void WelsDecoderIChromaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
394;*******************************************************************************
395WELS_EXTERN WelsDecoderIChromaPredPlane_sse2
396    push r3
397    push r4
398    %assign push_num 2
399    INIT_X86_32_PIC r5
400    LOAD_2_PARA
401    PUSH_XMM 8
402    SIGN_EXTENSION r1, r1d
403    mov r4, r0
404    sub     r0, 1
405    sub     r0, r1
406
407    pxor    mm7,    mm7
408    movq    mm0,    [r0]
409    movq    mm5,    [pic(sse2_plane_dec_c)]
410    punpcklbw mm0,  mm7
411    pmullw  mm0,    mm5
412    movq    mm1,    [r0 + 5]
413    movq    mm6,    [pic(sse2_plane_inc_c)]
414    punpcklbw mm1,  mm7
415    pmullw  mm1,    mm6
416    psubw   mm1,    mm0
417
418    movq2dq xmm1,   mm1
419    pxor    xmm2,   xmm2
420    SUMW_HORIZON    xmm1,xmm0,xmm2
421    movd    r2d,    xmm1
422    movsx   r2, r2w
423    imul    r2, 17
424    add     r2, 16
425    sar     r2, 5           ; b = (17 * H + 16) >> 5;
426    SSE2_Copy8Times xmm1, r2d   ; mm1 = b,b,b,b,b,b,b,b
427
428    movzx   r3, BYTE [r0+8]
429    sub r0, 3
430    LOAD_COLUMN_C   mm0, mm2, mm3, mm4, r0, r1
431
432    add     r0, 3
433    movzx   r2, BYTE [r0+4*r1]
434    add     r3, r2
435    shl     r3, 4           ; a = (left[7*kiStride] + top[7]) << 4;
436
437    sub r0, 3
438    add     r0, r1
439    LOAD_COLUMN_C   mm7, mm2, mm3, mm4, r0, r1
440    pxor    mm4,    mm4
441    punpckhbw mm0,  mm4
442    pmullw  mm0,    mm5
443    punpckhbw mm7,  mm4
444    pmullw  mm7,    mm6
445    psubw   mm7,    mm0
446
447    movq2dq xmm7,   mm7
448    pxor    xmm2,   xmm2
449    SUMW_HORIZON    xmm7,xmm0,xmm2
450    movd    r2d,    xmm7            ; V
451    movsx   r2, r2w
452
453    imul    r2, 17
454    add     r2, 16
455    sar     r2, 5               ; c = (17 * V + 16) >> 5;
456    SSE2_Copy8Times xmm4, r2d       ; mm4 = c,c,c,c,c,c,c,c
457
458    mov     r0, r4
459    add     r3, 16
460    imul    r2, -3
461    add     r3, r2              ; s = a + 16 + (-3)*c
462    SSE2_Copy8Times xmm0, r3d       ; xmm0 = s,s,s,s,s,s,s,s
463
464    xor     r2, r2
465    movdqa  xmm5,   [pic(sse2_plane_mul_b_c)]
466
467get_i_chroma_pred_plane_sse2_1:
468    movdqa  xmm2,   xmm1
469    pmullw  xmm2,   xmm5
470    paddw   xmm2,   xmm0
471    psraw   xmm2,   5
472    packuswb xmm2,  xmm2
473    movq    [r0],   xmm2
474    paddw   xmm0,   xmm4
475    add     r0, r1
476    inc     r2
477    cmp     r2, 8
478    jnz get_i_chroma_pred_plane_sse2_1
479
480    POP_XMM
481    DEINIT_X86_32_PIC
482    pop r4
483    pop r3
484    WELSEMMS
485    ret
486
487;*******************************************************************************
488;   0 |1 |2 |3 |4 |
489;   6 |7 |8 |9 |10|
490;   11|12|13|14|15|
491;   16|17|18|19|20|
492;   21|22|23|24|25|
493;   7 is the start pixel of current 4x4 block
494;   pPred[7] = ([6]+[0]*2+[1]+2)/4
495;
496;   void WelsDecoderI4x4LumaPredDDR_mmx(uint8_t *pPred, const int32_t kiStride)
497;
498;*******************************************************************************
499WELS_EXTERN WelsDecoderI4x4LumaPredDDR_mmx
500    %assign push_num 0
501    INIT_X86_32_PIC r3
502    LOAD_2_PARA
503    SIGN_EXTENSION r1, r1d
504    mov r2, r0
505
506    movq        mm1,[r2+r1-8]       ;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
507    movq        mm2,[r2-8]          ;get value of 6 mm2[8] = 6
508    sub     r2, r1          ;mov eax to above line of current block(postion of 1)
509    punpckhbw   mm2,[r2-8]          ;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6]
510    movd        mm3,[r2]            ;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3]
511    punpckhwd   mm1,mm2             ;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11]
512    psllq       mm3,18h             ;mm3[5]=[1]
513    psrlq       mm1,28h             ;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
514    por         mm3,mm1             ;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11]
515    movq        mm1,mm3             ;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
516    lea         r2,[r2+r1*2-8h]     ;set eax point to 12
517    movq        mm4,[r2+r1]     ;get value of 16, mm4[8]=[16]
518    psllq       mm3,8               ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0
519    psrlq       mm4,38h             ;mm4[1]=[16]
520    por         mm3,mm4             ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16]
521    movq        mm2,mm3             ;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16]
522    movq        mm4,[r2+r1*2]       ;mm4[8]=[21]
523    psllq       mm3,8               ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0
524    psrlq       mm4,38h             ;mm4[1]=[21]
525    por         mm3,mm4             ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21]
526    movq        mm4,mm3             ;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
527    pavgb       mm3,mm1             ;mm3=([11]+[21]+1)/2
528    pxor        mm1,mm4             ;find odd value in the lowest bit of each byte
529    pand        mm1,[pic(mmx_01bytes)]   ;set the odd bit
530    psubusb     mm3,mm1             ;decrease 1 from odd bytes
531    pavgb       mm2,mm3             ;mm2=(([11]+[21]+1)/2+1+[16])/2
532
533    lea         r0,[r0+r1]
534    movd        [r0+2*r1],mm2
535    sub         r0,r1
536    psrlq       mm2,8
537    movd        [r0+2*r1],mm2
538    psrlq       mm2,8
539    movd        [r0+r1],mm2
540    psrlq       mm2,8
541    movd        [r0],mm2
542    DEINIT_X86_32_PIC
543    WELSEMMS
544    ret
545
546
547;*******************************************************************************
548;   void WelsDecoderIChromaPredH_mmx(uint8_t *pPred, const int32_t kiStride)
549;   copy 8 pixel of 8 line from left
550;*******************************************************************************
551%macro MMX_PRED_H_8X8_ONE_LINE 4
552    movq        %1,     [%3-8]
553    psrlq       %1,     38h
554
555    pmullw      %1,     [pic(mmx_01bytes)]
556    pshufw      %1,     %1, 0
557    movq        [%4],   %1
558%endmacro
559
560%macro MMX_PRED_H_8X8_ONE_LINEE 4
561    movq        %1,     [%3+r1-8]
562    psrlq       %1,     38h
563
564    pmullw      %1,     [pic(mmx_01bytes)]
565    pshufw      %1,     %1, 0
566    movq        [%4],   %1
567%endmacro
568
569WELS_EXTERN WelsDecoderIChromaPredH_mmx
570    %assign push_num 0
571    INIT_X86_32_PIC r3
572    LOAD_2_PARA
573    SIGN_EXTENSION r1, r1d
574    mov r2, r0
575
576    movq        mm0,    [r2-8]
577    psrlq       mm0,    38h
578
579    pmullw      mm0,        [pic(mmx_01bytes)]
580    pshufw      mm0,    mm0,    0
581    movq        [r0],   mm0
582
583    MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
584
585    lea         r2, [r2+r1*2]
586    MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r2, r0+2*r1
587
588    lea         r0, [r0+2*r1]
589    MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
590
591    lea         r2, [r2+r1*2]
592    MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r2, r0+2*r1
593
594    lea         r0, [r0+2*r1]
595    MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
596
597    lea         r2, [r2+r1*2]
598    MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r2, r0+2*r1
599
600    lea         r0, [r0+2*r1]
601    MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
602
603    DEINIT_X86_32_PIC
604    WELSEMMS
605    ret
606
607
608;*******************************************************************************
609;   void WelsDecoderIChromaPredV_mmx(uint8_t *pPred, const int32_t kiStride)
610;   copy 8 pixels from top 8 pixels
611;*******************************************************************************
612WELS_EXTERN WelsDecoderIChromaPredV_mmx
613    %assign push_num 0
614    LOAD_2_PARA
615    SIGN_EXTENSION r1, r1d
616
617    sub         r0,     r1
618    movq        mm0,        [r0]
619
620    movq        [r0+r1],        mm0
621    movq        [r0+2*r1],  mm0
622    lea         r0, [r0+2*r1]
623    movq        [r0+r1],      mm0
624    movq        [r0+2*r1],    mm0
625    lea         r0, [r0+2*r1]
626    movq        [r0+r1],      mm0
627    movq        [r0+2*r1],    mm0
628    lea         r0, [r0+2*r1]
629    movq        [r0+r1],      mm0
630    movq        [r0+2*r1],    mm0
631
632    WELSEMMS
633    ret
634
635
636;*******************************************************************************
637;   lt|t0|t1|t2|t3|
638;   l0|
639;   l1|
640;   l2|
641;   l3|
642;   t3 will never been used
643;   destination:
644;   |a |b |c |d |
645;   |e |f |a |b |
646;   |g |h |e |f |
647;   |i |j |g |h |
648
649;   a = (1 + lt + l0)>>1
650;   e = (1 + l0 + l1)>>1
651;   g = (1 + l1 + l2)>>1
652;   i = (1 + l2 + l3)>>1
653
654;   d = (2 + t0 + (t1<<1) + t2)>>2
655;   c = (2 + lt + (t0<<1) + t1)>>2
656;   b = (2 + l0 + (lt<<1) + t0)>>2
657
658;   f = (2 + l1 + (l0<<1) + lt)>>2
659;   h = (2 + l2 + (l1<<1) + l0)>>2
660;   j = (2 + l3 + (l2<<1) + l1)>>2
661;   [b a f e h g j i] + [d c b a] --> mov to memory
662;
663;   void WelsDecoderI4x4LumaPredHD_mmx(uint8_t *pPred, const int32_t kiStride)
664;*******************************************************************************
665WELS_EXTERN WelsDecoderI4x4LumaPredHD_mmx
666    %assign push_num 0
667    INIT_X86_32_PIC r3
668    LOAD_2_PARA
669    SIGN_EXTENSION r1, r1d
670    mov r2, r0
671    sub         r2, r1
672    movd        mm0, [r2-1]            ; mm0 = [xx xx xx xx t2 t1 t0 lt]
673    psllq       mm0, 20h                ; mm0 = [t2 t1 t0 lt xx xx xx xx]
674
675    movd        mm1, [r2+2*r1-4]
676    punpcklbw   mm1, [r2+r1-4]        ; mm1[7] = l0, mm1[6] = l1
677    lea         r2, [r2+2*r1]
678    movd        mm2, [r2+2*r1-4]
679    punpcklbw   mm2, [r2+r1-4]        ; mm2[7] = l2, mm2[6] = l3
680    punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
681    psrlq       mm2, 20h
682    pxor        mm0, mm2                ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
683
684    movq        mm1, mm0
685    psrlq       mm1, 10h                ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
686    movq        mm2, mm0
687    psrlq       mm2, 8h                 ; mm2 = [xx t2 t1 t0 lt l0 l1 l2]
688    movq        mm3, mm2
689    movq        mm4, mm1
690    pavgb       mm1, mm0
691
692    pxor        mm4, mm0                ; find odd value in the lowest bit of each byte
693    pand        mm4, [pic(mmx_01bytes)] ; set the odd bit
694    psubusb     mm1, mm4                ; decrease 1 from odd bytes
695
696    pavgb       mm2, mm1                ; mm2 = [xx xx d  c  b  f  h  j]
697
698    movq        mm4, mm0
699    pavgb       mm3, mm4                ; mm3 = [xx xx xx xx a  e  g  i]
700    punpcklbw   mm3, mm2                ; mm3 = [b  a  f  e  h  g  j  i]
701
702    psrlq       mm2, 20h
703    psllq       mm2, 30h                ; mm2 = [d  c  0  0  0  0  0  0]
704    movq        mm4, mm3
705    psrlq       mm4, 10h                ; mm4 = [0  0  b  a  f  e  h  j]
706    pxor        mm2, mm4                ; mm2 = [d  c  b  a  xx xx xx xx]
707    psrlq       mm2, 20h                ; mm2 = [xx xx xx xx  d  c  b  a]
708
709    movd        [r0], mm2
710    lea         r0, [r0+r1]
711    movd        [r0+2*r1], mm3
712    sub         r0, r1
713    psrlq       mm3, 10h
714    movd        [r0+2*r1], mm3
715    psrlq       mm3, 10h
716    movd        [r0+r1], mm3
717    DEINIT_X86_32_PIC
718    WELSEMMS
719    ret
720
721
722
723;*******************************************************************************
724;   lt|t0|t1|t2|t3|
725;   l0|
726;   l1|
727;   l2|
728;   l3|
729;   t3 will never been used
730;   destination:
731;   |a |b |c |d |
732;   |c |d |e |f |
733;   |e |f |g |g |
734;   |g |g |g |g |
735
736;   a = (1 + l0 + l1)>>1
737;   c = (1 + l1 + l2)>>1
738;   e = (1 + l2 + l3)>>1
739;   g = l3
740
741;   b = (2 + l0 + (l1<<1) + l2)>>2
742;   d = (2 + l1 + (l2<<1) + l3)>>2
743;   f = (2 + l2 + (l3<<1) + l3)>>2
744
745;   [g g f e d c b a] + [g g g g] --> mov to memory
746;
747;   void WelsDecoderI4x4LumaPredHU_mmx(uint8_t *pPred, const int32_t kiStride)
748;*******************************************************************************
749WELS_EXTERN WelsDecoderI4x4LumaPredHU_mmx
750    %assign push_num 0
751    INIT_X86_32_PIC r3
752    LOAD_2_PARA
753    SIGN_EXTENSION r1, r1d
754    mov r2, r0
755
756    movd        mm0, [r2-4]            ; mm0[3] = l0
757    punpcklbw   mm0, [r2+r1-4]        ; mm0[7] = l1, mm0[6] = l0
758    lea         r2, [r2+2*r1]
759    movd        mm2, [r2-4]            ; mm2[3] = l2
760    movd        mm4, [r2+r1-4]        ; mm4[3] = l3
761    punpcklbw   mm2, mm4
762    punpckhwd   mm0, mm2                ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
763
764    psrlq       mm4, 18h
765    psllq       mm4, 38h                ; mm4 = [l3 xx xx xx xx xx xx xx]
766    psrlq       mm0, 8h
767    pxor        mm0, mm4                ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
768
769    movq        mm1, mm0
770    psllq       mm1, 8h                 ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
771    movq        mm3, mm1                ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
772    pavgb       mm1, mm0                ; mm1 = [g  e  c  a  xx xx xx xx]
773
774    movq        mm2, mm0
775    psllq       mm2, 10h                ; mm2 = [l2 l1 l0 xx xx xx xx xx]
776    movq        mm5, mm2
777    pavgb       mm2, mm0
778
779    pxor        mm5, mm0                ; find odd value in the lowest bit of each byte
780    pand        mm5, [pic(mmx_01bytes)] ; set the odd bit
781    psubusb     mm2, mm5                ; decrease 1 from odd bytes
782
783    pavgb       mm2, mm3                ; mm2 = [f  d  b  xx xx xx xx xx]
784
785    psrlq       mm2, 8h
786    pxor        mm2, mm4                ; mm2 = [g  f  d  b  xx xx xx xx]
787
788    punpckhbw   mm1, mm2                ; mm1 = [g  g  f  e  d  c  b  a]
789    punpckhbw   mm4, mm4                ; mm4 = [g  g  xx xx xx xx xx xx]
790    punpckhbw   mm4, mm4                ; mm4 = [g  g  g  g  xx xx xx xx]
791
792    psrlq       mm4, 20h
793    lea         r0, [r0+r1]
794    movd        [r0+2*r1], mm4
795
796    sub         r0, r1
797    movd        [r0], mm1
798    psrlq       mm1, 10h
799    movd        [r0+r1], mm1
800    psrlq       mm1, 10h
801    movd        [r0+2*r1], mm1
802    DEINIT_X86_32_PIC
803    WELSEMMS
804    ret
805
806
807
808;*******************************************************************************
809;   lt|t0|t1|t2|t3|
810;   l0|
811;   l1|
812;   l2|
813;   l3|
814;   l3 will never been used
815;   destination:
816;   |a |b |c |d |
817;   |e |f |g |h |
818;   |i |a |b |c |
819;   |j |e |f |g |
820
821;   a = (1 + lt + t0)>>1
822;   b = (1 + t0 + t1)>>1
823;   c = (1 + t1 + t2)>>1
824;   d = (1 + t2 + t3)>>1
825
826;   e = (2 + l0 + (lt<<1) + t0)>>2
827;   f = (2 + lt + (t0<<1) + t1)>>2
828;   g = (2 + t0 + (t1<<1) + t2)>>2
829
830;   h = (2 + t1 + (t2<<1) + t3)>>2
831;   i = (2 + lt + (l0<<1) + l1)>>2
832;   j = (2 + l0 + (l1<<1) + l2)>>2
833;
834;   void WelsDecoderI4x4LumaPredVR_mmx(uint8_t *pPred, const int32_t kiStride)
835;*******************************************************************************
836WELS_EXTERN WelsDecoderI4x4LumaPredVR_mmx
837    %assign push_num 0
838    INIT_X86_32_PIC r3
839    LOAD_2_PARA
840    SIGN_EXTENSION r1, r1d
841    mov r2, r0
842    sub         r2, r1
843    movq        mm0, [r2-1]            ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
844    psllq       mm0, 18h                ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
845
846    movd        mm1, [r2+2*r1-4]
847    punpcklbw   mm1, [r2+r1-4]        ; mm1[7] = l0, mm1[6] = l1
848    lea         r2, [r2+2*r1]
849    movq        mm2, [r2+r1-8]        ; mm2[7] = l2
850    punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 xx xx xx xx xx]
851    psrlq       mm2, 28h
852    pxor        mm0, mm2                ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
853
854    movq        mm1, mm0
855    psllq       mm1, 8h                 ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
856    pavgb       mm1, mm0                ; mm1 = [d  c  b  a  xx xx xx xx]
857
858    movq        mm2, mm0
859    psllq       mm2, 10h                ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
860    movq        mm3, mm2
861    pavgb       mm2, mm0
862
863    pxor        mm3, mm0                ; find odd value in the lowest bit of each byte
864    pand        mm3, [pic(mmx_01bytes)] ; set the odd bit
865    psubusb     mm2, mm3                ; decrease 1 from odd bytes
866
867    movq        mm3, mm0
868    psllq       mm3, 8h                 ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
869    pavgb       mm3, mm2                ; mm3 = [h  g  f  e  i  j  xx xx]
870    movq        mm2, mm3
871
872    psrlq       mm1, 20h                ; mm1 = [xx xx xx xx d  c  b  a]
873    movd        [r0], mm1
874
875    psrlq       mm2, 20h                ; mm2 = [xx xx xx xx h  g  f  e]
876    movd        [r0+r1], mm2
877
878    movq        mm4, mm3
879    psllq       mm4, 20h
880    psrlq       mm4, 38h                ; mm4 = [xx xx xx xx xx xx xx i]
881
882    movq        mm5, mm3
883    psllq       mm5, 28h
884    psrlq       mm5, 38h                ; mm5 = [xx xx xx xx xx xx xx j]
885
886    psllq       mm1, 8h
887    pxor        mm4, mm1                ; mm4 = [xx xx xx xx c  b  a  i]
888    movd        [r0+2*r1], mm4
889
890    psllq       mm2, 8h
891    pxor        mm5, mm2                ; mm5 = [xx xx xx xx g  f  e  j]
892    lea         r0, [r0+2*r1]
893    movd        [r0+r1], mm5
894    DEINIT_X86_32_PIC
895    WELSEMMS
896    ret
897
898;*******************************************************************************
899;   lt|t0|t1|t2|t3|t4|t5|t6|t7
900;   l0|
901;   l1|
902;   l2|
903;   l3|
904;   lt,t0,t1,t2,t3 will never been used
905;   destination:
906;   |a |b |c |d |
907;   |b |c |d |e |
908;   |c |d |e |f |
909;   |d |e |f |g |
910
911;   a = (2 + t0 + t2 + (t1<<1))>>2
912;   b = (2 + t1 + t3 + (t2<<1))>>2
913;   c = (2 + t2 + t4 + (t3<<1))>>2
914;   d = (2 + t3 + t5 + (t4<<1))>>2
915
916;   e = (2 + t4 + t6 + (t5<<1))>>2
917;   f = (2 + t5 + t7 + (t6<<1))>>2
918;   g = (2 + t6 + t7 + (t7<<1))>>2
919
920;   [g f e d c b a] --> mov to memory
921;
922;   void WelsDecoderI4x4LumaPredDDL_mmx(uint8_t *pPred, const int32_t kiStride)
923;*******************************************************************************
924WELS_EXTERN WelsDecoderI4x4LumaPredDDL_mmx
925    %assign push_num 0
926    INIT_X86_32_PIC r3
927    LOAD_2_PARA
928    SIGN_EXTENSION r1, r1d
929    mov r2, r0
930    sub         r2, r1
931    movq        mm0, [r2]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
932    movq        mm1, mm0
933    movq        mm2, mm0
934
935    movq        mm3, mm0
936    psrlq       mm3, 38h
937    psllq       mm3, 38h                ; mm3 = [t7 xx xx xx xx xx xx xx]
938
939    psllq       mm1, 8h                 ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
940    psrlq       mm2, 8h
941    pxor        mm2, mm3                ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
942
943    movq        mm3, mm1
944    pavgb       mm1, mm2
945    pxor        mm3, mm2                ; find odd value in the lowest bit of each byte
946    pand        mm3, [pic(mmx_01bytes)] ; set the odd bit
947    psubusb     mm1, mm3                ; decrease 1 from odd bytes
948
949    pavgb       mm0, mm1                ; mm0 = [g f e d c b a xx]
950
951    psrlq       mm0, 8h
952    movd        [r0], mm0
953    psrlq       mm0, 8h
954    movd        [r0+r1], mm0
955    psrlq       mm0, 8h
956    movd        [r0+2*r1], mm0
957    psrlq       mm0, 8h
958    lea         r0, [r0+2*r1]
959    movd        [r0+r1], mm0
960    DEINIT_X86_32_PIC
961    WELSEMMS
962    ret
963
964
965;*******************************************************************************
966;   lt|t0|t1|t2|t3|t4|t5|t6|t7
967;   l0|
968;   l1|
969;   l2|
970;   l3|
971;   lt,t0,t1,t2,t3 will never been used
972;   destination:
973;   |a |b |c |d |
974;   |e |f |g |h |
975;   |b |c |d |i |
976;   |f |g |h |j |
977
978;   a = (1 + t0 + t1)>>1
979;   b = (1 + t1 + t2)>>1
980;   c = (1 + t2 + t3)>>1
981;   d = (1 + t3 + t4)>>1
982;   i = (1 + t4 + t5)>>1
983
984;   e = (2 + t0 + (t1<<1) + t2)>>2
985;   f = (2 + t1 + (t2<<1) + t3)>>2
986;   g = (2 + t2 + (t3<<1) + t4)>>2
987;   h = (2 + t3 + (t4<<1) + t5)>>2
988;   j = (2 + t4 + (t5<<1) + t6)>>2
989
990;   [i d c b a] + [j h g f e] --> mov to memory
991;
992;   void WelsDecoderI4x4LumaPredVL_mmx(uint8_t *pPred, const int32_t kiStride)
993;*******************************************************************************
994WELS_EXTERN WelsDecoderI4x4LumaPredVL_mmx
995    %assign push_num 0
996    INIT_X86_32_PIC r3
997    LOAD_2_PARA
998    SIGN_EXTENSION r1, r1d
999    mov r2, r0
1000
1001    sub         r2, r1
1002    movq        mm0, [r2]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
1003    movq        mm1, mm0
1004    movq        mm2, mm0
1005
1006    psrlq       mm1, 8h                 ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
1007    psrlq       mm2, 10h                ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
1008
1009    movq        mm3, mm1
1010    pavgb       mm3, mm0                ; mm3 = [xx xx xx i  d  c  b  a]
1011
1012    movq        mm4, mm2
1013    pavgb       mm2, mm0
1014    pxor        mm4, mm0                ; find odd value in the lowest bit of each byte
1015    pand        mm4, [pic(mmx_01bytes)] ; set the odd bit
1016    psubusb     mm2, mm4                ; decrease 1 from odd bytes
1017
1018    pavgb       mm2, mm1                ; mm2 = [xx xx xx j  h  g  f  e]
1019
1020    movd        [r0], mm3
1021    psrlq       mm3, 8h
1022    movd        [r0+2*r1], mm3
1023
1024    movd        [r0+r1], mm2
1025    psrlq       mm2, 8h
1026    lea         r0, [r0+2*r1]
1027    movd        [r0+r1], mm2
1028    DEINIT_X86_32_PIC
1029    WELSEMMS
1030    ret
1031
1032;*******************************************************************************
1033;
1034;   void WelsDecoderIChromaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
1035;*******************************************************************************
1036WELS_EXTERN WelsDecoderIChromaPredDc_sse2
1037    push    r3
1038    push    r4
1039    %assign push_num 2
1040    INIT_X86_32_PIC r5
1041    LOAD_2_PARA
1042    SIGN_EXTENSION r1, r1d
1043    mov r4, r0
1044
1045    sub         r0, r1
1046    movq        mm0, [r0]
1047
1048    movzx       r2, byte [r0+r1-0x01] ; l1
1049    lea         r0, [r0+2*r1]
1050    movzx       r3, byte [r0-0x01]     ; l2
1051    add         r2, r3
1052    movzx       r3, byte [r0+r1-0x01] ; l3
1053    add         r2, r3
1054    lea         r0, [r0+2*r1]
1055    movzx       r3, byte [r0-0x01]     ; l4
1056    add         r2, r3
1057    movd        mm1, r2d                 ; mm1 = l1+l2+l3+l4
1058
1059    movzx       r2, byte [r0+r1-0x01] ; l5
1060    lea         r0, [r0+2*r1]
1061    movzx       r3, byte [r0-0x01]     ; l6
1062    add         r2, r3
1063    movzx       r3, byte [r0+r1-0x01] ; l7
1064    add         r2, r3
1065    lea         r0, [r0+2*r1]
1066    movzx       r3, byte [r0-0x01]     ; l8
1067    add         r2, r3
1068    movd        mm2, r2d                 ; mm2 = l5+l6+l7+l8
1069
1070    movq        mm3, mm0
1071    psrlq       mm0, 0x20
1072    psllq       mm3, 0x20
1073    psrlq       mm3, 0x20
1074    pxor        mm4, mm4
1075    psadbw      mm0, mm4
1076    psadbw      mm3, mm4                 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
1077
1078    paddq       mm3, mm1
1079    movq        mm1, mm2
1080    paddq       mm1, mm0;                ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
1081
1082    movq        mm4, [pic(mmx_0x02)]
1083
1084    paddq       mm0, mm4
1085    psrlq       mm0, 0x02
1086
1087    paddq       mm2, mm4
1088    psrlq       mm2, 0x02
1089
1090    paddq       mm3, mm4
1091    paddq       mm3, mm4
1092    psrlq       mm3, 0x03
1093
1094    paddq       mm1, mm4
1095    paddq       mm1, mm4
1096    psrlq       mm1, 0x03
1097
1098    pmuludq     mm0, [pic(mmx_01bytes)]
1099    pmuludq     mm3, [pic(mmx_01bytes)]
1100    psllq       mm0, 0x20
1101    pxor        mm0, mm3                 ; mm0 = m_up
1102
1103    pmuludq     mm2, [pic(mmx_01bytes)]
1104    pmuludq     mm1, [pic(mmx_01bytes)]
1105    psllq       mm1, 0x20
1106    pxor        mm1, mm2                 ; mm2 = m_down
1107
1108    movq        [r4],       mm0
1109    movq        [r4+r1],   mm0
1110    movq        [r4+2*r1], mm0
1111    lea         r4, [r4+2*r1]
1112    movq        [r4+r1],   mm0
1113
1114    movq        [r4+2*r1], mm1
1115    lea         r4, [r4+2*r1]
1116    movq        [r4+r1],   mm1
1117    movq        [r4+2*r1], mm1
1118    lea         r4, [r4+2*r1]
1119    movq        [r4+r1],   mm1
1120
1121    DEINIT_X86_32_PIC
1122    pop r4
1123    pop r3
1124    WELSEMMS
1125    ret
1126
1127
1128
1129;*******************************************************************************
1130;
1131;   void WelsDecoderI16x16LumaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
1132;*******************************************************************************
1133WELS_EXTERN WelsDecoderI16x16LumaPredDc_sse2
1134    push    r3
1135    push    r4
1136    %assign push_num 2
1137    INIT_X86_32_PIC r5
1138    LOAD_2_PARA
1139    SIGN_EXTENSION r1, r1d
1140    mov r4, r0
1141    sub         r0, r1
1142    movdqa      xmm0, [r0]             ; read one row
1143    pxor        xmm1, xmm1
1144    psadbw      xmm0, xmm1
1145    movdqa      xmm1, xmm0
1146    psrldq      xmm1, 0x08
1147    pslldq      xmm0, 0x08
1148    psrldq      xmm0, 0x08
1149    paddw       xmm0, xmm1
1150
1151    movzx       r2, byte [r0+r1-0x01]
1152    movzx       r3, byte [r0+2*r1-0x01]
1153    add     r2, r3
1154    lea         r0, [r0+r1]
1155    LOAD_2_LEFT_AND_ADD
1156    LOAD_2_LEFT_AND_ADD
1157    LOAD_2_LEFT_AND_ADD
1158    LOAD_2_LEFT_AND_ADD
1159    LOAD_2_LEFT_AND_ADD
1160    LOAD_2_LEFT_AND_ADD
1161    LOAD_2_LEFT_AND_ADD
1162    add         r2, 0x10
1163    movd        xmm1, r2d
1164    paddw       xmm0, xmm1
1165    psrld       xmm0, 0x05
1166    pmuludq     xmm0, [pic(mmx_01bytes)]
1167    pshufd      xmm0, xmm0, 0
1168
1169    movdqa      [r4],       xmm0
1170    movdqa      [r4+r1],   xmm0
1171    movdqa      [r4+2*r1], xmm0
1172    lea         r4,         [r4+2*r1]
1173
1174    movdqa      [r4+r1],   xmm0
1175    movdqa      [r4+2*r1], xmm0
1176    lea         r4,         [r4+2*r1]
1177
1178    movdqa      [r4+r1],   xmm0
1179    movdqa      [r4+2*r1], xmm0
1180    lea         r4,         [r4+2*r1]
1181
1182    movdqa      [r4+r1],   xmm0
1183    movdqa      [r4+2*r1], xmm0
1184    lea         r4,         [r4+2*r1]
1185
1186    movdqa      [r4+r1],   xmm0
1187    movdqa      [r4+2*r1], xmm0
1188    lea         r4,         [r4+2*r1]
1189
1190    movdqa      [r4+r1],   xmm0
1191    movdqa      [r4+2*r1], xmm0
1192    lea         r4,         [r4+2*r1]
1193
1194    movdqa      [r4+r1],   xmm0
1195    movdqa      [r4+2*r1], xmm0
1196    lea         r4,         [r4+2*r1]
1197
1198    movdqa      [r4+r1],   xmm0
1199
1200    DEINIT_X86_32_PIC
1201    pop r4
1202    pop r3
1203
1204    ret
1205
1206;*******************************************************************************
1207; for intra prediction as follows, 11/19/2010
1208;*******************************************************************************
1209
1210;*******************************************************************************
1211;   void WelsDecoderI16x16LumaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride)
1212;*******************************************************************************
1213WELS_EXTERN WelsDecoderI16x16LumaPredDcTop_sse2
1214    %assign push_num 0
1215    LOAD_2_PARA
1216    PUSH_XMM 8
1217    SIGN_EXTENSION r1, r1d
1218    mov r2, r0
1219    sub r2, r1
1220    movdqa xmm0, [r2]       ; pPred-kiStride, top line
1221    pxor xmm7, xmm7
1222    psadbw xmm0, xmm7
1223    movdqa xmm1, xmm0
1224    psrldq xmm1, 8
1225    paddw  xmm0, xmm1
1226    xor r2, r2
1227    movd r2d, xmm0
1228    ;movdqa xmm1, xmm0
1229    ;punpcklbw xmm0, xmm7
1230    ;punpckhbw xmm1, xmm7
1231
1232    ;paddw xmm0, xmm1           ; (ub.max(ff) << 4) will not excceed of uw, so can perform it in unit of unsigned word scope
1233    ;pshufd xmm1, xmm0, 04eh        ; 01001110, w3w2w1w0,w7w6w5w4
1234    ;paddw xmm0, xmm1           ; w3+7 w2+6 w1+5 w0+4 w3+7 w2+6 w1+5 w0+4
1235    ;pshufd xmm1, xmm0, 0b1h        ; 10110001, w1+5 w0+4 w3+7 w2+6 w1+5 w0+4 w3+7 w2+6
1236    ;paddw xmm0, xmm1           ; w_o w_e w_o w_e w_o w_e w_o w_e (w_o=1+3+5+7, w_e=0+2+4+6)
1237    ;pshuflw xmm1, xmm0, 0b1h   ; 10110001
1238    ;paddw xmm0, xmm1           ; sum in word unit (x8)
1239    ;xor r3, r3
1240    ;movd r3d, xmm0
1241    ;and edx, 0ffffh
1242
1243    add r2, 8
1244    sar r2, 4
1245    SSE2_Copy16Times xmm1, r2d
1246    ;mov dh, dl
1247    ;mov r2, edx
1248    ;shl r2, 010h
1249    ;or edx, r2
1250    ;movd xmm1, edx
1251    ;pshufd xmm0, xmm1, 00h
1252    ;movdqa xmm1, xmm0
1253    movdqa xmm0, xmm1
1254    lea r2, [2*r1+r1]       ; 3*kiStride
1255
1256    movdqa [r0], xmm0
1257    movdqa [r0+r1], xmm1
1258    movdqa [r0+2*r1], xmm0
1259    movdqa [r0+r2], xmm1
1260
1261    lea r0, [r0+4*r1]
1262    movdqa [r0], xmm0
1263    movdqa [r0+r1], xmm1
1264    movdqa [r0+2*r1], xmm0
1265    movdqa [r0+r2], xmm1
1266
1267    lea r0, [r0+4*r1]
1268    movdqa [r0], xmm0
1269    movdqa [r0+r1], xmm1
1270    movdqa [r0+2*r1], xmm0
1271    movdqa [r0+r2], xmm1
1272
1273    lea r0, [r0+4*r1]
1274    movdqa [r0], xmm0
1275    movdqa [r0+r1], xmm1
1276    movdqa [r0+2*r1], xmm0
1277    movdqa [r0+r2], xmm1
1278
1279    POP_XMM
1280    ret
1281
1282;*******************************************************************************
1283;   void WelsDecoderI16x16LumaPredDcNA_sse2(uint8_t *pPred, const int32_t kiStride)
1284;*******************************************************************************
1285WELS_EXTERN WelsDecoderI16x16LumaPredDcNA_sse2
1286    %assign push_num 0
1287    INIT_X86_32_PIC r3
1288    LOAD_2_PARA
1289    SIGN_EXTENSION r1, r1d
1290    lea r2, [2*r1+r1]       ; 3*kiStride
1291
1292    movdqa xmm0, [pic(sse2_dc_0x80)]
1293    movdqa xmm1, xmm0
1294    movdqa [r0], xmm0
1295    movdqa [r0+r1], xmm1
1296    movdqa [r0+2*r1], xmm0
1297    movdqa [r0+r2], xmm1
1298    lea r0, [r0+4*r1]
1299    movdqa [r0], xmm0
1300    movdqa [r0+r1], xmm1
1301    movdqa [r0+2*r1], xmm0
1302    movdqa [r0+r2], xmm1
1303    lea r0, [r0+4*r1]
1304    movdqa [r0], xmm0
1305    movdqa [r0+r1], xmm1
1306    movdqa [r0+2*r1], xmm0
1307    movdqa [r0+r2], xmm1
1308    lea r0, [r0+4*r1]
1309    movdqa [r0], xmm0
1310    movdqa [r0+r1], xmm1
1311    movdqa [r0+2*r1], xmm0
1312    movdqa [r0+r2], xmm1
1313
1314    DEINIT_X86_32_PIC
1315    ret
1316
1317;*******************************************************************************
1318;   void WelsDecoderIChromaPredDcLeft_mmx(uint8_t *pPred, const int32_t kiStride)
1319;*******************************************************************************
1320WELS_EXTERN WelsDecoderIChromaPredDcLeft_mmx
1321    push r3
1322    push r4
1323    %assign push_num 2
1324    LOAD_2_PARA
1325    SIGN_EXTENSION r1, r1d
1326    mov r4, r0
1327    ; for left
1328    dec r0
1329    xor r2, r2
1330    xor r3, r3
1331    movzx r2, byte [r0]
1332    movzx r3, byte [r0+r1]
1333    add r2, r3
1334    lea r0, [r0+2*r1]
1335    movzx r3, byte [r0]
1336    add r2, r3
1337    movzx r3, byte [r0+r1]
1338    add r2, r3
1339    add r2, 02h
1340    sar r2, 02h
1341    ;SSE2_Copy16Times mm0, r2d
1342    mov r3, r2
1343    sal r3, 8
1344    or r2, r3
1345    movd mm1, r2d
1346    pshufw mm0, mm1, 00h
1347    ;mov bh, bl
1348    ;movd mm1, ebx
1349    ;pshufw mm0, mm1, 00h   ; up64
1350    movq mm1, mm0
1351    xor r2, r2
1352    lea r0, [r0+2*r1]
1353    movzx r2, byte [r0]
1354    movzx r3, byte [r0+r1]
1355    add r2, r3
1356    lea r0, [r0+2*r1]
1357    movzx r3, byte [r0]
1358    add r2, r3
1359    movzx r3, byte [r0+r1]
1360    add r2, r3
1361    add r2, 02h
1362    sar r2, 02h
1363    mov r3, r2
1364    sal r3, 8
1365    or r2, r3
1366    movd mm3, r2d
1367    pshufw mm2, mm3, 00h
1368    ;mov bh, bl
1369    ;movd mm3, ebx
1370    ;pshufw mm2, mm3, 00h   ; down64
1371    ;SSE2_Copy16Times mm2, r2d
1372    movq mm3, mm2
1373    lea r2, [2*r1+r1]
1374    movq [r4], mm0
1375    movq [r4+r1], mm1
1376    movq [r4+2*r1], mm0
1377    movq [r4+r2], mm1
1378    lea r4, [r4+4*r1]
1379    movq [r4], mm2
1380    movq [r4+r1], mm3
1381    movq [r4+2*r1], mm2
1382    movq [r4+r2], mm3
1383    pop r4
1384    pop r3
1385    emms
1386    ret
1387
1388;*******************************************************************************
1389;   void WelsDecoderIChromaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride)
1390;*******************************************************************************
1391WELS_EXTERN WelsDecoderIChromaPredDcTop_sse2
1392    %assign push_num 0
1393    LOAD_2_PARA
1394    PUSH_XMM 8
1395    SIGN_EXTENSION r1, r1d
1396    mov r2, r0
1397    sub r2, r1
1398    movq xmm0, [r2]     ; top: 8x1 pixels
1399    pxor xmm7, xmm7
1400    punpcklbw xmm0, xmm7        ; ext 8x2 words
1401    pshufd xmm1, xmm0, 0B1h     ; 10110001 B, w5 w4 w7 w6 w1 w0 w3 w2
1402    paddw xmm0, xmm1            ; w5+7 w4+6 w5+7 w4+6 w1+3 w0+2 w1+3 w0+2
1403    movdqa xmm1, xmm0
1404    pshuflw xmm2, xmm0, 0B1h    ; 10110001 B, .. w0+2 w1+3 w0+2 w1+3
1405    pshufhw xmm3, xmm1, 0B1h    ; 10110001 B, w4+6 w5+7 w4+6 w5+7 ..
1406    paddw xmm0, xmm2            ; .. w0+..+3 w0+..+3 w0+..+3 w0+..+3
1407    paddw xmm1, xmm3            ; w4+..+7 w4+..+7 w4+..+7 w4+..+7 ..
1408    punpckhqdq xmm1, xmm7
1409    punpcklqdq xmm0, xmm1       ; sum1 sum1 sum1 sum1 sum0 sum0 sum0 sum0
1410%ifdef X86_32_PICASM
1411    pcmpeqw  xmm6, xmm6
1412    psrlw    xmm6, 15
1413    psllw    xmm6, 1
1414%else
1415    movdqa xmm6, [sse2_wd_0x02]
1416%endif
1417    paddw xmm0, xmm6
1418    psraw xmm0, 02h
1419    packuswb xmm0, xmm7
1420    lea r2, [2*r1+r1]
1421    movq [r0], xmm0
1422    movq [r0+r1], xmm0
1423    movq [r0+2*r1], xmm0
1424    movq [r0+r2], xmm0
1425    lea r0, [r0+4*r1]
1426    movq [r0], xmm0
1427    movq [r0+r1], xmm0
1428    movq [r0+2*r1], xmm0
1429    movq [r0+r2], xmm0
1430    POP_XMM
1431    ret
1432
1433;*******************************************************************************
1434;   void WelsDecoderIChromaPredDcNA_mmx(uint8_t *pPred, const int32_t kiStride)
1435;*******************************************************************************
1436WELS_EXTERN WelsDecoderIChromaPredDcNA_mmx
1437    %assign push_num 0
1438    INIT_X86_32_PIC r3
1439    LOAD_2_PARA
1440    SIGN_EXTENSION r1, r1d
1441    lea r2, [2*r1+r1]
1442    movq mm0, [pic(sse2_dc_0x80)]
1443    movq mm1, mm0
1444    movq [r0], mm0
1445    movq [r0+r1], mm1
1446    movq [r0+2*r1], mm0
1447    movq [r0+r2], mm1
1448    lea r0, [r0+4*r1]
1449    movq [r0], mm0
1450    movq [r0+r1], mm1
1451    movq [r0+2*r1], mm0
1452    movq [r0+r2], mm1
1453    DEINIT_X86_32_PIC
1454    emms
1455    ret
1456
1457