• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;*!
2;* \copy
3;*     Copyright (c)  2009-2013, Cisco Systems
4;*     All rights reserved.
5;*
6;*     Redistribution and use in source and binary forms, with or without
7;*     modification, are permitted provided that the following conditions
8;*     are met:
9;*
10;*        * Redistributions of source code must retain the above copyright
11;*          notice, this list of conditions and the following disclaimer.
12;*
13;*        * Redistributions in binary form must reproduce the above copyright
14;*          notice, this list of conditions and the following disclaimer in
15;*          the documentation and/or other materials provided with the
16;*          distribution.
17;*
18;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29;*     POSSIBILITY OF SUCH DAMAGE.
30;*
31;*
32;*  intra_pred.asm
33;*
34;*  Abstract
35;*      sse2 function for intra predict operations
36;*
37;*  History
38;*      18/09/2009 Created
39;*
40;*
41;*************************************************************************/
42%include "asm_inc.asm"
43
44;***********************************************************************
45; Local Data (Read Only)
46;***********************************************************************
47
48%ifdef X86_32_PICASM
49SECTION .text align=16
50%else
51SECTION .rodata align=16
52%endif
53
54align 16
55sse2_plane_inc_minus dw -7, -6, -5, -4, -3, -2, -1, 0
56align 16
57sse2_plane_inc dw 1, 2, 3, 4, 5, 6, 7, 8
58align 16
59sse2_plane_dec dw 8, 7, 6, 5, 4, 3, 2, 1
60
61; for chroma plane mode
62sse2_plane_inc_c dw 1, 2, 3, 4
63sse2_plane_dec_c dw 4, 3, 2, 1
64align 16
65sse2_plane_mul_b_c dw -3, -2, -1, 0, 1, 2, 3, 4
66
67align 16
68mmx_01bytes:        times 16    db 1
69
70align 16
71mmx_0x02: dw 0x02, 0x00, 0x00, 0x00
72
73
74;***********************************************************************
75; macros
76;***********************************************************************
77;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
78;%1 will keep the last result
79%macro SSE_DB_1_2REG 2
80    pxor %1, %1
81    pcmpeqw %2, %2
82    psubb %1, %2
83%endmacro
84
85;xmm0, xmm1, xmm2, eax, ecx
86;lower 64 bits of xmm0 save the result
87%macro SSE2_PRED_H_4X4_TWO_LINE 5
88    movd        %1, [%4-1]
89    movdqa      %3, %1
90    punpcklbw   %1, %3
91    movdqa      %3, %1
92    punpcklbw   %1, %3
93
94    ;add            %4, %5
95    movd        %2, [%4+%5-1]
96    movdqa      %3, %2
97    punpcklbw   %2, %3
98    movdqa      %3, %2
99    punpcklbw   %2, %3
100    punpckldq   %1, %2
101%endmacro
102
103%macro SUMW_HORIZON1 2
104    movdqa      %2, %1
105    psrldq      %2, 8
106    paddusw     %1, %2
107    movdqa      %2, %1
108    psrldq      %2, 4
109    paddusw     %1, %2
110    movdqa      %2, %1
111    psrldq      %2, 2
112    paddusw     %1, %2
113%endmacro
114
115%macro LOAD_COLUMN 6
116    movd    %1, [%5]
117    movd    %2, [%5+%6]
118    punpcklbw %1,   %2
119    lea     %5, [%5+2*%6]
120    movd    %3, [%5]
121    movd    %2, [%5+%6]
122    punpcklbw %3,   %2
123    punpcklwd %1,   %3
124    lea     %5, [%5+2*%6]
125    movd    %4, [%5]
126    movd    %2, [%5+%6]
127    punpcklbw %4,   %2
128    lea     %5, [%5+2*%6]
129    movd    %3, [%5]
130    movd    %2, [%5+%6]
131    lea     %5, [%5+2*%6]
132    punpcklbw %3,   %2
133    punpcklwd %4,   %3
134    punpckhdq %1,   %4
135%endmacro
136
137%macro SUMW_HORIZON 3
138    movhlps     %2, %1          ; x2 = xx xx xx xx d7 d6 d5 d4
139    paddw       %1, %2          ; x1 = xx xx xx xx d37 d26 d15 d04
140    punpcklwd   %1, %3          ; x1 =  d37  d26 d15 d04
141    movhlps     %2, %1          ; x2 = xxxx xxxx d37 d26
142    paddd       %1, %2          ; x1 = xxxx xxxx d1357 d0246
143    pshuflw     %2, %1, 0x4e    ; x2 = xxxx xxxx d0246 d1357
144    paddd       %1, %2          ; x1 = xxxx xxxx xxxx  d01234567
145%endmacro
146
147
148%macro COPY_16_TIMES 2
149    movdqa      %2, [%1-16]
150    psrldq      %2, 15
151    pmuludq     %2, [pic(mmx_01bytes)]
152    pshufd      %2, %2, 0
153%endmacro
154
155%macro COPY_16_TIMESS 3
156    movdqa      %2, [%1+%3-16]
157    psrldq      %2, 15
158    pmuludq     %2, [pic(mmx_01bytes)]
159    pshufd      %2, %2, 0
160%endmacro
161
162%macro LOAD_COLUMN_C 6
163    movd    %1, [%5]
164    movd    %2, [%5+%6]
165    punpcklbw %1,%2
166    lea     %5, [%5+2*%6]
167    movd    %3, [%5]
168    movd    %2, [%5+%6]
169    punpcklbw %3,   %2
170    punpckhwd %1,   %3
171    lea     %5, [%5+2*%6]
172%endmacro
173
174%macro LOAD_2_LEFT_AND_ADD 0
175    lea         r1, [r1+2*r2]
176    movzx       r4, byte [r1-0x01]
177    add         r3, r4
178    movzx       r4, byte [r1+r2-0x01]
179    add         r3, r4
180%endmacro
181
182;***********************************************************************
183; Code
184;***********************************************************************
185
186SECTION .text
187
188;***********************************************************************
189;   void WelsI4x4LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
190;
191;   pred must align to 16
192;***********************************************************************
193WELS_EXTERN WelsI4x4LumaPredH_sse2
194    push r3
195    %assign push_num 1
196    INIT_X86_32_PIC r4
197    LOAD_3_PARA
198    SIGN_EXTENSION r2, r2d
199    movzx       r3, byte [r1-1]
200    movd        xmm0,   r3d
201    pmuludq     xmm0,   [pic(mmx_01bytes)]
202
203    movzx       r3, byte [r1+r2-1]
204    movd        xmm1,   r3d
205    pmuludq     xmm1,   [pic(mmx_01bytes)]
206
207    unpcklps    xmm0,   xmm1
208
209    lea         r1, [r1+r2*2]
210    movzx       r3, byte [r1-1]
211    movd        xmm2,   r3d
212    pmuludq     xmm2,   [pic(mmx_01bytes)]
213
214    movzx       r3, byte [r1+r2-1]
215    movd        xmm3,   r3d
216    pmuludq     xmm3,   [pic(mmx_01bytes)]
217
218    unpcklps    xmm2,   xmm3
219    unpcklpd    xmm0,   xmm2
220
221    movdqa      [r0],   xmm0
222    DEINIT_X86_32_PIC
223    pop r3
224    ret
225
226;***********************************************************************
227; void WelsI16x16LumaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
228;***********************************************************************
229WELS_EXTERN WelsI16x16LumaPredPlane_sse2
230    push r3
231    push r4
232    %assign push_num 2
233    INIT_X86_32_PIC r5
234    LOAD_3_PARA
235    PUSH_XMM 8
236    SIGN_EXTENSION r2, r2d
237    sub     r1, 1
238    sub     r1, r2
239
240    ;for H
241    pxor    xmm7,   xmm7
242    movq    xmm0,   [r1]
243    movdqa  xmm5,   [pic(sse2_plane_dec)]
244    punpcklbw xmm0, xmm7
245    pmullw  xmm0,   xmm5
246    movq    xmm1,   [r1 + 9]
247    movdqa  xmm6,   [pic(sse2_plane_inc)]
248    punpcklbw xmm1, xmm7
249    pmullw  xmm1,   xmm6
250    psubw   xmm1,   xmm0
251
252    SUMW_HORIZON    xmm1,xmm0,xmm2
253    movd    r3d,    xmm1        ; H += (i + 1) * (top[8 + i] - top[6 - i]);
254    movsx   r3, r3w
255    imul    r3, 5
256    add     r3, 32
257    sar     r3, 6           ; b = (5 * H + 32) >> 6;
258    SSE2_Copy8Times xmm1, r3d   ; xmm1 = b,b,b,b,b,b,b,b
259
260    movzx   r4, BYTE [r1+16]
261    sub r1, 3
262    LOAD_COLUMN     xmm0, xmm2, xmm3, xmm4, r1, r2
263
264    add     r1, 3
265    movzx   r3, BYTE [r1+8*r2]
266    add     r4, r3
267    shl     r4, 4           ;   a = (left[15*stride] + top[15]) << 4;
268
269    sub r1, 3
270    add     r1, r2
271    LOAD_COLUMN     xmm7, xmm2, xmm3, xmm4, r1, r2
272    pxor    xmm4,   xmm4
273    punpckhbw xmm0, xmm4
274    pmullw  xmm0,   xmm5
275    punpckhbw xmm7, xmm4
276    pmullw  xmm7,   xmm6
277    psubw   xmm7,   xmm0
278
279    SUMW_HORIZON   xmm7,xmm0,xmm2
280    movd    r3d,   xmm7         ; V
281    movsx   r3, r3w
282    imul    r3, 5
283    add     r3, 32
284    sar     r3, 6               ; c = (5 * V + 32) >> 6;
285    SSE2_Copy8Times xmm4, r3d       ; xmm4 = c,c,c,c,c,c,c,c
286
287    add     r4, 16
288    imul    r3, -7
289    add     r3, r4              ; s = a + 16 + (-7)*c
290    SSE2_Copy8Times xmm0, r3d       ; xmm0 = s,s,s,s,s,s,s,s
291
292    xor     r3, r3
293    movdqa  xmm5,   [pic(sse2_plane_inc_minus)]
294
295get_i16x16_luma_pred_plane_sse2_1:
296    movdqa  xmm2,   xmm1
297    pmullw  xmm2,   xmm5
298    paddw   xmm2,   xmm0
299    psraw   xmm2,   5
300    movdqa  xmm3,   xmm1
301    pmullw  xmm3,   xmm6
302    paddw   xmm3,   xmm0
303    psraw   xmm3,   5
304    packuswb xmm2,  xmm3
305    movdqa  [r0],   xmm2
306    paddw   xmm0,   xmm4
307    add     r0, 16
308    inc     r3
309    cmp     r3, 16
310    jnz get_i16x16_luma_pred_plane_sse2_1
311    POP_XMM
312    DEINIT_X86_32_PIC
313    pop r4
314    pop r3
315    ret
316
317;***********************************************************************
318; void WelsIChromaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
319;***********************************************************************
320WELS_EXTERN WelsIChromaPredPlane_sse2
321    push r3
322    push r4
323    %assign push_num 2
324    INIT_X86_32_PIC r5
325    LOAD_3_PARA
326    PUSH_XMM 8
327    SIGN_EXTENSION r2, r2d
328    sub     r1, 1
329    sub     r1, r2
330
331    pxor    mm7,    mm7
332    movq    mm0,    [r1]
333    movq    mm5,    [pic(sse2_plane_dec_c)]
334    punpcklbw mm0,  mm7
335    pmullw  mm0,    mm5
336    movq    mm1,    [r1 + 5]
337    movq    mm6,    [pic(sse2_plane_inc_c)]
338    punpcklbw mm1,  mm7
339    pmullw  mm1,    mm6
340    psubw   mm1,    mm0
341
342    movq2dq xmm1,   mm1
343    pxor    xmm2,   xmm2
344    SUMW_HORIZON    xmm1,xmm0,xmm2
345    movd    r3d,    xmm1
346    movsx   r3, r3w
347    imul    r3, 17
348    add     r3, 16
349    sar     r3, 5           ; b = (17 * H + 16) >> 5;
350    SSE2_Copy8Times xmm1, r3d   ; mm1 = b,b,b,b,b,b,b,b
351
352    movzx   r3, BYTE [r1+8]
353    sub r1, 3
354    LOAD_COLUMN_C   mm0, mm2, mm3, mm4, r1, r2
355
356    add     r1, 3
357    movzx   r4, BYTE [r1+4*r2]
358    add     r4, r3
359    shl     r4, 4           ; a = (left[7*stride] + top[7]) << 4;
360
361    sub r1, 3
362    add     r1, r2
363    LOAD_COLUMN_C   mm7, mm2, mm3, mm4, r1, r2
364    pxor    mm4,    mm4
365    punpckhbw mm0,  mm4
366    pmullw  mm0,    mm5
367    punpckhbw mm7,  mm4
368    pmullw  mm7,    mm6
369    psubw   mm7,    mm0
370
371    movq2dq xmm7,   mm7
372    pxor    xmm2,   xmm2
373    SUMW_HORIZON    xmm7,xmm0,xmm2
374    movd    r3d,    xmm7            ; V
375    movsx   r3, r3w
376    imul    r3, 17
377    add     r3, 16
378    sar     r3, 5               ; c = (17 * V + 16) >> 5;
379    SSE2_Copy8Times xmm4, r3d   ; mm4 = c,c,c,c,c,c,c,c
380
381    add     r4, 16
382    imul    r3, -3
383    add     r3, r4      ; s = a + 16 + (-3)*c
384    SSE2_Copy8Times xmm0, r3d   ; xmm0 = s,s,s,s,s,s,s,s
385
386    xor     r3, r3
387    movdqa  xmm5,   [pic(sse2_plane_mul_b_c)]
388
389get_i_chroma_pred_plane_sse2_1:
390    movdqa  xmm2,   xmm1
391    pmullw  xmm2,   xmm5
392    paddw   xmm2,   xmm0
393    psraw   xmm2,   5
394    packuswb xmm2,  xmm2
395    movq    [r0],   xmm2
396    paddw   xmm0,   xmm4
397    add     r0, 8
398    inc     r3
399    cmp     r3, 8
400    jnz get_i_chroma_pred_plane_sse2_1
401    POP_XMM
402    DEINIT_X86_32_PIC
403    pop r4
404    pop r3
405    WELSEMMS
406    ret
407
408;***********************************************************************
409;   0 |1 |2 |3 |4 |
410;   6 |7 |8 |9 |10|
411;   11|12|13|14|15|
412;   16|17|18|19|20|
413;   21|22|23|24|25|
414;   7 is the start pixel of current 4x4 block
415;   pred[7] = ([6]+[0]*2+[1]+2)/4
416;
417;   void WelsI4x4LumaPredDDR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
418;
419;***********************************************************************
420WELS_EXTERN WelsI4x4LumaPredDDR_mmx
421    %assign push_num 0
422    INIT_X86_32_PIC r3
423    LOAD_3_PARA
424    SIGN_EXTENSION r2, r2d
425    movq        mm1,[r1+r2-8]       ;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
426    movq        mm2,[r1-8]          ;get value of 6 mm2[8] = 6
427    sub     r1, r2          ;mov eax to above line of current block(postion of 1)
428    punpckhbw   mm2,[r1-8]          ;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6]
429    movd        mm3,[r1]            ;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3]
430    punpckhwd   mm1,mm2             ;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11]
431    psllq       mm3,18h             ;mm3[5]=[1]
432    psrlq       mm1,28h             ;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
433    por         mm3,mm1             ;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11]
434    movq        mm1,mm3             ;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
435    lea         r1,[r1+r2*2-8h]     ;set eax point to 12
436    movq        mm4,[r1+r2]     ;get value of 16, mm4[8]=[16]
437    psllq       mm3,8               ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0
438    psrlq       mm4,38h             ;mm4[1]=[16]
439    por         mm3,mm4             ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16]
440    movq        mm2,mm3             ;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16]
441    movq        mm4,[r1+r2*2]       ;mm4[8]=[21]
442    psllq       mm3,8               ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0
443    psrlq       mm4,38h             ;mm4[1]=[21]
444    por         mm3,mm4             ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21]
445    movq        mm4,mm3             ;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
446    pavgb       mm3,mm1             ;mm3=([11]+[21]+1)/2
447    pxor        mm1,mm4             ;find odd value in the lowest bit of each byte
448    pand        mm1,[pic(mmx_01bytes)]   ;set the odd bit
449    psubusb     mm3,mm1             ;decrease 1 from odd bytes
450    pavgb       mm2,mm3             ;mm2=(([11]+[21]+1)/2+1+[16])/2
451
452    movd        [r0+12],mm2
453    psrlq       mm2,8
454    movd        [r0+8],mm2
455    psrlq       mm2,8
456    movd        [r0+4],mm2
457    psrlq       mm2,8
458    movd        [r0],mm2
459    DEINIT_X86_32_PIC
460    WELSEMMS
461    ret
462
463;***********************************************************************
464;   0 |1 |2 |3 |4 |
465;   5 |6 |7 |8 |9 |
466;   10|11|12|13|14|
467;   15|16|17|18|19|
468;   20|21|22|23|24|
469;   6 is the start pixel of current 4x4 block
470;   pred[6] = ([1]+[2]+[3]+[4]+[5]+[10]+[15]+[20]+4)/8
471;
472;   void WelsI4x4LumaPredDc_sse2(uint8_t *pred,uint8_t *pRef,int32_t stride)
473;
474;***********************************************************************
475WELS_EXTERN WelsI4x4LumaPredDc_sse2
476    push r3
477    push r4
478    %assign push_num 2
479    LOAD_3_PARA
480    SIGN_EXTENSION r2, r2d
481    movzx       r4, byte [r1-1h]
482    sub         r1, r2
483    movd        xmm0,   [r1]
484    pxor        xmm1,   xmm1
485    psadbw      xmm0,   xmm1
486    xor r3, r3
487    movd        r3d,    xmm0
488    add         r3, r4
489    movzx       r4, byte [r1+r2*2-1h]
490    add         r3, r4
491
492    lea         r1, [r1+r2*2-1]
493    movzx       r4, byte [r1+r2]
494    add         r3, r4
495
496    movzx       r4, byte [r1+r2*2]
497    add         r3, r4
498    add         r3, 4
499    sar         r3, 3
500    imul        r3, 0x01010101
501
502    movd        xmm0,   r3d
503    pshufd      xmm0,   xmm0,   0
504    movdqa      [r0],   xmm0
505    pop r4
506    pop r3
507    ret
508
509;***********************************************************************
510;   void WelsIChromaPredH_mmx(uint8_t *pred, uint8_t *pRef, int32_t stride)
511;   copy 8 pixel of 8 line from left
512;***********************************************************************
513%macro MMX_PRED_H_8X8_ONE_LINE 4
514    movq        %1,     [%3-8]
515    psrlq       %1,     38h
516
517    ;pmuludq        %1,     [mmx_01bytes]       ;extend to 4 bytes
518    pmullw      %1,     [pic(mmx_01bytes)]
519    pshufw      %1,     %1, 0
520    movq        [%4],   %1
521%endmacro
522
523%macro MMX_PRED_H_8X8_ONE_LINEE 4
524    movq        %1,     [%3+r2-8]
525    psrlq       %1,     38h
526
527    ;pmuludq        %1,     [mmx_01bytes]       ;extend to 4 bytes
528    pmullw      %1,     [pic(mmx_01bytes)]
529    pshufw      %1,     %1, 0
530    movq        [%4],   %1
531%endmacro
532
533WELS_EXTERN WelsIChromaPredH_mmx
534    %assign push_num 0
535    INIT_X86_32_PIC r3
536    LOAD_3_PARA
537    SIGN_EXTENSION r2, r2d
538    movq        mm0,    [r1-8]
539    psrlq       mm0,    38h
540
541    ;pmuludq        mm0,    [mmx_01bytes]       ;extend to 4 bytes
542    pmullw      mm0,        [pic(mmx_01bytes)]
543    pshufw      mm0,    mm0,    0
544    movq        [r0],   mm0
545
546    MMX_PRED_H_8X8_ONE_LINEE    mm0, mm1, r1,r0+8
547
548    lea         r1,[r1+r2*2]
549    MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+16
550
551    MMX_PRED_H_8X8_ONE_LINEE    mm0, mm1, r1,r0+24
552
553    lea         r1,[r1+r2*2]
554    MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+32
555
556    MMX_PRED_H_8X8_ONE_LINEE    mm0, mm1, r1,r0+40
557
558    lea         r1,[r1+r2*2]
559    MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+48
560
561    MMX_PRED_H_8X8_ONE_LINEE    mm0, mm1, r1,r0+56
562    DEINIT_X86_32_PIC
563    WELSEMMS
564    ret
565
566;***********************************************************************
567;   void WelsI4x4LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
568;   copy pixels from top 4 pixels
569;***********************************************************************
570WELS_EXTERN WelsI4x4LumaPredV_sse2
571    %assign push_num 0
572    LOAD_3_PARA
573    SIGN_EXTENSION r2, r2d
574    sub         r1, r2
575    movd        xmm0,   [r1]
576    pshufd      xmm0,   xmm0,   0
577    movdqa      [r0],   xmm0
578    ret
579
580;***********************************************************************
581;   void WelsIChromaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
582;   copy 8 pixels from top 8 pixels
583;***********************************************************************
584WELS_EXTERN WelsIChromaPredV_sse2
585    %assign push_num 0
586    LOAD_3_PARA
587    SIGN_EXTENSION r2, r2d
588    sub     r1,     r2
589    movq        xmm0,       [r1]
590    movdqa      xmm1,       xmm0
591    punpcklqdq  xmm0,       xmm1
592    movdqa      [r0],       xmm0
593    movdqa      [r0+16],    xmm0
594    movdqa      [r0+32],    xmm0
595    movdqa      [r0+48],    xmm0
596    ret
597
598;***********************************************************************
599;   lt|t0|t1|t2|t3|
600;   l0|
601;   l1|
602;   l2|
603;   l3|
604;   t3 will never been used
605;   destination:
606;   |a |b |c |d |
607;   |e |f |a |b |
608;   |g |h |e |f |
609;   |i |j |g |h |
610
611;   a = (1 + lt + l0)>>1
612;   e = (1 + l0 + l1)>>1
613;   g = (1 + l1 + l2)>>1
614;   i = (1 + l2 + l3)>>1
615
616;   d = (2 + t0 + (t1<<1) + t2)>>2
617;   c = (2 + lt + (t0<<1) + t1)>>2
618;   b = (2 + l0 + (lt<<1) + t0)>>2
619
620;   f = (2 + l1 + (l0<<1) + lt)>>2
621;   h = (2 + l2 + (l1<<1) + l0)>>2
622;   j = (2 + l3 + (l2<<1) + l1)>>2
623;   [b a f e h g j i] + [d c b a] --> mov to memory
624;
625;   void WelsI4x4LumaPredHD_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
626;***********************************************************************
627WELS_EXTERN WelsI4x4LumaPredHD_mmx
628    %assign push_num 0
629    INIT_X86_32_PIC r3
630    LOAD_3_PARA
631    SIGN_EXTENSION r2, r2d
632    sub         r1, r2
633    movd        mm0, [r1-1]            ; mm0 = [xx xx xx xx t2 t1 t0 lt]
634    psllq       mm0, 20h                ; mm0 = [t2 t1 t0 lt xx xx xx xx]
635
636    movd        mm1, [r1+2*r2-4]
637    punpcklbw   mm1, [r1+r2-4]        ; mm1[7] = l0, mm1[6] = l1
638    lea         r1, [r1+2*r2]
639    movd        mm2, [r1+2*r2-4]
640    punpcklbw   mm2, [r1+r2-4]        ; mm2[7] = l2, mm2[6] = l3
641    punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
642    psrlq       mm2, 20h
643    pxor        mm0, mm2                ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
644
645    movq        mm1, mm0
646    psrlq       mm1, 10h                ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
647    movq        mm2, mm0
648    psrlq       mm2, 8h                 ; mm2 = [xx t2 t1 t0 lt l0 l1 l2]
649    movq        mm3, mm2
650    movq        mm4, mm1
651    pavgb       mm1, mm0
652
653    pxor        mm4, mm0                ; find odd value in the lowest bit of each byte
654    pand        mm4, [pic(mmx_01bytes)] ; set the odd bit
655    psubusb     mm1, mm4                ; decrease 1 from odd bytes
656
657    pavgb       mm2, mm1                ; mm2 = [xx xx d  c  b  f  h  j]
658
659    movq        mm4, mm0
660    pavgb       mm3, mm4                ; mm3 = [xx xx xx xx a  e  g  i]
661    punpcklbw   mm3, mm2                ; mm3 = [b  a  f  e  h  g  j  i]
662
663    psrlq       mm2, 20h
664    psllq       mm2, 30h                ; mm2 = [d  c  0  0  0  0  0  0]
665    movq        mm4, mm3
666    psrlq       mm4, 10h                ; mm4 = [0  0  b  a  f  e  h  j]
667    pxor        mm2, mm4                ; mm2 = [d  c  b  a  xx xx xx xx]
668    psrlq       mm2, 20h                ; mm2 = [xx xx xx xx  d  c  b  a]
669
670    movd        [r0], mm2
671    movd        [r0+12], mm3
672    psrlq       mm3, 10h
673    movd        [r0+8], mm3
674    psrlq       mm3, 10h
675    movd        [r0+4], mm3
676    DEINIT_X86_32_PIC
677    WELSEMMS
678    ret
679
680;***********************************************************************
681;   lt|t0|t1|t2|t3|
682;   l0|
683;   l1|
684;   l2|
685;   l3|
686;   t3 will never been used
687;   destination:
688;   |a |b |c |d |
689;   |c |d |e |f |
690;   |e |f |g |g |
691;   |g |g |g |g |
692
693;   a = (1 + l0 + l1)>>1
694;   c = (1 + l1 + l2)>>1
695;   e = (1 + l2 + l3)>>1
696;   g = l3
697
698;   b = (2 + l0 + (l1<<1) + l2)>>2
699;   d = (2 + l1 + (l2<<1) + l3)>>2
700;   f = (2 + l2 + (l3<<1) + l3)>>2
701
702;   [g g f e d c b a] + [g g g g] --> mov to memory
703;
704;   void WelsI4x4LumaPredHU_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
705;***********************************************************************
706WELS_EXTERN WelsI4x4LumaPredHU_mmx
707    %assign push_num 0
708    INIT_X86_32_PIC r3
709    LOAD_3_PARA
710    SIGN_EXTENSION r2, r2d
711    movd        mm0, [r1-4]            ; mm0[3] = l0
712    punpcklbw   mm0, [r1+r2-4]        ; mm0[7] = l1, mm0[6] = l0
713    lea         r1, [r1+2*r2]
714    movd        mm2, [r1-4]            ; mm2[3] = l2
715    movd        mm4, [r1+r2-4]        ; mm4[3] = l3
716    punpcklbw   mm2, mm4
717    punpckhwd   mm0, mm2                ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
718
719    psrlq       mm4, 18h
720    psllq       mm4, 38h                ; mm4 = [l3 xx xx xx xx xx xx xx]
721    psrlq       mm0, 8h
722    pxor        mm0, mm4                ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
723
724    movq        mm1, mm0
725    psllq       mm1, 8h                 ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
726    movq        mm3, mm1                ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
727    pavgb       mm1, mm0                ; mm1 = [g  e  c  a  xx xx xx xx]
728
729    movq        mm2, mm0
730    psllq       mm2, 10h                ; mm2 = [l2 l1 l0 xx xx xx xx xx]
731    movq        mm5, mm2
732    pavgb       mm2, mm0
733
734    pxor        mm5, mm0                ; find odd value in the lowest bit of each byte
735    pand        mm5, [pic(mmx_01bytes)] ; set the odd bit
736    psubusb     mm2, mm5                ; decrease 1 from odd bytes
737
738    pavgb       mm2, mm3                ; mm2 = [f  d  b  xx xx xx xx xx]
739
740    psrlq       mm2, 8h
741    pxor        mm2, mm4                ; mm2 = [g  f  d  b  xx xx xx xx]
742
743    punpckhbw   mm1, mm2                ; mm1 = [g  g  f  e  d  c  b  a]
744    punpckhbw   mm4, mm4                ; mm4 = [g  g  xx xx xx xx xx xx]
745    punpckhbw   mm4, mm4                ; mm4 = [g  g  g  g  xx xx xx xx]
746
747    psrlq       mm4, 20h
748    movd        [r0+12], mm4
749
750    movd        [r0], mm1
751    psrlq       mm1, 10h
752    movd        [r0+4], mm1
753    psrlq       mm1, 10h
754    movd        [r0+8], mm1
755    DEINIT_X86_32_PIC
756    WELSEMMS
757    ret
758
759
760
761;***********************************************************************
762;   lt|t0|t1|t2|t3|
763;   l0|
764;   l1|
765;   l2|
766;   l3|
767;   l3 will never been used
768;   destination:
769;   |a |b |c |d |
770;   |e |f |g |h |
771;   |i |a |b |c |
772;   |j |e |f |g |
773
774;   a = (1 + lt + t0)>>1
775;   b = (1 + t0 + t1)>>1
776;   c = (1 + t1 + t2)>>1
777;   d = (1 + t2 + t3)>>1
778
779;   e = (2 + l0 + (lt<<1) + t0)>>2
780;   f = (2 + lt + (t0<<1) + t1)>>2
781;   g = (2 + t0 + (t1<<1) + t2)>>2
782
783;   h = (2 + t1 + (t2<<1) + t3)>>2
784;   i = (2 + lt + (l0<<1) + l1)>>2
785;   j = (2 + l0 + (l1<<1) + l2)>>2
786;
787;   void WelsI4x4LumaPredVR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
788;***********************************************************************
789WELS_EXTERN WelsI4x4LumaPredVR_mmx
790    %assign push_num 0
791    INIT_X86_32_PIC r3
792    LOAD_3_PARA
793    SIGN_EXTENSION r2, r2d
794    sub         r1, r2
795    movq        mm0, [r1-1]            ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
796    psllq       mm0, 18h                ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
797
798    movd        mm1, [r1+2*r2-4]
799    punpcklbw   mm1, [r1+r2-4]        ; mm1[7] = l0, mm1[6] = l1
800    lea         r1, [r1+2*r2]
801    movq        mm2, [r1+r2-8]        ; mm2[7] = l2
802    punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 xx xx xx xx xx]
803    psrlq       mm2, 28h
804    pxor        mm0, mm2                ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
805
806    movq        mm1, mm0
807    psllq       mm1, 8h                 ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
808    pavgb       mm1, mm0                ; mm1 = [d  c  b  a  xx xx xx xx]
809
810    movq        mm2, mm0
811    psllq       mm2, 10h                ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
812    movq        mm3, mm2
813    pavgb       mm2, mm0
814
815    pxor        mm3, mm0                ; find odd value in the lowest bit of each byte
816    pand        mm3, [pic(mmx_01bytes)] ; set the odd bit
817    psubusb     mm2, mm3                ; decrease 1 from odd bytes
818
819    movq        mm3, mm0
820    psllq       mm3, 8h                 ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
821    pavgb       mm3, mm2                ; mm3 = [h  g  f  e  i  j  xx xx]
822    movq        mm2, mm3
823
824    psrlq       mm1, 20h                ; mm1 = [xx xx xx xx d  c  b  a]
825    movd        [r0], mm1
826
827    psrlq       mm2, 20h                ; mm2 = [xx xx xx xx h  g  f  e]
828    movd        [r0+4], mm2
829
830    movq        mm4, mm3
831    psllq       mm4, 20h
832    psrlq       mm4, 38h                ; mm4 = [xx xx xx xx xx xx xx i]
833
834    movq        mm5, mm3
835    psllq       mm5, 28h
836    psrlq       mm5, 38h                ; mm5 = [xx xx xx xx xx xx xx j]
837
838    psllq       mm1, 8h
839    pxor        mm4, mm1                ; mm4 = [xx xx xx xx c  b  a  i]
840    movd        [r0+8], mm4
841
842    psllq       mm2, 8h
843    pxor        mm5, mm2                ; mm5 = [xx xx xx xx g  f  e  j]
844    movd        [r0+12], mm5
845    DEINIT_X86_32_PIC
846    WELSEMMS
847    ret
848
849;***********************************************************************
850;   lt|t0|t1|t2|t3|t4|t5|t6|t7
851;   l0|
852;   l1|
853;   l2|
854;   l3|
855;   lt,t0,t1,t2,t3 will never been used
856;   destination:
857;   |a |b |c |d |
858;   |b |c |d |e |
859;   |c |d |e |f |
860;   |d |e |f |g |
861
862;   a = (2 + t0 + t2 + (t1<<1))>>2
863;   b = (2 + t1 + t3 + (t2<<1))>>2
864;   c = (2 + t2 + t4 + (t3<<1))>>2
865;   d = (2 + t3 + t5 + (t4<<1))>>2
866
867;   e = (2 + t4 + t6 + (t5<<1))>>2
868;   f = (2 + t5 + t7 + (t6<<1))>>2
869;   g = (2 + t6 + t7 + (t7<<1))>>2
870
871;   [g f e d c b a] --> mov to memory
872;
873;   void WelsI4x4LumaPredDDL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
874;***********************************************************************
875WELS_EXTERN WelsI4x4LumaPredDDL_mmx
876    %assign push_num 0
877    INIT_X86_32_PIC r3
878    LOAD_3_PARA
879    SIGN_EXTENSION r2, r2d
880    sub         r1, r2
881    movq        mm0, [r1]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
882    movq        mm1, mm0
883    movq        mm2, mm0
884
885    movq        mm3, mm0
886    psrlq       mm3, 38h
887    psllq       mm3, 38h                ; mm3 = [t7 xx xx xx xx xx xx xx]
888
889    psllq       mm1, 8h                 ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
890    psrlq       mm2, 8h
891    pxor        mm2, mm3                ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
892
893    movq        mm3, mm1
894    pavgb       mm1, mm2
895    pxor        mm3, mm2                ; find odd value in the lowest bit of each byte
896    pand        mm3, [pic(mmx_01bytes)] ; set the odd bit
897    psubusb     mm1, mm3                ; decrease 1 from odd bytes
898
899    pavgb       mm0, mm1                ; mm0 = [g f e d c b a xx]
900
901    psrlq       mm0, 8h
902    movd        [r0], mm0
903    psrlq       mm0, 8h
904    movd        [r0+4], mm0
905    psrlq       mm0, 8h
906    movd        [r0+8], mm0
907    psrlq       mm0, 8h
908    movd        [r0+12], mm0
909    DEINIT_X86_32_PIC
910    WELSEMMS
911    ret
912
913
914;***********************************************************************
915;   lt|t0|t1|t2|t3|t4|t5|t6|t7
916;   l0|
917;   l1|
918;   l2|
919;   l3|
920;   lt,t0,t1,t2,t3 will never been used
921;   destination:
922;   |a |b |c |d |
923;   |e |f |g |h |
924;   |b |c |d |i |
925;   |f |g |h |j |
926
927;   a = (1 + t0 + t1)>>1
928;   b = (1 + t1 + t2)>>1
929;   c = (1 + t2 + t3)>>1
930;   d = (1 + t3 + t4)>>1
931;   i = (1 + t4 + t5)>>1
932
933;   e = (2 + t0 + (t1<<1) + t2)>>2
934;   f = (2 + t1 + (t2<<1) + t3)>>2
935;   g = (2 + t2 + (t3<<1) + t4)>>2
936;   h = (2 + t3 + (t4<<1) + t5)>>2
937;   j = (2 + t4 + (t5<<1) + t6)>>2
938
939;   [i d c b a] + [j h g f e] --> mov to memory
940;
941;   void WelsI4x4LumaPredVL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
942;***********************************************************************
943WELS_EXTERN WelsI4x4LumaPredVL_mmx
944    %assign push_num 0
945    INIT_X86_32_PIC r3
946    LOAD_3_PARA
947    SIGN_EXTENSION r2, r2d
948    sub         r1, r2
949    movq        mm0, [r1]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
950    movq        mm1, mm0
951    movq        mm2, mm0
952
953    psrlq       mm1, 8h                 ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
954    psrlq       mm2, 10h                ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
955
956    movq        mm3, mm1
957    pavgb       mm3, mm0                ; mm3 = [xx xx xx i  d  c  b  a]
958
959    movq        mm4, mm2
960    pavgb       mm2, mm0
961    pxor        mm4, mm0                ; find odd value in the lowest bit of each byte
962    pand        mm4, [pic(mmx_01bytes)] ; set the odd bit
963    psubusb     mm2, mm4                ; decrease 1 from odd bytes
964
965    pavgb       mm2, mm1                ; mm2 = [xx xx xx j  h  g  f  e]
966
967    movd        [r0], mm3
968    psrlq       mm3, 8h
969    movd        [r0+8], mm3
970
971    movd        [r0+4], mm2
972    psrlq       mm2, 8h
973    movd        [r0+12], mm2
974    DEINIT_X86_32_PIC
975    WELSEMMS
976    ret
977
978;***********************************************************************
979;
980;   void WelsIChromaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
981;***********************************************************************
982WELS_EXTERN WelsIChromaPredDc_sse2
983    push r3
984    push r4
985    %assign push_num 2
986    INIT_X86_32_PIC r5
987    LOAD_3_PARA
988    SIGN_EXTENSION r2, r2d
989    sub         r1, r2
990    movq        mm0, [r1]
991
992    movzx       r3, byte [r1+r2-0x01] ; l1
993    lea             r1, [r1+2*r2]
994    movzx       r4, byte [r1-0x01]     ; l2
995    add     r3, r4
996    movzx       r4, byte [r1+r2-0x01] ; l3
997    add     r3, r4
998    lea             r1, [r1+2*r2]
999    movzx       r4, byte [r1-0x01]     ; l4
1000    add     r3, r4
1001    movd            mm1, r3d                 ; mm1 = l1+l2+l3+l4
1002
1003    movzx       r3, byte [r1+r2-0x01] ; l5
1004    lea             r1, [r1+2*r2]
1005    movzx       r4, byte [r1-0x01]     ; l6
1006    add     r3, r4
1007    movzx       r4, byte [r1+r2-0x01] ; l7
1008    add     r3, r4
1009    lea             r1, [r1+2*r2]
1010    movzx       r4, byte [r1-0x01]     ; l8
1011    add     r3, r4
1012    movd            mm2, r3d                 ; mm2 = l5+l6+l7+l8
1013
1014    movq        mm3, mm0
1015    psrlq       mm0, 0x20
1016    psllq       mm3, 0x20
1017    psrlq       mm3, 0x20
1018    pxor        mm4, mm4
1019    psadbw      mm0, mm4
1020    psadbw      mm3, mm4                 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
1021
1022    paddq       mm3, mm1
1023    movq        mm1, mm2
1024    paddq       mm1, mm0;                ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
1025
1026    movq        mm4, [pic(mmx_0x02)]
1027
1028    paddq       mm0, mm4
1029    psrlq       mm0, 0x02
1030
1031    paddq       mm2, mm4
1032    psrlq       mm2, 0x02
1033
1034    paddq       mm3, mm4
1035    paddq       mm3, mm4
1036    psrlq       mm3, 0x03
1037
1038    paddq       mm1, mm4
1039    paddq       mm1, mm4
1040    psrlq       mm1, 0x03
1041
1042    pmuludq     mm0, [pic(mmx_01bytes)]
1043    pmuludq     mm3, [pic(mmx_01bytes)]
1044    psllq       mm0, 0x20
1045    pxor        mm0, mm3                 ; mm0 = m_up
1046
1047    pmuludq     mm2, [pic(mmx_01bytes)]
1048    pmuludq     mm1, [pic(mmx_01bytes)]
1049    psllq       mm1, 0x20
1050    pxor        mm1, mm2                 ; mm2 = m_down
1051
1052    movq        [r0], mm0
1053    movq        [r0+0x08], mm0
1054    movq        [r0+0x10], mm0
1055    movq        [r0+0x18], mm0
1056
1057    movq        [r0+0x20], mm1
1058    movq        [r0+0x28], mm1
1059    movq        [r0+0x30], mm1
1060    movq        [r0+0x38], mm1
1061
1062    DEINIT_X86_32_PIC
1063    pop r4
1064    pop r3
1065    WELSEMMS
1066    ret
1067
1068
1069
1070;***********************************************************************
1071;
1072;   void WelsI16x16LumaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
1073;***********************************************************************
1074WELS_EXTERN WelsI16x16LumaPredDc_sse2
1075    push r3
1076    push r4
1077    %assign push_num 2
1078    INIT_X86_32_PIC r5
1079    LOAD_3_PARA
1080    SIGN_EXTENSION r2, r2d
1081    sub         r1, r2
1082    movdqa      xmm0, [r1]             ; read one row
1083    pxor        xmm1, xmm1
1084    psadbw      xmm0, xmm1
1085    movdqa      xmm1, xmm0
1086    psrldq      xmm1, 0x08
1087    pslldq      xmm0, 0x08
1088    psrldq      xmm0, 0x08
1089    paddw       xmm0, xmm1
1090
1091    movzx       r3, byte [r1+r2-0x01]
1092    movzx       r4, byte [r1+2*r2-0x01]
1093    add     r3, r4
1094    lea         r1, [r1+r2]
1095    LOAD_2_LEFT_AND_ADD
1096    LOAD_2_LEFT_AND_ADD
1097    LOAD_2_LEFT_AND_ADD
1098    LOAD_2_LEFT_AND_ADD
1099    LOAD_2_LEFT_AND_ADD
1100    LOAD_2_LEFT_AND_ADD
1101    LOAD_2_LEFT_AND_ADD
1102    add         r3, 0x10
1103    movd        xmm1, r3d
1104    paddw       xmm0, xmm1
1105    psrld       xmm0, 0x05
1106    pmuludq     xmm0, [pic(mmx_01bytes)]
1107    pshufd      xmm0, xmm0, 0
1108
1109    movdqa      [r0], xmm0
1110    movdqa      [r0+0x10], xmm0
1111    movdqa      [r0+0x20], xmm0
1112    movdqa      [r0+0x30], xmm0
1113    movdqa      [r0+0x40], xmm0
1114    movdqa      [r0+0x50], xmm0
1115    movdqa      [r0+0x60], xmm0
1116    movdqa      [r0+0x70], xmm0
1117    movdqa      [r0+0x80], xmm0
1118    movdqa      [r0+0x90], xmm0
1119    movdqa      [r0+0xa0], xmm0
1120    movdqa      [r0+0xb0], xmm0
1121    movdqa      [r0+0xc0], xmm0
1122    movdqa      [r0+0xd0], xmm0
1123    movdqa      [r0+0xe0], xmm0
1124    movdqa      [r0+0xf0], xmm0
1125
1126    DEINIT_X86_32_PIC
1127    pop r4
1128    pop r3
1129    ret
1130