• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;*!
2;* \copy
3;*     Copyright (c)  2009-2013, Cisco Systems
4;*     All rights reserved.
5;*
6;*     Redistribution and use in source and binary forms, with or without
7;*     modification, are permitted provided that the following conditions
8;*     are met:
9;*
10;*        * Redistributions of source code must retain the above copyright
11;*          notice, this list of conditions and the following disclaimer.
12;*
13;*        * Redistributions in binary form must reproduce the above copyright
14;*          notice, this list of conditions and the following disclaimer in
15;*          the documentation and/or other materials provided with the
16;*          distribution.
17;*
18;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29;*     POSSIBILITY OF SUCH DAMAGE.
30;*
31;*
32;*  score.asm
33;*
34;*  Abstract
35;*      scan/score/count of sse2
36;*
37;*  History
38;*      8/21/2009 Created
39;*
40;*
41;*************************************************************************/
42
43%include "asm_inc.asm"
44
45;***********************************************************************
46; Macros
47;***********************************************************************
48
49;***********************************************************************
50; Local Data (Read Only)
51;***********************************************************************
52%ifdef X86_32_PICASM
53SECTION .text align=16
54%else
55SECTION .rodata align=16
56%endif
57
58;align 16
59;se2_2 dw 2, 2, 2, 2, 2, 2, 2, 2
60align 16
61sse2_1: dw 1, 1, 1, 1, 1, 1, 1, 1
62align 16
63sse2_b1: db 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
64i_ds_table: db 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
65align 16
66sse2_plane_inc_minus: dw -7, -6, -5, -4, -3, -2, -1, 0
67align 16
68sse2_plane_inc: dw 1, 2, 3, 4, 5, 6, 7, 8
69align 16
70sse2_plane_dec: dw 8, 7, 6, 5, 4, 3, 2, 1
71align 16
72pb_scanacdc_maska:db 0,1,2,3,8,9,14,15,10,11,4,5,6,7,12,13
73align 16
74pb_scanacdc_maskb:db 2,3,8,9,10,11,4,5,0,1,6,7,12,13,14,15
75align 16
76pb_scandc_maska:db 2,3,8,9,14,15,10,11,4,5,6,7,12,13,0,1
77align 16
78pb_scandc_maskb:db 8,9,10,11,4,5,0,1,6,7,12,13,14,15,128,128
79
80align 16
81nozero_count_table:
82db  0,1,1,2,1,2,2,3,1,2
83db  2,3,2,3,3,4,1,2,2,3
84db  2,3,3,4,2,3,3,4,3,4
85db  4,5,1,2,2,3,2,3,3,4
86db  2,3,3,4,3,4,4,5,2,3
87db  3,4,3,4,4,5,3,4,4,5
88db  4,5,5,6,1,2,2,3,2,3
89db  3,4,2,3,3,4,3,4,4,5
90db  2,3,3,4,3,4,4,5,3,4
91db  4,5,4,5,5,6,2,3,3,4
92db  3,4,4,5,3,4,4,5,4,5
93db  5,6,3,4,4,5,4,5,5,6
94db  4,5,5,6,5,6,6,7,1,2
95db  2,3,2,3,3,4,2,3,3,4
96db  3,4,4,5,2,3,3,4,3,4
97db  4,5,3,4,4,5,4,5,5,6
98db  2,3,3,4,3,4,4,5,3,4
99db  4,5,4,5,5,6,3,4,4,5
100db  4,5,5,6,4,5,5,6,5,6
101db  6,7,2,3,3,4,3,4,4,5
102db  3,4,4,5,4,5,5,6,3,4
103db  4,5,4,5,5,6,4,5,5,6
104db  5,6,6,7,3,4,4,5,4,5
105db  5,6,4,5,5,6,5,6,6,7
106db  4,5,5,6,5,6,6,7,5,6
107db  6,7,6,7,7,8
108
109align 16
110high_mask_table:
111    db  0, 0, 0, 3, 0, 2, 3, 6, 0, 2
112    db  2, 5, 3, 5, 6, 9, 0, 1, 2, 5
113    db  2, 4, 5, 8, 3, 5, 5, 8, 6, 8
114    db  9,12, 0, 1, 1, 4, 2, 4, 5, 8
115    db  2, 4, 4, 7, 5, 7, 8,11, 3, 4
116    db  5, 8, 5, 7, 8,11, 6, 8, 8,11
117    db  9,11,12,15, 0, 1, 1, 4, 1, 3
118    db  4, 7, 2, 4, 4, 7, 5, 7, 8,11
119    db  2, 3, 4, 7, 4, 6, 7,10, 5, 7
120    db  7,10, 8,10,11,14, 3, 4, 4, 7
121    db  5, 7, 8,11, 5, 7, 7,10, 8,10
122    db 11,14, 6, 7, 8,11, 8,10,11,14
123    db  9,11,11,14,12,14,15,18, 0, 0
124    db  1, 4, 1, 3, 4, 7, 1, 3, 3, 6
125    db  4, 6, 7,10, 2, 3, 4, 7, 4, 6
126    db  7,10, 5, 7, 7,10, 8,10,11,14
127    db  2, 3, 3, 6, 4, 6, 7,10, 4, 6
128    db  6, 9, 7, 9,10,13, 5, 6, 7,10
129    db  7, 9,10,13, 8,10,10,13,11,13
130    db 14,17, 3, 4, 4, 7, 4, 6, 7,10
131    db  5, 7, 7,10, 8,10,11,14, 5, 6
132    db  7,10, 7, 9,10,13, 8,10,10,13
133    db 11,13,14,17, 6, 7, 7,10, 8,10
134    db 11,14, 8,10,10,13,11,13,14,17
135    db  9,10,11,14,11,13,14,17,12,14
136    db 14,17,15,17,18,21
137
138align 16
139low_mask_table:
140    db  0, 3, 2, 6, 2, 5, 5, 9, 1, 5
141    db  4, 8, 5, 8, 8,12, 1, 4, 4, 8
142    db  4, 7, 7,11, 4, 8, 7,11, 8,11
143    db 11,15, 1, 4, 3, 7, 4, 7, 7,11
144    db  3, 7, 6,10, 7,10,10,14, 4, 7
145    db  7,11, 7,10,10,14, 7,11,10,14
146    db 11,14,14,18, 0, 4, 3, 7, 3, 6
147    db  6,10, 3, 7, 6,10, 7,10,10,14
148    db  3, 6, 6,10, 6, 9, 9,13, 6,10
149    db  9,13,10,13,13,17, 4, 7, 6,10
150    db  7,10,10,14, 6,10, 9,13,10,13
151    db 13,17, 7,10,10,14,10,13,13,17
152    db 10,14,13,17,14,17,17,21, 0, 3
153    db  3, 7, 3, 6, 6,10, 2, 6, 5, 9
154    db  6, 9, 9,13, 3, 6, 6,10, 6, 9
155    db  9,13, 6,10, 9,13,10,13,13,17
156    db  3, 6, 5, 9, 6, 9, 9,13, 5, 9
157    db  8,12, 9,12,12,16, 6, 9, 9,13
158    db  9,12,12,16, 9,13,12,16,13,16
159    db 16,20, 3, 7, 6,10, 6, 9, 9,13
160    db  6,10, 9,13,10,13,13,17, 6, 9
161    db  9,13, 9,12,12,16, 9,13,12,16
162    db 13,16,16,20, 7,10, 9,13,10,13
163    db 13,17, 9,13,12,16,13,16,16,20
164    db 10,13,13,17,13,16,16,20,13,17
165    db 16,20,17,20,20,24
166
167
168SECTION .text
169
170;***********************************************************************
171;void WelsScan4x4DcAc_sse2( int16_t level[16], int16_t *pDct )
172;***********************************************************************
173WELS_EXTERN WelsScan4x4DcAc_sse2
174    %ifdef X86_32
175    push r3
176    %assign push_num 1
177    %else
178    %assign push_num 0
179    %endif
180    LOAD_2_PARA
181    movdqa     xmm0, [r1]           ; 7 6 5 4 3 2 1 0
182    movdqa     xmm1, [r1+16]        ; f e d c b a 9 8
183    pextrw     r2d, xmm0, 7         ; ecx = 7
184    pextrw     r3d, xmm1, 2         ; edx = a
185    pextrw     r1d, xmm0, 5         ; eax = 5
186    pinsrw     xmm1, r2d, 2         ; f e d c b 7 9 8
187    pinsrw     xmm0, r1d, 7         ; 5 6 5 4 3 2 1 0
188    pextrw     r2d, xmm1, 0         ; ecx = 8
189    pinsrw     xmm0, r2d, 5         ; 5 6 8 4 3 2 1 0
190    pinsrw     xmm1, r3d, 0         ; f e d c b 7 9 a
191    pshufd     xmm2, xmm0, 0xd8     ; 5 6 3 2 8 4 1 0
192    pshufd     xmm3, xmm1, 0xd8     ; f e b 7 d c 9 a
193    pshufhw    xmm0, xmm2, 0x93     ; 6 3 2 5 8 4 1 0
194    pshuflw    xmm1, xmm3, 0x39     ; f e b 7 a d c 9
195    movdqa     [r0],xmm0
196    movdqa     [r0+16], xmm1
197    %ifdef X86_32
198    pop r3
199    %endif
200    ret
201
202;***********************************************************************
203;void WelsScan4x4DcAc_ssse3( int16_t level[16], int16_t *pDct )
204;***********************************************************************
205WELS_EXTERN WelsScan4x4DcAc_ssse3
206    %assign push_num 0
207    INIT_X86_32_PIC r3
208    LOAD_2_PARA
209    movdqa     xmm0, [r1]
210    movdqa     xmm1, [r1+16]
211    pextrw      r2d,  xmm0, 7           ; ecx = [7]
212    pextrw      r1d,  xmm1, 0           ; eax = [8]
213    pinsrw      xmm0, r1d, 7            ; xmm0[7]   =   [8]
214    pinsrw      xmm1, r2d, 0            ; xmm1[0]   =   [7]
215    pshufb      xmm1, [pic(pb_scanacdc_maskb)]
216    pshufb      xmm0, [pic(pb_scanacdc_maska)]
217
218    movdqa     [r0],xmm0
219    movdqa     [r0+16], xmm1
220    DEINIT_X86_32_PIC
221    ret
222;***********************************************************************
223;void WelsScan4x4Ac_sse2( int16_t* zig_value, int16_t* pDct )
224;***********************************************************************
225WELS_EXTERN WelsScan4x4Ac_sse2
226    %assign push_num 0
227    LOAD_2_PARA
228    movdqa     xmm0, [r1]
229    movdqa     xmm1, [r1+16]
230    movdqa     xmm2, xmm0
231    punpcklqdq xmm0, xmm1
232    punpckhqdq xmm2, xmm1
233
234    movdqa     xmm3, xmm0
235    punpckldq  xmm0, xmm2
236    punpckhdq  xmm3, xmm2
237    pextrw     r1d , xmm0, 3
238    pextrw     r2d , xmm0, 7
239    pinsrw     xmm0, r1d,  7
240    pextrw     r1d,  xmm3, 4
241    pinsrw     xmm3, r2d,  4
242    pextrw     r2d,  xmm3, 0
243    pinsrw     xmm3, r1d,  0
244    pinsrw     xmm0, r2d,  3
245
246    pshufhw    xmm1, xmm0, 0x93
247    pshuflw    xmm2, xmm3, 0x39
248
249    movdqa     xmm3, xmm2
250    psrldq     xmm1, 2
251    pslldq     xmm3, 14
252    por        xmm1, xmm3
253    psrldq     xmm2, 2
254    movdqa     [r0],xmm1
255    movdqa     [r0+16], xmm2
256    ret
257
258
259;***********************************************************************
260;void int32_t WelsCalculateSingleCtr4x4_sse2( int16_t *pDct );
261;***********************************************************************
262WELS_EXTERN WelsCalculateSingleCtr4x4_sse2
263    %ifdef X86_32
264    push r3
265    %assign push_num 1
266    %else
267    %assign push_num 0
268    %endif
269    INIT_X86_32_PIC r4
270    LOAD_1_PARA
271    movdqa    xmm0, [r0]
272    movdqa    xmm1, [r0+16]
273
274    packsswb  xmm0, xmm1
275    ; below is the register map: r0 - eax, r1 - ebx, r2 - ecx, r3 - edx
276    xor r3, r3
277    pxor      xmm3, xmm3
278    pcmpeqb   xmm0, xmm3
279    pmovmskb  r3d,  xmm0
280
281    xor       r3,  0xffff
282
283    xor       r0,  r0
284    mov       r2,  7
285    mov       r1,  8
286.loop_low8_find1:
287    bt        r3,  r2
288    jc        .loop_high8_find1
289    dec       r2
290    jnz      .loop_low8_find1
291.loop_high8_find1:
292    bt        r3, r1
293    jc        .find1end
294    inc       r1
295    cmp       r1,16
296    jb        .loop_high8_find1
297.find1end:
298    sub       r1, r2
299    sub       r1, 1
300    lea   r2,  [pic(i_ds_table)]
301    add       r0b,  [r2+r1]
302    mov       r1, r3
303    and       r3, 0xff
304    shr       r1, 8
305    and       r1, 0xff
306    lea   r2 , [pic(low_mask_table)]
307    add       r0b,  [r2 +r3]
308    lea   r2, [pic(high_mask_table)]
309    add       r0b,  [r2+r1]
310    DEINIT_X86_32_PIC
311    %ifdef X86_32
312    pop r3
313    %else
314    mov retrd, r0d
315    %endif
316    ret
317
318
319;***********************************************************************
320; int32_t WelsGetNoneZeroCount_sse2(int16_t* level);
321;***********************************************************************
322WELS_EXTERN WelsGetNoneZeroCount_sse2
323    %assign push_num 0
324    INIT_X86_32_PIC r3
325    LOAD_1_PARA
326    movdqa    xmm0, [r0]
327    movdqa    xmm1, [r0+16]
328    pxor      xmm2, xmm2
329    pcmpeqw   xmm0, xmm2
330    pcmpeqw   xmm1, xmm2
331    packsswb  xmm1, xmm0
332    xor r1, r1
333    pmovmskb  r1d,  xmm1
334    xor       r1d,  0xffff
335    mov       r2,  r1
336    and       r1,  0xff
337    shr       r2,  8
338;   and       ecx,  0xff    ; we do not need this due to high 16bits equal to 0 yet
339;   xor       retr,  retr
340    ;add       al,  [nozero_count_table+r2]
341    lea       r0 , [pic(nozero_count_table)]
342    movzx     r2, byte [r0+r2]
343    movzx     r1,   byte [r0+r1]
344    mov   retrq, r2
345    add   retrq, r1
346    ;add       al,  [nozero_count_table+r1]
347    DEINIT_X86_32_PIC
348    ret
349
350;***********************************************************************
351; int32_t WelsGetNoneZeroCount_sse42(int16_t* level);
352;***********************************************************************
353WELS_EXTERN WelsGetNoneZeroCount_sse42
354    %assign push_num 0
355    LOAD_1_PARA
356    movdqa          xmm0, [r0]
357    packsswb        xmm0, [r0 + 16]
358    pxor            xmm1, xmm1
359    pcmpeqb         xmm0, xmm1
360    pmovmskb        retrd, xmm0
361    xor             retrd, 0FFFFh
362    popcnt          retrd, retrd
363    ret
364