• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;*!
2;* \copy
3;*     Copyright (c)  2010-2013, Cisco Systems
4;*     All rights reserved.
5;*
6;*     Redistribution and use in source and binary forms, with or without
7;*     modification, are permitted provided that the following conditions
8;*     are met:
9;*
10;*        * Redistributions of source code must retain the above copyright
11;*          notice, this list of conditions and the following disclaimer.
12;*
13;*        * Redistributions in binary form must reproduce the above copyright
14;*          notice, this list of conditions and the following disclaimer in
15;*          the documentation and/or other materials provided with the
16;*          distribution.
17;*
18;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29;*     POSSIBILITY OF SUCH DAMAGE.
30;*
31;*
32;*  vaa.asm
33;*
34;*  Abstract
35;*      sse2 for pVaa routines
36;*
37;*  History
38;*      04/14/2010  Created
39;*      06/07/2010  Added AnalysisVaaInfoIntra_sse2(ssse3)
40;*      06/10/2010  Tune rc_sad_frame_sse2 and got about 40% improvement
41;*      08/11/2010  Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2
42;*
43;*************************************************************************/
44%include "asm_inc.asm"
45
46
47;***********************************************************************
48; Macros and other preprocessor constants
49;***********************************************************************
50
51; by comparing it outperforms than phaddw(SSSE3) sets
52%macro SUM_WORD_8x2_SSE2    2   ; dst(pSrc), tmp
53    ; @sum_8x2 begin
54    pshufd %2, %1, 04Eh ; 01001110 B
55    paddw %1, %2
56    pshuflw %2, %1, 04Eh    ; 01001110 B
57    paddw %1, %2
58    pshuflw %2, %1, 0B1h    ; 10110001 B
59    paddw %1, %2
60    ; end of @sum_8x2
61%endmacro   ; END of SUM_WORD_8x2_SSE2
62
63
64%macro VAA_AVG_BLOCK_SSE2 6 ; dst, t0, t1, t2, t3, t4
65    movdqa %1, [r0    ] ; line 0
66    movdqa %2, [r0+r1]  ; line 1
67    movdqa %3, %1
68    punpcklbw %1, xmm7
69    punpckhbw %3, xmm7
70    movdqa %4, %2
71    punpcklbw %4, xmm7
72    punpckhbw %2, xmm7
73    paddw %1, %4
74    paddw %2, %3
75    movdqa %3, [r0+r2]  ; line 2
76    movdqa %4, [r0+r3]  ; line 3
77    movdqa %5, %3
78    punpcklbw %3, xmm7
79    punpckhbw %5, xmm7
80    movdqa %6, %4
81    punpcklbw %6, xmm7
82    punpckhbw %4, xmm7
83    paddw %3, %6
84    paddw %4, %5
85    paddw %1, %3    ; block 0, 1
86    paddw %2, %4    ; block 2, 3
87    pshufd %3, %1, 0B1h
88    pshufd %4, %2, 0B1h
89    paddw %1, %3
90    paddw %2, %4
91    movdqa %3, %1
92    movdqa %4, %2
93    pshuflw %5, %1, 0B1h
94    pshufhw %6, %3, 0B1h
95    paddw %1, %5
96    paddw %3, %6
97    pshuflw %5, %2, 0B1h
98    pshufhw %6, %4, 0B1h
99    paddw %2, %5
100    paddw %4, %6
101    punpcklwd %1, %2
102    punpckhwd %3, %4
103    punpcklwd %1, %3
104    psraw %1, $04
105%endmacro
106
107%macro VAA_AVG_BLOCK_SSSE3 6 ; dst, t0, t1, t2, t3, t4
108    movdqa %1, [r0    ] ; line 0
109    movdqa %2, [r0+r1]  ; line 1
110    movdqa %3, %1
111    punpcklbw %1, xmm7
112    punpckhbw %3, xmm7
113    movdqa %4, %2
114    punpcklbw %4, xmm7
115    punpckhbw %2, xmm7
116    paddw %1, %4
117    paddw %2, %3
118    movdqa %3, [r0+r2]  ; line 2
119    movdqa %4, [r0+r3]  ; line 3
120    movdqa %5, %3
121    punpcklbw %3, xmm7
122    punpckhbw %5, xmm7
123    movdqa %6, %4
124    punpcklbw %6, xmm7
125    punpckhbw %4, xmm7
126    paddw %3, %6
127    paddw %4, %5
128    paddw %1, %3    ; block 0, 1
129    paddw %2, %4    ; block 2, 3
130    phaddw %1, %2   ; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; ..
131    phaddw %1, xmm7 ; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; ....
132    psraw %1, $04
133%endmacro
134
135
136
137;***********************************************************************
138; Code
139;***********************************************************************
140
141SECTION .text
142
143; , 6/7/2010
144
145;***********************************************************************
146;   int32_t AnalysisVaaInfoIntra_sse2(  uint8_t *pDataY, const int32_t iLineSize );
147;***********************************************************************
148WELS_EXTERN AnalysisVaaInfoIntra_sse2
149
150    %assign push_num 0
151    LOAD_2_PARA
152    PUSH_XMM 8
153    SIGN_EXTENSION r1,r1d
154
155%ifdef X86_32
156    push r3
157    push r4
158    push r5
159    push r6
160    %assign push_num push_num+4
161%endif
162
163    mov  r5,r7
164    and  r5,0fh
165    sub  r7,r5
166    sub  r7,32
167
168
169    mov r2,r1
170    sal r2,$01   ;r2 = 2*iLineSize
171    mov r3,r2
172    add r3,r1   ;r3 = 3*iLineSize
173
174    mov r4,r2
175    sal r4,$01   ;r4 = 4*iLineSize
176
177    pxor xmm7, xmm7
178
179    ; loops
180    VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
181    movq [r7], xmm0
182
183    lea r0, [r0+r4]
184    VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
185    movq [r7+8], xmm0
186
187    lea r0, [r0+r4]
188    VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
189    movq [r7+16], xmm0
190
191    lea r0, [r0+r4]
192    VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
193    movq [r7+24], xmm0
194
195    movdqa xmm0, [r7]       ; block 0~7
196    movdqa xmm1, [r7+16]    ; block 8~15
197    movdqa xmm2, xmm0
198    paddw xmm0, xmm1
199    SUM_WORD_8x2_SSE2 xmm0, xmm3
200
201    pmullw xmm1, xmm1
202    pmullw xmm2, xmm2
203    movdqa xmm3, xmm1
204    movdqa xmm4, xmm2
205    punpcklwd xmm1, xmm7
206    punpckhwd xmm3, xmm7
207    punpcklwd xmm2, xmm7
208    punpckhwd xmm4, xmm7
209    paddd xmm1, xmm2
210    paddd xmm3, xmm4
211    paddd xmm1, xmm3
212    pshufd xmm2, xmm1, 01Bh
213    paddd xmm1, xmm2
214    pshufd xmm2, xmm1, 0B1h
215    paddd xmm1, xmm2
216
217
218
219    movd r2d, xmm0
220    and r2, 0ffffh      ; effective low work truncated
221    mov r3, r2
222    imul r2, r3
223    sar r2, $04
224    movd retrd, xmm1
225    sub retrd, r2d
226
227    add r7,32
228    add r7,r5
229
230%ifdef X86_32
231    pop r6
232    pop r5
233    pop r4
234    pop r3
235%endif
236    POP_XMM
237
238    ret
239
240;***********************************************************************
241;   int32_t AnalysisVaaInfoIntra_ssse3( uint8_t *pDataY, const int32_t iLineSize );
242;***********************************************************************
243WELS_EXTERN AnalysisVaaInfoIntra_ssse3
244
245    %assign push_num 0
246    LOAD_2_PARA
247    PUSH_XMM 8
248    SIGN_EXTENSION r1,r1d
249
250%ifdef X86_32
251    push r3
252    push r4
253    push r5
254    push r6
255    %assign push_num push_num+4
256%endif
257
258    mov  r5,r7
259    and  r5,0fh
260    sub  r7,r5
261    sub  r7,32
262
263
264    mov r2,r1
265    sal r2,$01   ;r2 = 2*iLineSize
266    mov r3,r2
267    add r3,r1   ;r3 = 3*iLineSize
268
269    mov r4,r2
270    sal r4,$01   ;r4 = 4*iLineSize
271
272    pxor xmm7, xmm7
273
274    ; loops
275    VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
276    movq [r7],xmm0
277
278    lea r0,[r0+r4]
279    VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
280    movq [r7+8],xmm1
281
282
283    lea r0,[r0+r4]
284    VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
285    movq [r7+16],xmm0
286
287    lea r0,[r0+r4]
288    VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
289    movq [r7+24],xmm1
290
291
292    movdqa xmm0,[r7]
293    movdqa xmm1,[r7+16]
294    movdqa xmm2, xmm0
295    paddw xmm0, xmm1
296    SUM_WORD_8x2_SSE2 xmm0, xmm3    ; better performance than that of phaddw sets
297
298    pmullw xmm1, xmm1
299    pmullw xmm2, xmm2
300    movdqa xmm3, xmm1
301    movdqa xmm4, xmm2
302    punpcklwd xmm1, xmm7
303    punpckhwd xmm3, xmm7
304    punpcklwd xmm2, xmm7
305    punpckhwd xmm4, xmm7
306    paddd xmm1, xmm2
307    paddd xmm3, xmm4
308    paddd xmm1, xmm3
309    pshufd xmm2, xmm1, 01Bh
310    paddd xmm1, xmm2
311    pshufd xmm2, xmm1, 0B1h
312    paddd xmm1, xmm2
313
314
315    movd r2d, xmm0
316    and r2, 0ffffh          ; effective low work truncated
317    mov r3, r2
318    imul r2, r3
319    sar r2, $04
320    movd retrd, xmm1
321    sub retrd, r2d
322
323    add r7,32
324    add r7,r5
325%ifdef X86_32
326    pop r6
327    pop r5
328    pop r4
329    pop r3
330%endif
331    POP_XMM
332
333    ret
334
335;***********************************************************************
336;   uint8_t MdInterAnalysisVaaInfo_sse41( int32_t *pSad8x8 )
337;***********************************************************************
338WELS_EXTERN MdInterAnalysisVaaInfo_sse41
339    %assign push_num 0
340    LOAD_1_PARA
341    movdqa xmm0,[r0]
342    pshufd xmm1, xmm0, 01Bh
343    paddd xmm1, xmm0
344    pshufd xmm2, xmm1, 0B1h
345    paddd xmm1, xmm2
346    psrad xmm1, 02h     ; iAverageSad
347    movdqa xmm2, xmm1
348    psrad xmm2, 06h
349    movdqa xmm3, xmm0   ; iSadBlock
350    psrad xmm3, 06h
351    psubd xmm3, xmm2
352    pmulld xmm3, xmm3   ; [comment]: pmulld from SSE4.1 instruction sets
353    pshufd xmm4, xmm3, 01Bh
354    paddd xmm4, xmm3
355    pshufd xmm3, xmm4, 0B1h
356    paddd xmm3, xmm4
357    movd r0d, xmm3
358    cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD
359
360    jb near .threshold_exit
361    pshufd xmm0, xmm0, 01Bh
362    pcmpgtd xmm0, xmm1  ; iSadBlock > iAverageSad
363    movmskps retrd, xmm0
364    ret
365.threshold_exit:
366    mov retrd, 15
367    ret
368
369;***********************************************************************
370;   uint8_t MdInterAnalysisVaaInfo_sse2( int32_t *pSad8x8 )
371;***********************************************************************
372WELS_EXTERN MdInterAnalysisVaaInfo_sse2
373    %assign push_num 0
374    LOAD_1_PARA
375    movdqa xmm0, [r0]
376    pshufd xmm1, xmm0, 01Bh
377    paddd xmm1, xmm0
378    pshufd xmm2, xmm1, 0B1h
379    paddd xmm1, xmm2
380    psrad xmm1, 02h     ; iAverageSad
381    movdqa xmm2, xmm1
382    psrad xmm2, 06h
383    movdqa xmm3, xmm0   ; iSadBlock
384    psrad xmm3, 06h
385    psubd xmm3, xmm2
386
387    ; to replace pmulld functionality as below
388    movdqa xmm2, xmm3
389    pmuludq xmm2, xmm3
390    pshufd xmm4, xmm3, 0B1h
391    pmuludq xmm4, xmm4
392    movdqa xmm5, xmm2
393    punpckldq xmm5, xmm4
394    punpckhdq xmm2, xmm4
395    punpcklqdq xmm5, xmm2
396
397    pshufd xmm4, xmm5, 01Bh
398    paddd xmm4, xmm5
399    pshufd xmm5, xmm4, 0B1h
400    paddd xmm5, xmm4
401
402    movd r0d, xmm5
403    cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD
404    jb near .threshold_exit
405    pshufd xmm0, xmm0, 01Bh
406    pcmpgtd xmm0, xmm1  ; iSadBlock > iAverageSad
407    movmskps retrd, xmm0
408    ret
409.threshold_exit:
410    mov retrd, 15
411    ret
412