• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;*!
2;* \copy
3;*     Copyright (c)  2009-2013, Cisco Systems
4;*     All rights reserved.
5;*
6;*     Redistribution and use in source and binary forms, with or without
7;*     modification, are permitted provided that the following conditions
8;*     are met:
9;*
10;*        * Redistributions of source code must retain the above copyright
11;*          notice, this list of conditions and the following disclaimer.
12;*
13;*        * Redistributions in binary form must reproduce the above copyright
14;*          notice, this list of conditions and the following disclaimer in
15;*          the documentation and/or other materials provided with the
16;*          distribution.
17;*
18;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29;*     POSSIBILITY OF SUCH DAMAGE.
30;*
31;*
32;*  quant.asm
33;*
34;*  Abstract
35;*      sse2 quantize inter-block
36;*
37;*  History
38;*      7/6/2009 Created
39;*
40;*
41;*************************************************************************/
42
43%include "asm_inc.asm"
44
45
46SECTION .text
47;************************************************
48;NEW_QUANT
49;************************************************
50
51%macro SSE2_Quant8  5
52    MOVDQ   %1, %5
53    pxor    %2, %2
54    pcmpgtw %2, %1
55    pxor    %1, %2
56    psubw   %1, %2
57    paddusw %1, %3
58    pmulhuw %1, %4
59    pxor    %1, %2
60    psubw   %1, %2
61    MOVDQ   %5, %1
62%endmacro
63
64%macro SSE2_QuantMax8  6
65    MOVDQ   %1, %5
66    pxor    %2, %2
67    pcmpgtw %2, %1
68    pxor    %1, %2
69    psubw   %1, %2
70    paddusw %1, %3
71    pmulhuw %1, %4
72    pmaxsw  %6, %1
73    pxor    %1, %2
74    psubw   %1, %2
75    MOVDQ   %5, %1
76%endmacro
77
78%define pDct                esp + 4
79%define ff                  esp + 8
80%define mf                  esp + 12
81%define max                 esp + 16
82;***********************************************************************
83;   void WelsQuant4x4_sse2(int16_t *pDct, int16_t* ff,  int16_t *mf);
84;***********************************************************************
85WELS_EXTERN WelsQuant4x4_sse2
86    %assign push_num 0
87    LOAD_3_PARA
88    movdqa  xmm2, [r1]
89    movdqa  xmm3, [r2]
90
91    SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
92    SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
93
94    ret
95
96;***********************************************************************
97;void WelsQuant4x4Dc_sse2(int16_t *pDct, const int16_t ff, int16_t mf);
98;***********************************************************************
99WELS_EXTERN WelsQuant4x4Dc_sse2
100    %assign push_num 0
101    LOAD_3_PARA
102    SIGN_EXTENSIONW r1, r1w
103    SIGN_EXTENSIONW r2, r2w
104    SSE2_Copy8Times xmm3, r2d
105
106    SSE2_Copy8Times xmm2, r1d
107
108    SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
109    SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
110
111    ret
112
113;***********************************************************************
114;   void WelsQuantFour4x4_sse2(int16_t *pDct, int16_t* ff,  int16_t *mf);
115;***********************************************************************
116WELS_EXTERN WelsQuantFour4x4_sse2
117    %assign push_num 0
118    LOAD_3_PARA
119    MOVDQ   xmm2, [r1]
120    MOVDQ   xmm3, [r2]
121
122    SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
123    SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
124    SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x20]
125    SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x30]
126    SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x40]
127    SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x50]
128    SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x60]
129    SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x70]
130
131    ret
132
133;***********************************************************************
134;   void WelsQuantFour4x4Max_sse2(int16_t *pDct, int32_t* f,  int16_t *mf, int16_t *max);
135;***********************************************************************
136WELS_EXTERN WelsQuantFour4x4Max_sse2
137    %assign push_num 0
138    LOAD_4_PARA
139    PUSH_XMM 8
140    MOVDQ   xmm2, [r1]
141    MOVDQ   xmm3, [r2]
142
143    pxor    xmm4, xmm4
144    pxor    xmm5, xmm5
145    pxor    xmm6, xmm6
146    pxor    xmm7, xmm7
147    SSE2_QuantMax8  xmm0, xmm1, xmm2, xmm3, [r0   ], xmm4
148    SSE2_QuantMax8  xmm0, xmm1, xmm2, xmm3, [r0 + 0x10], xmm4
149    SSE2_QuantMax8  xmm0, xmm1, xmm2, xmm3, [r0 + 0x20], xmm5
150    SSE2_QuantMax8  xmm0, xmm1, xmm2, xmm3, [r0 + 0x30], xmm5
151    SSE2_QuantMax8  xmm0, xmm1, xmm2, xmm3, [r0 + 0x40], xmm6
152    SSE2_QuantMax8  xmm0, xmm1, xmm2, xmm3, [r0 + 0x50], xmm6
153    SSE2_QuantMax8  xmm0, xmm1, xmm2, xmm3, [r0 + 0x60], xmm7
154    SSE2_QuantMax8  xmm0, xmm1, xmm2, xmm3, [r0 + 0x70], xmm7
155
156    SSE2_TransTwo4x4W xmm4, xmm5, xmm6, xmm7, xmm0
157    pmaxsw  xmm0,  xmm4
158    pmaxsw  xmm0,  xmm5
159    pmaxsw  xmm0,  xmm7
160    movdqa  xmm1,  xmm0
161    punpckhqdq  xmm0, xmm1
162    pmaxsw  xmm0, xmm1
163
164    movq    [r3], xmm0
165    POP_XMM
166    LOAD_4_PARA_POP
167    ret
168
169%macro MMX_Copy4Times 2
170    movd        %1, %2
171    punpcklwd   %1, %1
172    punpckldq   %1, %1
173%endmacro
174
175SECTION .text
176
177%macro MMX_Quant4  4
178    pxor    %2, %2
179    pcmpgtw %2, %1
180    pxor    %1, %2
181    psubw   %1, %2
182    paddusw %1, %3
183    pmulhuw %1, %4
184    pxor    %1, %2
185    psubw   %1, %2
186%endmacro
187
188;***********************************************************************
189;int32_t WelsHadamardQuant2x2_mmx(int16_t *rs, const int16_t ff, int16_t mf, int16_t * pDct, int16_t * block);
190;***********************************************************************
191WELS_EXTERN WelsHadamardQuant2x2_mmx
192    %assign push_num 0
193    LOAD_5_PARA
194    SIGN_EXTENSIONW r1, r1w
195    SIGN_EXTENSIONW r2, r2w
196    movd        mm0,            [r0]
197    movd        mm1,            [r0 + 0x20]
198    punpcklwd   mm0,            mm1
199    movd        mm3,            [r0 + 0x40]
200    movd        mm1,            [r0 + 0x60]
201    punpcklwd   mm3,            mm1
202
203    ;hdm_2x2,   mm0 = dct0 dct1, mm3 = dct2 dct3
204    movq        mm5,            mm3
205    paddw       mm3,            mm0
206    psubw       mm0,            mm5
207    punpcklwd   mm3,            mm0
208    movq        mm1,            mm3
209    psrlq       mm1,            32
210    movq        mm5,            mm1
211    paddw       mm1,            mm3
212    psubw       mm3,            mm5
213    punpcklwd   mm1,            mm3
214
215    ;quant_2x2_dc
216    MMX_Copy4Times  mm3,        r2d
217    MMX_Copy4Times  mm2,        r1d
218    MMX_Quant4      mm1,    mm0,    mm2,    mm3
219
220    ; store dct_2x2
221    movq        [r3],           mm1
222    movq        [r4],           mm1
223
224    ; pNonZeroCount of dct_2x2
225    pcmpeqb     mm2,            mm2     ; mm2 = FF
226    pxor        mm3,            mm3
227    packsswb    mm1,            mm3
228    pcmpeqb     mm1,            mm3     ; set FF if equal, 0 if not equal
229    psubsb      mm1,            mm2     ; set 0 if equal, 1 if not equal
230    psadbw      mm1,            mm3     ;
231    mov         r1w,                0
232    mov         [r0],           r1w
233    mov         [r0 + 0x20],    r1w
234    mov         [r0 + 0x40],    r1w
235    mov         [r0 + 0x60],    r1w
236
237
238    movd        retrd,      mm1
239
240    WELSEMMS
241    LOAD_5_PARA_POP
242    ret
243
244;***********************************************************************
245;int32_t WelsHadamardQuant2x2Skip_mmx(int16_t *pDct, int16_t ff,  int16_t mf);
246;***********************************************************************
247WELS_EXTERN WelsHadamardQuant2x2Skip_mmx
248    %assign push_num 0
249    LOAD_3_PARA
250    SIGN_EXTENSIONW r1, r1w
251    SIGN_EXTENSIONW r2, r2w
252    movd        mm0,            [r0]
253    movd        mm1,            [r0 + 0x20]
254    punpcklwd   mm0,            mm1
255    movd        mm3,            [r0 + 0x40]
256    movd        mm1,            [r0 + 0x60]
257    punpcklwd   mm3,            mm1
258
259    ;hdm_2x2,   mm0 = dct0 dct1, mm3 = dct2 dct3
260    movq        mm5,            mm3
261    paddw       mm3,            mm0
262    psubw       mm0,            mm5
263    punpcklwd   mm3,            mm0
264    movq        mm1,            mm3
265    psrlq       mm1,            32
266    movq        mm5,            mm1
267    paddw       mm1,            mm3
268    psubw       mm3,            mm5
269    punpcklwd   mm1,            mm3
270
271    ;quant_2x2_dc
272    MMX_Copy4Times  mm3,        r2d
273    MMX_Copy4Times  mm2,        r1d
274    MMX_Quant4      mm1,    mm0,    mm2,    mm3
275
276    ; pNonZeroCount of dct_2x2
277    pcmpeqb     mm2,            mm2     ; mm2 = FF
278    pxor        mm3,            mm3
279    packsswb    mm1,            mm3
280    pcmpeqb     mm1,            mm3     ; set FF if equal, 0 if not equal
281    psubsb      mm1,            mm2     ; set 0 if equal, 1 if not equal
282    psadbw      mm1,            mm3     ;
283    movd        retrd,          mm1
284
285    WELSEMMS
286    ret
287
288
289%macro SSE2_DeQuant8 3
290    MOVDQ  %2, %1
291    pmullw %2, %3
292    MOVDQ  %1, %2
293%endmacro
294
295
296;***********************************************************************
297; void WelsDequant4x4_sse2(int16_t *pDct, const uint16_t* mf);
298;***********************************************************************
299WELS_EXTERN WelsDequant4x4_sse2
300    %assign push_num 0
301    LOAD_2_PARA
302
303    movdqa  xmm1, [r1]
304    SSE2_DeQuant8 [r0   ],  xmm0, xmm1
305    SSE2_DeQuant8 [r0 + 0x10],  xmm0, xmm1
306
307    ret
308
309;***********************************************************************
310;void WelsDequantFour4x4_sse2(int16_t *pDct, const uint16_t* mf);
311;***********************************************************************
312
313WELS_EXTERN WelsDequantFour4x4_sse2
314    %assign push_num 0
315    LOAD_2_PARA
316
317    movdqa  xmm1, [r1]
318    SSE2_DeQuant8 [r0   ],  xmm0, xmm1
319    SSE2_DeQuant8 [r0+0x10  ],  xmm0, xmm1
320    SSE2_DeQuant8 [r0+0x20  ],  xmm0, xmm1
321    SSE2_DeQuant8 [r0+0x30  ],  xmm0, xmm1
322    SSE2_DeQuant8 [r0+0x40  ],  xmm0, xmm1
323    SSE2_DeQuant8 [r0+0x50  ],  xmm0, xmm1
324    SSE2_DeQuant8 [r0+0x60  ],  xmm0, xmm1
325    SSE2_DeQuant8 [r0+0x70  ],  xmm0, xmm1
326
327    ret
328
329;***********************************************************************
330;void WelsDequantIHadamard4x4_sse2(int16_t *rs, const uint16_t mf);
331;***********************************************************************
332WELS_EXTERN WelsDequantIHadamard4x4_sse2
333    %assign push_num 0
334    LOAD_2_PARA
335    %ifndef X86_32
336    movzx r1, r1w
337    %endif
338
339    ; WelsDequantLumaDc4x4
340    SSE2_Copy8Times xmm1,       r1d
341    ;psrlw      xmm1,       2       ; for the (>>2) in ihdm
342    MOVDQ       xmm0,       [r0]
343    MOVDQ       xmm2,       [r0+0x10]
344    pmullw      xmm0,       xmm1
345    pmullw      xmm2,       xmm1
346
347    ; ihdm_4x4
348    movdqa      xmm1,       xmm0
349    psrldq      xmm1,       8
350    movdqa      xmm3,       xmm2
351    psrldq      xmm3,       8
352
353    SSE2_SumSub     xmm0, xmm3, xmm5                    ; xmm0 = xmm0 - xmm3, xmm3 = xmm0 + xmm3
354    SSE2_SumSub     xmm1, xmm2, xmm5                    ; xmm1 = xmm1 - xmm2, xmm2 = xmm1 + xmm2
355    SSE2_SumSub     xmm3, xmm2, xmm5                    ; xmm3 = xmm3 - xmm2, xmm2 = xmm3 + xmm2
356    SSE2_SumSub     xmm0, xmm1, xmm5                    ; xmm0 = xmm0 - xmm1, xmm1 = xmm0 + xmm1
357
358    SSE2_TransTwo4x4W   xmm2, xmm1, xmm3, xmm0, xmm4
359    SSE2_SumSub     xmm2, xmm4, xmm5
360    SSE2_SumSub     xmm1, xmm0, xmm5
361    SSE2_SumSub     xmm4, xmm0, xmm5
362    SSE2_SumSub     xmm2, xmm1, xmm5
363    SSE2_TransTwo4x4W   xmm0, xmm1, xmm4, xmm2, xmm3
364
365    punpcklqdq  xmm0,       xmm1
366    MOVDQ       [r0],       xmm0
367
368    punpcklqdq  xmm2,       xmm3
369    MOVDQ       [r0+16],    xmm2
370    ret
371
372
373%ifdef HAVE_AVX2
374; data=%1 abs_out=%2 ff=%3 mf=%4 7FFFh=%5
375%macro AVX2_Quant 5
376    vpabsw          %2, %1
377    vpor            %1, %1, %5  ; ensure non-zero before vpsignw
378    vpaddusw        %2, %2, %3
379    vpmulhuw        %2, %2, %4
380    vpsignw         %1, %2, %1
381%endmacro
382
383
384;***********************************************************************
385;   void WelsQuant4x4_avx2(int16_t *pDct, int16_t* ff, int16_t *mf);
386;***********************************************************************
387
388WELS_EXTERN WelsQuant4x4_avx2
389    %assign push_num 0
390    LOAD_3_PARA
391    PUSH_XMM 5
392    vbroadcasti128  ymm0, [r1]
393    vbroadcasti128  ymm1, [r2]
394    WELS_DW32767_VEX ymm2
395    vmovdqu         ymm3, [r0]
396    AVX2_Quant      ymm3, ymm4, ymm0, ymm1, ymm2
397    vmovdqu         [r0], ymm3
398    vzeroupper
399    POP_XMM
400    ret
401
402
403;***********************************************************************
404;void WelsQuant4x4Dc_avx2(int16_t *pDct, int16_t ff, int16_t mf);
405;***********************************************************************
406
407WELS_EXTERN WelsQuant4x4Dc_avx2
408    %assign push_num 0
409    LOAD_1_PARA
410    PUSH_XMM 5
411%ifidni r1, arg2
412    vmovd           xmm0, arg2d
413    vpbroadcastw    ymm0, xmm0
414%else
415    vpbroadcastw    ymm0, arg2
416%endif
417%ifidni r2, arg3
418    vmovd           xmm1, arg3d
419    vpbroadcastw    ymm1, xmm1
420%else
421    vpbroadcastw    ymm1, arg3
422%endif
423    WELS_DW32767_VEX ymm2
424    vmovdqu         ymm3, [r0]
425    AVX2_Quant      ymm3, ymm4, ymm0, ymm1, ymm2
426    vmovdqu         [r0], ymm3
427    vzeroupper
428    POP_XMM
429    ret
430
431
432;***********************************************************************
433;   void WelsQuantFour4x4_avx2(int16_t *pDct, int16_t* ff, int16_t *mf);
434;***********************************************************************
435
436WELS_EXTERN WelsQuantFour4x4_avx2
437    %assign push_num 0
438    LOAD_3_PARA
439    PUSH_XMM 6
440    vbroadcasti128  ymm0, [r1]
441    vbroadcasti128  ymm1, [r2]
442    WELS_DW32767_VEX ymm4
443    vmovdqu         ymm3, [r0 + 0x00]
444    vmovdqu         ymm5, [r0 + 0x20]
445    AVX2_Quant      ymm3, ymm2, ymm0, ymm1, ymm4
446    vmovdqu         [r0 + 0x00], ymm3
447    AVX2_Quant      ymm5, ymm2, ymm0, ymm1, ymm4
448    vmovdqu         [r0 + 0x20], ymm5
449    vmovdqu         ymm3, [r0 + 0x40]
450    vmovdqu         ymm5, [r0 + 0x60]
451    AVX2_Quant      ymm3, ymm2, ymm0, ymm1, ymm4
452    vmovdqu         [r0 + 0x40], ymm3
453    AVX2_Quant      ymm5, ymm2, ymm0, ymm1, ymm4
454    vmovdqu         [r0 + 0x60], ymm5
455    vzeroupper
456    POP_XMM
457    ret
458
459
460;***********************************************************************
461;   void WelsQuantFour4x4Max_avx2(int16_t *pDct, int32_t* ff, int16_t *mf, int16_t *max);
462;***********************************************************************
463
464WELS_EXTERN WelsQuantFour4x4Max_avx2
465    %assign push_num 0
466    LOAD_4_PARA
467    PUSH_XMM 7
468    vbroadcasti128  ymm0, [r1]
469    vbroadcasti128  ymm1, [r2]
470    WELS_DW32767_VEX ymm6
471    vmovdqu         ymm4, [r0 + 0x00]
472    vmovdqu         ymm5, [r0 + 0x20]
473    AVX2_Quant      ymm4, ymm2, ymm0, ymm1, ymm6
474    vmovdqu         [r0 + 0x00], ymm4
475    AVX2_Quant      ymm5, ymm3, ymm0, ymm1, ymm6
476    vmovdqu         [r0 + 0x20], ymm5
477    vperm2i128      ymm4, ymm2, ymm3, 00100000b
478    vperm2i128      ymm3, ymm2, ymm3, 00110001b
479    vpmaxsw         ymm2, ymm4, ymm3
480    vmovdqu         ymm4, [r0 + 0x40]
481    vmovdqu         ymm5, [r0 + 0x60]
482    AVX2_Quant      ymm4, ymm3, ymm0, ymm1, ymm6
483    vmovdqu         [r0 + 0x40], ymm4
484    AVX2_Quant      ymm5, ymm4, ymm0, ymm1, ymm6
485    vmovdqu         [r0 + 0x60], ymm5
486    vperm2i128      ymm5, ymm3, ymm4, 00100000b
487    vperm2i128      ymm4, ymm3, ymm4, 00110001b
488    vpmaxsw         ymm3, ymm5, ymm4
489    vpxor           ymm2, ymm2, ymm6  ; flip bits so as to enable use of vphminposuw to find max value.
490    vpxor           ymm3, ymm3, ymm6  ; flip bits so as to enable use of vphminposuw to find max value.
491    vextracti128    xmm4, ymm2, 1
492    vextracti128    xmm5, ymm3, 1
493    vphminposuw     xmm2, xmm2
494    vphminposuw     xmm3, xmm3
495    vphminposuw     xmm4, xmm4
496    vphminposuw     xmm5, xmm5
497    vpunpcklwd      xmm2, xmm2, xmm4
498    vpunpcklwd      xmm3, xmm3, xmm5
499    vpunpckldq      xmm2, xmm2, xmm3
500    vpxor           xmm2, xmm2, xmm6  ; restore non-flipped values.
501    vmovq           [r3], xmm2        ; store max values.
502    vzeroupper
503    POP_XMM
504    LOAD_4_PARA_POP
505    ret
506%endif
507
508