• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;*!
2;* \copy
3;*     Copyright (c)  2009-2013, Cisco Systems
4;*     All rights reserved.
5;*
6;*     Redistribution and use in source and binary forms, with or without
7;*     modification, are permitted provided that the following conditions
8;*     are met:
9;*
10;*        * Redistributions of source code must retain the above copyright
11;*          notice, this list of conditions and the following disclaimer.
12;*
13;*        * Redistributions in binary form must reproduce the above copyright
14;*          notice, this list of conditions and the following disclaimer in
15;*          the documentation and/or other materials provided with the
16;*          distribution.
17;*
18;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29;*     POSSIBILITY OF SUCH DAMAGE.
30;*
31;*
32;*  deblock.asm
33;*
34;*  Abstract
35;*      edge loop
36;*
37;*  History
38;*      08/07/2009 Created
39;*
40;*
41;*************************************************************************/
42%include "asm_inc.asm"
43
44;*******************************************************************************
45; Macros and other preprocessor constants
46;*******************************************************************************
47
48%ifdef X86_32_PICASM
49SECTION .text align=16
50%else
51SECTION .rodata align=16
52%endif
53
54ALIGN   16
55FOUR_16B_SSE2:   dw   4, 4, 4, 4, 4, 4, 4, 4
56
57ALIGN   16
58WELS_DB1_16:
59    times 16 db 1
60WELS_DB127_16:
61    times 16 db 127
62WELS_DB96_16:
63    times 16 db 96
64WELS_SHUFB0000111122223333:
65    times 4 db 0
66    times 4 db 1
67    times 4 db 2
68    times 4 db 3
69
70
71SECTION .text
72
73; Unsigned byte absolute difference.
74; a=%1 b=%2 clobber=%3
75; Subtract once in each direction with saturation and return the maximum.
76%macro SSE2_AbsDiffUB 3
77    movdqa   %3, %2
78    psubusb  %3, %1
79    psubusb  %1, %2
80    por      %1, %3
81%endmacro
82
83; Unsigned byte compare less than.
84; lhs=%1 rhs^0x7f=%2 0x7f=%3
85; No unsigned byte lt/gt compare instruction available; xor by 0x7f and use a
86; signed compare. Some other options do exist. This one allows modifying the lhs
87; without mov and uses a bitwise op which can be executed on most ports on
88; common architectures.
89%macro SSE2_CmpltUB 3
90    pxor     %1, %3
91    pcmpgtb  %1, %2
92%endmacro
93
94; Unsigned byte compare greater than or equal.
95%macro SSE2_CmpgeUB 2
96    pminub   %1, %2
97    pcmpeqb  %1, %2
98%endmacro
99
100; Clip unsigned bytes to ref +/- diff.
101; data=%1 ref=%2 maxdiff_from_ref=%3 clobber=%4
102%macro SSE2_ClipUB 4
103    movdqa   %4, %2
104    psubusb  %4, %3
105    paddusb  %3, %2
106    pmaxub   %1, %4
107    pminub   %1, %3
108%endmacro
109
110; (a + b + 1 - c) >> 1
111; a=%1 b=%2 c=%3 [out:a^b&c]=%4
112%macro SSE2_AvgbFloor1 4
113    movdqa   %4, %1
114    pxor     %4, %2
115    pavgb    %1, %2
116    pand     %4, %3
117    psubb    %1, %4
118%endmacro
119
120; (a + b + carry) >> 1
121; a=%1 b=%2 carry-1=%3
122%macro SSE2_AvgbFloor2 3
123    pxor     %1, %3
124    pxor     %2, %3
125    pavgb    %1, %2
126    pxor     %1, %3
127%endmacro
128
129; a = (a & m) | (b & ~m)
130; a=%1 b=%2 m=%3
131%macro SSE2_Blend 3
132    pand     %1, %3
133    pandn    %3, %2
134    por      %1, %3
135%endmacro
136
137; Compute
138; p0 = clip(p0 + clip((q0 - p0 + ((p1 - q1) >> 2) + 1) >> 1, -iTc, iTc), 0, 255)
139; q0 = clip(q0 - clip((q0 - p0 + ((p1 - q1) >> 2) + 1) >> 1, -iTc, iTc), 0, 255)
140; 16-wide parallel in packed byte representation in xmm registers.
141;
142; p1=%1 p0=%2 q0=%3 q1=%4 iTc=%5 FFh=%6 xmmclobber=%7,%8
143%macro SSE2_DeblockP0Q0_Lt4 8
144    ; (q0 - p0 + ((p1 - q1) >> 2) + 1) >> 1 clipped to [-96, 159] and biased to [0, 255].
145    ; A limited range is sufficient because the value is clipped to [-iTc, iTc] later.
146    ; Bias so that unsigned saturation can be used.
147    ; Get ((p1 - q1) >> 2) + 192 via a pxor and two pavgbs.
148    ; q0 - p0 is split into a non-negative and non-positive part. The latter is
149    ; subtracted from the biased value.
150    movdqa     %7, %2
151    psubusb    %7, %3  ; clip(p0 - q0, 0, 255)
152    ; ((p1 - q1) >> 2) + 0xc0
153    pxor       %4, %6  ; q1 ^ 0xff aka -q1 - 1 & 0xff
154    pavgb      %1, %4  ; (((p1 - q1 + 0x100) >> 1)
155    pavgb      %1, %6  ;  + 0x100) >> 1
156    psubusb    %1, %7  ; -= clip(p0 - q0, 0, 255) saturate.
157    movdqa     %8, %3
158    psubusb    %8, %2  ; (clip(q0 - p0, 0, 255)
159    pavgb      %8, %1  ;  + clip(((p1 - q1 + 0x300) >> 2) - clip(p0 - q0, 0, 255), 0, 255) + 1) >> 1
160
161    ; Unbias and split into a non-negative and a non-positive part.
162    ; Clip each part to iTc via minub.
163    ; Add/subtract each part to/from p0/q0 and clip.
164    movdqa     %6, [pic(WELS_DB96_16)]
165    psubusb    %6, %8
166    psubusb    %8, [pic(WELS_DB96_16)]
167    pminub     %6, %5
168    pminub     %8, %5
169    psubusb    %2, %6
170    paddusb    %2, %8  ; p0
171    paddusb    %3, %6
172    psubusb    %3, %8  ; q0
173%endmacro
174
175
176;*******************************************************************************
177;    void DeblockLumaLt4V_ssse3(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
178;                                 int32_t iBeta, int8_t * pTC)
179;*******************************************************************************
180
181WELS_EXTERN DeblockLumaLt4V_ssse3
182    %assign push_num 0
183    INIT_X86_32_PIC r5
184    LOAD_5_PARA
185    PUSH_XMM 8
186    SIGN_EXTENSION r1, r1d
187    movd     xmm1, arg3d
188    movd     xmm2, arg4d
189    pxor     xmm3, xmm3
190    pxor     xmm1, [pic(WELS_DB127_16)]
191    pxor     xmm2, [pic(WELS_DB127_16)]
192    pshufb   xmm1, xmm3                       ; iAlpha ^ 0x7f
193    pshufb   xmm2, xmm3                       ; iBeta  ^ 0x7f
194    mov      r2, r1                           ; iStride
195    neg      r1                               ; -iStride
196    lea      r3, [r0 + r1]                    ; pPix - iStride
197
198    ; Compute masks to enable/disable deblocking.
199    MOVDQ    xmm6, [r3 + 0 * r1]              ; p0
200    MOVDQ    xmm7, [r3 + 1 * r1]              ; p1
201    MOVDQ    xmm0, [r0 + 0 * r2]              ; q0
202    movdqa   xmm4, xmm6
203    SSE2_AbsDiffUB xmm6, xmm0, xmm3           ; |p0 - q0|
204    SSE2_CmpltUB xmm6, xmm1, [pic(WELS_DB127_16)]  ; bDeltaP0Q0 = |p0 - q0| < iAlpha
205    MOVDQ    xmm1, [r0 + 1 * r2]              ; q1
206    SSE2_AbsDiffUB xmm7, xmm4, xmm3           ; |p1 - p0|
207    SSE2_AbsDiffUB xmm0, xmm1, xmm3           ; |q1 - q0|
208    pmaxub   xmm7, xmm0                       ; max(|p1 - p0|, |q1 - q0|)
209    SSE2_CmpltUB xmm7, xmm2, [pic(WELS_DB127_16)]  ; bDeltaP1P0 & bDeltaQ1Q0 = max(|p1 - p0|, |q1 - q0|) < iBeta
210    pand     xmm6, xmm7                       ; bDeltaP0Q0P1P0Q1Q0 = bDeltaP0Q0 & bDeltaP1P0 & bDeltaQ1Q0
211    MOVDQ    xmm7, [r3 + 2 * r1]              ; p2
212    movdqa   xmm0, xmm7
213    SSE2_AbsDiffUB xmm7, xmm4, xmm3           ; |p2 - p0|
214    SSE2_CmpltUB xmm7, xmm2, [pic(WELS_DB127_16)]  ; bDeltaP2P0 = |p2 - p0| < iBeta
215    MOVDQ    xmm5, [r0 + 2 * r2]              ; q2
216    MOVDQ    xmm3, [r0 + 0 * r2]              ; q0
217    movdqa   xmm1, xmm5
218    SSE2_AbsDiffUB xmm5, xmm3, xmm4           ; |q2 - q0|
219    SSE2_CmpltUB xmm5, xmm2, [pic(WELS_DB127_16)]  ; bDeltaQ2Q0 = |q2 - q0| < iBeta
220
221    pavgb    xmm3, [r3 + 0 * r1]
222    pcmpeqw  xmm2, xmm2  ; FFh
223    pxor     xmm3, xmm2
224    ; (p2 + ((p0 + q0 + 1) >> 1)) >> 1
225    pxor     xmm0, xmm2
226    pavgb    xmm0, xmm3
227    pxor     xmm0, xmm2
228    ; (q2 + ((p0 + q0 + 1) >> 1)) >> 1
229    pxor     xmm1, xmm2
230    pavgb    xmm1, xmm3
231    pxor     xmm1, xmm2
232
233    movd     xmm3, [r4]
234    pshufb   xmm3, [pic(WELS_SHUFB0000111122223333)] ; iTc
235    movdqa   xmm4, xmm3  ; iTc0 = iTc
236    pcmpgtb  xmm3, xmm2  ; iTc > -1 ? 0xff : 0x00
237    pand     xmm6, xmm3  ; bDeltaP0Q0P1P0Q1Q0 &= iTc > -1
238    movdqa   xmm3, xmm4
239    psubb    xmm3, xmm7  ; iTc -= bDeltaP2P0 ? -1 : 0
240    psubb    xmm3, xmm5  ; iTc -= bDeltaQ2Q0 ? -1 : 0
241    pand     xmm3, xmm6  ; iTc &= bDeltaP0Q0P1P0Q1Q0 ? 0xff : 0
242    pand     xmm7, xmm6  ; bDeltaP2P0 &= bDeltaP0Q0P1P0Q1Q0
243    pand     xmm5, xmm6  ; bDeltaQ2Q0 &= bDeltaP0Q0P1P0Q1Q0
244    pand     xmm7, xmm4  ; iTc0 & (bDeltaP2P0 ? 0xff : 0)
245    pand     xmm5, xmm4  ; iTc0 & (bDeltaQ2Q0 ? 0xff : 0)
246
247    MOVDQ    xmm4, [r3 + 1 * r1]
248    SSE2_ClipUB xmm0, xmm4, xmm7, xmm6  ; clip p1.
249    MOVDQ    xmm6, [r0 + 1 * r2]
250    MOVDQ    [r3 + 1 * r1], xmm0        ; store p1.
251    SSE2_ClipUB xmm1, xmm6, xmm5, xmm7  ; clip q1.
252    MOVDQ    [r0 + 1 * r2], xmm1        ; store q1.
253
254    MOVDQ    xmm1, [r3 + 0 * r1]  ; p0
255    MOVDQ    xmm0, [r0 + 0 * r2]  ; q0
256    SSE2_DeblockP0Q0_Lt4 xmm4, xmm1, xmm0, xmm6, xmm3, xmm2, xmm5, xmm7
257    MOVDQ    [r3 + 0 * r1], xmm1  ; store p0.
258    MOVDQ    [r0 + 0 * r2], xmm0  ; store q0.
259
260    POP_XMM
261    LOAD_5_PARA_POP
262    DEINIT_X86_32_PIC
263    ret
264
265
266; Deblock 3x16 luma pixels for the eq4 case.
267;
268; Compose 8-bit averages from pavgbs. Ie. (p1 + p0 + p2 + q0 + 2) >> 2 can be
269; written as (((p1 + p0) >> 1) + ((p2 + q0 + (p1 ^ p0 & 1)) >> 1) + 1) >> 1,
270; which maps to 3 pavgbs.
271;
272; pPix=%1 iStride=%2 [in:q0,out:p0]=%3 [in:q1,out:p1]=%4 bDeltaP0Q0P1P0Q1Q0=%5 bDeltaP2P0=%6 clobber=%7,%8,%9,%10 preserve_p0p1=%11 db1=%12
273%macro SSE2_DeblockLumaEq4_3x16P 12
274    movdqa   %7, %3
275    movdqa   %8, %6
276    MOVDQ    %10, [%1 + 1 * %2]                      ; p1
277    SSE2_Blend %7, %10, %8                           ; t0 = bDeltaP2P0 ? q0 : p1
278    movdqa   %8, %6
279    MOVDQ    %9, [%1 + 2 * %2]                       ; p2
280    SSE2_Blend %9, %4, %8                            ; t1 = bDeltaP2P0 ? p2 : q1
281    SSE2_AvgbFloor1 %4,  %9,   %12, %8               ; t1 = (t1 + q1) >> 1
282    SSE2_AvgbFloor1 %10, [%1], %12, %8               ; (p0 + p1) >> 1, p0 ^ p1
283    pxor     %8, %12
284    SSE2_AvgbFloor1 %7, %4, %8, %9                   ; (t0 + t1 + (p0 ^ p1 & 1)) >> 1
285    MOVDQ    %9, [%1 + 2 * %2]                       ; p2
286    SSE2_AvgbFloor1 %3, %9, %8, %4                   ; (p2 + q0 + (p0 ^ p1 & 1)) >> 1
287    pavgb    %7, %10                                 ; p0' = (p0 + p1 + t0 + t1 + 2) >> 2
288    movdqa   %8, %10
289    pxor     %8, %3                                  ; (p0 + p1) >> 1 ^ (p2 + q0 + (p0 ^ p1 & 1)) >> 1
290    pand     %8, %12                                 ; & 1
291    pavgb    %10, %3                                 ; p1' = (p0 + p1 + p2 + q0 + 2) >> 2
292    pand     %6, %5                                  ; bDeltaP2P0 &= bDeltaP0Q0P1P0Q1Q0
293%if %11
294    MOVDQ    %3, [%1 + 0 * %2]                       ; p0
295    movdqa   %4, %5
296    SSE2_Blend %7, %3, %4                            ; p0out = bDeltaP0Q0P1P0Q1Q0 ? p0' : p0
297%else
298    SSE2_Blend %7, [%1 + 0 * %2], %5                 ; p0out = bDeltaP0Q0P1P0Q1Q0 ? p0' : p0
299%endif
300    MOVDQ    [%1 + 0 * %2], %7                       ; store p0
301    add      %1, %2
302    movdqa   %7, %10
303    psubb    %10, %8                                 ; (p0 + p1 + p2 + q0) >> 2
304    psubb    %8, %12
305    MOVDQ    %4, [%1 + (3 - 1) * %2]                 ; p3
306    SSE2_AvgbFloor2 %4, %9, %8                       ; (p2 + p3 + ((p0 + p1) >> 1 ^ (p2 + q0 + (p0 ^ p1 & 1)) >> 1 & 1)) >> 1
307    pavgb    %10, %4                                 ; p2' = (((p0 + p1 + p2 + q0) >> 1) + p2 + p3 + 2) >> 2
308    movdqa   %8, %6
309    SSE2_Blend %10, [%1 + (2 - 1) * %2], %8          ; p2out = bDeltaP2P0 ? p2' : p2
310    MOVDQ    [%1 + (2 - 1) * %2], %10                ; store p2
311%if %11
312    MOVDQ    %4, [%1 + (1 - 1) * %2]                 ; p1
313    SSE2_Blend %7, %4, %6                            ; p1out = bDeltaP2P0 ? p1' : p1
314%else
315    SSE2_Blend %7, [%1 + (1 - 1) * %2], %6           ; p1out = bDeltaP2P0 ? p1' : p1
316%endif
317    MOVDQ    [%1 + (1 - 1) * %2], %7                 ; store p1
318%endmacro
319
320
321;*******************************************************************************
322;    void DeblockLumaEq4V_ssse3(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
323;                                 int32_t iBeta)
324;*******************************************************************************
325
326WELS_EXTERN DeblockLumaEq4V_ssse3
327    %assign push_num 0
328    INIT_X86_32_PIC r4
329    LOAD_4_PARA
330    PUSH_XMM 10
331    SIGN_EXTENSION r1, r1d
332    movd     xmm1, arg3d
333    movd     xmm2, arg4d
334    shr      r2, 2
335    add      r2, 1
336    movd     xmm3, r2d
337    pxor     xmm4, xmm4
338    pxor     xmm1, [pic(WELS_DB127_16)]
339    pxor     xmm2, [pic(WELS_DB127_16)]
340    pshufb   xmm1, xmm4                       ; iAlpha ^ 0x7f
341    pshufb   xmm2, xmm4                       ; iBeta  ^ 0x7f
342    pshufb   xmm3, xmm4                       ; (iAlpha >> 2) + 1
343    mov      r2, r1                           ; iStride
344    neg      r1                               ; -iStride
345    lea      r3, [r0 + r1]                    ; pPix - iStride
346
347    ; Compute masks to enable/disable filtering.
348    MOVDQ    xmm7, [r3 + 1 * r1]              ; p1
349    MOVDQ    xmm6, [r3 + 0 * r1]              ; p0
350    MOVDQ    xmm0, [r0 + 0 * r2]              ; q0
351    movdqa   xmm4, xmm6
352    SSE2_AbsDiffUB xmm6, xmm0, xmm5           ; |p0 - q0|
353    SSE2_CmpgeUB xmm3, xmm6                   ; |p0 - q0| < (iAlpha >> 2) + 2
354    SSE2_CmpltUB xmm6, xmm1, [pic(WELS_DB127_16)]  ; bDeltaP0Q0 = |p0 - q0| < iAlpha
355    MOVDQ    xmm1, [r0 + 1 * r2]              ; q1
356    SSE2_AbsDiffUB xmm7, xmm4, xmm5           ; |p1 - p0|
357    SSE2_AbsDiffUB xmm0, xmm1, xmm5           ; |q1 - q0|
358    pmaxub   xmm7, xmm0                       ; max(|p1 - p0|, |q1 - q0|)
359    SSE2_CmpltUB xmm7, xmm2, [pic(WELS_DB127_16)]  ; bDeltaP1P0 & bDeltaQ1Q0 = max(|p1 - p0|, |q1 - q0|) < iBeta
360    pand     xmm6, xmm7                       ; & bDeltaP0Q0
361
362    MOVDQ    xmm7, [r3 + 2 * r1]              ; p2
363    SSE2_AbsDiffUB xmm7, xmm4, xmm5           ; |p2 - p0|
364    SSE2_CmpltUB xmm7, xmm2, [pic(WELS_DB127_16)]  ; bDeltaP2P0 = |p2 - p0| < iBeta
365    pand     xmm7, xmm3                       ; &= |p0 - q0| < (iAlpha >> 2) + 2
366
367    MOVDQ    xmm0, [r0 + 0 * r2]              ; q0
368    MOVDQ    xmm5, [r0 + 2 * r2]              ; q2
369    SSE2_AbsDiffUB xmm5, xmm0, xmm4           ; |q2 - q0|
370    SSE2_CmpltUB xmm5, xmm2, [pic(WELS_DB127_16)]  ; bDeltaQ2Q0 = |q2 - q0| < iBeta
371    pand     xmm5, xmm3                       ; &= |p0 - q0| < (iAlpha >> 2) + 2
372
373%ifdef X86_32
374    ; Push xmm5 to free up one register. Align stack so as to ensure that failed
375    ; store forwarding penalty cannot occur (up to ~50 cycles for 128-bit on IVB).
376    mov      r2, esp
377    sub      esp,  16
378    and      esp, -16
379    movdqa   [esp], xmm5
380    SSE2_DeblockLumaEq4_3x16P r3, r1, xmm0, xmm1, xmm6, xmm7, xmm2, xmm3, xmm5, xmm4, 1, [pic(WELS_DB1_16)]
381    movdqa   xmm5, [esp]
382    mov      esp, r2
383    neg      r1
384    SSE2_DeblockLumaEq4_3x16P r0, r1, xmm0, xmm1, xmm6, xmm5, xmm2, xmm3, xmm7, xmm4, 0, [pic(WELS_DB1_16)]
385%else
386    movdqa   xmm9, [WELS_DB1_16]
387    SSE2_DeblockLumaEq4_3x16P r3, r1, xmm0, xmm1, xmm6, xmm7, xmm2, xmm3, xmm8, xmm4, 1, xmm9
388    SSE2_DeblockLumaEq4_3x16P r0, r2, xmm0, xmm1, xmm6, xmm5, xmm2, xmm3, xmm7, xmm4, 0, xmm9
389%endif
390
391    POP_XMM
392    LOAD_4_PARA_POP
393    DEINIT_X86_32_PIC
394    ret
395
396
397; [out:p1,p0,q0,q1]=%1,%2,%3,%4 pPixCb=%5 pPixCr=%6 iStride=%7 3*iStride-1=%8 xmmclobber=%9,%10,%11
398%macro SSE2_LoadCbCr_4x16H 11
399    movd       %1,  [%5 + 0 * %7 - 2]  ; [p1,p0,q0,q1] cb line 0
400    movd       %2,  [%5 + 2 * %7 - 2]  ; [p1,p0,q0,q1] cb line 2
401    punpcklbw  %1,  %2                 ; [p1,p1,p0,p0,q0,q0,q1,q1] cb line 0,2
402    movd       %2,  [%5 + 4 * %7 - 2]  ; [p1,p0,q0,q1] cb line 4
403    movd       %9,  [%5 + 2 * %8]      ; [p1,p0,q0,q1] cb line 6
404    punpcklbw  %2,  %9                 ; [p1,p1,p0,p0,q0,q0,q1,q1] cb line 4,6
405    punpcklwd  %1,  %2                 ; [p1,p1,p1,p1,p0,p0,p0,p0,q0,q0,q0,q0,q1,q1,q1,q1] cb line 0,2,4,6
406    movd       %2,  [%6 + 0 * %7 - 2]  ; [p1,p0,q0,q1] cr line 0
407    movd       %9,  [%6 + 2 * %7 - 2]  ; [p1,p0,q0,q1] cr line 2
408    punpcklbw  %2,  %9                 ; [p1,p1,p0,p0,q0,q0,q1,q1] cr line 0,2
409    movd       %9,  [%6 + 4 * %7 - 2]  ; [p1,p0,q0,q1] cr line 4
410    movd       %10, [%6 + 2 * %8]      ; [p1,p0,q0,q1] cr line 6
411    punpcklbw  %9,  %10                ; [p1,p1,p0,p0,q0,q0,q1,q1] cr line 4,6
412    punpcklwd  %2,  %9                 ; [p1,p1,p1,p1,p0,p0,p0,p0,q0,q0,q0,q0,q1,q1,q1,q1] cr line 0,2,4,6
413    add        %5,  %7                 ; pPixCb += iStride
414    add        %6,  %7                 ; pPixCr += iStride
415    movd       %9,  [%5 + 0 * %7 - 2]  ; [p1,p0,q0,q1] cb line 1
416    movd       %10, [%5 + 2 * %7 - 2]  ; [p1,p0,q0,q1] cb line 3
417    punpcklbw  %9,  %10                ; [p1,p1,p0,p0,q0,q0,q1,q1] cb line 1,3
418    movd       %10, [%5 + 4 * %7 - 2]  ; [p1,p0,q0,q1] cb line 5
419    movd       %3,  [%5 + 2 * %8]      ; [p1,p0,q0,q1] cb line 7
420    punpcklbw  %10, %3                 ; [p1,p1,p0,p0,q0,q0,q1,q1] cb line 5,7
421    punpcklwd  %9,  %10                ; [p1,p1,p1,p1,p0,p0,p0,p0,q0,q0,q0,q0,q1,q1,q1,q1] cb line 1,3,5,7
422    movd       %10, [%6 + 0 * %7 - 2]  ; [p1,p0,q0,q1] cr line 1
423    movd       %3,  [%6 + 2 * %7 - 2]  ; [p1,p0,q0,q1] cr line 3
424    punpcklbw  %10, %3                 ; [p1,p1,p0,p0,q0,q0,q1,q1] cr line 1,3
425    movd       %3,  [%6 + 4 * %7 - 2]  ; [p1,p0,q0,q1] cr line 5
426    movd       %4,  [%6 + 2 * %8]      ; [p1,p0,q0,q1] cr line 7
427    punpcklbw  %3,  %4                 ; [p1,p1,p0,p0,q0,q0,q1,q1] cr line 5,7
428    punpcklwd  %10, %3                 ; [p1,p1,p1,p1,p0,p0,p0,p0,q0,q0,q0,q0,q1,q1,q1,q1] cr line 1,3,5,7
429    movdqa     %3,  %1
430    punpckldq  %1,  %2                 ; [p1,p1,p1,p1,p1,p1,p1,p1,p0,p0,p0,p0,p0,p0,p0,p0] cb/cr line 0,2,4,6
431    punpckhdq  %3,  %2                 ; [q0,q0,q0,q0,q0,q0,q0,q0,q1,q1,q1,q1,q1,q1,q1,q1] cb/cr line 0,2,4,6
432    movdqa     %11, %9
433    punpckldq  %9,  %10                ; [p1,p1,p1,p1,p1,p1,p1,p1,p0,p0,p0,p0,p0,p0,p0,p0] cb/cr line 1,3,5,7
434    punpckhdq  %11, %10                ; [q0,q0,q0,q0,q0,q0,q0,q0,q1,q1,q1,q1,q1,q1,q1,q1] cb/cr line 1,3,5,7
435    movdqa     %2,  %1
436    punpcklqdq %1,  %9                 ; [p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1] cb/cr line 0,2,4,6,1,3,5,7
437    punpckhqdq %2,  %9                 ; [p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0] cb/cr line 0,2,4,6,1,3,5,7
438    movdqa     %4,  %3
439    punpcklqdq %3,  %11                ; [q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0] cb/cr line 0,2,4,6,1,3,5,7
440    punpckhqdq %4,  %11                ; [q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1] cb/cr line 0,2,4,6,1,3,5,7
441%endmacro
442
443; pPixCb+iStride=%1 pPixCr+iStride=%2 iStride=%3 3*iStride-1=%4 p0=%5 q0=%6 rclobber=%7 dwclobber={%8,%9} xmmclobber=%10
444%macro SSE2_StoreCbCr_4x16H 10
445    movdqa     %10, %5
446    punpcklbw  %10, %6                 ; [p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0] cb/cr line 0,2,4,6
447    punpckhbw  %5, %6                  ; [p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0] cb/cr line 1,3,5,7
448    mov        %7, r7                  ; preserve stack pointer
449    and        r7, -16                 ; align stack pointer
450    sub        r7, 32                  ; allocate stack space
451    movdqa     [r7     ], %10          ; store [p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0] cb/cr line 0,2,4,6 on the stack
452    movdqa     [r7 + 16], %5           ; store [p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0] cb/cr line 1,3,5,7 on the stack
453    mov        %8, [r7 + 16]           ; [p0,q0,p0,q0] cb line 1,3
454    mov        [%1 + 0 * %3 - 1], %9   ; store [p0,q0] cb line 1
455    shr        %8, 16                  ; [p0,q0] cb line 3
456    mov        [%1 + 2 * %3 - 1], %9   ; store [p0,q0] cb line 3
457    mov        %8, [r7 + 20]           ; [p0,q0,p0,q0] cb line 5,7
458    mov        [%1 + 4 * %3 - 1], %9   ; store [p0,q0] cb line 5
459    shr        %8, 16                  ; [p0,q0] cb line 7
460    mov        [%1 + 2 * %4 + 1], %9   ; store [p0,q0] cb line 7
461    mov        %8, [r7 + 24]           ; [p0,q0,p0,q0] cr line 1,3
462    mov        [%2 + 0 * %3 - 1], %9   ; store [p0,q0] cr line 1
463    shr        %8, 16                  ; [p0,q0] cr line 3
464    mov        [%2 + 2 * %3 - 1], %9   ; store [p0,q0] cr line 3
465    mov        %8, [r7 + 28]           ; [p0,q0,p0,q0] cr line 5,7
466    mov        [%2 + 4 * %3 - 1], %9   ; store [p0,q0] cr line 5
467    shr        %8, 16                  ; [p0,q0] cr line 7
468    mov        [%2 + 2 * %4 + 1], %9   ; store [p0,q0] cr line 7
469    sub        %1, %3                  ; pPixCb -= iStride
470    sub        %2, %3                  ; pPixCr -= iStride
471    mov        %8, [r7     ]           ; [p0,q0,p0,q0] cb line 0,2
472    mov        [%1 + 0 * %3 - 1], %9   ; store [p0,q0] cb line 0
473    shr        %8, 16                  ; [p0,q0] cb line 2
474    mov        [%1 + 2 * %3 - 1], %9   ; store [p0,q0] cb line 2
475    mov        %8, [r7 +  4]           ; [p0,q0,p0,q0] cb line 4,6
476    mov        [%1 + 4 * %3 - 1], %9   ; store [p0,q0] cb line 4
477    shr        %8, 16                  ; [p0,q0] cb line 6
478    mov        [%1 + 2 * %4 + 1], %9   ; store [p0,q0] cb line 6
479    mov        %8, [r7 +  8]           ; [p0,q0,p0,q0] cr line 0,2
480    mov        [%2 + 0 * %3 - 1], %9   ; store [p0,q0] cr line 0
481    shr        %8, 16                  ; [p0,q0] cr line 2
482    mov        [%2 + 2 * %3 - 1], %9   ; store [p0,q0] cr line 2
483    mov        %8, [r7 + 12]           ; [p0,q0,p0,q0] cr line 4,6
484    mov        [%2 + 4 * %3 - 1], %9   ; store [p0,q0] cr line 4
485    shr        %8, 16                  ; [p0,q0] cr line 6
486    mov        [%2 + 2 * %4 + 1], %9   ; store [p0,q0] cr line 6
487    mov        r7, %7                  ; restore stack pointer
488%endmacro
489
490; p1=%1 p0=%2 q0=%3 q1=%4 iAlpha=%5 iBeta=%6 pTC=%7 xmmclobber=%8,%9,%10 interleaveTC=%11
491%macro SSSE3_DeblockChromaLt4 11
492    movdqa     %8, %3
493    SSE2_AbsDiffUB %8, %2, %9           ; |p0 - q0|
494    SSE2_CmpgeUB %8, %5                 ; !bDeltaP0Q0 = |p0 - q0| >= iAlpha
495    movdqa     %9, %4
496    SSE2_AbsDiffUB %9, %3, %5           ; |q1 - q0|
497    movdqa     %10, %1
498    SSE2_AbsDiffUB %10, %2, %5          ; |p1 - p0|
499    pmaxub     %9, %10                  ; max(|q1 - q0|, |p1 - p0|)
500    pxor       %10, %10
501    movd       %5, %6
502    pshufb     %5, %10                  ; iBeta
503    SSE2_CmpgeUB %9, %5                 ; !bDeltaQ1Q0 | !bDeltaP1P0 = max(|q1 - q0|, |p1 - p0|) >= iBeta
504    por        %8, %9                   ; | !bDeltaP0Q0
505    movd       %5, [%7]
506%if %11
507    punpckldq  %5, %5
508    punpcklbw  %5, %5                   ; iTc
509%else
510    pshufd     %5, %5, 0                ; iTc
511%endif
512    pcmpeqw    %10, %10                 ; FFh
513    movdqa     %9, %5
514    pcmpgtb    %9, %10                  ; iTc > -1 ? FFh : 00h
515    pandn      %8, %5                   ; iTc & bDeltaP0Q0 & bDeltaP1P0 & bDeltaQ1Q0
516    pand       %8, %9                   ; &= (iTc > -1 ? FFh : 00h)
517    SSE2_DeblockP0Q0_Lt4 %1, %2, %3, %4, %8, %10, %5, %9
518%endmacro
519
520; p1=%1 p0=%2 q0=%3 q1=%4 iAlpha=%5 iBeta=%6 xmmclobber=%7,%8,%9
521%macro SSSE3_DeblockChromaEq4 9
522    movdqa   %7, %3
523    SSE2_AbsDiffUB %7, %2, %8         ; |p0 - q0|
524    SSE2_CmpgeUB %7, %5               ; !bDeltaP0Q0 = |p0 - q0| >= iAlpha
525    movdqa   %8, %4
526    SSE2_AbsDiffUB %8, %3, %5         ; |q1 - q0|
527    movdqa   %9, %1
528    SSE2_AbsDiffUB %9, %2, %5         ; |p1 - p0|
529    pmaxub   %8, %9                   ; max(|q1 - q0|, |p1 - p0|)
530    pxor     %9, %9
531    movd     %5, %6
532    pshufb   %5, %9                   ; iBeta
533    SSE2_CmpgeUB %8, %5               ; !bDeltaQ1Q0 | !bDeltaP1P0 = max(|q1 - q0|, |p1 - p0|) >= iBeta
534    por      %7, %8                   ; !bDeltaP0Q0P1P0Q1Q0 = !bDeltaP0Q0 | !bDeltaQ1Q0 | !bDeltaP1P0
535    WELS_DB1 %5
536    movdqa   %8, %2
537    SSE2_AvgbFloor1 %8, %4, %5, %9    ; (p0 + q1) >> 1
538    pavgb    %8, %1                   ; p0' = (p1 + ((p0 + q1) >> 1) + 1) >> 1
539    movdqa   %9, %7
540    SSE2_Blend %2, %8, %7             ; p0out = bDeltaP0Q0P1P0Q1Q0 ? p0' : p0
541    SSE2_AvgbFloor1 %1, %3, %5, %7    ; (q0 + p1) >> 1
542    pavgb    %1, %4                   ; q0' = (q1 + ((q0 + p1) >> 1) + 1) >> 1
543    SSE2_Blend %3, %1, %9             ; q0out = bDeltaP0Q0P1P0Q1Q0 ? q0' : q0
544%endmacro
545
546
547;******************************************************************************
548; void DeblockChromaLt4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
549;                           int32_t iAlpha, int32_t iBeta, int8_t * pTC);
550;*******************************************************************************
551
552WELS_EXTERN DeblockChromaLt4V_ssse3
553    %assign push_num 0
554    INIT_X86_32_PIC r4
555    LOAD_4_PARA
556    PUSH_XMM 8
557    SIGN_EXTENSION r2, r2d
558    movd     xmm7, arg4d
559    pxor     xmm0, xmm0
560    pshufb   xmm7, xmm0                       ; iAlpha
561    mov      r3, r2
562    neg      r3                               ; -iStride
563
564    movq     xmm0, [r0 + 0 * r2]              ; q0 cb
565    movhps   xmm0, [r1 + 0 * r2]              ; q0 cr
566    movq     xmm2, [r0 + 1 * r3]              ; p0 cb
567    movhps   xmm2, [r1 + 1 * r3]              ; p0 cr
568    movq     xmm1, [r0 + 1 * r2]              ; q1 cb
569    movhps   xmm1, [r1 + 1 * r2]              ; q1 cr
570    movq     xmm3, [r0 + 2 * r3]              ; p1 cb
571    movhps   xmm3, [r1 + 2 * r3]              ; p1 cr
572
573%ifidni arg6, r5
574    SSSE3_DeblockChromaLt4 xmm3, xmm2, xmm0, xmm1, xmm7, arg5d, arg6, xmm4, xmm5, xmm6, 1
575%else
576    mov      r2, arg6
577    SSSE3_DeblockChromaLt4 xmm3, xmm2, xmm0, xmm1, xmm7, arg5d, r2,   xmm4, xmm5, xmm6, 1
578%endif
579
580    movlps   [r0 + 1 * r3], xmm2              ; store p0 cb
581    movhps   [r1 + 1 * r3], xmm2              ; store p0 cr
582    movlps   [r0         ], xmm0              ; store q0 cb
583    movhps   [r1         ], xmm0              ; store q0 cr
584
585    POP_XMM
586    LOAD_4_PARA_POP
587    DEINIT_X86_32_PIC
588    ret
589
590
591;********************************************************************************
592;  void DeblockChromaEq4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
593;                             int32_t iAlpha, int32_t iBeta)
594;********************************************************************************
595
596WELS_EXTERN DeblockChromaEq4V_ssse3
597    %assign push_num 0
598    LOAD_4_PARA
599    PUSH_XMM 8
600    SIGN_EXTENSION r2, r2d
601    movd     xmm7, arg4d
602    pxor     xmm0, xmm0
603    pshufb   xmm7, xmm0                       ; iAlpha
604    mov      r3, r2
605    neg      r3                               ; -iStride
606
607    movq     xmm0, [r0 + 0 * r2]              ; q0 cb
608    movhps   xmm0, [r1 + 0 * r2]              ; q0 cr
609    movq     xmm2, [r0 + 1 * r3]              ; p0 cb
610    movhps   xmm2, [r1 + 1 * r3]              ; p0 cr
611    movq     xmm1, [r0 + 1 * r2]              ; q1 cb
612    movhps   xmm1, [r1 + 1 * r2]              ; q1 cr
613    movq     xmm3, [r0 + 2 * r3]              ; p1 cb
614    movhps   xmm3, [r1 + 2 * r3]              ; p1 cr
615
616    SSSE3_DeblockChromaEq4 xmm3, xmm2, xmm0, xmm1, xmm7, arg5d, xmm4, xmm5, xmm6
617
618    movlps   [r0 + 1 * r3], xmm2              ; store p0 cb
619    movhps   [r1 + 1 * r3], xmm2              ; store p0 cr
620    movlps   [r0 + 0 * r2], xmm0              ; store q0 cb
621    movhps   [r1 + 0 * r2], xmm0              ; store q0 cr
622
623    POP_XMM
624    LOAD_4_PARA_POP
625    ret
626
627
628;*******************************************************************************
629;    void DeblockChromaLt4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
630;                                int32_t iAlpha, int32_t iBeta, int8_t * pTC);
631;*******************************************************************************
632
633WELS_EXTERN DeblockChromaLt4H_ssse3
634    %assign push_num 0
635    LOAD_6_PARA
636    PUSH_XMM 8
637    SIGN_EXTENSION r2, r2d
638    movd       xmm7, arg4d
639    pxor       xmm0, xmm0
640    pshufb     xmm7, xmm0                       ; iAlpha
641    lea        r3, [3 * r2 - 1]                 ; 3 * iStride - 1
642
643    SSE2_LoadCbCr_4x16H xmm0, xmm1, xmm4, xmm5, r0, r1, r2, r3, xmm2, xmm3, xmm6
644    INIT_X86_32_PIC r1
645    SSSE3_DeblockChromaLt4 xmm0, xmm1, xmm4, xmm5, xmm7, arg5d, r5, xmm2, xmm3, xmm6, 0
646    DEINIT_X86_32_PIC
647    SSE2_StoreCbCr_4x16H r0, r1, r2, r3, xmm1, xmm4, r5, r4d, r4w, xmm0
648
649    POP_XMM
650    LOAD_6_PARA_POP
651    ret
652
653
654;***************************************************************************
655;  void DeblockChromaEq4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
656;          int32_t iAlpha, int32_t iBeta)
657;***************************************************************************
658
659WELS_EXTERN DeblockChromaEq4H_ssse3
660    %assign push_num 0
661    LOAD_4_PARA
662    PUSH_XMM 8
663    SIGN_EXTENSION r2, r2d
664    movd       xmm7, arg4d
665    pxor       xmm0, xmm0
666    pshufb     xmm7, xmm0                       ; iAlpha
667    lea        r3, [3 * r2 - 1]                 ; 3 * iStride - 1
668
669    SSE2_LoadCbCr_4x16H xmm0, xmm1, xmm4, xmm5, r0, r1, r2, r3, xmm2, xmm3, xmm6
670    SSSE3_DeblockChromaEq4 xmm0, xmm1, xmm4, xmm5, xmm7, arg5d, xmm2, xmm3, xmm6
671%ifdef X86_32
672    push r4
673    push r5
674    SSE2_StoreCbCr_4x16H r0, r1, r2, r3, xmm1, xmm4, r5, r4d, r4w, xmm0
675    pop r5
676    pop r4
677%else
678    SSE2_StoreCbCr_4x16H r0, r1, r2, r3, xmm1, xmm4, r5, r4d, r4w, xmm0
679%endif
680
681    POP_XMM
682    LOAD_4_PARA_POP
683    ret
684
685
686;********************************************************************************
687;
688;   void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);
689;
690;********************************************************************************
691
692WELS_EXTERN DeblockLumaTransposeH2V_sse2
693    push     r3
694    push     r4
695    push     r5
696
697%assign   push_num   3
698    LOAD_3_PARA
699    PUSH_XMM 8
700
701    SIGN_EXTENSION   r1, r1d
702
703    mov      r5,    r7
704    mov      r3,    r7
705    and      r3,    0Fh
706    sub      r7,    r3
707    sub      r7,    10h
708
709    lea      r3,    [r0 + r1 * 8]
710    lea      r4,    [r1 * 3]
711
712    movq    xmm0,  [r0]
713    movq    xmm7,  [r3]
714    punpcklqdq   xmm0,  xmm7
715    movq    xmm1,  [r0 + r1]
716    movq    xmm7,  [r3 + r1]
717    punpcklqdq   xmm1,  xmm7
718    movq    xmm2,  [r0 + r1*2]
719    movq    xmm7,  [r3 + r1*2]
720    punpcklqdq   xmm2,  xmm7
721    movq    xmm3,  [r0 + r4]
722    movq    xmm7,  [r3 + r4]
723    punpcklqdq   xmm3,  xmm7
724
725    lea     r0,   [r0 + r1 * 4]
726    lea     r3,   [r3 + r1 * 4]
727    movq    xmm4,  [r0]
728    movq    xmm7,  [r3]
729    punpcklqdq   xmm4,  xmm7
730    movq    xmm5,  [r0 + r1]
731    movq    xmm7,  [r3 + r1]
732    punpcklqdq   xmm5,  xmm7
733    movq    xmm6,  [r0 + r1*2]
734    movq    xmm7,  [r3 + r1*2]
735    punpcklqdq   xmm6,  xmm7
736
737    movdqa  [r7],   xmm0
738    movq    xmm7,  [r0 + r4]
739    movq    xmm0,  [r3 + r4]
740    punpcklqdq   xmm7,  xmm0
741    movdqa  xmm0,   [r7]
742
743    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r7]
744    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
745
746    movdqa  [r2],    xmm4
747    movdqa  [r2 + 10h],  xmm2
748    movdqa  [r2 + 20h],  xmm3
749    movdqa  [r2 + 30h],  xmm7
750    movdqa  [r2 + 40h],  xmm5
751    movdqa  [r2 + 50h],  xmm1
752    movdqa  [r2 + 60h],  xmm6
753    movdqa  [r2 + 70h],  xmm0
754
755    mov     r7,   r5
756    POP_XMM
757    pop     r5
758    pop     r4
759    pop     r3
760    ret
761
762
763;*******************************************************************************************
764;
765;   void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
766;
767;*******************************************************************************************
768
769WELS_EXTERN DeblockLumaTransposeV2H_sse2
770    push     r3
771    push     r4
772
773%assign  push_num 2
774    LOAD_3_PARA
775    PUSH_XMM 8
776
777    SIGN_EXTENSION   r1, r1d
778
779    mov      r4,    r7
780    mov      r3,    r7
781    and      r3,    0Fh
782    sub      r7,    r3
783    sub      r7,    10h
784
785    movdqa   xmm0,   [r2]
786    movdqa   xmm1,   [r2 + 10h]
787    movdqa   xmm2,   [r2 + 20h]
788    movdqa   xmm3,   [r2 + 30h]
789    movdqa   xmm4,   [r2 + 40h]
790    movdqa   xmm5,   [r2 + 50h]
791    movdqa   xmm6,   [r2 + 60h]
792    movdqa   xmm7,   [r2 + 70h]
793
794    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r7]
795    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
796
797    lea      r2,   [r1 * 3]
798
799    movq     [r0],  xmm4
800    movq     [r0 + r1],  xmm2
801    movq     [r0 + r1*2],  xmm3
802    movq     [r0 + r2],  xmm7
803
804    lea      r0,   [r0 + r1*4]
805    movq     [r0],  xmm5
806    movq     [r0 + r1],  xmm1
807    movq     [r0 + r1*2],  xmm6
808    movq     [r0 + r2],  xmm0
809
810    psrldq    xmm4,   8
811    psrldq    xmm2,   8
812    psrldq    xmm3,   8
813    psrldq    xmm7,   8
814    psrldq    xmm5,   8
815    psrldq    xmm1,   8
816    psrldq    xmm6,   8
817    psrldq    xmm0,   8
818
819    lea       r0,  [r0 + r1*4]
820    movq     [r0],  xmm4
821    movq     [r0 + r1],  xmm2
822    movq     [r0 + r1*2],  xmm3
823    movq     [r0 + r2],  xmm7
824
825    lea      r0,   [r0 + r1*4]
826    movq     [r0],  xmm5
827    movq     [r0 + r1],  xmm1
828    movq     [r0 + r1*2],  xmm6
829    movq     [r0 + r2],  xmm0
830
831
832    mov      r7,   r4
833    POP_XMM
834    pop      r4
835    pop      r3
836    ret
837
838WELS_EXTERN WelsNonZeroCount_sse2
839    %assign  push_num 0
840    LOAD_1_PARA
841    movdqu  xmm0, [r0]
842    movq    xmm1, [r0+16]
843    WELS_DB1 xmm2
844    pminub  xmm0, xmm2
845    pminub  xmm1, xmm2
846    movdqu  [r0], xmm0
847    movq    [r0+16], xmm1
848    ret
849