• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;*!
2;* \copy
3;*     Copyright (c)  2009-2013, Cisco Systems
4;*     All rights reserved.
5;*
6;*     Redistribution and use in source and binary forms, with or without
7;*     modification, are permitted provided that the following conditions
8;*     are met:
9;*
10;*        * Redistributions of source code must retain the above copyright
11;*          notice, this list of conditions and the following disclaimer.
12;*
13;*        * Redistributions in binary form must reproduce the above copyright
14;*          notice, this list of conditions and the following disclaimer in
15;*          the documentation and/or other materials provided with the
16;*          distribution.
17;*
18;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29;*     POSSIBILITY OF SUCH DAMAGE.
30;*
31;*
32;*  expand_picture.asm
33;*
34;*  Abstract
35;*      mmxext/sse for expand_frame
36;*
37;*  History
38;*      09/25/2009 Created
39;*
40;*
41;*************************************************************************/
42
43%include "asm_inc.asm"
44
45
46
47;***********************************************************************
48; Macros and other preprocessor constants
49;***********************************************************************
50
51;***********************************************************************
52; Code
53;***********************************************************************
54
55
56
57SECTION .text
58
59
60;;;;;;;expanding result;;;;;;;
61
62;aaaa|attttttttttttttttb|bbbb
63;aaaa|attttttttttttttttb|bbbb
64;aaaa|attttttttttttttttb|bbbb
65;aaaa|attttttttttttttttb|bbbb
66;----------------------------
67;aaaa|attttttttttttttttb|bbbb
68;llll|l                r|rrrr
69;llll|l                r|rrrr
70;llll|l                r|rrrr
71;llll|l                r|rrrr
72;llll|l                r|rrrr
73;cccc|ceeeeeeeeeeeeeeeed|dddd
74;----------------------------
75;cccc|ceeeeeeeeeeeeeeeed|dddd
76;cccc|ceeeeeeeeeeeeeeeed|dddd
77;cccc|ceeeeeeeeeeeeeeeed|dddd
78;cccc|ceeeeeeeeeeeeeeeed|dddd
79
80%macro mov_line_8x4_mmx     3   ; dst, stride, mm?
81    movq [%1], %3
82    movq [%1+%2], %3
83    lea %1, [%1+2*%2]
84    movq [%1], %3
85    movq [%1+%2], %3
86    lea %1, [%1+2*%2]
87%endmacro
88
89%macro mov_line_end8x4_mmx      3   ; dst, stride, mm?
90    movq [%1], %3
91    movq [%1+%2], %3
92    lea %1, [%1+2*%2]
93    movq [%1], %3
94    movq [%1+%2], %3
95    lea %1, [%1+%2]
96%endmacro
97
98%macro mov_line_16x4_sse2   4   ; dst, stride, xmm?, u/a
99    movdq%4 [%1], %3        ; top(bottom)_0
100    movdq%4 [%1+%2], %3     ; top(bottom)_1
101    lea %1, [%1+2*%2]
102    movdq%4 [%1], %3        ; top(bottom)_2
103    movdq%4 [%1+%2], %3     ; top(bottom)_3
104    lea %1, [%1+2*%2]
105%endmacro
106
107%macro mov_line_end16x4_sse2    4   ; dst, stride, xmm?, u/a
108    movdq%4 [%1], %3        ; top(bottom)_0
109    movdq%4 [%1+%2], %3     ; top(bottom)_1
110    lea %1, [%1+2*%2]
111    movdq%4 [%1], %3        ; top(bottom)_2
112    movdq%4 [%1+%2], %3     ; top(bottom)_3
113    lea %1, [%1+%2]
114%endmacro
115
116%macro mov_line_32x4_sse2   3   ; dst, stride, xmm?
117    movdqa [%1], %3         ; top(bottom)_0
118    movdqa [%1+16], %3      ; top(bottom)_0
119    movdqa [%1+%2], %3      ; top(bottom)_1
120    movdqa [%1+%2+16], %3       ; top(bottom)_1
121    lea %1, [%1+2*%2]
122    movdqa [%1], %3         ; top(bottom)_2
123    movdqa [%1+16], %3      ; top(bottom)_2
124    movdqa [%1+%2], %3      ; top(bottom)_3
125    movdqa [%1+%2+16], %3       ; top(bottom)_3
126    lea %1, [%1+2*%2]
127%endmacro
128
129%macro mov_line_end32x4_sse2    3   ; dst, stride, xmm?
130    movdqa [%1], %3         ; top(bottom)_0
131    movdqa [%1+16], %3      ; top(bottom)_0
132    movdqa [%1+%2], %3      ; top(bottom)_1
133    movdqa [%1+%2+16], %3       ; top(bottom)_1
134    lea %1, [%1+2*%2]
135    movdqa [%1], %3         ; top(bottom)_2
136    movdqa [%1+16], %3      ; top(bottom)_2
137    movdqa [%1+%2], %3      ; top(bottom)_3
138    movdqa [%1+%2+16], %3       ; top(bottom)_3
139    lea %1, [%1+%2]
140%endmacro
141
142%macro exp_top_bottom_sse2  1   ; iPaddingSize [luma(32)/chroma(16)]
143    ;r2 [width/16(8)]
144    ;r0 [pSrc +0], r5 [pSrc -width] r1[-stride], 32(16) ;top
145    ;r3 [pSrc +(h-1)*stride], r4 [pSrc + (h+31)*stride],32(16); bottom
146
147%if %1 == 32        ; for luma
148    sar r2, 04h     ; width / 16(8) pixels
149.top_bottom_loops:
150    ; top
151    movdqa xmm0, [r0]       ; first line of picture pData
152    mov_line_16x4_sse2 r5, r1, xmm0, a  ; dst, stride, xmm?
153    mov_line_16x4_sse2 r5, r1, xmm0, a
154    mov_line_16x4_sse2 r5, r1, xmm0, a
155    mov_line_16x4_sse2 r5, r1, xmm0, a
156    mov_line_16x4_sse2 r5, r1, xmm0, a  ; dst, stride, xmm?
157    mov_line_16x4_sse2 r5, r1, xmm0, a
158    mov_line_16x4_sse2 r5, r1, xmm0, a
159    mov_line_end16x4_sse2 r5, r1, xmm0, a
160
161    ; bottom
162    movdqa xmm1, [r3]       ; last line of picture pData
163    mov_line_16x4_sse2 r4, r1, xmm1, a  ; dst, stride, xmm?
164    mov_line_16x4_sse2 r4, r1, xmm1, a
165    mov_line_16x4_sse2 r4, r1, xmm1, a
166    mov_line_16x4_sse2 r4, r1, xmm1, a
167    mov_line_16x4_sse2 r4, r1, xmm1, a  ; dst, stride, xmm?
168    mov_line_16x4_sse2 r4, r1, xmm1, a
169    mov_line_16x4_sse2 r4, r1, xmm1, a
170    mov_line_end16x4_sse2 r4, r1, xmm1, a
171
172    lea r0, [r0+16]     ; top pSrc
173    lea r5, [r5+16]     ; top dst
174    lea r3, [r3+16]     ; bottom pSrc
175    lea r4, [r4+16]     ; bottom dst
176    neg r1          ; positive/negative stride need for next loop?
177
178    dec r2
179    jnz near .top_bottom_loops
180%elif %1 == 16  ; for chroma ??
181    mov r6, r2
182    sar r2, 04h     ; (width / 16) pixels
183.top_bottom_loops:
184    ; top
185    movdqa xmm0, [r0]       ; first line of picture pData
186    mov_line_16x4_sse2 r5, r1, xmm0, a  ; dst, stride, xmm?
187    mov_line_16x4_sse2 r5, r1, xmm0, a
188    mov_line_16x4_sse2 r5, r1, xmm0, a
189    mov_line_end16x4_sse2 r5, r1, xmm0, a
190
191    ; bottom
192    movdqa xmm1, [r3]       ; last line of picture pData
193    mov_line_16x4_sse2 r4, r1, xmm1, a  ; dst, stride, xmm?
194    mov_line_16x4_sse2 r4, r1, xmm1, a
195    mov_line_16x4_sse2 r4, r1, xmm1, a
196    mov_line_end16x4_sse2 r4, r1, xmm1, a
197
198    lea r0, [r0+16]     ; top pSrc
199    lea r5, [r5+16]     ; top dst
200    lea r3, [r3+16]     ; bottom pSrc
201    lea r4, [r4+16]     ; bottom dst
202    neg r1          ; positive/negative stride need for next loop?
203
204    dec r2
205    jnz near .top_bottom_loops
206
207    ; for remaining 8 bytes
208    and r6, 0fh     ; any 8 bytes left?
209    test r6, r6
210    jz near .to_be_continued    ; no left to exit here
211
212    ; top
213    movq mm0, [r0]      ; remained 8 byte
214    mov_line_8x4_mmx r5, r1, mm0    ; dst, stride, mm?
215    mov_line_8x4_mmx r5, r1, mm0    ; dst, stride, mm?
216    mov_line_8x4_mmx r5, r1, mm0    ; dst, stride, mm?
217    mov_line_end8x4_mmx r5, r1, mm0 ; dst, stride, mm?
218    ; bottom
219    movq mm1, [r3]
220    mov_line_8x4_mmx r4, r1, mm1    ; dst, stride, mm?
221    mov_line_8x4_mmx r4, r1, mm1    ; dst, stride, mm?
222    mov_line_8x4_mmx r4, r1, mm1    ; dst, stride, mm?
223    mov_line_end8x4_mmx r4, r1, mm1 ; dst, stride, mm?
224    WELSEMMS
225
226.to_be_continued:
227%endif
228%endmacro
229
230%macro exp_left_right_sse2  2   ; iPaddingSize [luma(32)/chroma(16)], u/a
231    ;r6 [height]
232    ;r0 [pSrc+0]  r5[pSrc-32] r1[stride]
233    ;r3 [pSrc+(w-1)] r4[pSrc+w]
234
235%if %1 == 32        ; for luma
236.left_right_loops:
237    ; left
238    movzx r2d, byte [r0]        ; pixel pData for left border
239    SSE2_Copy16Times    xmm0, r2d               ; dst, tmp, pSrc [generic register name: a/b/c/d]
240    movdqa [r5], xmm0
241    movdqa [r5+16], xmm0
242
243    ; right
244    movzx r2d, byte [r3]
245    SSE2_Copy16Times    xmm1, r2d               ; dst, tmp, pSrc [generic register name: a/b/c/d]
246    movdqa [r4], xmm1
247    movdqa [r4+16], xmm1
248
249    lea r0, [r0+r1]     ; left pSrc
250    lea r5, [r5+r1]     ; left dst
251    lea r3, [r3+r1]     ; right pSrc
252    lea r4, [r4+r1]     ; right dst
253
254    dec r6
255    jnz near .left_right_loops
256%elif %1 == 16  ; for chroma ??
257.left_right_loops:
258    ; left
259    movzx r2d, byte [r0]        ; pixel pData for left border
260    SSE2_Copy16Times    xmm0, r2d               ; dst, tmp, pSrc [generic register name: a/b/c/d]
261    movdqa [r5], xmm0
262
263    ; right
264    movzx r2d, byte [r3]
265    SSE2_Copy16Times    xmm1, r2d               ; dst, tmp, pSrc [generic register name: a/b/c/d]
266    movdq%2 [r4], xmm1                              ; might not be aligned 16 bytes in case chroma planes
267
268    lea r0, [r0+r1]     ; left pSrc
269    lea r5, [r5+r1]     ; left dst
270    lea r3, [r3+r1]     ; right pSrc
271    lea r4, [r4+r1]     ; right dst
272
273    dec r6
274    jnz near .left_right_loops
275%endif
276%endmacro
277
278%macro exp_cross_sse2   2   ; iPaddingSize [luma(32)/chroma(16)], u/a
279    ; top-left: (x)mm3, top-right: (x)mm4, bottom-left: (x)mm5, bottom-right: (x)mm6
280    ; edi: TL, ebp: TR, eax: BL, ebx: BR, ecx, -stride
281    ;r3:TL ,r4:TR,r5:BL,r6:BR r1:-stride
282%if %1 == 32        ; luma
283    ; TL
284    mov_line_32x4_sse2  r3, r1, xmm3    ; dst, stride, xmm?
285    mov_line_32x4_sse2  r3, r1, xmm3    ; dst, stride, xmm?
286    mov_line_32x4_sse2  r3, r1, xmm3    ; dst, stride, xmm?
287    mov_line_32x4_sse2  r3, r1, xmm3    ; dst, stride, xmm?
288    mov_line_32x4_sse2  r3, r1, xmm3    ; dst, stride, xmm?
289    mov_line_32x4_sse2  r3, r1, xmm3    ; dst, stride, xmm?
290    mov_line_32x4_sse2  r3, r1, xmm3    ; dst, stride, xmm?
291    mov_line_end32x4_sse2   r3, r1, xmm3    ; dst, stride, xmm?
292
293    ; TR
294    mov_line_32x4_sse2  r4, r1, xmm4    ; dst, stride, xmm?
295    mov_line_32x4_sse2  r4, r1, xmm4    ; dst, stride, xmm?
296    mov_line_32x4_sse2  r4, r1, xmm4    ; dst, stride, xmm?
297    mov_line_32x4_sse2  r4, r1, xmm4    ; dst, stride, xmm?
298    mov_line_32x4_sse2  r4, r1, xmm4    ; dst, stride, xmm?
299    mov_line_32x4_sse2  r4, r1, xmm4    ; dst, stride, xmm?
300    mov_line_32x4_sse2  r4, r1, xmm4    ; dst, stride, xmm?
301    mov_line_end32x4_sse2   r4, r1, xmm4    ; dst, stride, xmm?
302
303    ; BL
304    mov_line_32x4_sse2  r5, r1, xmm5    ; dst, stride, xmm?
305    mov_line_32x4_sse2  r5, r1, xmm5    ; dst, stride, xmm?
306    mov_line_32x4_sse2  r5, r1, xmm5    ; dst, stride, xmm?
307    mov_line_32x4_sse2  r5, r1, xmm5    ; dst, stride, xmm?
308    mov_line_32x4_sse2  r5, r1, xmm5    ; dst, stride, xmm?
309    mov_line_32x4_sse2  r5, r1, xmm5    ; dst, stride, xmm?
310    mov_line_32x4_sse2  r5, r1, xmm5    ; dst, stride, xmm?
311    mov_line_end32x4_sse2   r5, r1, xmm5    ; dst, stride, xmm?
312
313    ; BR
314    mov_line_32x4_sse2  r6, r1, xmm6    ; dst, stride, xmm?
315    mov_line_32x4_sse2  r6, r1, xmm6    ; dst, stride, xmm?
316    mov_line_32x4_sse2  r6, r1, xmm6    ; dst, stride, xmm?
317    mov_line_32x4_sse2  r6, r1, xmm6    ; dst, stride, xmm?
318    mov_line_32x4_sse2  r6, r1, xmm6    ; dst, stride, xmm?
319    mov_line_32x4_sse2  r6, r1, xmm6    ; dst, stride, xmm?
320    mov_line_32x4_sse2  r6, r1, xmm6    ; dst, stride, xmm?
321    mov_line_end32x4_sse2   r6, r1, xmm6    ; dst, stride, xmm?
322%elif %1 == 16  ; chroma
323    ; TL
324    mov_line_16x4_sse2  r3, r1, xmm3, a ; dst, stride, xmm?
325    mov_line_16x4_sse2  r3, r1, xmm3, a ; dst, stride, xmm?
326    mov_line_16x4_sse2  r3, r1, xmm3, a ; dst, stride, xmm?
327    mov_line_end16x4_sse2   r3, r1, xmm3, a ; dst, stride, xmm?
328
329    ; TR
330    mov_line_16x4_sse2  r4, r1, xmm4, %2    ; dst, stride, xmm?
331    mov_line_16x4_sse2  r4, r1, xmm4, %2    ; dst, stride, xmm?
332    mov_line_16x4_sse2  r4, r1, xmm4, %2    ; dst, stride, xmm?
333    mov_line_end16x4_sse2 r4, r1, xmm4, %2  ; dst, stride, xmm?
334
335    ; BL
336    mov_line_16x4_sse2  r5, r1, xmm5, a ; dst, stride, xmm?
337    mov_line_16x4_sse2  r5, r1, xmm5, a ; dst, stride, xmm?
338    mov_line_16x4_sse2  r5, r1, xmm5, a ; dst, stride, xmm?
339    mov_line_end16x4_sse2   r5, r1, xmm5, a ; dst, stride, xmm?
340
341    ; BR
342    mov_line_16x4_sse2  r6, r1, xmm6, %2    ; dst, stride, xmm?
343    mov_line_16x4_sse2  r6, r1, xmm6, %2    ; dst, stride, xmm?
344    mov_line_16x4_sse2  r6, r1, xmm6, %2    ; dst, stride, xmm?
345    mov_line_end16x4_sse2   r6, r1, xmm6, %2    ; dst, stride, xmm?
346%endif
347%endmacro
348
349;***********************************************************************----------------
350; void ExpandPictureLuma_sse2(  uint8_t *pDst,
351;                                   const int32_t iStride,
352;                                   const int32_t iWidth,
353;                                   const int32_t iHeight   );
354;***********************************************************************----------------
355WELS_EXTERN ExpandPictureLuma_sse2
356
357    push r4
358    push r5
359    push r6
360
361    %assign push_num 3
362    LOAD_4_PARA
363    PUSH_XMM 7
364
365    SIGN_EXTENSION r1, r1d
366    SIGN_EXTENSION r2, r2d
367    SIGN_EXTENSION r3, r3d
368
369    ;also prepare for cross border pData top-left:xmm3
370
371    movzx r6d,byte[r0]
372    SSE2_Copy16Times xmm3,r6d         ;xmm3: pSrc[0]
373
374    neg r1
375    lea r5,[r0+r1]              ;last line of top border r5= dst top  pSrc[-stride]
376    neg r1
377
378    push r3
379
380
381    dec r3                      ;h-1
382    imul r3,r1                  ;(h-1)*stride
383    lea  r3,[r0+r3]             ;pSrc[(h-1)*stride]  r3 = src bottom
384
385    mov r6,r1                    ;r6 = stride
386    sal r6,05h                   ;r6 = 32*stride
387    lea r4,[r3+r6]               ;r4 = dst bottom
388
389    ;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
390
391    movzx r6d,byte [r3]             ;bottom-left
392    SSE2_Copy16Times xmm5,r6d
393
394    lea r6,[r3+r2-1]
395    movzx r6d,byte [r6]
396    SSE2_Copy16Times xmm6,r6d ;bottom-right
397
398    neg r1  ;r1 = -stride
399
400    push r0
401    push r1
402    push r2
403
404    exp_top_bottom_sse2 32
405
406    ; for both left and right border
407    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
408
409    pop r2
410    pop r1
411    pop r0
412
413    lea r5,[r0-32]                          ;left border dst  luma =32 chroma = -16
414
415    lea r3,[r0+r2-1]                        ;right border src
416    lea r4,[r3+1]                           ;right border dst
417
418    ;prepare for cross border data: top-rigth with xmm4
419    movzx r6d,byte [r3]                         ;top -rigth
420    SSE2_Copy16Times xmm4,r6d
421
422    neg r1   ;r1 = stride
423
424
425    pop r6  ;  r6 = height
426
427
428
429    push r0
430    push r1
431    push r2
432    push r6
433
434    exp_left_right_sse2  32,a
435
436    pop r6
437    pop r2
438    pop r1
439    pop r0
440
441    ; for cross border [top-left, top-right, bottom-left, bottom-right]
442    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
443    ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
444
445    neg r1  ;r1 = -stride
446    lea r3,[r0-32]
447    lea r3,[r3+r1]    ;last line of top-left border
448
449    lea r4,[r0+r2]    ;psrc +width
450    lea r4,[r4+r1]    ;psrc +width -stride
451
452
453    neg r1  ;r1 = stride
454    add r6,32         ;height +32(16) ,luma = 32, chroma = 16
455    imul r6,r1
456
457    lea r5,[r3+r6]    ;last line of bottom-left border
458    lea r6,[r4+r6]    ;last line of botoom-right border
459
460    neg r1 ; r1 = -stride
461
462    ; for left & right border expanding
463    exp_cross_sse2 32,a
464
465    POP_XMM
466    LOAD_4_PARA_POP
467
468    pop r6
469    pop r5
470    pop r4
471
472    %assign push_num 0
473
474
475    ret
476
477;***********************************************************************----------------
478; void ExpandPictureChromaAlign_sse2(   uint8_t *pDst,
479;                                       const int32_t iStride,
480;                                       const int32_t iWidth,
481;                                       const int32_t iHeight   );
482;***********************************************************************----------------
483WELS_EXTERN ExpandPictureChromaAlign_sse2
484
485    push r4
486    push r5
487    push r6
488
489    %assign push_num 3
490    LOAD_4_PARA
491    PUSH_XMM 7
492
493    SIGN_EXTENSION r1,r1d
494    SIGN_EXTENSION r2,r2d
495    SIGN_EXTENSION r3,r3d
496
497    ;also prepare for cross border pData top-left:xmm3
498
499    movzx r6d,byte [r0]
500    SSE2_Copy16Times xmm3,r6d         ;xmm3: pSrc[0]
501
502    neg r1
503    lea r5,[r0+r1]              ;last line of top border r5= dst top  pSrc[-stride]
504    neg r1
505
506    push r3
507
508
509    dec r3                      ;h-1
510    imul r3,r1                  ;(h-1)*stride
511    lea  r3,[r0+r3]             ;pSrc[(h-1)*stride]  r3 = src bottom
512
513    mov r6,r1                    ;r6 = stride
514    sal r6,04h                   ;r6 = 32*stride
515    lea r4,[r3+r6]               ;r4 = dst bottom
516
517    ;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
518
519    movzx r6d,byte [r3]             ;bottom-left
520    SSE2_Copy16Times xmm5,r6d
521
522    lea r6,[r3+r2-1]
523    movzx r6d,byte [r6]
524    SSE2_Copy16Times xmm6,r6d ;bottom-right
525
526    neg r1  ;r1 = -stride
527
528    push r0
529    push r1
530    push r2
531
532    exp_top_bottom_sse2 16
533
534    ; for both left and right border
535    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
536
537    pop r2
538    pop r1
539    pop r0
540
541    lea r5,[r0-16]                          ;left border dst  luma =32 chroma = -16
542
543    lea r3,[r0+r2-1]                        ;right border src
544    lea r4,[r3+1]                           ;right border dst
545
546    ;prepare for cross border data: top-rigth with xmm4
547    movzx r6d,byte [r3]                         ;top -rigth
548    SSE2_Copy16Times xmm4,r6d
549
550    neg r1   ;r1 = stride
551
552
553    pop r6  ;  r6 = height
554
555
556
557    push r0
558    push r1
559    push r2
560    push r6
561    exp_left_right_sse2 16,a
562
563    pop r6
564    pop r2
565    pop r1
566    pop r0
567
568    ; for cross border [top-left, top-right, bottom-left, bottom-right]
569    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
570    ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
571
572    neg r1  ;r1 = -stride
573    lea r3,[r0-16]
574    lea r3,[r3+r1]    ;last line of top-left border
575
576    lea r4,[r0+r2]    ;psrc +width
577    lea r4,[r4+r1]    ;psrc +width -stride
578
579
580    neg r1  ;r1 = stride
581    add r6,16         ;height +32(16) ,luma = 32, chroma = 16
582    imul r6,r1
583
584    lea r5,[r3+r6]    ;last line of bottom-left border
585    lea r6,[r4+r6]    ;last line of botoom-right border
586
587    neg r1 ; r1 = -stride
588
589    ; for left & right border expanding
590    exp_cross_sse2 16,a
591
592    POP_XMM
593    LOAD_4_PARA_POP
594
595    pop r6
596    pop r5
597    pop r4
598
599    %assign push_num 0
600
601
602    ret
603
604;***********************************************************************----------------
605; void ExpandPictureChromaUnalign_sse2( uint8_t *pDst,
606;                                       const int32_t iStride,
607;                                       const int32_t iWidth,
608;                                       const int32_t iHeight   );
609;***********************************************************************----------------
610WELS_EXTERN ExpandPictureChromaUnalign_sse2
611    push r4
612    push r5
613    push r6
614
615    %assign push_num 3
616    LOAD_4_PARA
617    PUSH_XMM 7
618
619    SIGN_EXTENSION r1,r1d
620    SIGN_EXTENSION r2,r2d
621    SIGN_EXTENSION r3,r3d
622
623    ;also prepare for cross border pData top-left:xmm3
624
625    movzx r6d,byte [r0]
626    SSE2_Copy16Times xmm3,r6d         ;xmm3: pSrc[0]
627
628    neg r1
629    lea r5,[r0+r1]              ;last line of top border r5= dst top  pSrc[-stride]
630    neg r1
631
632    push r3
633
634
635    dec r3                      ;h-1
636    imul r3,r1                  ;(h-1)*stride
637    lea  r3,[r0+r3]             ;pSrc[(h-1)*stride]  r3 = src bottom
638
639    mov r6,r1                    ;r6 = stride
640    sal r6,04h                   ;r6 = 32*stride
641    lea r4,[r3+r6]               ;r4 = dst bottom
642
643    ;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
644
645    movzx r6d,byte [r3]             ;bottom-left
646    SSE2_Copy16Times xmm5,r6d
647
648    lea r6,[r3+r2-1]
649    movzx r6d,byte [r6]
650    SSE2_Copy16Times xmm6,r6d ;bottom-right
651
652    neg r1  ;r1 = -stride
653
654    push r0
655    push r1
656    push r2
657
658    exp_top_bottom_sse2 16
659
660    ; for both left and right border
661    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
662
663    pop r2
664    pop r1
665    pop r0
666
667    lea r5,[r0-16]                          ;left border dst  luma =32 chroma = -16
668
669    lea r3,[r0+r2-1]                        ;right border src
670    lea r4,[r3+1]                           ;right border dst
671
672    ;prepare for cross border data: top-rigth with xmm4
673    movzx r6d,byte [r3]                         ;top -rigth
674    SSE2_Copy16Times xmm4,r6d
675
676    neg r1   ;r1 = stride
677
678
679    pop r6  ;  r6 = height
680
681
682
683    push r0
684    push r1
685    push r2
686    push r6
687    exp_left_right_sse2 16,u
688
689    pop r6
690    pop r2
691    pop r1
692    pop r0
693
694    ; for cross border [top-left, top-right, bottom-left, bottom-right]
695    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
696    ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
697
698    neg r1  ;r1 = -stride
699    lea r3,[r0-16]
700    lea r3,[r3+r1]    ;last line of top-left border
701
702    lea r4,[r0+r2]    ;psrc +width
703    lea r4,[r4+r1]    ;psrc +width -stride
704
705
706    neg r1  ;r1 = stride
707    add r6,16         ;height +32(16) ,luma = 32, chroma = 16
708    imul r6,r1
709
710    lea r5,[r3+r6]    ;last line of bottom-left border
711    lea r6,[r4+r6]    ;last line of botoom-right border
712
713    neg r1 ; r1 = -stride
714
715    ; for left & right border expanding
716    exp_cross_sse2 16,u
717
718    POP_XMM
719    LOAD_4_PARA_POP
720
721    pop r6
722    pop r5
723    pop r4
724
725    %assign push_num 0
726
727
728    ret
729