• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;*!
2;* \copy
3;*     Copyright (c)  2009-2013, Cisco Systems
4;*     All rights reserved.
5;*
6;*     Redistribution and use in source and binary forms, with or without
7;*     modification, are permitted provided that the following conditions
8;*     are met:
9;*
10;*        * Redistributions of source code must retain the above copyright
11;*          notice, this list of conditions and the following disclaimer.
12;*
13;*        * Redistributions in binary form must reproduce the above copyright
14;*          notice, this list of conditions and the following disclaimer in
15;*          the documentation and/or other materials provided with the
16;*          distribution.
17;*
18;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29;*     POSSIBILITY OF SUCH DAMAGE.
30;*
31;*
32;*  mb_copy.asm
33;*
34;*  Abstract
35;*      mb_copy and mb_copy1
36;*
37;*  History
38;*      15/09/2009 Created
39;*      12/28/2009 Modified with larger throughput
40;*      12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2,
41;*                 WelsCopy16x8_mmx, WelsCopy8x16_mmx etc;
42;*
43;*
44;*********************************************************************************************/
45%include "asm_inc.asm"
46
47%ifdef __NASM_VER__
48    %use smartalign
49%endif
50
51;***********************************************************************
52; Macros and other preprocessor constants
53;***********************************************************************
54
55;***********************************************************************
56; Code
57;***********************************************************************
58
59SECTION .text
60
61
62;***********************************************************************
63; void WelsCopy16x16_sse2(  uint8_t* Dst,
64;                           int32_t  iStrideD,
65;                           uint8_t* Src,
66;                           int32_t  iStrideS )
67;***********************************************************************
68WELS_EXTERN WelsCopy16x16_sse2
69
70    push r4
71    push r5
72    %assign  push_num 2
73    LOAD_4_PARA
74    PUSH_XMM 8
75    SIGN_EXTENSION r1, r1d
76    SIGN_EXTENSION r3, r3d
77
78    lea r4, [r1+2*r1]   ;ebx, [eax+2*eax]   ; x3
79    lea r5, [r3+2*r3]   ;edx, [ecx+2*ecx]   ; x3
80
81    movdqa xmm0, [r2]
82    movdqa xmm1, [r2+r3]
83    movdqa xmm2, [r2+2*r3]
84    movdqa xmm3, [r2+r5]
85    lea r2, [r2+4*r3]
86    movdqa xmm4, [r2]
87    movdqa xmm5, [r2+r3]
88    movdqa xmm6, [r2+2*r3]
89    movdqa xmm7, [r2+r5]
90    lea r2, [r2+4*r3]
91
92    movdqa [r0], xmm0
93    movdqa [r0+r1], xmm1
94    movdqa [r0+2*r1], xmm2
95    movdqa [r0+r4], xmm3
96    lea r0, [r0+4*r1]
97    movdqa [r0], xmm4
98    movdqa [r0+r1], xmm5
99    movdqa [r0+2*r1], xmm6
100    movdqa [r0+r4], xmm7
101    lea r0, [r0+4*r1]
102
103    movdqa xmm0, [r2]
104    movdqa xmm1, [r2+r3]
105    movdqa xmm2, [r2+2*r3]
106    movdqa xmm3, [r2+r5]
107    lea r2, [r2+4*r3]
108    movdqa xmm4, [r2]
109    movdqa xmm5, [r2+r3]
110    movdqa xmm6, [r2+2*r3]
111    movdqa xmm7, [r2+r5]
112
113    movdqa [r0], xmm0
114    movdqa [r0+r1], xmm1
115    movdqa [r0+2*r1], xmm2
116    movdqa [r0+r4], xmm3
117    lea r0, [r0+4*r1]
118    movdqa [r0], xmm4
119    movdqa [r0+r1], xmm5
120    movdqa [r0+2*r1], xmm6
121    movdqa [r0+r4], xmm7
122    POP_XMM
123    LOAD_4_PARA_POP
124    pop r5
125    pop r4
126    ret
127
128;***********************************************************************
129; void WelsCopy16x16NotAligned_sse2(    uint8_t* Dst,
130;                           int32_t  iStrideD,
131;                           uint8_t* Src,
132;                           int32_t  iStrideS )
133;***********************************************************************
134; dst can be align with 16 bytes, but not sure about pSrc, 12/29/2011
135WELS_EXTERN WelsCopy16x16NotAligned_sse2
136    push r4
137    push r5
138    %assign  push_num 2
139    LOAD_4_PARA
140    PUSH_XMM 8
141    SIGN_EXTENSION r1, r1d
142    SIGN_EXTENSION r3, r3d
143
144    lea r4, [r1+2*r1]   ;ebx, [eax+2*eax]   ; x3
145    lea r5, [r3+2*r3]   ;edx, [ecx+2*ecx]   ; x3
146
147    movdqu xmm0, [r2]
148    movdqu xmm1, [r2+r3]
149    movdqu xmm2, [r2+2*r3]
150    movdqu xmm3, [r2+r5]
151    lea r2, [r2+4*r3]
152    movdqu xmm4, [r2]
153    movdqu xmm5, [r2+r3]
154    movdqu xmm6, [r2+2*r3]
155    movdqu xmm7, [r2+r5]
156    lea r2, [r2+4*r3]
157
158    movdqa [r0], xmm0
159    movdqa [r0+r1], xmm1
160    movdqa [r0+2*r1], xmm2
161    movdqa [r0+r4], xmm3
162    lea r0, [r0+4*r1]
163    movdqa [r0], xmm4
164    movdqa [r0+r1], xmm5
165    movdqa [r0+2*r1], xmm6
166    movdqa [r0+r4], xmm7
167    lea r0, [r0+4*r1]
168
169    movdqu xmm0, [r2]
170    movdqu xmm1, [r2+r3]
171    movdqu xmm2, [r2+2*r3]
172    movdqu xmm3, [r2+r5]
173    lea r2, [r2+4*r3]
174    movdqu xmm4, [r2]
175    movdqu xmm5, [r2+r3]
176    movdqu xmm6, [r2+2*r3]
177    movdqu xmm7, [r2+r5]
178
179    movdqa [r0], xmm0
180    movdqa [r0+r1], xmm1
181    movdqa [r0+2*r1], xmm2
182    movdqa [r0+r4], xmm3
183    lea r0, [r0+4*r1]
184    movdqa [r0], xmm4
185    movdqa [r0+r1], xmm5
186    movdqa [r0+2*r1], xmm6
187    movdqa [r0+r4], xmm7
188    POP_XMM
189    LOAD_4_PARA_POP
190    pop r5
191    pop r4
192    ret
193
194; , 12/29/2011
195;***********************************************************************
196; void WelsCopy16x8NotAligned_sse2(uint8_t* Dst,
197;                           int32_t  iStrideD,
198;                           uint8_t* Src,
199;                           int32_t  iStrideS )
200;***********************************************************************
201WELS_EXTERN WelsCopy16x8NotAligned_sse2
202    push r4
203    push r5
204    %assign  push_num 2
205    LOAD_4_PARA
206    PUSH_XMM 8
207    SIGN_EXTENSION r1, r1d
208    SIGN_EXTENSION r3, r3d
209
210    lea r4, [r1+2*r1]   ;ebx, [eax+2*eax]   ; x3
211    lea r5, [r3+2*r3]   ;edx, [ecx+2*ecx]   ; x3
212
213    movdqu xmm0, [r2]
214    movdqu xmm1, [r2+r3]
215    movdqu xmm2, [r2+2*r3]
216    movdqu xmm3, [r2+r5]
217    lea r2, [r2+4*r3]
218    movdqu xmm4, [r2]
219    movdqu xmm5, [r2+r3]
220    movdqu xmm6, [r2+2*r3]
221    movdqu xmm7, [r2+r5]
222
223    movdqa [r0], xmm0
224    movdqa [r0+r1], xmm1
225    movdqa [r0+2*r1], xmm2
226    movdqa [r0+r4], xmm3
227    lea r0, [r0+4*r1]
228    movdqa [r0], xmm4
229    movdqa [r0+r1], xmm5
230    movdqa [r0+2*r1], xmm6
231    movdqa [r0+r4], xmm7
232    POP_XMM
233    LOAD_4_PARA_POP
234    pop r5
235    pop r4
236    ret
237
238
239;***********************************************************************
240; void WelsCopy8x16_mmx(uint8_t* Dst,
241;                       int32_t  iStrideD,
242;                       uint8_t* Src,
243;                       int32_t  iStrideS )
244;***********************************************************************
245WELS_EXTERN WelsCopy8x16_mmx
246    %assign  push_num 0
247    LOAD_4_PARA
248    SIGN_EXTENSION r1, r1d
249    SIGN_EXTENSION r3, r3d
250
251    movq mm0, [r2]
252    movq mm1, [r2+r3]
253    lea r2, [r2+2*r3]
254    movq mm2, [r2]
255    movq mm3, [r2+r3]
256    lea r2, [r2+2*r3]
257    movq mm4, [r2]
258    movq mm5, [r2+r3]
259    lea r2, [r2+2*r3]
260    movq mm6, [r2]
261    movq mm7, [r2+r3]
262    lea r2, [r2+2*r3]
263
264    movq [r0], mm0
265    movq [r0+r1], mm1
266    lea r0, [r0+2*r1]
267    movq [r0], mm2
268    movq [r0+r1], mm3
269    lea r0, [r0+2*r1]
270    movq [r0], mm4
271    movq [r0+r1], mm5
272    lea r0, [r0+2*r1]
273    movq [r0], mm6
274    movq [r0+r1], mm7
275    lea r0, [r0+2*r1]
276
277    movq mm0, [r2]
278    movq mm1, [r2+r3]
279    lea r2, [r2+2*r3]
280    movq mm2, [r2]
281    movq mm3, [r2+r3]
282    lea r2, [r2+2*r3]
283    movq mm4, [r2]
284    movq mm5, [r2+r3]
285    lea r2, [r2+2*r3]
286    movq mm6, [r2]
287    movq mm7, [r2+r3]
288
289    movq [r0], mm0
290    movq [r0+r1], mm1
291    lea r0, [r0+2*r1]
292    movq [r0], mm2
293    movq [r0+r1], mm3
294    lea r0, [r0+2*r1]
295    movq [r0], mm4
296    movq [r0+r1], mm5
297    lea r0, [r0+2*r1]
298    movq [r0], mm6
299    movq [r0+r1], mm7
300
301    WELSEMMS
302    LOAD_4_PARA_POP
303    ret
304
305;***********************************************************************
306; void WelsCopy8x8_mmx(  uint8_t* Dst,
307;                        int32_t  iStrideD,
308;                        uint8_t* Src,
309;                        int32_t  iStrideS )
310;***********************************************************************
311WELS_EXTERN WelsCopy8x8_mmx
312    push r4
313    %assign  push_num 1
314    LOAD_4_PARA
315    SIGN_EXTENSION r1, r1d
316    SIGN_EXTENSION r3, r3d
317    lea r4, [r3+2*r3]   ;edx, [ebx+2*ebx]
318
319    ; to prefetch next loop
320    prefetchnta [r2+2*r3]
321    prefetchnta [r2+r4]
322    movq mm0, [r2]
323    movq mm1, [r2+r3]
324    lea r2, [r2+2*r3]
325    ; to prefetch next loop
326    prefetchnta [r2+2*r3]
327    prefetchnta [r2+r4]
328    movq mm2, [r2]
329    movq mm3, [r2+r3]
330    lea r2, [r2+2*r3]
331    ; to prefetch next loop
332    prefetchnta [r2+2*r3]
333    prefetchnta [r2+r4]
334    movq mm4, [r2]
335    movq mm5, [r2+r3]
336    lea r2, [r2+2*r3]
337    movq mm6, [r2]
338    movq mm7, [r2+r3]
339
340    movq [r0], mm0
341    movq [r0+r1], mm1
342    lea r0, [r0+2*r1]
343    movq [r0], mm2
344    movq [r0+r1], mm3
345    lea r0, [r0+2*r1]
346    movq [r0], mm4
347    movq [r0+r1], mm5
348    lea r0, [r0+2*r1]
349    movq [r0], mm6
350    movq [r0+r1], mm7
351
352    WELSEMMS
353    LOAD_4_PARA_POP
354    pop r4
355    ret
356
357; (dunhuang@cisco), 12/21/2011
358;***********************************************************************
359; void UpdateMbMv_sse2( SMVUnitXY *pMvBuffer, const SMVUnitXY sMv )
360;***********************************************************************
361WELS_EXTERN UpdateMbMv_sse2
362
363    %assign  push_num 0
364    LOAD_2_PARA
365
366    movd xmm0, r1d  ; _mv
367    pshufd xmm1, xmm0, $00
368    movdqa [r0     ], xmm1
369    movdqa [r0+0x10], xmm1
370    movdqa [r0+0x20], xmm1
371    movdqa [r0+0x30], xmm1
372    ret
373
374;*******************************************************************************
375; Macros and other preprocessor constants
376;*******************************************************************************
377
378;*******************************************************************************
379; Code
380;*******************************************************************************
381
382SECTION .text
383
384
385
386
387;*******************************************************************************
388; void PixelAvgWidthEq4_mmx( uint8_t *pDst,  int iDstStride,
389;                           uint8_t *pSrcA, int iSrcAStride,
390;                           uint8_t *pSrcB, int iSrcBStride,
391;                           int iHeight );
392;*******************************************************************************
393WELS_EXTERN PixelAvgWidthEq4_mmx
394
395    %assign  push_num 0
396    LOAD_7_PARA
397
398    SIGN_EXTENSION  r1, r1d
399    SIGN_EXTENSION  r3, r3d
400    SIGN_EXTENSION  r5, r5d
401    SIGN_EXTENSION  r6, r6d
402
403ALIGN 4
404.height_loop:
405    movd        mm0, [r4]
406    pavgb       mm0, [r2]
407    movd        [r0], mm0
408
409    dec         r6
410    lea         r0, [r0+r1]
411    lea         r2, [r2+r3]
412    lea         r4, [r4+r5]
413    jne         .height_loop
414
415    WELSEMMS
416    LOAD_7_PARA_POP
417    ret
418
419
420;*******************************************************************************
421; void PixelAvgWidthEq8_mmx( uint8_t *pDst,  int iDstStride,
422;                           uint8_t *pSrcA, int iSrcAStride,
423;                           uint8_t *pSrcB, int iSrcBStride,
424;                           int iHeight );
425;*******************************************************************************
426WELS_EXTERN PixelAvgWidthEq8_mmx
427    %assign  push_num 0
428    LOAD_7_PARA
429
430    SIGN_EXTENSION  r1, r1d
431    SIGN_EXTENSION  r3, r3d
432    SIGN_EXTENSION  r5, r5d
433    SIGN_EXTENSION  r6, r6d
434
435ALIGN 4
436.height_loop:
437    movq        mm0, [r2]
438    pavgb       mm0, [r4]
439    movq        [r0], mm0
440    movq        mm0, [r2+r3]
441    pavgb       mm0, [r4+r5]
442    movq        [r0+r1], mm0
443
444    lea         r2,  [r2+2*r3]
445    lea         r4,  [r4+2*r5]
446    lea         r0,  [r0+2*r1]
447
448    sub         r6, 2
449    jnz         .height_loop
450
451    WELSEMMS
452    LOAD_7_PARA_POP
453    ret
454
455
456
457;*******************************************************************************
458; void PixelAvgWidthEq16_sse2( uint8_t *pDst,  int iDstStride,
459;                          uint8_t *pSrcA, int iSrcAStride,
460;                          uint8_t *pSrcB, int iSrcBStride,
461;                          int iHeight );
462;*******************************************************************************
463WELS_EXTERN PixelAvgWidthEq16_sse2
464
465    %assign  push_num 0
466    LOAD_7_PARA
467    SIGN_EXTENSION  r1, r1d
468    SIGN_EXTENSION  r3, r3d
469    SIGN_EXTENSION  r5, r5d
470    SIGN_EXTENSION  r6, r6d
471ALIGN 4
472.height_loop:
473    movdqu      xmm0, [r2]
474    movdqu      xmm1, [r4]
475    pavgb       xmm0, xmm1
476    ;pavgb       xmm0, [r4]
477    movdqu      [r0], xmm0
478
479    movdqu      xmm0, [r2+r3]
480    movdqu      xmm1, [r4+r5]
481    pavgb       xmm0, xmm1
482    movdqu      [r0+r1], xmm0
483
484    movdqu      xmm0, [r2+2*r3]
485    movdqu       xmm1, [r4+2*r5]
486    pavgb       xmm0, xmm1
487    movdqu      [r0+2*r1], xmm0
488
489    lea         r2, [r2+2*r3]
490    lea         r4, [r4+2*r5]
491    lea         r0, [r0+2*r1]
492
493    movdqu      xmm0, [r2+r3]
494    movdqu      xmm1, [r4+r5]
495    pavgb       xmm0, xmm1
496    movdqu      [r0+r1], xmm0
497
498    lea         r2, [r2+2*r3]
499    lea         r4, [r4+2*r5]
500    lea         r0, [r0+2*r1]
501
502    sub         r6, 4
503    jne         .height_loop
504
505    WELSEMMS
506    LOAD_7_PARA_POP
507    ret
508
509; load_instr=%1 store_instr=%2 p_dst=%3 i_dststride=%4 p_src=%5 i_srcstride=%6 cnt=%7 r_tmp=%8,%9 mm_tmp=%10,%11
510%macro CopyStrided4N 11
511    lea             %8, [3 * %6]
512    lea             %9, [3 * %4]
513ALIGN 32
514%%loop:
515    %1              %10, [%5]
516    %1              %11, [%5 + %6]
517    %2              [%3], %10
518    %2              [%3 + %4], %11
519    %1              %10, [%5 + 2 * %6]
520    %1              %11, [%5 + %8]
521    %2              [%3 + 2 * %4], %10
522    %2              [%3 + %9], %11
523    lea             %5, [%5 + 4 * %6]
524    lea             %3, [%3 + 4 * %4]
525    sub             %7, 4
526    jg              %%loop
527%endmacro
528
529;*******************************************************************************
530;   void McCopyWidthEq8_mmx( uint8_t *pSrc, int iSrcStride,
531;                            uint8_t *pDst, int iDstStride, int iHeight )
532;*******************************************************************************
533WELS_EXTERN McCopyWidthEq8_mmx
534    %assign  push_num 0
535%ifdef X86_32
536    push            r5
537    push            r6
538    %assign  push_num 2
539%endif
540    LOAD_5_PARA
541
542    SIGN_EXTENSION  r1, r1d
543    SIGN_EXTENSION  r3, r3d
544    SIGN_EXTENSION  r4, r4d
545
546    CopyStrided4N   movq, movq, r2, r3, r0, r1, r4, r5, r6, mm0, mm1
547
548    WELSEMMS
549    LOAD_5_PARA_POP
550%ifdef X86_32
551    pop             r6
552    pop             r5
553%endif
554    ret
555
556
557;*******************************************************************************
558;   void McCopyWidthEq16_sse2( uint8_t *pSrc, int iSrcStride, uint8_t *pDst, int iDstStride, int iHeight )
559;*******************************************************************************
560;read unaligned memory
561%macro SSE_READ_UNA 2
562    movq    %1, [%2]
563    movhps  %1, [%2+8]
564%endmacro
565
566;write unaligned memory
567%macro SSE_WRITE_UNA 2
568    movq    [%1],   %2
569    movhps  [%1+8], %2
570%endmacro
571WELS_EXTERN McCopyWidthEq16_sse2
572    %assign  push_num 0
573    LOAD_5_PARA
574    SIGN_EXTENSION  r1, r1d
575    SIGN_EXTENSION  r3, r3d
576    SIGN_EXTENSION  r4, r4d
577ALIGN 4
578.height_loop:
579    SSE_READ_UNA    xmm0, r0
580    SSE_READ_UNA    xmm1, r0+r1
581    SSE_WRITE_UNA   r2, xmm0
582    SSE_WRITE_UNA   r2+r3, xmm1
583
584    sub     r4, 2
585    lea     r0, [r0+r1*2]
586    lea     r2, [r2+r3*2]
587    jnz     .height_loop
588
589    LOAD_5_PARA_POP
590    ret
591
592
593;*******************************************************************************
594;   void McCopyWidthEq16_sse3( uint8_t *pSrc, int iSrcStride, uint8_t *pDst, int iDstStride, int iHeight )
595;*******************************************************************************
596WELS_EXTERN McCopyWidthEq16_sse3
597    %assign push_num 0
598%ifdef X86_32
599    push            r5
600    push            r6
601    %assign push_num 2
602%endif
603    LOAD_5_PARA
604    SIGN_EXTENSION  r1, r1d
605    SIGN_EXTENSION  r3, r3d
606    SIGN_EXTENSION  r4, r4d
607
608    CopyStrided4N   lddqu, MOVDQ, r2, r3, r0, r1, r4, r5, r6, xmm0, xmm1
609
610    LOAD_5_PARA_POP
611%ifdef X86_32
612    pop             r6
613    pop             r5
614%endif
615    ret
616