1;****************************************************************************** 2;* SIMD-optimized quarterpel functions 3;* Copyright (c) 2008 Loren Merritt 4;* Copyright (c) 2003-2013 Michael Niedermayer 5;* Copyright (c) 2013 Daniel Kang 6;* 7;* This file is part of FFmpeg. 8;* 9;* FFmpeg is free software; you can redistribute it and/or 10;* modify it under the terms of the GNU Lesser General Public 11;* License as published by the Free Software Foundation; either 12;* version 2.1 of the License, or (at your option) any later version. 13;* 14;* FFmpeg is distributed in the hope that it will be useful, 15;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17;* Lesser General Public License for more details. 18;* 19;* You should have received a copy of the GNU Lesser General Public 20;* License along with FFmpeg; if not, write to the Free Software 21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22;****************************************************************************** 23 24%include "libavutil/x86/x86util.asm" 25 26SECTION .text 27 28%macro op_avgh 3 29 movh %3, %2 30 pavgb %1, %3 31 movh %2, %1 32%endmacro 33 34%macro op_avg 2 35 pavgb %1, %2 36 mova %2, %1 37%endmacro 38 39%macro op_puth 2-3 40 movh %2, %1 41%endmacro 42 43%macro op_put 2 44 mova %2, %1 45%endmacro 46 47; void ff_put/avg_pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, 48; int dstStride, int src1Stride, int h) 49%macro PIXELS4_L2 1 50%define OP op_%1h 51cglobal %1_pixels4_l2, 6,6 52 movsxdifnidn r3, r3d 53 movsxdifnidn r4, r4d 54 test r5d, 1 55 je .loop 56 movd m0, [r1] 57 movd m1, [r2] 58 add r1, r4 59 add r2, 4 60 pavgb m0, m1 61 OP m0, [r0], m3 62 add r0, r3 63 dec r5d 64.loop: 65 mova m0, [r1] 66 mova m1, [r1+r4] 67 lea r1, [r1+2*r4] 68 pavgb m0, [r2] 69 pavgb m1, [r2+4] 70 OP m0, [r0], m3 71 OP m1, [r0+r3], m3 72 lea r0, [r0+2*r3] 73 mova m0, [r1] 74 mova m1, [r1+r4] 75 lea r1, [r1+2*r4] 76 pavgb m0, [r2+8] 77 pavgb m1, [r2+12] 78 OP m0, [r0], m3 79 OP m1, [r0+r3], m3 80 lea r0, [r0+2*r3] 81 add r2, 16 82 sub r5d, 4 83 jne .loop 84 REP_RET 85%endmacro 86 87INIT_MMX mmxext 88PIXELS4_L2 put 89PIXELS4_L2 avg 90 91; void ff_put/avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, 92; int dstStride, int src1Stride, int h) 93%macro PIXELS8_L2 1 94%define OP op_%1 95cglobal %1_pixels8_l2, 6,6 96 movsxdifnidn r3, r3d 97 movsxdifnidn r4, r4d 98 test r5d, 1 99 je .loop 100 mova m0, [r1] 101 mova m1, [r2] 102 add r1, r4 103 add r2, 8 104 pavgb m0, m1 105 OP m0, [r0] 106 add r0, r3 107 dec r5d 108.loop: 109 mova m0, [r1] 110 mova m1, [r1+r4] 111 lea r1, [r1+2*r4] 112 pavgb m0, [r2] 113 pavgb m1, [r2+8] 114 OP m0, [r0] 115 OP m1, [r0+r3] 116 lea r0, [r0+2*r3] 117 mova m0, [r1] 118 mova m1, [r1+r4] 119 lea r1, [r1+2*r4] 120 pavgb m0, [r2+16] 121 pavgb m1, [r2+24] 122 OP m0, [r0] 123 OP m1, [r0+r3] 124 lea r0, [r0+2*r3] 125 add r2, 32 126 sub r5d, 4 127 jne .loop 128 REP_RET 129%endmacro 130 131INIT_MMX mmxext 132PIXELS8_L2 put 133PIXELS8_L2 avg 134 135; void ff_put/avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, 136; int dstStride, int src1Stride, int h) 137%macro PIXELS16_L2 1 138%define OP op_%1 139cglobal %1_pixels16_l2, 6,6 140 movsxdifnidn r3, r3d 141 movsxdifnidn r4, r4d 142 test r5d, 1 143 je .loop 144 mova m0, [r1] 145 mova m1, [r1+8] 146 pavgb m0, [r2] 147 pavgb m1, [r2+8] 148 add r1, r4 149 add r2, 16 150 OP m0, [r0] 151 OP m1, [r0+8] 152 add r0, r3 153 dec r5d 154.loop: 155 mova m0, [r1] 156 mova m1, [r1+8] 157 add r1, r4 158 pavgb m0, [r2] 159 pavgb m1, [r2+8] 160 OP m0, [r0] 161 OP m1, [r0+8] 162 add r0, r3 163 mova m0, [r1] 164 mova m1, [r1+8] 165 add r1, r4 166 pavgb m0, [r2+16] 167 pavgb m1, [r2+24] 168 OP m0, [r0] 169 OP m1, [r0+8] 170 add r0, r3 171 add r2, 32 172 sub r5d, 2 173 jne .loop 174 REP_RET 175%endmacro 176 177INIT_MMX mmxext 178PIXELS16_L2 put 179PIXELS16_L2 avg 180