1;*! 2;* \copy 3;* Copyright (c) 2010-2013, Cisco Systems 4;* All rights reserved. 5;* 6;* Redistribution and use in source and binary forms, with or without 7;* modification, are permitted provided that the following conditions 8;* are met: 9;* 10;* * Redistributions of source code must retain the above copyright 11;* notice, this list of conditions and the following disclaimer. 12;* 13;* * Redistributions in binary form must reproduce the above copyright 14;* notice, this list of conditions and the following disclaimer in 15;* the documentation and/or other materials provided with the 16;* distribution. 17;* 18;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 21;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 22;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 24;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 28;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29;* POSSIBILITY OF SUCH DAMAGE. 30;* 31;* 32;* predenoise.asm 33;* 34;* Abstract 35;* denoise for SVC2.1 36;* History 37;* 4/13/2010 Created 38;* 7/30/2010 Modified 39;* 40;* 41;*************************************************************************/ 42%include "asm_inc.asm" 43 44;*********************************************************************** 45; Constant 46;*********************************************************************** 47%ifdef X86_32_PICASM 48SECTION .text align=16 49%else 50SECTION .rodata align=16 51%endif 52 53sse2_32 times 8 dw 32 54sse2_20 times 8 dw 20 55 56 57 58;*********************************************************************** 59; Code 60;*********************************************************************** 61SECTION .text 62 63%macro WEIGHT_LINE 9 64 movq %2, %9 65 punpcklbw %2, %7 66 movdqa %8, %2 67 68 movdqa %1, %6 69 psubusb %1, %8 70 psubusb %8, %6 71 por %8, %1 ; ABS(curPixel - centerPixel); 72 73 movdqa %1, %3 74 psubusb %1, %8 75 76 pmullw %1, %1 77 psrlw %1, 5 78 pmullw %2, %1 79 paddusw %4, %1 80 paddusw %5, %2 81%endmacro 82 83%macro WEIGHT_LINE1_UV 4 84 movdqa %2, %1 85 punpcklbw %2, %4 86 paddw %3, %2 87 88 movdqa %2, %1 89 psrldq %2, 1 90 punpcklbw %2, %4 91 paddw %3, %2 92 93 movdqa %2, %1 94 psrldq %2, 2 95 punpcklbw %2, %4 96 psllw %2, 1 97 paddw %3, %2 98 99 movdqa %2, %1 100 psrldq %2, 3 101 punpcklbw %2, %4 102 paddw %3, %2 103 104 movdqa %2, %1 105 psrldq %2, 4 106 punpcklbw %2, %4 107 paddw %3, %2 108%endmacro 109 110%macro WEIGHT_LINE2_UV 4 111 movdqa %2, %1 112 punpcklbw %2, %4 113 paddw %3, %2 114 115 movdqa %2, %1 116 psrldq %2, 1 117 punpcklbw %2, %4 118 psllw %2, 1 119 paddw %3, %2 120 121 movdqa %2, %1 122 psrldq %2, 2 123 punpcklbw %2, %4 124 psllw %2, 2 125 paddw %3, %2 126 127 movdqa %2, %1 128 psrldq %2, 3 129 punpcklbw %2, %4 130 psllw %2, 1 131 paddw %3, %2 132 133 movdqa %2, %1 134 psrldq %2, 4 135 punpcklbw %2, %4 136 paddw %3, %2 137%endmacro 138 139%macro WEIGHT_LINE3_UV 4 140 movdqa %2, %1 141 punpcklbw %2, %4 142 psllw %2, 1 143 paddw %3, %2 144 145 movdqa %2, %1 146 psrldq %2, 1 147 punpcklbw %2, %4 148 psllw %2, 2 149 paddw %3, %2 150 151 movdqa %2, %1 152 psrldq %2, 2 153 punpcklbw %2, %4 154 pmullw %2, [pic(sse2_20)] 155 paddw %3, %2 156 157 movdqa %2, %1 158 psrldq %2, 3 159 punpcklbw %2, %4 160 psllw %2, 2 161 paddw %3, %2 162 163 movdqa %2, %1 164 psrldq %2, 4 165 punpcklbw %2, %4 166 psllw %2, 1 167 paddw %3, %2 168%endmacro 169 170;*********************************************************************** 171; BilateralLumaFilter8_sse2(uint8_t *pixels, int stride); 172;*********************************************************************** 173; 1 2 3 174; 4 0 5 175; 6 7 8 176; 0: the center point 177 178WELS_EXTERN BilateralLumaFilter8_sse2 179 180 push r3 181 %assign push_num 1 182 LOAD_2_PARA 183 PUSH_XMM 8 184 185 pxor xmm7, xmm7 186 187 mov r3, r0 188 189 movq xmm6, [r0] 190 punpcklbw xmm6, xmm7 191%ifdef X86_32_PICASM 192 pcmpeqw xmm3, xmm3 193 psrlw xmm3, 15 194 psllw xmm3, 5 195%else 196 movdqa xmm3, [sse2_32] 197%endif 198 pxor xmm4, xmm4 ; nTotWeight 199 pxor xmm5, xmm5 ; nSum 200 201 dec r0 202 WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 4 203 WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 5 204 205 sub r0, r1 206 WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 1 207 WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 1] ; pixel 2 208 WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 3 209 210 lea r0, [r0 + r1 * 2] 211 WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 6 212 WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 1] ; pixel 7 213 WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 8 214 215 pcmpeqw xmm0, xmm0 216 psrlw xmm0, 15 217 psllw xmm0, 8 218 psubusw xmm0, xmm4 219 pmullw xmm0, xmm6 220 paddusw xmm5, xmm0 221 psrlw xmm5, 8 222 packuswb xmm5, xmm5 223 movq [r3], xmm5 224 225 226 POP_XMM 227 pop r3 228 %assign push_num 0 229 230 ret 231 232;*********************************************************************** 233; void WaverageChromaFilter8_sse2(uint8_t *pixels, int stride); 234;*********************************************************************** 235;5x5 filter: 236;1 1 2 1 1 237;1 2 4 2 1 238;2 4 20 4 2 239;1 2 4 2 1 240;1 1 2 1 1 241 242WELS_EXTERN WaverageChromaFilter8_sse2 243 244 push r3 245 246 %assign push_num 1 247 248 INIT_X86_32_PIC r4 249 LOAD_2_PARA 250 251 mov r3, r1 252 add r3, r3 253 sub r0, r3 ; pixels - 2 * stride 254 sub r0, 2 255 256 pxor xmm0, xmm0 257 pxor xmm3, xmm3 258 259 movdqu xmm1, [r0] 260 WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0 261 262 movdqu xmm1, [r0 + r1] 263 WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0 264 265 add r0, r3 266 movdqu xmm1, [r0] 267 WEIGHT_LINE3_UV xmm1, xmm2, xmm3, xmm0 268 269 movdqu xmm1, [r0 + r1] 270 WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0 271 272 movdqu xmm1, [r0 + r1 * 2] 273 WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0 274 275 psrlw xmm3, 6 276 packuswb xmm3, xmm3 277 movq [r0 + 2], xmm3 278 279 280 DEINIT_X86_32_PIC 281 pop r3 282 283 %assign push_num 0 284 ret 285