1;****************************************************************************** 2;* x86 optimized discrete wavelet trasnform 3;* Copyright (c) 2010 David Conrad 4;* 5;* This file is part of FFmpeg. 6;* 7;* FFmpeg is free software; you can redistribute it and/or 8;* modify it under the terms of the GNU Lesser General Public 9;* License as published by the Free Software Foundation; either 10;* version 2.1 of the License, or (at your option) any later version. 11;* 12;* FFmpeg is distributed in the hope that it will be useful, 13;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;* Lesser General Public License for more details. 16;* 17;* You should have received a copy of the GNU Lesser General Public 18;* License along with FFmpeg; if not, write to the Free Software 19;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20;****************************************************************************** 21 22%include "libavutil/x86/x86util.asm" 23 24SECTION_RODATA 25pw_1991: times 4 dw 9,-1 26 27cextern pw_1 28cextern pw_2 29cextern pw_8 30cextern pw_16 31 32SECTION .text 33 34; %1 -= (%2 + %3 + 2)>>2 %4 is pw_2 35%macro COMPOSE_53iL0 4 36 paddw %2, %3 37 paddw %2, %4 38 psraw %2, 2 39 psubw %1, %2 40%endm 41 42; m1 = %1 + (-m0 + 9*m1 + 9*%2 -%3 + 8)>>4 43; if %4 is supplied, %1 is loaded unaligned from there 44; m2: clobbered m3: pw_8 m4: pw_1991 45%macro COMPOSE_DD97iH0 3-4 46 paddw m0, %3 47 paddw m1, %2 48 psubw m0, m3 49 mova m2, m1 50 punpcklwd m1, m0 51 punpckhwd m2, m0 52 pmaddwd m1, m4 53 pmaddwd m2, m4 54%if %0 > 3 55 movu %1, %4 56%endif 57 psrad m1, 4 58 psrad m2, 4 59 packssdw m1, m2 60 paddw m1, %1 61%endm 62 63%macro COMPOSE_VERTICAL 1 64; void vertical_compose53iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, 65; int width) 66cglobal vertical_compose53iL0_%1, 4,4,1, b0, b1, b2, width 67 mova m2, [pw_2] 68%if ARCH_X86_64 69 mov widthd, widthd 70%endif 71.loop: 72 sub widthq, mmsize/2 73 mova m1, [b0q+2*widthq] 74 mova m0, [b1q+2*widthq] 75 COMPOSE_53iL0 m0, m1, [b2q+2*widthq], m2 76 mova [b1q+2*widthq], m0 77 jg .loop 78 REP_RET 79 80; void vertical_compose_dirac53iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, 81; int width) 82cglobal vertical_compose_dirac53iH0_%1, 4,4,1, b0, b1, b2, width 83 mova m1, [pw_1] 84%if ARCH_X86_64 85 mov widthd, widthd 86%endif 87.loop: 88 sub widthq, mmsize/2 89 mova m0, [b0q+2*widthq] 90 paddw m0, [b2q+2*widthq] 91 paddw m0, m1 92 psraw m0, 1 93 paddw m0, [b1q+2*widthq] 94 mova [b1q+2*widthq], m0 95 jg .loop 96 REP_RET 97 98; void vertical_compose_dd97iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, 99; IDWTELEM *b3, IDWTELEM *b4, int width) 100cglobal vertical_compose_dd97iH0_%1, 6,6,5, b0, b1, b2, b3, b4, width 101 mova m3, [pw_8] 102 mova m4, [pw_1991] 103%if ARCH_X86_64 104 mov widthd, widthd 105%endif 106.loop: 107 sub widthq, mmsize/2 108 mova m0, [b0q+2*widthq] 109 mova m1, [b1q+2*widthq] 110 COMPOSE_DD97iH0 [b2q+2*widthq], [b3q+2*widthq], [b4q+2*widthq] 111 mova [b2q+2*widthq], m1 112 jg .loop 113 REP_RET 114 115; void vertical_compose_dd137iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, 116; IDWTELEM *b3, IDWTELEM *b4, int width) 117cglobal vertical_compose_dd137iL0_%1, 6,6,6, b0, b1, b2, b3, b4, width 118 mova m3, [pw_16] 119 mova m4, [pw_1991] 120%if ARCH_X86_64 121 mov widthd, widthd 122%endif 123.loop: 124 sub widthq, mmsize/2 125 mova m0, [b0q+2*widthq] 126 mova m1, [b1q+2*widthq] 127 mova m5, [b2q+2*widthq] 128 paddw m0, [b4q+2*widthq] 129 paddw m1, [b3q+2*widthq] 130 psubw m0, m3 131 mova m2, m1 132 punpcklwd m1, m0 133 punpckhwd m2, m0 134 pmaddwd m1, m4 135 pmaddwd m2, m4 136 psrad m1, 5 137 psrad m2, 5 138 packssdw m1, m2 139 psubw m5, m1 140 mova [b2q+2*widthq], m5 141 jg .loop 142 REP_RET 143 144; void vertical_compose_haar(IDWTELEM *b0, IDWTELEM *b1, int width) 145cglobal vertical_compose_haar_%1, 3,4,3, b0, b1, width 146 mova m3, [pw_1] 147%if ARCH_X86_64 148 mov widthd, widthd 149%endif 150.loop: 151 sub widthq, mmsize/2 152 mova m1, [b1q+2*widthq] 153 mova m0, [b0q+2*widthq] 154 mova m2, m1 155 paddw m1, m3 156 psraw m1, 1 157 psubw m0, m1 158 mova [b0q+2*widthq], m0 159 paddw m2, m0 160 mova [b1q+2*widthq], m2 161 jg .loop 162 REP_RET 163%endmacro 164 165; extend the left and right edges of the tmp array by %1 and %2 respectively 166%macro EDGE_EXTENSION 3 167 mov %3, [tmpq] 168%assign %%i 1 169%rep %1 170 mov [tmpq-2*%%i], %3 171 %assign %%i %%i+1 172%endrep 173 mov %3, [tmpq+2*w2q-2] 174%assign %%i 0 175%rep %2 176 mov [tmpq+2*w2q+2*%%i], %3 177 %assign %%i %%i+1 178%endrep 179%endmacro 180 181 182%macro HAAR_HORIZONTAL 2 183; void horizontal_compose_haari(IDWTELEM *b, IDWTELEM *tmp, int width) 184cglobal horizontal_compose_haar%2i_%1, 3,6,4, b, tmp, w, x, w2, b_w2 185 mov w2d, wd 186 xor xq, xq 187 shr w2d, 1 188 lea b_w2q, [bq+wq] 189 mova m3, [pw_1] 190.lowpass_loop: 191 movu m1, [b_w2q + 2*xq] 192 mova m0, [bq + 2*xq] 193 paddw m1, m3 194 psraw m1, 1 195 psubw m0, m1 196 mova [tmpq + 2*xq], m0 197 add xq, mmsize/2 198 cmp xq, w2q 199 jl .lowpass_loop 200 201 xor xq, xq 202 and w2q, ~(mmsize/2 - 1) 203 cmp w2q, mmsize/2 204 jl .end 205 206.highpass_loop: 207 movu m1, [b_w2q + 2*xq] 208 mova m0, [tmpq + 2*xq] 209 paddw m1, m0 210 211 ; shift and interleave 212%if %2 == 1 213 paddw m0, m3 214 paddw m1, m3 215 psraw m0, 1 216 psraw m1, 1 217%endif 218 mova m2, m0 219 punpcklwd m0, m1 220 punpckhwd m2, m1 221 mova [bq+4*xq], m0 222 mova [bq+4*xq+mmsize], m2 223 224 add xq, mmsize/2 225 cmp xq, w2q 226 jl .highpass_loop 227.end: 228 REP_RET 229%endmacro 230 231 232INIT_XMM 233; void horizontal_compose_dd97i(IDWTELEM *b, IDWTELEM *tmp, int width) 234cglobal horizontal_compose_dd97i_ssse3, 3,6,8, b, tmp, w, x, w2, b_w2 235 mov w2d, wd 236 xor xd, xd 237 shr w2d, 1 238 lea b_w2q, [bq+wq] 239 movu m4, [bq+wq] 240 mova m7, [pw_2] 241 pslldq m4, 14 242.lowpass_loop: 243 movu m1, [b_w2q + 2*xq] 244 mova m0, [bq + 2*xq] 245 mova m2, m1 246 palignr m1, m4, 14 247 mova m4, m2 248 COMPOSE_53iL0 m0, m1, m2, m7 249 mova [tmpq + 2*xq], m0 250 add xd, mmsize/2 251 cmp xd, w2d 252 jl .lowpass_loop 253 254 EDGE_EXTENSION 1, 2, xw 255 ; leave the last up to 7 (sse) or 3 (mmx) values for C 256 xor xd, xd 257 and w2d, ~(mmsize/2 - 1) 258 cmp w2d, mmsize/2 259 jl .end 260 261 mova m7, [tmpq-mmsize] 262 mova m0, [tmpq] 263 mova m5, [pw_1] 264 mova m3, [pw_8] 265 mova m4, [pw_1991] 266.highpass_loop: 267 mova m6, m0 268 palignr m0, m7, 14 269 mova m7, [tmpq + 2*xq + 16] 270 mova m1, m7 271 mova m2, m7 272 palignr m1, m6, 2 273 palignr m2, m6, 4 274 COMPOSE_DD97iH0 m0, m6, m2, [b_w2q + 2*xq] 275 mova m0, m7 276 mova m7, m6 277 278 ; shift and interleave 279 paddw m6, m5 280 paddw m1, m5 281 psraw m6, 1 282 psraw m1, 1 283 mova m2, m6 284 punpcklwd m6, m1 285 punpckhwd m2, m1 286 mova [bq+4*xq], m6 287 mova [bq+4*xq+mmsize], m2 288 289 add xd, mmsize/2 290 cmp xd, w2d 291 jl .highpass_loop 292.end: 293 REP_RET 294 295 296%if ARCH_X86_64 == 0 297INIT_MMX 298COMPOSE_VERTICAL mmx 299HAAR_HORIZONTAL mmx, 0 300HAAR_HORIZONTAL mmx, 1 301%endif 302 303;;INIT_XMM 304INIT_XMM 305COMPOSE_VERTICAL sse2 306HAAR_HORIZONTAL sse2, 0 307HAAR_HORIZONTAL sse2, 1 308