1;****************************************************************************** 2;* x86-SIMD-optimized IDCT for prores 3;* this is identical to "simple" IDCT written by Michael Niedermayer 4;* except for the clip range 5;* 6;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com> 7;* 8;* This file is part of FFmpeg. 9;* 10;* FFmpeg is free software; you can redistribute it and/or 11;* modify it under the terms of the GNU Lesser General Public 12;* License as published by the Free Software Foundation; either 13;* version 2.1 of the License, or (at your option) any later version. 14;* 15;* FFmpeg is distributed in the hope that it will be useful, 16;* but WITHOUT ANY WARRANTY; without even the implied warranty of 17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18;* Lesser General Public License for more details. 19;* 20;* You should have received a copy of the GNU Lesser General Public 21;* License along with FFmpeg; if not, write to the Free Software 22;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 23;****************************************************************************** 24 25; add SECTION_RODATA and proper include before including this file! 26 27%if ARCH_X86_64 28 29%macro define_constants 1 30 %undef w4_plus_w2 31 %undef w4_min_w2 32 %undef w4_plus_w6 33 %undef w4_min_w6 34 %undef w1_plus_w3 35 %undef w3_min_w1 36 %undef w7_plus_w3 37 %undef w3_min_w7 38 %define w4_plus_w2 w4_plus_w2%1 39 %define w4_min_w2 w4_min_w2%1 40 %define w4_plus_w6 w4_plus_w6%1 41 %define w4_min_w6 w4_min_w6%1 42 %define w1_plus_w3 w1_plus_w3%1 43 %define w3_min_w1 w3_min_w1%1 44 %define w7_plus_w3 w7_plus_w3%1 45 %define w3_min_w7 w3_min_w7%1 46%endmacro 47 48; interleave data while maintaining source 49; %1=type, %2=dstlo, %3=dsthi, %4=src, %5=interleave 50%macro SBUTTERFLY3 5 51 punpckl%1 m%2, m%4, m%5 52 punpckh%1 m%3, m%4, m%5 53%endmacro 54 55; %1/%2=src1/dst1, %3/%4=dst2, %5/%6=src2, %7=shift 56; action: %3/%4 = %1/%2 - %5/%6; %1/%2 += %5/%6 57; %1/%2/%3/%4 >>= %7; dword -> word (in %1/%3) 58%macro SUMSUB_SHPK 7 59 psubd %3, %1, %5 ; { a0 - b0 }[0-3] 60 psubd %4, %2, %6 ; { a0 - b0 }[4-7] 61 paddd %1, %5 ; { a0 + b0 }[0-3] 62 paddd %2, %6 ; { a0 + b0 }[4-7] 63 psrad %1, %7 64 psrad %2, %7 65 psrad %3, %7 66 psrad %4, %7 67 packssdw %1, %2 ; row[0] 68 packssdw %3, %4 ; row[7] 69%endmacro 70 71; %1 = initial bias ("" if nop) 72; %2 = number of bits to shift at the end 73; %3 = qmat (for prores) 74%macro IDCT_1D 2-3 75 ; a0 = (W4 * row[0]) + (1 << (15 - 1)); 76 ; a1 = a0; 77 ; a2 = a0; 78 ; a3 = a0; 79 ; a0 += W2 * row[2]; 80 ; a1 += W6 * row[2]; 81 ; a2 -= W6 * row[2]; 82 ; a3 -= W2 * row[2]; 83%ifstr %1 84 mova m15, [pd_round_ %+ %2] 85%else 86 paddw m10, [%1] 87%endif 88 SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[0], row[2] }[0-3]/[4-7] 89 pmaddwd m2, m0, [w4_plus_w6] 90 pmaddwd m3, m1, [w4_plus_w6] 91 pmaddwd m4, m0, [w4_min_w6] 92 pmaddwd m5, m1, [w4_min_w6] 93 pmaddwd m6, m0, [w4_min_w2] 94 pmaddwd m7, m1, [w4_min_w2] 95 pmaddwd m0, [w4_plus_w2] 96 pmaddwd m1, [w4_plus_w2] 97%ifstr %1 98 ; Adding 1<<(%2-1) for >=15 bits values 99 paddd m2, m15 100 paddd m3, m15 101 paddd m4, m15 102 paddd m5, m15 103 paddd m6, m15 104 paddd m7, m15 105 paddd m0, m15 106 paddd m1, m15 107%endif 108 109 ; a0: -1*row[0]-1*row[2] 110 ; a1: -1*row[0] 111 ; a2: -1*row[0] 112 ; a3: -1*row[0]+1*row[2] 113 114 ; a0 += W4*row[4] + W6*row[6]; i.e. -1*row[4] 115 ; a1 -= W4*row[4] + W2*row[6]; i.e. -1*row[4]-1*row[6] 116 ; a2 -= W4*row[4] - W2*row[6]; i.e. -1*row[4]+1*row[6] 117 ; a3 += W4*row[4] - W6*row[6]; i.e. -1*row[4] 118 SBUTTERFLY3 wd, 8, 9, 13, 12 ; { row[4], row[6] }[0-3]/[4-7] 119 pmaddwd m10, m8, [w4_plus_w6] 120 pmaddwd m11, m9, [w4_plus_w6] 121 paddd m0, m10 ; a0[0-3] 122 paddd m1, m11 ; a0[4-7] 123 pmaddwd m10, m8, [w4_min_w6] 124 pmaddwd m11, m9, [w4_min_w6] 125 paddd m6, m10 ; a3[0-3] 126 paddd m7, m11 ; a3[4-7] 127 pmaddwd m10, m8, [w4_min_w2] 128 pmaddwd m11, m9, [w4_min_w2] 129 pmaddwd m8, [w4_plus_w2] 130 pmaddwd m9, [w4_plus_w2] 131 psubd m4, m10 ; a2[0-3] intermediate 132 psubd m5, m11 ; a2[4-7] intermediate 133 psubd m2, m8 ; a1[0-3] intermediate 134 psubd m3, m9 ; a1[4-7] intermediate 135 136 ; load/store 137 mova [blockq+ 0], m0 138 mova [blockq+ 32], m2 139 mova [blockq+ 64], m4 140 mova [blockq+ 96], m6 141 mova m10,[blockq+ 16] ; { row[1] }[0-7] 142 mova m8, [blockq+ 48] ; { row[3] }[0-7] 143 mova m13,[blockq+ 80] ; { row[5] }[0-7] 144 mova m14,[blockq+112] ; { row[7] }[0-7] 145 mova [blockq+ 16], m1 146 mova [blockq+ 48], m3 147 mova [blockq+ 80], m5 148 mova [blockq+112], m7 149%if %0 == 3 150 pmullw m10,[%3+ 16] 151 pmullw m8, [%3+ 48] 152 pmullw m13,[%3+ 80] 153 pmullw m14,[%3+112] 154%endif 155 156 ; b0 = MUL(W1, row[1]); 157 ; MAC(b0, W3, row[3]); 158 ; b1 = MUL(W3, row[1]); 159 ; MAC(b1, -W7, row[3]); 160 ; b2 = MUL(W5, row[1]); 161 ; MAC(b2, -W1, row[3]); 162 ; b3 = MUL(W7, row[1]); 163 ; MAC(b3, -W5, row[3]); 164 SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[1], row[3] }[0-3]/[4-7] 165 pmaddwd m2, m0, [w3_min_w7] 166 pmaddwd m3, m1, [w3_min_w7] 167 pmaddwd m4, m0, [w5_min_w1] 168 pmaddwd m5, m1, [w5_min_w1] 169 pmaddwd m6, m0, [w7_min_w5] 170 pmaddwd m7, m1, [w7_min_w5] 171 pmaddwd m0, [w1_plus_w3] 172 pmaddwd m1, [w1_plus_w3] 173 174 ; b0: +1*row[1]+2*row[3] 175 ; b1: +2*row[1]-1*row[3] 176 ; b2: -1*row[1]-1*row[3] 177 ; b3: +1*row[1]+1*row[3] 178 179 ; MAC(b0, W5, row[5]); 180 ; MAC(b0, W7, row[7]); 181 ; MAC(b1, -W1, row[5]); 182 ; MAC(b1, -W5, row[7]); 183 ; MAC(b2, W7, row[5]); 184 ; MAC(b2, W3, row[7]); 185 ; MAC(b3, W3, row[5]); 186 ; MAC(b3, -W1, row[7]); 187 SBUTTERFLY3 wd, 8, 9, 13, 14 ; { row[5], row[7] }[0-3]/[4-7] 188 189 ; b0: -1*row[5]+1*row[7] 190 ; b1: -1*row[5]+1*row[7] 191 ; b2: +1*row[5]+2*row[7] 192 ; b3: +2*row[5]-1*row[7] 193 194 pmaddwd m10, m8, [w1_plus_w5] 195 pmaddwd m11, m9, [w1_plus_w5] 196 pmaddwd m12, m8, [w5_plus_w7] 197 pmaddwd m13, m9, [w5_plus_w7] 198 psubd m2, m10 ; b1[0-3] 199 psubd m3, m11 ; b1[4-7] 200 paddd m0, m12 ; b0[0-3] 201 paddd m1, m13 ; b0[4-7] 202 pmaddwd m12, m8, [w7_plus_w3] 203 pmaddwd m13, m9, [w7_plus_w3] 204 pmaddwd m8, [w3_min_w1] 205 pmaddwd m9, [w3_min_w1] 206 paddd m4, m12 ; b2[0-3] 207 paddd m5, m13 ; b2[4-7] 208 paddd m6, m8 ; b3[0-3] 209 paddd m7, m9 ; b3[4-7] 210 211 ; row[0] = (a0 + b0) >> 15; 212 ; row[7] = (a0 - b0) >> 15; 213 ; row[1] = (a1 + b1) >> 15; 214 ; row[6] = (a1 - b1) >> 15; 215 ; row[2] = (a2 + b2) >> 15; 216 ; row[5] = (a2 - b2) >> 15; 217 ; row[3] = (a3 + b3) >> 15; 218 ; row[4] = (a3 - b3) >> 15; 219 mova m8, [blockq+ 0] ; a0[0-3] 220 mova m9, [blockq+16] ; a0[4-7] 221 SUMSUB_SHPK m8, m9, m10, m11, m0, m1, %2 222 mova m0, [blockq+32] ; a1[0-3] 223 mova m1, [blockq+48] ; a1[4-7] 224 SUMSUB_SHPK m0, m1, m9, m11, m2, m3, %2 225 mova m1, [blockq+64] ; a2[0-3] 226 mova m2, [blockq+80] ; a2[4-7] 227 SUMSUB_SHPK m1, m2, m11, m3, m4, m5, %2 228 mova m2, [blockq+96] ; a3[0-3] 229 mova m3, [blockq+112] ; a3[4-7] 230 SUMSUB_SHPK m2, m3, m4, m5, m6, m7, %2 231%endmacro 232 233; void ff_prores_idct_put_10_<opt>(uint8_t *pixels, ptrdiff_t stride, 234; int16_t *block, const int16_t *qmat); 235 236; %1 = row shift 237; %2 = row bias macro 238; %3 = column shift 239; %4 = column bias macro 240; %5 = final action (nothing, "store", "put", "add") 241; %6 = min pixel value 242; %7 = max pixel value 243; %8 = qmat (for prores) 244 245%macro IDCT_FN 4-8 246 ; for (i = 0; i < 8; i++) 247 ; idctRowCondDC(block + i*8); 248 mova m10,[blockq+ 0] ; { row[0] }[0-7] 249 mova m8, [blockq+32] ; { row[2] }[0-7] 250 mova m13,[blockq+64] ; { row[4] }[0-7] 251 mova m12,[blockq+96] ; { row[6] }[0-7] 252 253%if %0 == 8 254 pmullw m10,[%8+ 0] 255 pmullw m8, [%8+32] 256 pmullw m13,[%8+64] 257 pmullw m12,[%8+96] 258 259 IDCT_1D %1, %2, %8 260%elif %2 == 11 261 ; This copies the DC-only shortcut. When there is only a DC coefficient the 262 ; C shifts the value and splats it to all coeffs rather than multiplying and 263 ; doing the full IDCT. This causes a difference on 8-bit because the 264 ; coefficient is 16383 rather than 16384 (which you can get with shifting). 265 por m1, m8, m13 266 por m1, m12 267 por m1, [blockq+ 16] ; { row[1] }[0-7] 268 por m1, [blockq+ 48] ; { row[3] }[0-7] 269 por m1, [blockq+ 80] ; { row[5] }[0-7] 270 por m1, [blockq+112] ; { row[7] }[0-7] 271 pxor m2, m2 272 pcmpeqw m1, m2 273 psllw m2, m10, 3 274 pand m2, m1 275 pcmpeqb m3, m3 276 pxor m1, m3 277 mova [rsp], m1 278 mova [rsp+16], m2 279 280 IDCT_1D %1, %2 281 282 mova m5, [rsp] 283 mova m6, [rsp+16] 284 pand m8, m5 285 por m8, m6 286 pand m0, m5 287 por m0, m6 288 pand m1, m5 289 por m1, m6 290 pand m2, m5 291 por m2, m6 292 pand m4, m5 293 por m4, m6 294 pand m11, m5 295 por m11, m6 296 pand m9, m5 297 por m9, m6 298 pand m10, m5 299 por m10, m6 300%else 301 IDCT_1D %1, %2 302%endif 303 304 ; transpose for second part of IDCT 305 TRANSPOSE8x8W 8, 0, 1, 2, 4, 11, 9, 10, 3 306 mova [blockq+ 16], m0 307 mova [blockq+ 48], m2 308 mova [blockq+ 80], m11 309 mova [blockq+112], m10 310 SWAP 8, 10 311 SWAP 1, 8 312 SWAP 4, 13 313 SWAP 9, 12 314 315 ; for (i = 0; i < 8; i++) 316 ; idctSparseColAdd(dest + i, line_size, block + i); 317 IDCT_1D %3, %4 318 319 ; clip/store 320%if %0 >= 5 321%ifidn %5,"store" 322 ; No clamping, means pure idct 323 mova [blockq+ 0], m8 324 mova [blockq+ 16], m0 325 mova [blockq+ 32], m1 326 mova [blockq+ 48], m2 327 mova [blockq+ 64], m4 328 mova [blockq+ 80], m11 329 mova [blockq+ 96], m9 330 mova [blockq+112], m10 331%elifidn %5,"put" 332%ifidn %6, 0 333 pxor m3, m3 334%else 335 mova m3, [%6] 336%endif ; ifidn %6, 0 337 mova m5, [%7] 338 pmaxsw m8, m3 339 pmaxsw m0, m3 340 pmaxsw m1, m3 341 pmaxsw m2, m3 342 pmaxsw m4, m3 343 pmaxsw m11, m3 344 pmaxsw m9, m3 345 pmaxsw m10, m3 346 pminsw m8, m5 347 pminsw m0, m5 348 pminsw m1, m5 349 pminsw m2, m5 350 pminsw m4, m5 351 pminsw m11, m5 352 pminsw m9, m5 353 pminsw m10, m5 354 355 lea r2, [r1*3] 356 mova [r0 ], m8 357 mova [r0+r1 ], m0 358 mova [r0+r1*2], m1 359 mova [r0+r2 ], m2 360 lea r0, [r0+r1*4] 361 mova [r0 ], m4 362 mova [r0+r1 ], m11 363 mova [r0+r1*2], m9 364 mova [r0+r2 ], m10 365%endif ; %5 action 366%endif; if %0 >= 5 367%endmacro 368 369%endif 370