1;****************************************************************************** 2;* x86-optimized functions for the CFHD encoder 3;* Copyright (c) 2021 Paul B Mahol 4;* 5;* This file is part of FFmpeg. 6;* 7;* FFmpeg is free software; you can redistribute it and/or 8;* modify it under the terms of the GNU Lesser General Public 9;* License as published by the Free Software Foundation; either 10;* version 2.1 of the License, or (at your option) any later version. 11;* 12;* FFmpeg is distributed in the hope that it will be useful, 13;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;* Lesser General Public License for more details. 16;* 17;* You should have received a copy of the GNU Lesser General Public 18;* License along with FFmpeg; if not, write to the Free Software 19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20;****************************************************************************** 21 22%include "libavutil/x86/x86util.asm" 23 24SECTION_RODATA 25 26pw_p1_n1: dw 1, -1, 1, -1, 1, -1, 1, -1 27pw_n1_p1: dw -1, 1, -1, 1, -1, 1, -1, 1 28pw_p5_n11: dw 5, -11, 5, -11, 5, -11, 5, -11 29pw_n5_p11: dw -5, 11, -5, 11, -5, 11, -5, 11 30pw_p11_n5: dw 11, -5, 11, -5, 11, -5, 11, -5 31pw_n11_p5: dw -11, 5, -11, 5, -11, 5, -11, 5 32pd_4: times 4 dd 4 33pw_n4: times 8 dw -4 34cextern pw_m1 35cextern pw_1 36cextern pw_4 37 38SECTION .text 39 40%if ARCH_X86_64 41INIT_XMM sse2 42cglobal cfhdenc_horiz_filter, 8, 10, 11, input, low, high, istride, lwidth, hwidth, width, y, x, temp 43 shl istrideq, 1 44 shl lwidthq, 1 45 shl hwidthq, 1 46 mova m7, [pd_4] 47 mova m8, [pw_1] 48 mova m9, [pw_m1] 49 mova m10,[pw_p1_n1] 50 movsxdifnidn yq, yd 51 movsxdifnidn widthq, widthd 52 neg yq 53.looph: 54 movsx xq, word [inputq] 55 56 movsx tempq, word [inputq + 2] 57 add tempq, xq 58 59 movd xm0, tempd 60 packssdw m0, m0 61 movd tempd, m0 62 mov word [lowq], tempw 63 64 movsx xq, word [inputq] 65 imul xq, 5 66 movsx tempq, word [inputq + 2] 67 imul tempq, -11 68 add tempq, xq 69 70 movsx xq, word [inputq + 4] 71 imul xq, 4 72 add tempq, xq 73 74 movsx xq, word [inputq + 6] 75 imul xq, 4 76 add tempq, xq 77 78 movsx xq, word [inputq + 8] 79 imul xq, -1 80 add tempq, xq 81 82 movsx xq, word [inputq + 10] 83 imul xq, -1 84 add tempq, xq 85 86 add tempq, 4 87 sar tempq, 3 88 89 movd xm0, tempd 90 packssdw m0, m0 91 movd tempd, m0 92 mov word [highq], tempw 93 94 mov xq, 2 95 96.loopw: 97 movu m0, [inputq + xq * 2] 98 movu m1, [inputq + xq * 2 + mmsize] 99 100 pmaddwd m0, m8 101 pmaddwd m1, m8 102 103 packssdw m0, m1 104 movu [lowq+xq], m0 105 106 movu m2, [inputq + xq * 2 - 4] 107 movu m3, [inputq + xq * 2 - 4 + mmsize] 108 109 pmaddwd m2, m9 110 pmaddwd m3, m9 111 112 movu m0, [inputq + xq * 2 + 4] 113 movu m1, [inputq + xq * 2 + 4 + mmsize] 114 115 pmaddwd m0, m8 116 pmaddwd m1, m8 117 118 paddd m0, m2 119 paddd m1, m3 120 121 paddd m0, m7 122 paddd m1, m7 123 124 psrad m0, 3 125 psrad m1, 3 126 127 movu m5, [inputq + xq * 2 + 0] 128 movu m6, [inputq + xq * 2 + mmsize] 129 130 pmaddwd m5, m10 131 pmaddwd m6, m10 132 133 paddd m0, m5 134 paddd m1, m6 135 136 packssdw m0, m1 137 movu [highq+xq], m0 138 139 add xq, mmsize 140 cmp xq, widthq 141 jl .loopw 142 143 add lowq, widthq 144 add highq, widthq 145 lea inputq, [inputq + widthq * 2] 146 147 movsx xq, word [inputq - 4] 148 movsx tempq, word [inputq - 2] 149 add tempq, xq 150 151 movd xm0, tempd 152 packssdw m0, m0 153 movd tempd, m0 154 mov word [lowq-2], tempw 155 156 movsx tempq, word [inputq - 4] 157 imul tempq, 11 158 movsx xq, word [inputq - 2] 159 imul xq, -5 160 add tempq, xq 161 162 movsx xq, word [inputq - 6] 163 imul xq, -4 164 add tempq, xq 165 166 movsx xq, word [inputq - 8] 167 imul xq, -4 168 add tempq, xq 169 170 movsx xq, word [inputq - 10] 171 add tempq, xq 172 173 movsx xq, word [inputq - 12] 174 add tempq, xq 175 176 add tempq, 4 177 sar tempq, 3 178 179 movd xm0, tempd 180 packssdw m0, m0 181 movd tempd, m0 182 mov word [highq-2], tempw 183 184 sub inputq, widthq 185 sub inputq, widthq 186 sub highq, widthq 187 sub lowq, widthq 188 189 add lowq, lwidthq 190 add highq, hwidthq 191 add inputq, istrideq 192 add yq, 1 193 jl .looph 194 195 RET 196%endif 197 198%if ARCH_X86_64 199INIT_XMM sse2 200cglobal cfhdenc_vert_filter, 8, 11, 14, input, low, high, istride, lwidth, hwidth, width, height, x, y, pos 201 shl istrideq, 1 202 203 shl widthd, 1 204 sub heightd, 2 205 206 xor xq, xq 207 208 mova m7, [pd_4] 209 mova m8, [pw_1] 210 mova m9, [pw_m1] 211 mova m10,[pw_p1_n1] 212 mova m11,[pw_n1_p1] 213 mova m12,[pw_4] 214 mova m13,[pw_n4] 215.loopw: 216 mov yq, 2 217 218 mov posq, xq 219 movu m0, [inputq + posq] 220 add posq, istrideq 221 movu m1, [inputq + posq] 222 223 paddsw m0, m1 224 225 movu [lowq + xq], m0 226 227 mov posq, xq 228 229 movu m0, [inputq + posq] 230 add posq, istrideq 231 movu m1, [inputq + posq] 232 add posq, istrideq 233 movu m2, [inputq + posq] 234 add posq, istrideq 235 movu m3, [inputq + posq] 236 add posq, istrideq 237 movu m4, [inputq + posq] 238 add posq, istrideq 239 movu m5, [inputq + posq] 240 241 mova m6, m0 242 punpcklwd m0, m1 243 punpckhwd m1, m6 244 245 mova m6, m2 246 punpcklwd m2, m3 247 punpckhwd m3, m6 248 249 mova m6, m4 250 punpcklwd m4, m5 251 punpckhwd m5, m6 252 253 pmaddwd m0, [pw_p5_n11] 254 pmaddwd m1, [pw_n11_p5] 255 pmaddwd m2, m12 256 pmaddwd m3, m12 257 pmaddwd m4, m9 258 pmaddwd m5, m9 259 260 paddd m0, m2 261 paddd m1, m3 262 paddd m0, m4 263 paddd m1, m5 264 265 paddd m0, m7 266 paddd m1, m7 267 268 psrad m0, 3 269 psrad m1, 3 270 packssdw m0, m1 271 272 movu [highq + xq], m0 273 274.looph: 275 276 mov posq, istrideq 277 imul posq, yq 278 add posq, xq 279 280 movu m0, [inputq + posq] 281 282 add posq, istrideq 283 movu m1, [inputq + posq] 284 285 paddsw m0, m1 286 287 mov posq, lwidthq 288 imul posq, yq 289 add posq, xq 290 291 movu [lowq + posq], m0 292 293 add yq, -2 294 295 mov posq, istrideq 296 imul posq, yq 297 add posq, xq 298 299 movu m0, [inputq + posq] 300 add posq, istrideq 301 movu m1, [inputq + posq] 302 add posq, istrideq 303 movu m2, [inputq + posq] 304 add posq, istrideq 305 movu m3, [inputq + posq] 306 add posq, istrideq 307 movu m4, [inputq + posq] 308 add posq, istrideq 309 movu m5, [inputq + posq] 310 311 add yq, 2 312 313 mova m6, m0 314 punpcklwd m0, m1 315 punpckhwd m1, m6 316 317 mova m6, m2 318 punpcklwd m2, m3 319 punpckhwd m3, m6 320 321 mova m6, m4 322 punpcklwd m4, m5 323 punpckhwd m5, m6 324 325 pmaddwd m0, m9 326 pmaddwd m1, m9 327 pmaddwd m2, m10 328 pmaddwd m3, m11 329 pmaddwd m4, m8 330 pmaddwd m5, m8 331 332 paddd m0, m4 333 paddd m1, m5 334 335 paddd m0, m7 336 paddd m1, m7 337 338 psrad m0, 3 339 psrad m1, 3 340 paddd m0, m2 341 paddd m1, m3 342 packssdw m0, m1 343 344 mov posq, hwidthq 345 imul posq, yq 346 add posq, xq 347 348 movu [highq + posq], m0 349 350 add yq, 2 351 cmp yq, heightq 352 jl .looph 353 354 mov posq, istrideq 355 imul posq, yq 356 add posq, xq 357 358 movu m0, [inputq + posq] 359 add posq, istrideq 360 movu m1, [inputq + posq] 361 362 paddsw m0, m1 363 364 mov posq, lwidthq 365 imul posq, yq 366 add posq, xq 367 368 movu [lowq + posq], m0 369 370 sub yq, 4 371 372 mov posq, istrideq 373 imul posq, yq 374 add posq, xq 375 376 movu m0, [inputq + posq] 377 add posq, istrideq 378 movu m1, [inputq + posq] 379 add posq, istrideq 380 movu m2, [inputq + posq] 381 add posq, istrideq 382 movu m3, [inputq + posq] 383 add posq, istrideq 384 movu m4, [inputq + posq] 385 add posq, istrideq 386 movu m5, [inputq + posq] 387 388 add yq, 4 389 390 mova m6, m0 391 punpcklwd m0, m1 392 punpckhwd m1, m6 393 394 mova m6, m2 395 punpcklwd m2, m3 396 punpckhwd m3, m6 397 398 mova m6, m4 399 punpcklwd m4, m5 400 punpckhwd m5, m6 401 402 pmaddwd m0, m8 403 pmaddwd m1, m8 404 pmaddwd m2, m13 405 pmaddwd m3, m13 406 pmaddwd m4, [pw_p11_n5] 407 pmaddwd m5, [pw_n5_p11] 408 409 paddd m4, m2 410 paddd m5, m3 411 412 paddd m4, m0 413 paddd m5, m1 414 415 paddd m4, m7 416 paddd m5, m7 417 418 psrad m4, 3 419 psrad m5, 3 420 packssdw m4, m5 421 422 mov posq, hwidthq 423 imul posq, yq 424 add posq, xq 425 426 movu [highq + posq], m4 427 428 add xq, mmsize 429 cmp xq, widthq 430 jl .loopw 431 RET 432%endif 433