1; ***************************************************************************** 2; * Provide SIMD optimizations for add_residual functions for HEVC decoding 3; * Copyright (c) 2014 Pierre-Edouard LEPERE 4; * 5; * This file is part of FFmpeg. 6; * 7; * FFmpeg is free software; you can redistribute it and/or 8; * modify it under the terms of the GNU Lesser General Public 9; * License as published by the Free Software Foundation; either 10; * version 2.1 of the License, or (at your option) any later version. 11; * 12; * FFmpeg is distributed in the hope that it will be useful, 13; * but WITHOUT ANY WARRANTY; without even the implied warranty of 14; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15; * Lesser General Public License for more details. 16; * 17; * You should have received a copy of the GNU Lesser General Public 18; * License along with FFmpeg; if not, write to the Free Software 19; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20; ****************************************************************************** 21 22%include "libavutil/x86/x86util.asm" 23 24SECTION .text 25 26cextern pw_1023 27%define max_pixels_10 pw_1023 28 29; the add_res macros and functions were largely inspired by h264_idct.asm from the x264 project 30%macro ADD_RES_MMX_4_8 0 31 mova m0, [r1] 32 mova m2, [r1+8] 33 34 movd m1, [r0] 35 movd m3, [r0+r2] 36 punpcklbw m1, m4 37 punpcklbw m3, m4 38 39 paddsw m0, m1 40 paddsw m2, m3 41 packuswb m0, m4 42 packuswb m2, m4 43 44 movd [r0], m0 45 movd [r0+r2], m2 46%endmacro 47 48 49INIT_MMX mmxext 50; void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride) 51cglobal hevc_add_residual_4_8, 3, 3, 6 52 pxor m4, m4 53 ADD_RES_MMX_4_8 54 add r1, 16 55 lea r0, [r0+r2*2] 56 ADD_RES_MMX_4_8 57 RET 58 59%macro ADD_RES_SSE_8_8 0 60 movq m0, [r0] 61 movq m1, [r0+r2] 62 punpcklbw m0, m4 63 punpcklbw m1, m4 64 mova m2, [r1] 65 mova m3, [r1+16] 66 paddsw m0, m2 67 paddsw m1, m3 68 packuswb m0, m1 69 70 movq m2, [r0+r2*2] 71 movq m3, [r0+r3] 72 punpcklbw m2, m4 73 punpcklbw m3, m4 74 mova m6, [r1+32] 75 mova m7, [r1+48] 76 paddsw m2, m6 77 paddsw m3, m7 78 packuswb m2, m3 79 80 movq [r0], m0 81 movhps [r0+r2], m0 82 movq [r0+r2*2], m2 83 movhps [r0+r3], m2 84%endmacro 85 86%macro ADD_RES_SSE_16_32_8 3 87 mova m1, [%2] 88 mova m2, m1 89 punpcklbw m1, m0 90 punpckhbw m2, m0 91 mova xm5, [r1+%1] 92 mova xm6, [r1+%1+16] 93%if cpuflag(avx2) 94 vinserti128 m5, m5, [r1+%1+32], 1 95 vinserti128 m6, m6, [r1+%1+48], 1 96%endif 97 paddsw m1, m5 98 paddsw m2, m6 99 100 mova m3, [%3] 101 mova m4, m3 102 punpcklbw m3, m0 103 punpckhbw m4, m0 104 mova xm5, [r1+%1+mmsize*2] 105 mova xm6, [r1+%1+mmsize*2+16] 106%if cpuflag(avx2) 107 vinserti128 m5, m5, [r1+%1+96], 1 108 vinserti128 m6, m6, [r1+%1+112], 1 109%endif 110 paddsw m3, m5 111 paddsw m4, m6 112 113 packuswb m1, m2 114 packuswb m3, m4 115 mova [%2], m1 116 mova [%3], m3 117%endmacro 118 119 120%macro TRANSFORM_ADD_8 0 121; void ff_hevc_add_residual_8_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride) 122cglobal hevc_add_residual_8_8, 3, 4, 8 123 pxor m4, m4 124 lea r3, [r2*3] 125 ADD_RES_SSE_8_8 126 add r1, 64 127 lea r0, [r0+r2*4] 128 ADD_RES_SSE_8_8 129 RET 130 131; void ff_hevc_add_residual_16_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride) 132cglobal hevc_add_residual_16_8, 3, 5, 7 133 pxor m0, m0 134 lea r3, [r2*3] 135 mov r4d, 4 136.loop: 137 ADD_RES_SSE_16_32_8 0, r0, r0+r2 138 ADD_RES_SSE_16_32_8 64, r0+r2*2, r0+r3 139 add r1, 128 140 lea r0, [r0+r2*4] 141 dec r4d 142 jg .loop 143 RET 144 145; void ff_hevc_add_residual_32_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride) 146cglobal hevc_add_residual_32_8, 3, 5, 7 147 pxor m0, m0 148 mov r4d, 16 149.loop: 150 ADD_RES_SSE_16_32_8 0, r0, r0+16 151 ADD_RES_SSE_16_32_8 64, r0+r2, r0+r2+16 152 add r1, 128 153 lea r0, [r0+r2*2] 154 dec r4d 155 jg .loop 156 RET 157%endmacro 158 159INIT_XMM sse2 160TRANSFORM_ADD_8 161INIT_XMM avx 162TRANSFORM_ADD_8 163 164%if HAVE_AVX2_EXTERNAL 165INIT_YMM avx2 166; void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride) 167cglobal hevc_add_residual_32_8, 3, 5, 7 168 pxor m0, m0 169 lea r3, [r2*3] 170 mov r4d, 8 171.loop: 172 ADD_RES_SSE_16_32_8 0, r0, r0+r2 173 ADD_RES_SSE_16_32_8 128, r0+r2*2, r0+r3 174 add r1, 256 175 lea r0, [r0+r2*4] 176 dec r4d 177 jg .loop 178 RET 179%endif ;HAVE_AVX2_EXTERNAL 180 181%macro ADD_RES_SSE_8_10 4 182 mova m0, [%4] 183 mova m1, [%4+16] 184 mova m2, [%4+32] 185 mova m3, [%4+48] 186 paddw m0, [%1+0] 187 paddw m1, [%1+%2] 188 paddw m2, [%1+%2*2] 189 paddw m3, [%1+%3] 190 CLIPW m0, m4, m5 191 CLIPW m1, m4, m5 192 CLIPW m2, m4, m5 193 CLIPW m3, m4, m5 194 mova [%1+0], m0 195 mova [%1+%2], m1 196 mova [%1+%2*2], m2 197 mova [%1+%3], m3 198%endmacro 199 200%macro ADD_RES_MMX_4_10 3 201 mova m0, [%1+0] 202 mova m1, [%1+%2] 203 paddw m0, [%3] 204 paddw m1, [%3+8] 205 CLIPW m0, m2, m3 206 CLIPW m1, m2, m3 207 mova [%1+0], m0 208 mova [%1+%2], m1 209%endmacro 210 211%macro ADD_RES_SSE_16_10 3 212 mova m0, [%3] 213 mova m1, [%3+16] 214 mova m2, [%3+32] 215 mova m3, [%3+48] 216 paddw m0, [%1] 217 paddw m1, [%1+16] 218 paddw m2, [%1+%2] 219 paddw m3, [%1+%2+16] 220 CLIPW m0, m4, m5 221 CLIPW m1, m4, m5 222 CLIPW m2, m4, m5 223 CLIPW m3, m4, m5 224 mova [%1], m0 225 mova [%1+16], m1 226 mova [%1+%2], m2 227 mova [%1+%2+16], m3 228%endmacro 229 230%macro ADD_RES_SSE_32_10 2 231 mova m0, [%2] 232 mova m1, [%2+16] 233 mova m2, [%2+32] 234 mova m3, [%2+48] 235 236 paddw m0, [%1] 237 paddw m1, [%1+16] 238 paddw m2, [%1+32] 239 paddw m3, [%1+48] 240 CLIPW m0, m4, m5 241 CLIPW m1, m4, m5 242 CLIPW m2, m4, m5 243 CLIPW m3, m4, m5 244 mova [%1], m0 245 mova [%1+16], m1 246 mova [%1+32], m2 247 mova [%1+48], m3 248%endmacro 249 250%macro ADD_RES_AVX2_16_10 4 251 mova m0, [%4] 252 mova m1, [%4+32] 253 mova m2, [%4+64] 254 mova m3, [%4+96] 255 256 paddw m0, [%1+0] 257 paddw m1, [%1+%2] 258 paddw m2, [%1+%2*2] 259 paddw m3, [%1+%3] 260 261 CLIPW m0, m4, m5 262 CLIPW m1, m4, m5 263 CLIPW m2, m4, m5 264 CLIPW m3, m4, m5 265 mova [%1+0], m0 266 mova [%1+%2], m1 267 mova [%1+%2*2], m2 268 mova [%1+%3], m3 269%endmacro 270 271%macro ADD_RES_AVX2_32_10 3 272 mova m0, [%3] 273 mova m1, [%3+32] 274 mova m2, [%3+64] 275 mova m3, [%3+96] 276 277 paddw m0, [%1] 278 paddw m1, [%1+32] 279 paddw m2, [%1+%2] 280 paddw m3, [%1+%2+32] 281 282 CLIPW m0, m4, m5 283 CLIPW m1, m4, m5 284 CLIPW m2, m4, m5 285 CLIPW m3, m4, m5 286 mova [%1], m0 287 mova [%1+32], m1 288 mova [%1+%2], m2 289 mova [%1+%2+32], m3 290%endmacro 291 292; void ff_hevc_add_residual_<4|8|16|32>_10(pixel *dst, int16_t *block, ptrdiff_t stride) 293INIT_MMX mmxext 294cglobal hevc_add_residual_4_10, 3, 3, 6 295 pxor m2, m2 296 mova m3, [max_pixels_10] 297 ADD_RES_MMX_4_10 r0, r2, r1 298 add r1, 16 299 lea r0, [r0+2*r2] 300 ADD_RES_MMX_4_10 r0, r2, r1 301 RET 302 303INIT_XMM sse2 304cglobal hevc_add_residual_8_10, 3, 4, 6 305 pxor m4, m4 306 mova m5, [max_pixels_10] 307 lea r3, [r2*3] 308 309 ADD_RES_SSE_8_10 r0, r2, r3, r1 310 lea r0, [r0+r2*4] 311 add r1, 64 312 ADD_RES_SSE_8_10 r0, r2, r3, r1 313 RET 314 315cglobal hevc_add_residual_16_10, 3, 5, 6 316 pxor m4, m4 317 mova m5, [max_pixels_10] 318 319 mov r4d, 8 320.loop: 321 ADD_RES_SSE_16_10 r0, r2, r1 322 lea r0, [r0+r2*2] 323 add r1, 64 324 dec r4d 325 jg .loop 326 RET 327 328cglobal hevc_add_residual_32_10, 3, 5, 6 329 pxor m4, m4 330 mova m5, [max_pixels_10] 331 332 mov r4d, 32 333.loop: 334 ADD_RES_SSE_32_10 r0, r1 335 lea r0, [r0+r2] 336 add r1, 64 337 dec r4d 338 jg .loop 339 RET 340 341%if HAVE_AVX2_EXTERNAL 342INIT_YMM avx2 343cglobal hevc_add_residual_16_10, 3, 5, 6 344 pxor m4, m4 345 mova m5, [max_pixels_10] 346 lea r3, [r2*3] 347 348 mov r4d, 4 349.loop: 350 ADD_RES_AVX2_16_10 r0, r2, r3, r1 351 lea r0, [r0+r2*4] 352 add r1, 128 353 dec r4d 354 jg .loop 355 RET 356 357cglobal hevc_add_residual_32_10, 3, 5, 6 358 pxor m4, m4 359 mova m5, [max_pixels_10] 360 361 mov r4d, 16 362.loop: 363 ADD_RES_AVX2_32_10 r0, r2, r1 364 lea r0, [r0+r2*2] 365 add r1, 128 366 dec r4d 367 jg .loop 368 RET 369%endif ;HAVE_AVX2_EXTERNAL 370