1;****************************************************************************** 2;* SIMD optimized SAO functions for HEVC 10/12bit decoding 3;* 4;* Copyright (c) 2013 Pierre-Edouard LEPERE 5;* Copyright (c) 2014 James Almer 6;* 7;* This file is part of FFmpeg. 8;* 9;* FFmpeg is free software; you can redistribute it and/or 10;* modify it under the terms of the GNU Lesser General Public 11;* License as published by the Free Software Foundation; either 12;* version 2.1 of the License, or (at your option) any later version. 13;* 14;* FFmpeg is distributed in the hope that it will be useful, 15;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17;* Lesser General Public License for more details. 18;* 19;* You should have received a copy of the GNU Lesser General Public 20;* License along with FFmpeg; if not, write to the Free Software 21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22;****************************************************************************** 23 24%include "libavutil/x86/x86util.asm" 25 26SECTION_RODATA 32 27 28pw_m2: times 16 dw -2 29pw_mask10: times 16 dw 0x03FF 30pw_mask12: times 16 dw 0x0FFF 31pb_eo: db -1, 0, 1, 0, 0, -1, 0, 1, -1, -1, 1, 1, 1, -1, -1, 1 32cextern pw_m1 33cextern pw_1 34cextern pw_2 35 36SECTION .text 37 38;****************************************************************************** 39;SAO Band Filter 40;****************************************************************************** 41 42%macro HEVC_SAO_BAND_FILTER_INIT 1 43 and leftq, 31 44 movd xm0, leftd 45 add leftq, 1 46 and leftq, 31 47 movd xm1, leftd 48 add leftq, 1 49 and leftq, 31 50 movd xm2, leftd 51 add leftq, 1 52 and leftq, 31 53 movd xm3, leftd 54 55 SPLATW m0, xm0 56 SPLATW m1, xm1 57 SPLATW m2, xm2 58 SPLATW m3, xm3 59%if mmsize > 16 60 SPLATW m4, [offsetq + 2] 61 SPLATW m5, [offsetq + 4] 62 SPLATW m6, [offsetq + 6] 63 SPLATW m7, [offsetq + 8] 64%else 65 movq m7, [offsetq + 2] 66 SPLATW m4, m7, 0 67 SPLATW m5, m7, 1 68 SPLATW m6, m7, 2 69 SPLATW m7, m7, 3 70%endif 71 72%if ARCH_X86_64 73 mova m13, [pw_mask %+ %1] 74 pxor m14, m14 75 76%else ; ARCH_X86_32 77 mova [rsp+mmsize*0], m0 78 mova [rsp+mmsize*1], m1 79 mova [rsp+mmsize*2], m2 80 mova [rsp+mmsize*3], m3 81 mova [rsp+mmsize*4], m4 82 mova [rsp+mmsize*5], m5 83 mova [rsp+mmsize*6], m6 84 mova m1, [pw_mask %+ %1] 85 pxor m0, m0 86 %define m14 m0 87 %define m13 m1 88 %define m9 m2 89 %define m8 m3 90%endif ; ARCH 91DEFINE_ARGS dst, src, dststride, srcstride, offset, height 92 mov heightd, r7m 93%endmacro 94 95;void ff_hevc_sao_band_filter_<width>_<depth>_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, 96; int16_t *sao_offset_val, int sao_left_class, int width, int height); 97%macro HEVC_SAO_BAND_FILTER 3 98cglobal hevc_sao_band_filter_%2_%1, 6, 6, 15, 7*mmsize*ARCH_X86_32, dst, src, dststride, srcstride, offset, left 99 HEVC_SAO_BAND_FILTER_INIT %1 100 101align 16 102.loop: 103 104%assign i 0 105%assign j 0 106%rep %3 107%assign k 8+(j&1) 108%assign l 9-(j&1) 109 mova m %+ k, [srcq + i] 110 psraw m %+ l, m %+ k, %1-5 111%if ARCH_X86_64 112 pcmpeqw m10, m %+ l, m0 113 pcmpeqw m11, m %+ l, m1 114 pcmpeqw m12, m %+ l, m2 115 pcmpeqw m %+ l, m3 116 pand m10, m4 117 pand m11, m5 118 pand m12, m6 119 pand m %+ l, m7 120 por m10, m11 121 por m12, m %+ l 122 por m10, m12 123 paddw m %+ k, m10 124%else ; ARCH_X86_32 125 pcmpeqw m4, m %+ l, [rsp+mmsize*0] 126 pcmpeqw m5, m %+ l, [rsp+mmsize*1] 127 pcmpeqw m6, m %+ l, [rsp+mmsize*2] 128 pcmpeqw m %+ l, [rsp+mmsize*3] 129 pand m4, [rsp+mmsize*4] 130 pand m5, [rsp+mmsize*5] 131 pand m6, [rsp+mmsize*6] 132 pand m %+ l, m7 133 por m4, m5 134 por m6, m %+ l 135 por m4, m6 136 paddw m %+ k, m4 137%endif ; ARCH 138 CLIPW m %+ k, m14, m13 139 mova [dstq + i], m %+ k 140%assign i i+mmsize 141%assign j j+1 142%endrep 143 144 add dstq, dststrideq 145 add srcq, srcstrideq 146 dec heightd 147 jg .loop 148 REP_RET 149%endmacro 150 151%macro HEVC_SAO_BAND_FILTER_FUNCS 0 152HEVC_SAO_BAND_FILTER 10, 8, 1 153HEVC_SAO_BAND_FILTER 10, 16, 2 154HEVC_SAO_BAND_FILTER 10, 32, 4 155HEVC_SAO_BAND_FILTER 10, 48, 6 156HEVC_SAO_BAND_FILTER 10, 64, 8 157 158HEVC_SAO_BAND_FILTER 12, 8, 1 159HEVC_SAO_BAND_FILTER 12, 16, 2 160HEVC_SAO_BAND_FILTER 12, 32, 4 161HEVC_SAO_BAND_FILTER 12, 48, 6 162HEVC_SAO_BAND_FILTER 12, 64, 8 163%endmacro 164 165INIT_XMM sse2 166HEVC_SAO_BAND_FILTER_FUNCS 167INIT_XMM avx 168HEVC_SAO_BAND_FILTER_FUNCS 169 170%if HAVE_AVX2_EXTERNAL 171INIT_XMM avx2 172HEVC_SAO_BAND_FILTER 10, 8, 1 173INIT_YMM avx2 174HEVC_SAO_BAND_FILTER 10, 16, 1 175HEVC_SAO_BAND_FILTER 10, 32, 2 176HEVC_SAO_BAND_FILTER 10, 48, 3 177HEVC_SAO_BAND_FILTER 10, 64, 4 178 179INIT_XMM avx2 180HEVC_SAO_BAND_FILTER 12, 8, 1 181INIT_YMM avx2 182HEVC_SAO_BAND_FILTER 12, 16, 1 183HEVC_SAO_BAND_FILTER 12, 32, 2 184HEVC_SAO_BAND_FILTER 12, 48, 3 185HEVC_SAO_BAND_FILTER 12, 64, 4 186%endif 187 188;****************************************************************************** 189;SAO Edge Filter 190;****************************************************************************** 191 192%define MAX_PB_SIZE 64 193%define PADDING_SIZE 64 ; AV_INPUT_BUFFER_PADDING_SIZE 194%define EDGE_SRCSTRIDE 2 * MAX_PB_SIZE + PADDING_SIZE 195 196%macro PMINUW 4 197%if cpuflag(sse4) 198 pminuw %1, %2, %3 199%else 200 psubusw %4, %2, %3 201 psubw %1, %2, %4 202%endif 203%endmacro 204 205%macro HEVC_SAO_EDGE_FILTER_INIT 0 206%if WIN64 207 movsxd eoq, dword eom 208%elif ARCH_X86_64 209 movsxd eoq, eod 210%else 211 mov eoq, r4m 212%endif 213 lea tmp2q, [pb_eo] 214 movsx a_strideq, byte [tmp2q+eoq*4+1] 215 movsx b_strideq, byte [tmp2q+eoq*4+3] 216 imul a_strideq, EDGE_SRCSTRIDE >> 1 217 imul b_strideq, EDGE_SRCSTRIDE >> 1 218 movsx tmpq, byte [tmp2q+eoq*4] 219 add a_strideq, tmpq 220 movsx tmpq, byte [tmp2q+eoq*4+2] 221 add b_strideq, tmpq 222%endmacro 223 224;void ff_hevc_sao_edge_filter_<width>_<depth>_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, 225; int eo, int width, int height); 226%macro HEVC_SAO_EDGE_FILTER 3 227%if ARCH_X86_64 228cglobal hevc_sao_edge_filter_%2_%1, 4, 9, 16, dst, src, dststride, offset, eo, a_stride, b_stride, height, tmp 229%define tmp2q heightq 230 HEVC_SAO_EDGE_FILTER_INIT 231 mov heightd, r6m 232 add a_strideq, a_strideq 233 add b_strideq, b_strideq 234 235%else ; ARCH_X86_32 236cglobal hevc_sao_edge_filter_%2_%1, 1, 6, 8, 5*mmsize, dst, src, dststride, a_stride, b_stride, height 237%define eoq srcq 238%define tmpq heightq 239%define tmp2q dststrideq 240%define offsetq heightq 241%define m8 m1 242%define m9 m2 243%define m10 m3 244%define m11 m4 245%define m12 m5 246 HEVC_SAO_EDGE_FILTER_INIT 247 mov srcq, srcm 248 mov offsetq, r3m 249 mov dststrideq, dststridem 250 add a_strideq, a_strideq 251 add b_strideq, b_strideq 252 253%endif ; ARCH 254 255%if mmsize > 16 256 SPLATW m8, [offsetq+2] 257 SPLATW m9, [offsetq+4] 258 SPLATW m10, [offsetq+0] 259 SPLATW m11, [offsetq+6] 260 SPLATW m12, [offsetq+8] 261%else 262 movq m10, [offsetq+0] 263 movd m12, [offsetq+6] 264 SPLATW m8, xm10, 1 265 SPLATW m9, xm10, 2 266 SPLATW m10, xm10, 0 267 SPLATW m11, xm12, 0 268 SPLATW m12, xm12, 1 269%endif 270 pxor m0, m0 271%if ARCH_X86_64 272 mova m13, [pw_m1] 273 mova m14, [pw_1] 274 mova m15, [pw_2] 275%else 276 mov heightd, r6m 277 mova [rsp+mmsize*0], m8 278 mova [rsp+mmsize*1], m9 279 mova [rsp+mmsize*2], m10 280 mova [rsp+mmsize*3], m11 281 mova [rsp+mmsize*4], m12 282%endif 283 284align 16 285.loop: 286 287%assign i 0 288%rep %3 289 mova m1, [srcq + i] 290 movu m2, [srcq+a_strideq + i] 291 movu m3, [srcq+b_strideq + i] 292 PMINUW m4, m1, m2, m6 293 PMINUW m5, m1, m3, m7 294 pcmpeqw m2, m4 295 pcmpeqw m3, m5 296 pcmpeqw m4, m1 297 pcmpeqw m5, m1 298 psubw m4, m2 299 psubw m5, m3 300 301 paddw m4, m5 302 pcmpeqw m2, m4, [pw_m2] 303%if ARCH_X86_64 304 pcmpeqw m3, m4, m13 305 pcmpeqw m5, m4, m0 306 pcmpeqw m6, m4, m14 307 pcmpeqw m7, m4, m15 308 pand m2, m8 309 pand m3, m9 310 pand m5, m10 311 pand m6, m11 312 pand m7, m12 313%else 314 pcmpeqw m3, m4, [pw_m1] 315 pcmpeqw m5, m4, m0 316 pcmpeqw m6, m4, [pw_1] 317 pcmpeqw m7, m4, [pw_2] 318 pand m2, [rsp+mmsize*0] 319 pand m3, [rsp+mmsize*1] 320 pand m5, [rsp+mmsize*2] 321 pand m6, [rsp+mmsize*3] 322 pand m7, [rsp+mmsize*4] 323%endif 324 paddw m2, m3 325 paddw m5, m6 326 paddw m2, m7 327 paddw m2, m1 328 paddw m2, m5 329 CLIPW m2, m0, [pw_mask %+ %1] 330 mova [dstq + i], m2 331%assign i i+mmsize 332%endrep 333 334 add dstq, dststrideq 335 add srcq, EDGE_SRCSTRIDE 336 dec heightd 337 jg .loop 338 RET 339%endmacro 340 341INIT_XMM sse2 342HEVC_SAO_EDGE_FILTER 10, 8, 1 343HEVC_SAO_EDGE_FILTER 10, 16, 2 344HEVC_SAO_EDGE_FILTER 10, 32, 4 345HEVC_SAO_EDGE_FILTER 10, 48, 6 346HEVC_SAO_EDGE_FILTER 10, 64, 8 347 348HEVC_SAO_EDGE_FILTER 12, 8, 1 349HEVC_SAO_EDGE_FILTER 12, 16, 2 350HEVC_SAO_EDGE_FILTER 12, 32, 4 351HEVC_SAO_EDGE_FILTER 12, 48, 6 352HEVC_SAO_EDGE_FILTER 12, 64, 8 353 354%if HAVE_AVX2_EXTERNAL 355INIT_XMM avx2 356HEVC_SAO_EDGE_FILTER 10, 8, 1 357INIT_YMM avx2 358HEVC_SAO_EDGE_FILTER 10, 16, 1 359HEVC_SAO_EDGE_FILTER 10, 32, 2 360HEVC_SAO_EDGE_FILTER 10, 48, 3 361HEVC_SAO_EDGE_FILTER 10, 64, 4 362 363INIT_XMM avx2 364HEVC_SAO_EDGE_FILTER 12, 8, 1 365INIT_YMM avx2 366HEVC_SAO_EDGE_FILTER 12, 16, 1 367HEVC_SAO_EDGE_FILTER 12, 32, 2 368HEVC_SAO_EDGE_FILTER 12, 48, 3 369HEVC_SAO_EDGE_FILTER 12, 64, 4 370%endif 371