1;****************************************************************************** 2;* SIMD-optimized MLP DSP functions 3;* Copyright (c) 2014 James Almer <jamrial@gmail.com> 4;* 5;* This file is part of FFmpeg. 6;* 7;* FFmpeg is free software; you can redistribute it and/or 8;* modify it under the terms of the GNU Lesser General Public 9;* License as published by the Free Software Foundation; either 10;* version 2.1 of the License, or (at your option) any later version. 11;* 12;* FFmpeg is distributed in the hope that it will be useful, 13;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;* Lesser General Public License for more details. 16;* 17;* You should have received a copy of the GNU Lesser General Public 18;* License along with FFmpeg; if not, write to the Free Software 19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20;****************************************************************************** 21 22%include "libavutil/x86/x86util.asm" 23 24SECTION .text 25 26%if ARCH_X86_64 27 28%macro SHLX 2 29%if cpuflag(bmi2) 30 shlx %1, %1, %2q 31%else 32 shl %1, %2b 33%endif 34%endmacro 35 36%macro REMATRIX 0 37 movdqa m0, [samplesq] 38 movdqa m1, [coeffsq ] 39 pshufd m2, m0, q2301 40 pshufd m3, m1, q2301 41 pmuldq m0, m1 42 pmuldq m3, m2 43 paddq m0, m3 44%if notcpuflag(avx2) 45 movdqa m1, [samplesq + 16] 46 movdqa m2, [coeffsq + 16] 47 pshufd m3, m1, q2301 48 pshufd m4, m2, q2301 49 pmuldq m1, m2 50 pmuldq m4, m3 51 paddq m0, m1 52 paddq m0, m4 53%else 54 vextracti128 xm1, m0, 1 55 paddq xm0, xm1 56%endif 57%endmacro 58 59%macro LOOP_END 0 60 pshufd xm1, xm0, q0032 61 paddq xm0, xm1 62 movq accumq, xm0 63 movzx blsbsd, byte [blsbs_ptrq] ; load *bypassed_lsbs 64 sar accumq, 14 ; accum >>= 14 65 and accumd, maskd ; accum &= mask 66 add accumd, blsbsd ; accum += *bypassed_lsbs 67 mov [samplesq + dest_chq], accumd ; samples[dest_ch] = accum 68 add blsbs_ptrq, 8 ; bypassed_lsbs += MAX_CHANNELS; 69 add samplesq, 32 ; samples += MAX_CHANNELS; 70 cmp blsbs_ptrq, cntq 71%endmacro 72 73%macro LOOP_SHIFT_END 0 74 pshufd xm1, xm0, q0032 75 paddq xm0, xm1 76 movq accumq, xm0 77 and indexd, auspd ; index &= access_unit_size_pow2; 78 movsx noiseq, byte [noise_bufferq + indexq] ; load noise_buffer[index] 79 add indexd, index2d ; index += index2 80 SHLX noiseq, mns ; noise_buffer[index] <<= matrix_noise_shift 81 add accumq, noiseq ; accum += noise_buffer[index] 82 movzx noised, byte [blsbs_ptrq] ; load *bypassed_lsbs (reuse tmp noise register) 83 sar accumq, 14 ; accum >>= 14 84 and accumd, maskd ; accum &= mask 85 add accumd, noised ; accum += *bypassed_lsbs 86 mov [samplesq + dest_chq], accumd ; samples[dest_ch] = accum 87 add blsbs_ptrq, 8 ; bypassed_lsbs += MAX_CHANNELS; 88 add samplesq, 32 ; samples += MAX_CHANNELS; 89 cmp blsbs_ptrq, cntq 90%endmacro 91 92;void ff_mlp_rematrix_channel(int32_t *samples, const int32_t *coeffs, 93; const uint8_t *bypassed_lsbs, const int8_t *noise_buffer, 94; int index, unsigned int dest_ch, uint16_t blockpos, 95; unsigned int maxchan, int matrix_noise_shift, 96; int access_unit_size_pow2, int32_t mask) 97%macro MLP_REMATRIX_CHANNEL 0 98cglobal mlp_rematrix_channel, 0, 13, 5, samples, coeffs, blsbs_ptr, blsbs, \ 99 index, dest_ch, blockpos, maxchan, mns, \ 100 accum, mask, cnt 101 mov mnsd, mnsm ; load matrix_noise_shift 102 movzx blockposq, word blockposm ; load and zero extend blockpos (16bit) 103 mov maxchand, maxchanm ; load maxchan 104 mov maskd, maskm ; load mask 105%if WIN64 106 mov dest_chd, dest_chm ; load dest_chd (not needed on UNIX64) 107%endif 108 shl dest_chd, 2 109 lea cntq, [blsbs_ptrq + blockposq*8] 110 test mnsd, mnsd ; is matrix_noise_shift != 0? 111 jne .shift ; jump if true 112 cmp maxchand, 4 ; is maxchan < 4? 113 jl .loop4 ; jump if true 114 115align 16 116.loop8: 117 ; Process 5 or more channels 118 REMATRIX 119 LOOP_END 120 jne .loop8 121 RET 122 123align 16 124.loop4: 125 ; Process up to 4 channels 126 movdqa xm0, [samplesq] 127 movdqa xm1, [coeffsq ] 128 pshufd xm2, xm0, q2301 129 pshufd xm3, xm1, q2301 130 pmuldq xm0, xm1 131 pmuldq xm3, xm2 132 paddq xm0, xm3 133 LOOP_END 134 jne .loop4 135 RET 136 137.shift: 138%if WIN64 139 mov indexd, indexm ; load index (not needed on UNIX64) 140%endif 141 mov r9d, r9m ; load access_unit_size_pow2 142%if cpuflag(bmi2) 143 ; bmi2 has shift functions that accept any gpr, not just cl, so keep things in place. 144 DEFINE_ARGS samples, coeffs, blsbs_ptr, noise_buffer, \ 145 index, dest_ch, accum, index2, mns, \ 146 ausp, mask, cnt, noise 147 add mnsd, 7 ; matrix_noise_shift += 7 148%else ; sse4 149 mov r6, rcx ; move rcx elsewhere so we can use cl for matrix_noise_shift 150%if WIN64 151 ; r0 = rcx 152 DEFINE_ARGS mns, coeffs, blsbs_ptr, noise_buffer, index, dest_ch, samples, \ 153 index2, accum, ausp, mask, cnt, noise 154%else ; UNIX64 155 ; r3 = rcx 156 DEFINE_ARGS samples, coeffs, blsbs_ptr, mns, index, dest_ch, noise_buffer, \ 157 index2, accum, ausp, mask, cnt, noise 158%endif 159 lea mnsd, [r8 + 7] ; rcx = matrix_noise_shift + 7 160%endif ; cpuflag 161 sub auspd, 1 ; access_unit_size_pow2 -= 1 162 cmp r7d, 4 ; is maxchan < 4? 163 lea index2q, [indexq*2 + 1] ; index2 = 2 * index + 1; 164 jl .loop4_shift ; jump if maxchan < 4 165 166align 16 167.loop8_shift: 168 ; Process 5 or more channels 169 REMATRIX 170 LOOP_SHIFT_END 171 jne .loop8_shift 172 RET 173 174align 16 175.loop4_shift: 176 ; Process up to 4 channels 177 movdqa xm0, [samplesq] 178 movdqa xm1, [coeffsq ] 179 pshufd xm2, xm0, q2301 180 pshufd xm3, xm1, q2301 181 pmuldq xm0, xm1 182 pmuldq xm3, xm2 183 paddq xm0, xm3 184 LOOP_SHIFT_END 185 jne .loop4_shift 186 RET 187%endmacro 188 189INIT_XMM sse4 190MLP_REMATRIX_CHANNEL 191%if HAVE_AVX2_EXTERNAL 192INIT_YMM avx2, bmi2 193MLP_REMATRIX_CHANNEL 194%endif 195 196%endif ; ARCH_X86_64 197