1;****************************************************************************** 2;* linear least squares model 3;* 4;* Copyright (c) 2013 Loren Merritt 5;* 6;* This file is part of FFmpeg. 7;* 8;* FFmpeg is free software; you can redistribute it and/or 9;* modify it under the terms of the GNU Lesser General Public 10;* License as published by the Free Software Foundation; either 11;* version 2.1 of the License, or (at your option) any later version. 12;* 13;* FFmpeg is distributed in the hope that it will be useful, 14;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16;* Lesser General Public License for more details. 17;* 18;* You should have received a copy of the GNU Lesser General Public 19;* License along with FFmpeg; if not, write to the Free Software 20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21;****************************************************************************** 22 23%include "x86util.asm" 24 25SECTION .text 26 27%define MAX_VARS 32 28%define MAX_VARS_ALIGN (MAX_VARS+4) 29%define COVAR_STRIDE MAX_VARS_ALIGN*8 30%define COVAR(x,y) [covarq + (x)*8 + (y)*COVAR_STRIDE] 31 32struc LLSModel 33 .covariance: resq MAX_VARS_ALIGN*MAX_VARS_ALIGN 34 .coeff: resq MAX_VARS*MAX_VARS 35 .variance: resq MAX_VARS 36 .indep_count: resd 1 37endstruc 38 39%macro ADDPD_MEM 2 40%if cpuflag(avx) 41 vaddpd %2, %2, %1 42%else 43 addpd %2, %1 44%endif 45 mova %1, %2 46%endmacro 47 48INIT_XMM sse2 49%define movdqa movaps 50cglobal update_lls, 2,5,8, ctx, var, i, j, covar2 51 %define covarq ctxq 52 mov id, [ctxq + LLSModel.indep_count] 53 lea varq, [varq + iq*8] 54 neg iq 55 mov covar2q, covarq 56.loopi: 57 ; Compute all 3 pairwise products of a 2x2 block that lies on the diagonal 58 mova m1, [varq + iq*8] 59 mova m3, [varq + iq*8 + 16] 60 pshufd m4, m1, q1010 61 pshufd m5, m1, q3232 62 pshufd m6, m3, q1010 63 pshufd m7, m3, q3232 64 mulpd m0, m1, m4 65 mulpd m1, m1, m5 66 lea covarq, [covar2q + 16] 67 ADDPD_MEM COVAR(-2,0), m0 68 ADDPD_MEM COVAR(-2,1), m1 69 lea jq, [iq + 2] 70 cmp jd, -2 71 jg .skip4x4 72.loop4x4: 73 ; Compute all 16 pairwise products of a 4x4 block 74 mulpd m0, m4, m3 75 mulpd m1, m5, m3 76 mulpd m2, m6, m3 77 mulpd m3, m3, m7 78 ADDPD_MEM COVAR(0,0), m0 79 ADDPD_MEM COVAR(0,1), m1 80 ADDPD_MEM COVAR(0,2), m2 81 ADDPD_MEM COVAR(0,3), m3 82 mova m3, [varq + jq*8 + 16] 83 mulpd m0, m4, m3 84 mulpd m1, m5, m3 85 mulpd m2, m6, m3 86 mulpd m3, m3, m7 87 ADDPD_MEM COVAR(2,0), m0 88 ADDPD_MEM COVAR(2,1), m1 89 ADDPD_MEM COVAR(2,2), m2 90 ADDPD_MEM COVAR(2,3), m3 91 mova m3, [varq + jq*8 + 32] 92 add covarq, 32 93 add jq, 4 94 cmp jd, -2 95 jle .loop4x4 96.skip4x4: 97 test jd, jd 98 jg .skip2x4 99 mulpd m4, m3 100 mulpd m5, m3 101 mulpd m6, m3 102 mulpd m7, m3 103 ADDPD_MEM COVAR(0,0), m4 104 ADDPD_MEM COVAR(0,1), m5 105 ADDPD_MEM COVAR(0,2), m6 106 ADDPD_MEM COVAR(0,3), m7 107.skip2x4: 108 add iq, 4 109 add covar2q, 4*COVAR_STRIDE+32 110 cmp id, -2 111 jle .loopi 112 test id, id 113 jg .ret 114 mov jq, iq 115 %define covarq covar2q 116.loop2x1: 117 movsd m0, [varq + iq*8] 118 movlhps m0, m0 119 mulpd m0, [varq + jq*8] 120 ADDPD_MEM COVAR(0,0), m0 121 inc iq 122 add covarq, COVAR_STRIDE 123 test id, id 124 jle .loop2x1 125.ret: 126 REP_RET 127 128%macro UPDATE_LLS 0 129cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2 130 %define covarq ctxq 131 mov countd, [ctxq + LLSModel.indep_count] 132 lea count2d, [countq-2] 133 xor id, id 134.loopi: 135 ; Compute all 10 pairwise products of a 4x4 block that lies on the diagonal 136 mova ymm1, [varq + iq*8] 137 vbroadcastsd ymm4, [varq + iq*8] 138 vbroadcastsd ymm5, [varq + iq*8 + 8] 139 vbroadcastsd ymm6, [varq + iq*8 + 16] 140 vbroadcastsd ymm7, [varq + iq*8 + 24] 141 vextractf128 xmm3, ymm1, 1 142%if cpuflag(fma3) 143 mova ymm0, COVAR(iq ,0) 144 mova xmm2, COVAR(iq+2,2) 145 fmaddpd ymm0, ymm1, ymm4, ymm0 146 fmaddpd xmm2, xmm3, xmm6, xmm2 147 fmaddpd ymm1, ymm5, ymm1, COVAR(iq ,1) 148 fmaddpd xmm3, xmm7, xmm3, COVAR(iq+2,3) 149 mova COVAR(iq ,0), ymm0 150 mova COVAR(iq ,1), ymm1 151 mova COVAR(iq+2,2), xmm2 152 mova COVAR(iq+2,3), xmm3 153%else 154 vmulpd ymm0, ymm1, ymm4 155 vmulpd ymm1, ymm1, ymm5 156 vmulpd xmm2, xmm3, xmm6 157 vmulpd xmm3, xmm3, xmm7 158 ADDPD_MEM COVAR(iq ,0), ymm0 159 ADDPD_MEM COVAR(iq ,1), ymm1 160 ADDPD_MEM COVAR(iq+2,2), xmm2 161 ADDPD_MEM COVAR(iq+2,3), xmm3 162%endif ; cpuflag(fma3) 163 lea jd, [iq + 4] 164 cmp jd, count2d 165 jg .skip4x4 166.loop4x4: 167 ; Compute all 16 pairwise products of a 4x4 block 168 mova ymm3, [varq + jq*8] 169%if cpuflag(fma3) 170 mova ymm0, COVAR(jq, 0) 171 mova ymm1, COVAR(jq, 1) 172 mova ymm2, COVAR(jq, 2) 173 fmaddpd ymm0, ymm3, ymm4, ymm0 174 fmaddpd ymm1, ymm3, ymm5, ymm1 175 fmaddpd ymm2, ymm3, ymm6, ymm2 176 fmaddpd ymm3, ymm7, ymm3, COVAR(jq,3) 177 mova COVAR(jq, 0), ymm0 178 mova COVAR(jq, 1), ymm1 179 mova COVAR(jq, 2), ymm2 180 mova COVAR(jq, 3), ymm3 181%else 182 vmulpd ymm0, ymm3, ymm4 183 vmulpd ymm1, ymm3, ymm5 184 vmulpd ymm2, ymm3, ymm6 185 vmulpd ymm3, ymm3, ymm7 186 ADDPD_MEM COVAR(jq,0), ymm0 187 ADDPD_MEM COVAR(jq,1), ymm1 188 ADDPD_MEM COVAR(jq,2), ymm2 189 ADDPD_MEM COVAR(jq,3), ymm3 190%endif ; cpuflag(fma3) 191 add jd, 4 192 cmp jd, count2d 193 jle .loop4x4 194.skip4x4: 195 cmp jd, countd 196 jg .skip2x4 197 mova xmm3, [varq + jq*8] 198%if cpuflag(fma3) 199 mova xmm0, COVAR(jq, 0) 200 mova xmm1, COVAR(jq, 1) 201 mova xmm2, COVAR(jq, 2) 202 fmaddpd xmm0, xmm3, xmm4, xmm0 203 fmaddpd xmm1, xmm3, xmm5, xmm1 204 fmaddpd xmm2, xmm3, xmm6, xmm2 205 fmaddpd xmm3, xmm7, xmm3, COVAR(jq,3) 206 mova COVAR(jq, 0), xmm0 207 mova COVAR(jq, 1), xmm1 208 mova COVAR(jq, 2), xmm2 209 mova COVAR(jq, 3), xmm3 210%else 211 vmulpd xmm0, xmm3, xmm4 212 vmulpd xmm1, xmm3, xmm5 213 vmulpd xmm2, xmm3, xmm6 214 vmulpd xmm3, xmm3, xmm7 215 ADDPD_MEM COVAR(jq,0), xmm0 216 ADDPD_MEM COVAR(jq,1), xmm1 217 ADDPD_MEM COVAR(jq,2), xmm2 218 ADDPD_MEM COVAR(jq,3), xmm3 219%endif ; cpuflag(fma3) 220.skip2x4: 221 add id, 4 222 add covarq, 4*COVAR_STRIDE 223 cmp id, count2d 224 jle .loopi 225 cmp id, countd 226 jg .ret 227 mov jd, id 228.loop2x1: 229 vmovddup xmm0, [varq + iq*8] 230%if cpuflag(fma3) 231 mova xmm1, [varq + jq*8] 232 fmaddpd xmm0, xmm1, xmm0, COVAR(jq,0) 233 mova COVAR(jq,0), xmm0 234%else 235 vmulpd xmm0, [varq + jq*8] 236 ADDPD_MEM COVAR(jq,0), xmm0 237%endif ; cpuflag(fma3) 238 inc id 239 add covarq, COVAR_STRIDE 240 cmp id, countd 241 jle .loop2x1 242.ret: 243 REP_RET 244%endmacro ; UPDATE_LLS 245 246%if HAVE_AVX_EXTERNAL 247INIT_YMM avx 248UPDATE_LLS 249%endif 250%if HAVE_FMA3_EXTERNAL 251INIT_YMM fma3 252UPDATE_LLS 253%endif 254 255INIT_XMM sse2 256cglobal evaluate_lls, 3,4,2, ctx, var, order, i 257 ; This function is often called on the same buffer as update_lls, but with 258 ; an offset. They can't both be aligned. 259 ; Load halves rather than movu to avoid store-forwarding stalls, since the 260 ; input was initialized immediately prior to this function using scalar math. 261 %define coefsq ctxq 262 mov id, orderd 263 imul orderd, MAX_VARS 264 lea coefsq, [ctxq + LLSModel.coeff + orderq*8] 265 movsd m0, [varq] 266 movhpd m0, [varq + 8] 267 mulpd m0, [coefsq] 268 lea coefsq, [coefsq + iq*8] 269 lea varq, [varq + iq*8] 270 neg iq 271 add iq, 2 272.loop: 273 movsd m1, [varq + iq*8] 274 movhpd m1, [varq + iq*8 + 8] 275 mulpd m1, [coefsq + iq*8] 276 addpd m0, m1 277 add iq, 2 278 jl .loop 279 jg .skip1 280 movsd m1, [varq + iq*8] 281 mulsd m1, [coefsq + iq*8] 282 addpd m0, m1 283.skip1: 284 movhlps m1, m0 285 addsd m0, m1 286%if ARCH_X86_32 287 movsd r0m, m0 288 fld qword r0m 289%endif 290 RET 291