1 /* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #ifndef VPX_DSP_MIPS_FWD_TXFM_MSA_H_ 12 #define VPX_DSP_MIPS_FWD_TXFM_MSA_H_ 13 14 #include "vpx_dsp/mips/txfm_macros_msa.h" 15 #include "vpx_dsp/txfm_common.h" 16 17 #define VP9_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3) \ 18 { \ 19 v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m; \ 20 v8i16 vec0_m, vec1_m, vec2_m, vec3_m; \ 21 v4i32 vec4_m, vec5_m, vec6_m, vec7_m; \ 22 v8i16 coeff_m = { \ 23 cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, -cospi_8_64, 0, 0, 0 \ 24 }; \ 25 \ 26 BUTTERFLY_4(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m); \ 27 ILVR_H2_SH(vec1_m, vec0_m, vec3_m, vec2_m, vec0_m, vec2_m); \ 28 SPLATI_H2_SH(coeff_m, 0, 1, cnst0_m, cnst1_m); \ 29 cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ 30 vec5_m = __msa_dotp_s_w(vec0_m, cnst1_m); \ 31 \ 32 SPLATI_H2_SH(coeff_m, 4, 3, cnst2_m, cnst3_m); \ 33 cnst2_m = __msa_ilvev_h(cnst3_m, cnst2_m); \ 34 vec7_m = __msa_dotp_s_w(vec2_m, cnst2_m); \ 35 \ 36 vec4_m = __msa_dotp_s_w(vec0_m, cnst0_m); \ 37 cnst2_m = __msa_splati_h(coeff_m, 2); \ 38 cnst2_m = __msa_ilvev_h(cnst2_m, cnst3_m); \ 39 vec6_m = __msa_dotp_s_w(vec2_m, cnst2_m); \ 40 \ 41 SRARI_W4_SW(vec4_m, vec5_m, vec6_m, vec7_m, DCT_CONST_BITS); \ 42 PCKEV_H4_SH(vec4_m, vec4_m, vec5_m, vec5_m, vec6_m, vec6_m, vec7_m, \ 43 vec7_m, out0, out2, out1, out3); \ 44 } 45 46 #define SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7) \ 47 { \ 48 v8i16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ 49 \ 50 SRLI_H4_SH(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m, 15); \ 51 SRLI_H4_SH(in4, in5, in6, in7, vec4_m, vec5_m, vec6_m, vec7_m, 15); \ 52 AVE_SH4_SH(vec0_m, in0, vec1_m, in1, vec2_m, in2, vec3_m, in3, in0, in1, \ 53 in2, in3); \ 54 AVE_SH4_SH(vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m, in7, in4, in5, \ 55 in6, in7); \ 56 } 57 58 #define VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ 59 out3, out4, out5, out6, out7) \ 60 { \ 61 v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m; \ 62 v8i16 s7_m, x0_m, x1_m, x2_m, x3_m; \ 63 v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, \ 64 cospi_4_64, cospi_28_64, cospi_12_64, cospi_20_64 }; \ 65 \ 66 /* FDCT stage1 */ \ 67 BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, s2_m, \ 68 s3_m, s4_m, s5_m, s6_m, s7_m); \ 69 BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \ 70 ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \ 71 ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \ 72 SPLATI_H2_SH(coeff_m, 0, 1, x0_m, x1_m); \ 73 x1_m = __msa_ilvev_h(x1_m, x0_m); \ 74 out4 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \ 75 \ 76 SPLATI_H2_SH(coeff_m, 2, 3, x2_m, x3_m); \ 77 x2_m = -x2_m; \ 78 x2_m = __msa_ilvev_h(x3_m, x2_m); \ 79 out6 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \ 80 \ 81 out0 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \ 82 x2_m = __msa_splati_h(coeff_m, 2); \ 83 x2_m = __msa_ilvev_h(x2_m, x3_m); \ 84 out2 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \ 85 \ 86 /* stage2 */ \ 87 ILVRL_H2_SH(s5_m, s6_m, s1_m, s0_m); \ 88 \ 89 s6_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \ 90 s5_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \ 91 \ 92 /* stage3 */ \ 93 BUTTERFLY_4(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \ 94 \ 95 /* stage4 */ \ 96 ILVL_H2_SH(x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \ 97 ILVR_H2_SH(x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \ 98 \ 99 SPLATI_H2_SH(coeff_m, 4, 5, x0_m, x1_m); \ 100 x1_m = __msa_ilvev_h(x0_m, x1_m); \ 101 out1 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m); \ 102 \ 103 SPLATI_H2_SH(coeff_m, 6, 7, x2_m, x3_m); \ 104 x2_m = __msa_ilvev_h(x3_m, x2_m); \ 105 out5 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \ 106 \ 107 x1_m = __msa_splati_h(coeff_m, 5); \ 108 x0_m = -x0_m; \ 109 x0_m = __msa_ilvev_h(x1_m, x0_m); \ 110 out7 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m); \ 111 \ 112 x2_m = __msa_splati_h(coeff_m, 6); \ 113 x3_m = -x3_m; \ 114 x2_m = __msa_ilvev_h(x2_m, x3_m); \ 115 out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \ 116 } 117 118 #define FDCT8x16_EVEN(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 119 out2, out3, out4, out5, out6, out7) \ 120 { \ 121 v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \ 122 v8i16 x0_m, x1_m, x2_m, x3_m; \ 123 v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, \ 124 cospi_4_64, cospi_28_64, cospi_12_64, cospi_20_64 }; \ 125 \ 126 /* FDCT stage1 */ \ 127 BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, s2_m, \ 128 s3_m, s4_m, s5_m, s6_m, s7_m); \ 129 BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \ 130 ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \ 131 ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \ 132 SPLATI_H2_SH(coeff_m, 0, 1, x0_m, x1_m); \ 133 x1_m = __msa_ilvev_h(x1_m, x0_m); \ 134 out4 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \ 135 \ 136 SPLATI_H2_SH(coeff_m, 2, 3, x2_m, x3_m); \ 137 x2_m = -x2_m; \ 138 x2_m = __msa_ilvev_h(x3_m, x2_m); \ 139 out6 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \ 140 \ 141 out0 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \ 142 x2_m = __msa_splati_h(coeff_m, 2); \ 143 x2_m = __msa_ilvev_h(x2_m, x3_m); \ 144 out2 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \ 145 \ 146 /* stage2 */ \ 147 ILVRL_H2_SH(s5_m, s6_m, s1_m, s0_m); \ 148 \ 149 s6_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \ 150 s5_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \ 151 \ 152 /* stage3 */ \ 153 BUTTERFLY_4(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \ 154 \ 155 /* stage4 */ \ 156 ILVL_H2_SH(x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \ 157 ILVR_H2_SH(x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \ 158 \ 159 SPLATI_H2_SH(coeff_m, 4, 5, x0_m, x1_m); \ 160 x1_m = __msa_ilvev_h(x0_m, x1_m); \ 161 out1 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m); \ 162 \ 163 SPLATI_H2_SH(coeff_m, 6, 7, x2_m, x3_m); \ 164 x2_m = __msa_ilvev_h(x3_m, x2_m); \ 165 out5 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \ 166 \ 167 x1_m = __msa_splati_h(coeff_m, 5); \ 168 x0_m = -x0_m; \ 169 x0_m = __msa_ilvev_h(x1_m, x0_m); \ 170 out7 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m); \ 171 \ 172 x2_m = __msa_splati_h(coeff_m, 6); \ 173 x3_m = -x3_m; \ 174 x2_m = __msa_ilvev_h(x2_m, x3_m); \ 175 out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \ 176 } 177 178 #define FDCT8x16_ODD(input0, input1, input2, input3, input4, input5, input6, \ 179 input7, out1, out3, out5, out7, out9, out11, out13, \ 180 out15) \ 181 { \ 182 v8i16 stp21_m, stp22_m, stp23_m, stp24_m, stp25_m, stp26_m; \ 183 v8i16 stp30_m, stp31_m, stp32_m, stp33_m, stp34_m, stp35_m; \ 184 v8i16 stp36_m, stp37_m, vec0_m, vec1_m; \ 185 v8i16 vec2_m, vec3_m, vec4_m, vec5_m, vec6_m; \ 186 v8i16 cnst0_m, cnst1_m, cnst4_m, cnst5_m; \ 187 v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, \ 188 -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 }; \ 189 v8i16 coeff1_m = { cospi_2_64, cospi_30_64, cospi_14_64, cospi_18_64, \ 190 cospi_10_64, cospi_22_64, cospi_6_64, cospi_26_64 }; \ 191 v8i16 coeff2_m = { \ 192 -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, 0, 0, 0, 0 \ 193 }; \ 194 \ 195 /* stp 1 */ \ 196 ILVL_H2_SH(input2, input5, input3, input4, vec2_m, vec4_m); \ 197 ILVR_H2_SH(input2, input5, input3, input4, vec3_m, vec5_m); \ 198 \ 199 cnst4_m = __msa_splati_h(coeff_m, 0); \ 200 stp25_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst4_m); \ 201 \ 202 cnst5_m = __msa_splati_h(coeff_m, 1); \ 203 cnst5_m = __msa_ilvev_h(cnst5_m, cnst4_m); \ 204 stp22_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst5_m); \ 205 stp24_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst4_m); \ 206 stp23_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst5_m); \ 207 \ 208 /* stp2 */ \ 209 BUTTERFLY_4(input0, input1, stp22_m, stp23_m, stp30_m, stp31_m, stp32_m, \ 210 stp33_m); \ 211 BUTTERFLY_4(input7, input6, stp25_m, stp24_m, stp37_m, stp36_m, stp35_m, \ 212 stp34_m); \ 213 \ 214 ILVL_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec2_m, vec4_m); \ 215 ILVR_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec3_m, vec5_m); \ 216 \ 217 SPLATI_H2_SH(coeff_m, 2, 3, cnst0_m, cnst1_m); \ 218 cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ 219 stp26_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \ 220 \ 221 cnst0_m = __msa_splati_h(coeff_m, 4); \ 222 cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ 223 stp21_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \ 224 \ 225 SPLATI_H2_SH(coeff_m, 5, 2, cnst0_m, cnst1_m); \ 226 cnst1_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ 227 stp25_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m); \ 228 \ 229 cnst0_m = __msa_splati_h(coeff_m, 3); \ 230 cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ 231 stp22_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m); \ 232 \ 233 /* stp4 */ \ 234 BUTTERFLY_4(stp30_m, stp37_m, stp26_m, stp21_m, vec6_m, vec2_m, vec4_m, \ 235 vec5_m); \ 236 BUTTERFLY_4(stp33_m, stp34_m, stp25_m, stp22_m, stp21_m, stp23_m, stp24_m, \ 237 stp31_m); \ 238 \ 239 ILVRL_H2_SH(vec2_m, vec6_m, vec1_m, vec0_m); \ 240 SPLATI_H2_SH(coeff1_m, 0, 1, cnst0_m, cnst1_m); \ 241 cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ 242 \ 243 out1 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ 244 \ 245 cnst0_m = __msa_splati_h(coeff2_m, 0); \ 246 cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ 247 out15 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ 248 \ 249 ILVRL_H2_SH(vec4_m, vec5_m, vec1_m, vec0_m); \ 250 SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \ 251 cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ 252 \ 253 out9 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \ 254 \ 255 cnst1_m = __msa_splati_h(coeff2_m, 2); \ 256 cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ 257 out7 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ 258 \ 259 ILVRL_H2_SH(stp23_m, stp21_m, vec1_m, vec0_m); \ 260 SPLATI_H2_SH(coeff1_m, 4, 5, cnst0_m, cnst1_m); \ 261 cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ 262 out5 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ 263 \ 264 cnst0_m = __msa_splati_h(coeff2_m, 1); \ 265 cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ 266 out11 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ 267 \ 268 ILVRL_H2_SH(stp24_m, stp31_m, vec1_m, vec0_m); \ 269 SPLATI_H2_SH(coeff1_m, 6, 7, cnst0_m, cnst1_m); \ 270 cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ 271 \ 272 out13 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \ 273 \ 274 cnst1_m = __msa_splati_h(coeff2_m, 3); \ 275 cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ 276 out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ 277 } 278 279 #define FDCT_POSTPROC_2V_NEG_H(vec0, vec1) \ 280 { \ 281 v8i16 tp0_m, tp1_m; \ 282 v8i16 one_m = __msa_ldi_h(1); \ 283 \ 284 tp0_m = __msa_clti_s_h(vec0, 0); \ 285 tp1_m = __msa_clti_s_h(vec1, 0); \ 286 vec0 += 1; \ 287 vec1 += 1; \ 288 tp0_m = one_m & tp0_m; \ 289 tp1_m = one_m & tp1_m; \ 290 vec0 += tp0_m; \ 291 vec1 += tp1_m; \ 292 vec0 >>= 2; \ 293 vec1 >>= 2; \ 294 } 295 296 #define FDCT32_POSTPROC_NEG_W(vec) \ 297 { \ 298 v4i32 temp_m; \ 299 v4i32 one_m = __msa_ldi_w(1); \ 300 \ 301 temp_m = __msa_clti_s_w(vec, 0); \ 302 vec += 1; \ 303 temp_m = one_m & temp_m; \ 304 vec += temp_m; \ 305 vec >>= 2; \ 306 } 307 308 #define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) \ 309 { \ 310 v8i16 tp0_m, tp1_m; \ 311 v8i16 one = __msa_ldi_h(1); \ 312 \ 313 tp0_m = __msa_clei_s_h(vec0, 0); \ 314 tp1_m = __msa_clei_s_h(vec1, 0); \ 315 tp0_m = (v8i16)__msa_xori_b((v16u8)tp0_m, 255); \ 316 tp1_m = (v8i16)__msa_xori_b((v16u8)tp1_m, 255); \ 317 vec0 += 1; \ 318 vec1 += 1; \ 319 tp0_m = one & tp0_m; \ 320 tp1_m = one & tp1_m; \ 321 vec0 += tp0_m; \ 322 vec1 += tp1_m; \ 323 vec0 >>= 2; \ 324 vec1 >>= 2; \ 325 } 326 327 #define DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right, reg1_right, \ 328 const0, const1, out0, out1, out2, out3) \ 329 { \ 330 v4i32 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \ 331 v2i64 tp0_m, tp1_m, tp2_m, tp3_m; \ 332 v4i32 k0_m = __msa_fill_w((int32_t)const0); \ 333 \ 334 s0_m = __msa_fill_w((int32_t)const1); \ 335 k0_m = __msa_ilvev_w(s0_m, k0_m); \ 336 \ 337 ILVRL_W2_SW(-reg1_left, reg0_left, s1_m, s0_m); \ 338 ILVRL_W2_SW(reg0_left, reg1_left, s3_m, s2_m); \ 339 ILVRL_W2_SW(-reg1_right, reg0_right, s5_m, s4_m); \ 340 ILVRL_W2_SW(reg0_right, reg1_right, s7_m, s6_m); \ 341 \ 342 DOTP_SW2_SD(s0_m, s1_m, k0_m, k0_m, tp0_m, tp1_m); \ 343 DOTP_SW2_SD(s4_m, s5_m, k0_m, k0_m, tp2_m, tp3_m); \ 344 tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS); \ 345 tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS); \ 346 tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS); \ 347 tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS); \ 348 out0 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m); \ 349 out1 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m); \ 350 \ 351 DOTP_SW2_SD(s2_m, s3_m, k0_m, k0_m, tp0_m, tp1_m); \ 352 DOTP_SW2_SD(s6_m, s7_m, k0_m, k0_m, tp2_m, tp3_m); \ 353 tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS); \ 354 tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS); \ 355 tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS); \ 356 tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS); \ 357 out2 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m); \ 358 out3 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m); \ 359 } 360 361 void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr, 362 int32_t src_stride); 363 void fdct16x8_1d_row(int16_t *input, int16_t *output); 364 #endif // VPX_DSP_MIPS_FWD_TXFM_MSA_H_ 365