1/*! 2 * \copy 3 * Copyright (c) 2013, Cisco Systems 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 10 * * Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 21 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 22 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 24 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 28 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 * 31 */ 32 33#ifdef HAVE_NEON 34#include "arm_arch_common_macro.S" 35 36.macro LOAD_4x4_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 37// { // input: \arg0~\arg3, src1*, src1_stride, src2*, src2_stride 38 vld2.16 {\arg0[0],\arg1[0]}, [\arg4], \arg5 39 vld2.16 {\arg2[0],\arg3[0]}, [\arg6], \arg7 40 vld2.16 {\arg0[1],\arg1[1]}, [\arg4], \arg5 41 vld2.16 {\arg2[1],\arg3[1]}, [\arg6], \arg7 42 43 vld2.16 {\arg0[2],\arg1[2]}, [\arg4], \arg5 44 vld2.16 {\arg2[2],\arg3[2]}, [\arg6], \arg7 45 vld2.16 {\arg0[3],\arg1[3]}, [\arg4], \arg5 46 vld2.16 {\arg2[3],\arg3[3]}, [\arg6], \arg7 47// } 48.endm 49 50.macro LOAD_8x8_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9 51// { // input: \arg0~\arg3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride 52 vld1.64 {\arg0}, [\arg8], r2 53 vld1.64 {\arg4}, [\arg9], r4 54 vld1.64 {\arg1}, [\arg8], r2 55 vld1.64 {\arg5}, [\arg9], r4 56 57 vld1.64 {\arg2}, [\arg8], r2 58 vld1.64 {\arg6}, [\arg9], r4 59 vld1.64 {\arg3}, [\arg8], r2 60 vld1.64 {\arg7}, [\arg9], r4 61// } 62.endm 63 64.macro DCT_ROW_TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 65// { // input: src_d[0]~[3], working: [4]~[7] 66 vadd.s16 \arg4, \arg0, \arg3 //int16 s[0] = data[i] + data[i3]; 67 vsub.s16 \arg7, \arg0, \arg3 //int16 s[3] = data[i] - data[i3]; 68 vadd.s16 \arg5, \arg1, \arg2 //int16 s[1] = data[i1] + data[i2]; 69 vsub.s16 \arg6, \arg1, \arg2 //int16 s[2] = data[i1] - data[i2]; 70 71 vadd.s16 \arg0, \arg4, \arg5 //int16 dct[i ] = s[0] + s[1]; 72 vsub.s16 \arg2, \arg4, \arg5 //int16 dct[i2] = s[0] - s[1]; 73 vshl.s16 \arg1, \arg7, #1 74 vshl.s16 \arg3, \arg6, #1 75 vadd.s16 \arg1, \arg1, \arg6 //int16 dct[i1] = (s[3] << 1) + s[2]; 76 vsub.s16 \arg3, \arg7, \arg3 //int16 dct[i3] = s[3] - (s[2] << 1); 77// } 78.endm 79 80.macro MATRIX_TRANSFORM_EACH_16BITS arg0, arg1, arg2, arg3 81// { // input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15] 82 vtrn.s16 \arg0, \arg1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7] 83 vtrn.s16 \arg2, \arg3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15] 84 vtrn.32 \arg0, \arg2 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14] 85 vtrn.32 \arg1, \arg3 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15] 86// } 87.endm 88 89.macro NEWQUANT_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 90// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0, md_d1 91 veor.s16 \arg6, \arg6 // init 0 , and keep 0; 92 vaba.s16 \arg1, \arg0, \arg6 // f + abs(coef - 0) 93 vmull.s16 \arg7, \arg2, \arg4 94 vmull.s16 \arg8, \arg3, \arg5 95 vshr.s32 \arg7, #16 96 vshr.s32 \arg8, #16 97 vmovn.s32 \arg2, \arg7 98 vmovn.s32 \arg3, \arg8 99 100 vcgt.s16 \arg7, \arg0, #0 // if true, location of coef == 11111111 101 vbif.s16 \arg6, \arg1, \arg7 // if (x<0) reserved part; else keep 0 untouched 102 vshl.s16 \arg6, #1 103 vsub.s16 \arg1, \arg1, \arg6 // if x > 0, -= 0; else x-= 2x 104// } 105.endm 106 107.macro NEWQUANT_COEF_EACH_16BITS_MAX arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9 108// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0(max), md_d1 109 veor.s16 \arg6, \arg6 // init 0 , and keep 0; 110 vaba.s16 \arg1, \arg0, \arg6 // f + abs(coef - 0) 111 vmull.s16 \arg7, \arg2, \arg4 112 vmull.s16 \arg8, \arg3, \arg5 113 vshr.s32 \arg7, #16 114 vshr.s32 \arg8, #16 115 vmovn.s32 \arg2, \arg7 116 vmovn.s32 \arg3, \arg8 117 118 vcgt.s16 \arg7, \arg0, #0 // if true, location of coef == 11111111 119 vbif.s16 \arg6, \arg1, \arg7 // if (x<0) reserved part; else keep 0 untouched 120 vshl.s16 \arg6, #1 121 vmax.s16 \arg9, \arg2, \arg3 122 vsub.s16 \arg1, \arg1, \arg6 // if x > 0, -= 0; else x-= 2x 123// } 124.endm 125 126.macro QUANT_DUALWORD_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4 127// { // input: coef, ff (dst), mf , working_d (all 0), working_q 128 vaba.s16 \arg1, \arg0, \arg3 // f + abs(coef - 0) 129 vmull.s16 \arg4, \arg1, \arg2 // *= mf 130 vshr.s32 \arg4, #16 131 vmovn.s32 \arg1, \arg4 // >> 16 132 133 vcgt.s16 \arg2, \arg0, #0 // if true, location of coef == 11111111 134 vbif.s16 \arg3, \arg1, \arg2 // if (x<0) reserved part; else keep 0 untouched 135 vshl.s16 \arg3, #1 136 vsub.s16 \arg1, \arg1, \arg3 // if x > 0, -= 0; else x-= 2x 137// } 138.endm 139 140.macro DC_ZERO_COUNT_IN_DUALWORD arg0, arg1, arg2 141// { // input: coef, dst_d, working_d (all 0x01) 142 vceq.s16 \arg1, \arg0, #0 143 vand.s16 \arg1, \arg2 144 vpadd.s16 \arg1, \arg1, \arg1 145 vpadd.s16 \arg1, \arg1, \arg1 146// } 147.endm 148 149.macro SELECT_MAX_IN_ABS_COEF arg0, arg1, arg2, arg3, arg4 150// { // input: coef_0, coef_1, max_q (identy to follow two), output: max_d0, max_d1 151 vmax.s16 \arg2, \arg0, \arg1 // max 1st in \arg3 & max 2nd in \arg4 152 vpmax.s16 \arg3, \arg3, \arg4 // max 1st in \arg3[0][1] & max 2nd in \arg3[2][3] 153 vpmax.s16 \arg3, \arg3, \arg4 // max 1st in \arg3[0][1] 154// } 155.endm 156 157.macro ZERO_COUNT_IN_2_QUARWORD arg0, arg1, arg2, arg3, arg4, arg5, arg6 158// { // input: coef_0 (identy to \arg3 \arg4), coef_1(identy to \arg5 \arg6), mask_q 159 vceq.s16 \arg0, #0 160 vceq.s16 \arg1, #0 161 vand.s16 \arg0, \arg2 162 vand.s16 \arg1, \arg2 163 164 vpadd.s16 \arg3, \arg3, \arg5 165 vpadd.s16 \arg4, \arg4, \arg6 166 vpadd.s16 \arg3, \arg3, \arg4 // 8-->4 167 vpadd.s16 \arg3, \arg3, \arg3 168 vpadd.s16 \arg3, \arg3, \arg3 169// } 170.endm 171 172.macro HDM_QUANT_2x2_TOTAL_16BITS arg0, arg1, arg2 173// { // input: src_d[0]~[3], working_d, dst_d 174 vshr.s64 \arg1, \arg0, #32 175 vadd.s16 \arg2, \arg0, \arg1 // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48]; 176 vsub.s16 \arg1, \arg0, \arg1 // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48]; 177 vtrn.s16 \arg2, \arg1 178 vtrn.s32 \arg2, \arg1 179// } 180.endm 181 182.macro IHDM_4x4_TOTAL_16BITS arg0, arg1, arg2 183// { // input: each src_d[0]~[3](dst), working_q0, working_q1, working_q2 184 vshr.s64 \arg1, \arg0, #32 185 vadd.s16 \arg2, \arg0, \arg1 // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3]; 186 vsub.s16 \arg1, \arg0, \arg1 // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3]; 187 vtrn.s16 \arg2, \arg1 188 vrev32.16 \arg1, \arg1 189 vtrn.s32 \arg2, \arg1 // [0] = rs[0] + rs[2];[1] = rs[0] - rs[2];[2] = rs[1] - rs[3];[3] = rs[1] + rs[3]; 190 191 vrev64.16 \arg1, \arg2 192 vadd.s16 \arg0, \arg2, \arg1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2]; 193 vsub.s16 \arg1, \arg2, \arg1 194 vrev32.16 \arg1, \arg1 // [0] = rs[1] - rs[2];[1] = rs[0] - rs[3]; 195 vtrn.s32 \arg0, \arg1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];[2] = rs[1] - rs[2];[3] = rs[0] - rs[3]; 196// } 197.endm 198 199.macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP arg0, arg1, arg2, arg3, arg4, arg5 200// { // input: pred_d[0]/[1](output), dct_q0/1, working_q0/1; 201 vmovl.u8 \arg4,\arg0 202 vmovl.u8 \arg5,\arg1 203 vadd.s16 \arg4,\arg2 204 vadd.s16 \arg5,\arg3 205 vqmovun.s16 \arg0,\arg4 206 vqmovun.s16 \arg1,\arg5 207// } 208.endm 209 210.macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 211// { // input: src_d[0]~[3], output: e_d[0]~[3]; 212 vadd.s16 \arg4, \arg0, \arg2 //int16 e[i][0] = src[0] + src[2]; 213 vsub.s16 \arg5, \arg0, \arg2 //int16 e[i][1] = src[0] - src[2]; 214 vshr.s16 \arg6, \arg1, #1 215 vshr.s16 \arg7, \arg3, #1 216 vsub.s16 \arg6, \arg6, \arg3 //int16 e[i][2] = (src[1]>>1)-src[3]; 217 vadd.s16 \arg7, \arg1, \arg7 //int16 e[i][3] = src[1] + (src[3]>>1); 218// } 219.endm 220 221.macro TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used 222// { // output: f_q[0]~[3], input: e_q[0]~[3]; 223 vadd.s16 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3]; 224 vadd.s16 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2]; 225 vsub.s16 \arg2, \arg5, \arg6 //int16 f[i][2] = e[i][1] - e[i][2]; 226 vsub.s16 \arg3, \arg4, \arg7 //int16 f[i][3] = e[i][0] - e[i][3]; 227// } 228.endm 229 230 231.macro ROW_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 232// { // input: src_d[0]~[3], output: e_q[0]~[3]; 233 vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2]; 234 vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2]; 235 vsubl.s16 \arg6, \arg1, \arg3 //int32 e[i][2] = src[1] - src[3]; 236 vaddl.s16 \arg7, \arg1, \arg3 //int32 e[i][3] = src[1] + src[3]; 237// } 238.endm 239 240.macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9 241// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: \arg8 \arg9 242 vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2]; 243 vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2]; 244 vshr.s16 \arg8, \arg1, #1 245 vshr.s16 \arg9, \arg3, #1 246 vsubl.s16 \arg6, \arg8, \arg3 //int32 e[i][2] = (src[1]>>1)-src[3]; 247 vaddl.s16 \arg7, \arg1, \arg9 //int32 e[i][3] = src[1] + (src[3]>>1); 248// } 249.endm 250 251.macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used 252// { // output: f_q[0]~[3], input: e_q[0]~[3]; 253 vadd.s32 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3]; 254 vadd.s32 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2]; 255 vsub.s32 \arg2, \arg5, \arg6 //int16 f[i][2] = e[i][1] - e[i][2]; 256 vsub.s32 \arg3, \arg4, \arg7 //int16 f[i][3] = e[i][0] - e[i][3]; 257// } 258.endm 259 260.macro COL_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 261// { // input: src_q[0]~[3], output: e_q[0]~[3]; 262 vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j]; 263 vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j]; 264 vsub.s32 \arg6, \arg1, \arg3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j]; 265 vadd.s32 \arg7, \arg1, \arg3 //int32 e[3][j] = f[1][j] + (f[3][j]>>1); 266// } 267.endm 268 269.macro COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 270// { // input: src_q[0]~[3], output: e_q[0]~[3]; 271 vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j]; 272 vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j]; 273 vshr.s32 \arg6, \arg1, #1 274 vshr.s32 \arg7, \arg3, #1 275 vsub.s32 \arg6, \arg6, \arg3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j]; 276 vadd.s32 \arg7, \arg1, \arg7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1); 277// } 278.endm 279 280 281WELS_ASM_FUNC_BEGIN WelsDctT4_neon 282 push {r4} 283 ldr r4, [sp, #4] 284 285 LOAD_4x4_DATA_FOR_DCT d4, d5, d6, d7, r1, r2, r3, r4 286 287 vsubl.u8 q0, d4, d6 288 vsubl.u8 q1, d5, d7 289 vtrn.s32 q0, q1 290 vswp d1, d2 291 292 // horizontal transform 293 DCT_ROW_TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7 294 295 // transform element 296 MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3 297 298 // vertical transform 299 DCT_ROW_TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7 300 301 // transform element 302 MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3 303 304 vst1.s16 {q0, q1}, [r0]! 305 306 pop {r4} 307WELS_ASM_FUNC_END 308 309 310WELS_ASM_FUNC_BEGIN WelsDctFourT4_neon 311 push {r4} 312 ldr r4, [sp, #4] 313 314 LOAD_8x8_DATA_FOR_DCT d16, d17, d18, d19, d20, d21, d22, d23, r1, r3 315 316 vsubl.u8 q0, d16, d20 317 vsubl.u8 q1, d17, d21 318 vsubl.u8 q2, d18, d22 319 vsubl.u8 q3, d19, d23 320 MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3 321 322 // horizontal transform 323 DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11 324 325 // transform element 326 MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3 327 328 // vertical transform 329 DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11 330 331 vswp d1, d2 332 vswp d5, d6 333 vswp q1, q2 334 vst1.s16 {q0, q1}, [r0]! 335 vst1.s16 {q2, q3}, [r0]! 336 337 //////////////// 338 LOAD_8x8_DATA_FOR_DCT d16, d17, d18, d19, d20, d21, d22, d23, r1, r3 339 340 vsubl.u8 q0, d16, d20 341 vsubl.u8 q1, d17, d21 342 vsubl.u8 q2, d18, d22 343 vsubl.u8 q3, d19, d23 344 MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3 345 346 // horizontal transform 347 DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11 348 349 // transform element 350 MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3 351 352 // vertical transform 353 DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11 354 355 vswp d1, d2 356 vswp d5, d6 357 vswp q1, q2 358 vst1.s16 {q0, q1}, [r0]! 359 vst1.s16 {q2, q3}, [r0]! 360 361 pop {r4} 362WELS_ASM_FUNC_END 363 364 365WELS_ASM_FUNC_BEGIN WelsQuant4x4_neon 366 vld1.s16 {q2}, [r1] 367 vld1.s16 {q0, q1}, [r0] 368 vld1.s16 {q3}, [r2] 369 370 vmov q8, q2 371 372 NEWQUANT_COEF_EACH_16BITS q0, q2, d4, d5, d6, d7, q9, q10, q11 373 vst1.s16 {q2}, [r0]! 374 375 NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11 376 vst1.s16 {q8}, [r0]! 377 378WELS_ASM_FUNC_END 379 380 381WELS_ASM_FUNC_BEGIN WelsQuant4x4Dc_neon 382 383 vld1.s16 {q0, q1}, [r0] 384 vdup.s16 q2, r1 // even ff range [0, 768] 385 vdup.s16 q3, r2 386 387 vmov q8, q2 388 389 NEWQUANT_COEF_EACH_16BITS q0, q2, d4, d5, d6, d7, q9, q10, q11 390 vst1.s16 {q2}, [r0]! 391 392 NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11 393 vst1.s16 {q8}, [r0]! 394 395WELS_ASM_FUNC_END 396 397 398WELS_ASM_FUNC_BEGIN WelsQuantFour4x4_neon 399 vld1.s16 {q2}, [r1] 400 vld1.s16 {q3}, [r2] 401 mov r1, r0 402 403 vld1.s16 {q0, q1}, [r0]! 404 vmov q8, q2 405 NEWQUANT_COEF_EACH_16BITS q0, q8, d16, d17, d6, d7, q9, q10, q11 406 vst1.s16 {q8}, [r1]! 407 vmov q8, q2 408 NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11 409 vst1.s16 {q8}, [r1]! 410 411 vld1.s16 {q0, q1}, [r0]! 412 vmov q8, q2 413 NEWQUANT_COEF_EACH_16BITS q0, q8, d16, d17, d6, d7, q9, q10, q11 414 vst1.s16 {q8}, [r1]! 415 vmov q8, q2 416 NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11 417 vst1.s16 {q8}, [r1]! 418 419 vld1.s16 {q0, q1}, [r0]! 420 vmov q8, q2 421 NEWQUANT_COEF_EACH_16BITS q0, q8, d16, d17, d6, d7, q9, q10, q11 422 vst1.s16 {q8}, [r1]! 423 vmov q8, q2 424 NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11 425 vst1.s16 {q8}, [r1]! 426 427 vld1.s16 {q0, q1}, [r0]! 428 vmov q8, q2 429 NEWQUANT_COEF_EACH_16BITS q0, q8, d16, d17, d6, d7, q9, q10, q11 430 vst1.s16 {q8}, [r1]! 431 vmov q8, q2 432 NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11 433 vst1.s16 {q8}, [r1]! 434 435WELS_ASM_FUNC_END 436 437 438WELS_ASM_FUNC_BEGIN WelsQuantFour4x4Max_neon 439 vld1.s16 {q2}, [r1] 440 vld1.s16 {q3}, [r2] 441 mov r1, r0 442 443 vld1.s16 {q0, q1}, [r0]! 444 vmov q8, q2 445 NEWQUANT_COEF_EACH_16BITS_MAX q0, q8, d16, d17, d6, d7, q9, q10, q11, d26 446 vst1.s16 {q8}, [r1]! 447 vmov q12, q2 448 NEWQUANT_COEF_EACH_16BITS_MAX q1, q12, d24, d25, d6, d7, q9, q10, q11, d28 449 vst1.s16 {q12}, [r1]! // then 1st 16 elem in d26 & d28 450 451 vld1.s16 {q0, q1}, [r0]! 452 vmov q8, q2 453 NEWQUANT_COEF_EACH_16BITS_MAX q0, q8, d16, d17, d6, d7, q9, q10, q11, d27 454 vst1.s16 {q8}, [r1]! 455 vmov q12, q2 456 NEWQUANT_COEF_EACH_16BITS_MAX q1, q12, d24, d25, d6, d7, q9, q10, q11, d29 457 vst1.s16 {q12}, [r1]! // then 2nd 16 elem in d27 & d29 458 459 SELECT_MAX_IN_ABS_COEF q13, q14, q0, d0, d1 460 vst1.s32 {d0[0]}, [r3]! 461 462 /////////// 463 vld1.s16 {q0, q1}, [r0]! 464 vmov q8, q2 465 NEWQUANT_COEF_EACH_16BITS_MAX q0, q8, d16, d17, d6, d7, q9, q10, q11, d26 466 vst1.s16 {q8}, [r1]! 467 vmov q12, q2 468 NEWQUANT_COEF_EACH_16BITS_MAX q1, q12, d24, d25, d6, d7, q9, q10, q11, d28 469 vst1.s16 {q12}, [r1]! // then 3rd 16 elem in d26 & d28 470 471 vld1.s16 {q0, q1}, [r0]! 472 vmov q8, q2 473 NEWQUANT_COEF_EACH_16BITS_MAX q0, q8, d16, d17, d6, d7, q9, q10, q11, d27 474 vst1.s16 {q8}, [r1]! 475 vmov q12, q2 476 NEWQUANT_COEF_EACH_16BITS_MAX q1, q12, d24, d25, d6, d7, q9, q10, q11, d29 477 vst1.s16 {q12}, [r1]! // then 4th 16 elem in d27 & d29 478 479 SELECT_MAX_IN_ABS_COEF q13, q14, q0, d0, d1 480 vst1.s32 {d0[0]}, [r3]! 481 482WELS_ASM_FUNC_END 483 484 485WELS_ASM_FUNC_BEGIN WelsHadamardT4Dc_neon 486 push {r2,r3} 487 mov r2, #64 // 2*16*sizeof(int16_t) 488 add r3, r1, #32 489 490 vld1.s16 {d0}, [r1], r2 491 vld1.s16 {d1}, [r3], r2 492 vld1.s16 {d4}, [r1], r2 493 vld1.s16 {d5}, [r3], r2 494 vld1.s16 {d2}, [r1], r2 495 vld1.s16 {d3}, [r3], r2 496 vld1.s16 {d6}, [r1], r2 497 vld1.s16 {d7}, [r3], r2 498 vtrn.16 q0, q2 // d0[0 4], d1[1 5] 499 vtrn.16 q1, q3 // d2[2 6], d3[3 7] 500 501 vld1.s16 {d16}, [r1], r2 502 vld1.s16 {d17}, [r3], r2 503 vld1.s16 {d20}, [r1], r2 504 vld1.s16 {d21}, [r3], r2 505 vld1.s16 {d18}, [r1], r2 506 vld1.s16 {d19}, [r3], r2 507 vld1.s16 {d22}, [r1], r2 508 vld1.s16 {d23}, [r3], r2 509 vtrn.16 q8, q10 //d16[08 12],d17[09 13] 510 vtrn.16 q9, q11 //d18[10 14],d19[11 15] 511 512 vtrn.32 q0, q8 // d0 [0 4 08 12] = dct[idx], d1[1 5 09 13] = dct[idx+16] 513 vtrn.32 q1, q9 // d2 [2 6 10 14] = dct[idx+64], d3[3 7 11 15] = dct[idx+80] 514 515 ROW_TRANSFORM_0_STEP d0, d1, d3, d2, q8, q11, q10, q9 516 517 TRANSFORM_4BYTES q0, q1, q3, q2, q8, q11, q10, q9 518 519 // transform element 32bits 520 vtrn.s32 q0, q1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7] 521 vtrn.s32 q2, q3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15] 522 vswp d1, d4 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14] 523 vswp d3, d6 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15] 524 525 COL_TRANSFORM_0_STEP q0, q1, q3, q2, q8, q11, q10, q9 526 527 TRANSFORM_4BYTES q0, q1, q3, q2, q8, q11, q10, q9 528 529 vrshrn.s32 d16, q0, #1 530 vrshrn.s32 d17, q1, #1 531 vrshrn.s32 d18, q2, #1 532 vrshrn.s32 d19, q3, #1 533 vst1.16 {q8, q9}, [r0] //store 534 535 pop {r2,r3} 536WELS_ASM_FUNC_END 537 538 539WELS_ASM_FUNC_BEGIN WelsHadamardQuant2x2_neon 540 541 vdup.s16 d1, r1 //ff 542 vdup.s16 d2, r2 //mf 543 veor d3, d3 544 545 mov r1, #32 546 mov r2, r0 547 548 vld1.s16 {d0[0]}, [r0], r1 //rs[00] 549 vst1.s16 {d3[0]}, [r2], r1 //rs[00]=0 550 vld1.s16 {d0[1]}, [r0], r1 //rs[16] 551 vst1.s16 {d3[0]}, [r2], r1 //rs[16]=0 552 vld1.s16 {d0[2]}, [r0], r1 //rs[32] 553 vst1.s16 {d3[0]}, [r2], r1 //rs[32]=0 554 vld1.s16 {d0[3]}, [r0], r1 //rs[48] 555 vst1.s16 {d3[0]}, [r2], r1 //rs[48]=0 556 557 HDM_QUANT_2x2_TOTAL_16BITS d0, d4, d5 // output d5 558 559 HDM_QUANT_2x2_TOTAL_16BITS d5, d4, d0 // output d0 560 561 QUANT_DUALWORD_COEF_EACH_16BITS d0, d1, d2, d3, q2 562 563 vst1.s16 d1, [r3] // store to dct 564 ldr r2, [sp, #0] 565 vst1.s16 d1, [r2] // store to block 566 567 mov r1, #1 568 vdup.s16 d3, r1 569 DC_ZERO_COUNT_IN_DUALWORD d1, d0, d3 570 571 vmov r0, r1, d0 572 and r0, #0x07 // range [0~4] 573 rsb r0, #4 574WELS_ASM_FUNC_END 575 576 577WELS_ASM_FUNC_BEGIN WelsHadamardQuant2x2SkipKernel_neon 578 579 vdup.s16 d3, r1 580 mov r1, #32 581 vld1.s16 {d0[0]}, [r0], r1 //rs[00] 582 vld1.s16 {d0[1]}, [r0], r1 //rs[16] 583 vld1.s16 {d0[2]}, [r0], r1 //rs[32] 584 vld1.s16 {d0[3]}, [r0], r1 //rs[48] 585 586 HDM_QUANT_2x2_TOTAL_16BITS d0, d1, d2 // output d2 587 588 HDM_QUANT_2x2_TOTAL_16BITS d2, d1, d0 // output d0 589 590 vabs.s16 d1, d0 591 vcgt.s16 d1, d1, d3 // abs(dct[i])>threshold; 592 vmov r0, r1, d1 593 orr r0, r1 594WELS_ASM_FUNC_END 595 596 597WELS_ASM_FUNC_BEGIN WelsGetNoneZeroCount_neon 598 push {r1} 599 vld1.s16 {q0, q1}, [r0] 600 vmov.s16 q8, #1 601 602 ZERO_COUNT_IN_2_QUARWORD q0, q1, q8, d0, d1, d2, d3 603 vmov r0, r1, d0 604 and r0, #0x1F // range [0~16] 605 rsb r0, #16 606 pop {r1} 607WELS_ASM_FUNC_END 608 609 610WELS_ASM_FUNC_BEGIN WelsDequant4x4_neon 611 vld1.s16 {q0, q1}, [r0] 612 vld1.u16 {q2}, [r1] 613 614 vmul.s16 q8, q0, q2 615 vmul.s16 q9, q1, q2 616 617 vst1.s16 {q8, q9}, [r0] 618WELS_ASM_FUNC_END 619 620 621WELS_ASM_FUNC_BEGIN WelsDequantFour4x4_neon 622 vld1.u16 {q12}, [r1] 623 mov r1, r0 624 vld1.s16 {q0, q1}, [r0]! 625 vld1.s16 {q2, q3}, [r0]! 626 vmul.s16 q0, q0, q12 627 vld1.s16 {q8, q9}, [r0]! 628 vmul.s16 q1, q1, q12 629 vld1.s16 {q10, q11}, [r0]! 630 631 vst1.s16 {q0, q1}, [r1]! 632 633 vmul.s16 q2, q2, q12 634 vmul.s16 q3, q3, q12 635 vmul.s16 q8, q8, q12 636 vst1.s16 {q2, q3}, [r1]! 637 638 vmul.s16 q9, q9, q12 639 vmul.s16 q10, q10, q12 640 vmul.s16 q11, q11, q12 641 vst1.s16 {q8, q9}, [r1]! 642 vst1.s16 {q10, q11}, [r1]! 643 644WELS_ASM_FUNC_END 645 646 647WELS_ASM_FUNC_BEGIN WelsDequantIHadamard4x4_neon 648 649 vld1.s16 {q0, q1}, [r0] 650 vdup.s16 q8, r1 651 652 IHDM_4x4_TOTAL_16BITS q0, q2, q3 653 IHDM_4x4_TOTAL_16BITS q1, q2, q3 654 655 MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3 656 657 IHDM_4x4_TOTAL_16BITS q0, q2, q3 658 vmul.s16 q0, q8 659 660 IHDM_4x4_TOTAL_16BITS q1, q2, q3 661 vmul.s16 q1, q8 662 663 MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3 664 vst1.s16 {q0, q1}, [r0] 665WELS_ASM_FUNC_END 666 667 668WELS_ASM_FUNC_BEGIN WelsIDctT4Rec_neon 669 vld1.u32 {d16[0]}, [r2], r3 670 push {r4} 671 ldr r4, [sp, #4] 672 vld1.u32 {d16[1]}, [r2], r3 673 674 vld4.s16 {d0, d1, d2, d3}, [r4] // cost 3 cycles! 675 vld1.u32 {d17[0]}, [r2], r3 676 vld1.u32 {d17[1]}, [r2], r3 // q7 is pred 677 678 ROW_TRANSFORM_1_STEP_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7 679 680 TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7 681 682 MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3 683 684 ROW_TRANSFORM_1_STEP_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7 685 686 TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7 687 vrshr.s16 d0, d0, #6 688 vrshr.s16 d1, d1, #6 689 vrshr.s16 d2, d2, #6 690 vrshr.s16 d3, d3, #6 691 692 //after rounding 6, clip into [0, 255] 693 vmovl.u8 q2,d16 694 vadd.s16 q0,q2 695 vqmovun.s16 d16,q0 696 vst1.32 {d16[0]},[r0],r1 697 vst1.32 {d16[1]},[r0],r1 698 699 vmovl.u8 q2,d17 700 vadd.s16 q1,q2 701 vqmovun.s16 d17,q1 702 vst1.32 {d17[0]},[r0],r1 703 vst1.32 {d17[1]},[r0] 704 705 pop {r4} 706WELS_ASM_FUNC_END 707 708 709WELS_ASM_FUNC_BEGIN WelsIDctFourT4Rec_neon 710 711 vld1.u64 {d24}, [r2], r3 712 push {r4} 713 ldr r4, [sp, #4] 714 vld1.u64 {d25}, [r2], r3 715 716 vld4.s16 {d0, d1, d2, d3}, [r4]! // cost 3 cycles! 717 vld1.u64 {d26}, [r2], r3 718 vld1.u64 {d27}, [r2], r3 719 vld4.s16 {d4, d5, d6, d7}, [r4]! // cost 3 cycles! 720 vswp d1, d4 721 vswp d3, d6 722 vswp q1, q2 // q0~q3 723 724 ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11 725 726 TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11 727 728 MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3 729 730 ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11 731 732 TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11 733 vrshr.s16 q0, q0, #6 734 vrshr.s16 q1, q1, #6 735 vrshr.s16 q2, q2, #6 736 vrshr.s16 q3, q3, #6 737 738 //after rounding 6, clip into [0, 255] 739 vmovl.u8 q8,d24 740 vadd.s16 q0,q8 741 vqmovun.s16 d24,q0 742 vst1.u8 {d24},[r0],r1 743 744 vmovl.u8 q8,d25 745 vadd.s16 q1,q8 746 vqmovun.s16 d25,q1 747 vst1.u8 {d25},[r0],r1 748 749 vmovl.u8 q8,d26 750 vadd.s16 q2,q8 751 vqmovun.s16 d26,q2 752 vst1.u8 {d26},[r0],r1 753 754 vmovl.u8 q8,d27 755 vadd.s16 q3,q8 756 vqmovun.s16 d27,q3 757 vst1.u8 {d27},[r0],r1 758 759 vld1.u64 {d24}, [r2], r3 760 vld1.u64 {d25}, [r2], r3 761 762 vld4.s16 {d0, d1, d2, d3}, [r4]! // cost 3 cycles! 763 vld1.u64 {d26}, [r2], r3 764 vld1.u64 {d27}, [r2], r3 765 vld4.s16 {d4, d5, d6, d7}, [r4]! // cost 3 cycles! 766 vswp d1, d4 767 vswp d3, d6 768 vswp q1, q2 // q0~q3 769 770 ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11 771 772 TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11 773 774 MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3 775 776 ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11 777 778 TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11 779 vrshr.s16 q0, q0, #6 780 vrshr.s16 q1, q1, #6 781 vrshr.s16 q2, q2, #6 782 vrshr.s16 q3, q3, #6 783 784 //after rounding 6, clip into [0, 255] 785 vmovl.u8 q8,d24 786 vadd.s16 q0,q8 787 vqmovun.s16 d24,q0 788 vst1.u8 {d24},[r0],r1 789 790 vmovl.u8 q8,d25 791 vadd.s16 q1,q8 792 vqmovun.s16 d25,q1 793 vst1.u8 {d25},[r0],r1 794 795 vmovl.u8 q8,d26 796 vadd.s16 q2,q8 797 vqmovun.s16 d26,q2 798 vst1.u8 {d26},[r0],r1 799 800 vmovl.u8 q8,d27 801 vadd.s16 q3,q8 802 vqmovun.s16 d27,q3 803 vst1.u8 {d27},[r0],r1 804 805 pop {r4} 806WELS_ASM_FUNC_END 807 808 809WELS_ASM_FUNC_BEGIN WelsIDctRecI16x16Dc_neon 810 push {r4} 811 ldr r4, [sp, #4] 812 813 vld1.s16 {q8,q9}, [r4] 814 vrshr.s16 q8, q8, #6 815 vrshr.s16 q9, q9, #6 816 817 vdup.s16 d20, d16[0] 818 vdup.s16 d21, d16[1] 819 vdup.s16 d22, d16[2] 820 vdup.s16 d23, d16[3] 821 822 vld1.u8 {q0}, [r2], r3 823 MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 824 vst1.u8 {q0}, [r0], r1 825 826 vld1.u8 {q0}, [r2], r3 827 MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 828 vst1.u8 {q0}, [r0], r1 829 830 vld1.u8 {q0}, [r2], r3 831 MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 832 vst1.u8 {q0}, [r0], r1 833 834 vld1.u8 {q0}, [r2], r3 835 MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 836 vst1.u8 {q0}, [r0], r1 837 838 vdup.s16 d20, d17[0] 839 vdup.s16 d21, d17[1] 840 vdup.s16 d22, d17[2] 841 vdup.s16 d23, d17[3] 842 843 vld1.u8 {q0}, [r2], r3 844 MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 845 vst1.u8 {q0}, [r0], r1 846 847 vld1.u8 {q0}, [r2], r3 848 MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 849 vst1.u8 {q0}, [r0], r1 850 851 vld1.u8 {q0}, [r2], r3 852 MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 853 vst1.u8 {q0}, [r0], r1 854 855 vld1.u8 {q0}, [r2], r3 856 MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 857 vst1.u8 {q0}, [r0], r1 858 859 vdup.s16 d20, d18[0] 860 vdup.s16 d21, d18[1] 861 vdup.s16 d22, d18[2] 862 vdup.s16 d23, d18[3] 863 864 vld1.u8 {q0}, [r2], r3 865 MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 866 vst1.u8 {q0}, [r0], r1 867 868 vld1.u8 {q0}, [r2], r3 869 MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 870 vst1.u8 {q0}, [r0], r1 871 872 vld1.u8 {q0}, [r2], r3 873 MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 874 vst1.u8 {q0}, [r0], r1 875 876 vld1.u8 {q0}, [r2], r3 877 MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 878 vst1.u8 {q0}, [r0], r1 879 880 vdup.s16 d20, d19[0] 881 vdup.s16 d21, d19[1] 882 vdup.s16 d22, d19[2] 883 vdup.s16 d23, d19[3] 884 885 vld1.u8 {q0}, [r2], r3 886 MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 887 vst1.u8 {q0}, [r0], r1 888 889 vld1.u8 {q0}, [r2], r3 890 MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 891 vst1.u8 {q0}, [r0], r1 892 893 vld1.u8 {q0}, [r2], r3 894 MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 895 vst1.u8 {q0}, [r0], r1 896 897 vld1.u8 {q0}, [r2], r3 898 MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 899 vst1.u8 {q0}, [r0], r1 900 901 pop {r4} 902WELS_ASM_FUNC_END 903#endif 904