1/*! 2 * \copy 3 * Copyright (c) 2013, Cisco Systems 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 10 * * Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 21 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 22 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 24 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 28 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 * 31 */ 32 33#ifdef HAVE_NEON_AARCH64 34#include "arm_arch64_common_macro.S" 35 36.macro ZERO_COUNT_IN_2_QUARWORD arg0, arg1, arg2 37// { // input: coef_0 (identy to \arg3\() \arg4\()), coef_1(identy to \arg5\() \arg6\()), mask_q 38 cmeq \arg0\().8h, \arg0\().8h, #0 39 cmeq \arg1\().8h, \arg1\().8h, #0 40 uzp1 \arg0\().16b, \arg0\().16b, \arg1\().16b 41 ushr \arg0\().16b, \arg0\().16b, 7 42 addv \arg2\(), \arg0\().16b 43// } 44.endm 45 46.macro NEWQUANT_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4, arg5 47// if coef <= 0, - coef; else , coef; 48// { // input: coef, ff (dst), mf 49 eor \arg3\().16b, \arg3\().16b, \arg3\().16b // init 0 , and keep 0; 50 saba \arg1\().8h, \arg0\().8h, \arg3\().8h // f + abs(coef - 0) 51 smull \arg4\().4s, \arg1\().4h, \arg2\().4h 52 smull2 \arg5\().4s, \arg1\().8h, \arg2\().8h 53 shrn \arg1\().4h, \arg4\().4s, #16 54 shrn2 \arg1\().8h, \arg5\().4s, #16 55 56 cmgt \arg4\().8h, \arg0\().8h, #0 // if true, location of coef == 11111111 57 bif \arg3\().16b, \arg1\().16b, \arg4\().16b // if (x<0) reserved part; else keep 0 untouched 58 shl \arg3\().8h, \arg3\().8h, #1 59 sub \arg1\().8h, \arg1\().8h, \arg3\().8h // if x > 0, -= 0; else x-= 2x 60// } 61.endm 62 63.macro NEWQUANT_COEF_EACH_16BITS_MAX arg0, arg1, arg2, arg3, arg4, arg5, arg6 64// if coef <= 0, - coef; else , coef; 65// { // input: coef, ff (dst), mf 66 eor \arg3\().16b, \arg3\().16b, \arg3\().16b // init 0 , and keep 0; 67 saba \arg1\().8h, \arg0\().8h, \arg3\().8h // f + abs(coef - 0) 68 smull \arg4\().4s, \arg1\().4h, \arg2\().4h 69 smull2 \arg5\().4s, \arg1\().8h, \arg2\().8h 70 shrn \arg1\().4h, \arg4\().4s, #16 71 shrn2 \arg1\().8h, \arg5\().4s, #16 72 73 cmgt \arg4\().8h, \arg0\().8h, #0 // if true, location of coef == 11111111 74 bif \arg3\().16b, \arg1\().16b, \arg4\().16b // if (x<0) reserved part; else keep 0 untouched 75 shl \arg3\().8h, \arg3\().8h, #1 76 mov \arg6\().16b, \arg1\().16b 77 sub \arg1\().8h, \arg1\().8h, \arg3\().8h // if x > 0, -= 0; else x-= 2x 78// } 79.endm 80 81.macro QUANT_DUALWORD_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4 82// if coef <= 0, - coef; else , coef; 83// { // input: coef, ff (dst), mf 84 saba \arg1\().8h, \arg0\().8h, \arg3\().8h // f + abs(coef - 0) 85 smull \arg4\().4s, \arg1\().4h, \arg2\().4h 86 shrn \arg1\().4h, \arg4\().4s, #16 87 88 cmgt \arg4\().8h, \arg0\().8h, #0 // if true, location of coef == 11111111 89 bif \arg3\().16b, \arg1\().16b, \arg4\().16b // if (x<0) reserved part; else keep 0 untouched 90 shl \arg3\().8h, \arg3\().8h, #1 91 sub \arg1\().8h, \arg1\().8h, \arg3\().8h // if x > 0, -= 0; else x-= 2x 92// } 93.endm 94 95.macro SELECT_MAX_IN_ABS_COEF arg0, arg1, arg2, arg3, arg4, arg5 96// { // input: coef_0, coef_1, coef_2, coef_3, max_q (identy to follow two) 97 umax \arg0\().8h, \arg0\().8h, \arg1\().8h 98 umaxv \arg4\(), \arg0\().8h 99 umax \arg2\().8h, \arg2\().8h, \arg3\().8h 100 umaxv \arg5\(), \arg2\().8h 101// } 102.endm 103 104.macro HDM_QUANT_2x2_TOTAL_16BITS arg0, arg1, arg2 105// { // input: src_d[0][16][32][48], dst_d[0][16][32][48], working 106 sshr \arg1\().2d, \arg0\().2d, #32 107 add \arg2\().4h, \arg0\().4h, \arg1\().4h // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48]; 108 sub \arg1\().4h, \arg0\().4h, \arg1\().4h // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48]; 109 zip1 \arg1\().4h, \arg2\().4h, \arg1\().4h 110// } 111.endm 112 113 114.macro DC_ZERO_COUNT_IN_DUALWORD arg0, arg1, arg2 115// { // input: coef, dst_d, working_d (all 0x01) 116 cmeq \arg0\().4h, \arg0\().4h, #0 117 and \arg0\().8b, \arg0\().8b, \arg2\().8b 118 addv \arg1\(), \arg0\().4h 119// } 120.endm 121 122.macro IHDM_4x4_TOTAL_16BITS arg0, arg1, arg2 123// { // input: each src_d[0]~[3](dst), working_q0, working_q1 124 uzp2 \arg1\().4s, \arg0\().4s, \arg0\().4s 125 uzp1 \arg0\().4s, \arg0\().4s, \arg0\().4s 126 add \arg2\().8h, \arg0\().8h, \arg1\().8h // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];[2] = rs[4] + rs[6];[3] = rs[5] + rs[7]; 127 sub \arg1\().8h, \arg0\().8h, \arg1\().8h // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];[2] = rs[4] - rs[6];[3] = rs[5] - rs[7]; 128 zip1 \arg2\().8h, \arg2\().8h, \arg1\().8h // [0] = rs[0] + rs[2]; [1] = rs[0] - rs[2]; ... [2]; [3] 129 130 uzp2 \arg1\().4s, \arg2\().4s, \arg2\().4s 131 uzp1 \arg0\().4s, \arg2\().4s, \arg2\().4s 132 add \arg2\().8h, \arg0\().8h, \arg1\().8h // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];[2] = rs[4] + rs[6];[3] = rs[5] + rs[7]; 133 sub \arg1\().8h, \arg0\().8h, \arg1\().8h // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];[2] = rs[4] - rs[6];[3] = rs[5] - rs[7]; 134 rev32 \arg1\().4h, \arg1\().4h // [0] = rs[1] - rs[3];[1] = rs[0] - rs[2];[2] = rs[5] - rs[7];[3] = rs[4] - rs[6]; 135 zip1 \arg0\().4s, \arg2\().4s, \arg1\().4s 136 // } 137.endm 138 139.macro MATRIX_TRANSFORM_EACH_16BITS_2x8_OUT2 arg0, arg1, arg2, arg3 140// { // input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15] 141 uzp1 \arg2\().4s, \arg0\().4s, \arg1\().4s //[0 1 4 5]+[8 9 12 13] 142 uzp2 \arg3\().4s, \arg0\().4s, \arg1\().4s //[2 3 6 7]+[10 11 14 15] 143 144 uzp1 \arg0\().8h, \arg2\().8h, \arg3\().8h //[0 4 8 12]+[2 6 10 14] 145 uzp2 \arg2\().8h, \arg2\().8h, \arg3\().8h //[1 5 9 13]+[3 7 11 15] 146 zip2 \arg1\().2d, \arg0\().2d, \arg2\().2d //[2 6 10 14]+[3 7 11 15] 147 zip1 \arg0\().2d, \arg0\().2d, \arg2\().2d //[0 4 8 12]+[1 5 9 13] 148// } 149.endm 150 151.macro MATRIX_TRANSFORM_EACH_16BITS_OUT4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 152// { // input & output: src_d[0]~[3];[0 4 8 12],[1 5 9 13],[2 6 10 14],[3 7 11 15] 153 trn1 \arg4\().8h, v0.8h, v1.8h 154 trn2 \arg5\().8h, v0.8h, v1.8h 155 trn1 \arg6\().8h, v2.8h, v3.8h 156 trn2 \arg7\().8h, v2.8h, v3.8h 157 158 trn1 \arg0\().4s, v4.4s, v6.4s 159 trn2 \arg2\().4s, v4.4s, v6.4s 160 trn1 \arg1\().4s, v5.4s, v7.4s 161 trn2 \arg3\().4s, v5.4s, v7.4s 162// } 163.endm 164 165.macro MATRIX_TRANSFORM_EACH_16BITS_4x4_OUT2 arg0, arg1, arg2, arg3 166// { // input & output: src_d[0]~[3];[0 1 2 3],[4 5 6 7],[8 9 10 11],[12 13 14 15] 167 mov \arg0\().d[1], \arg1\().d[0] //[0 1 2 3]+[4 5 6 7] 168 mov \arg2\().d[1], \arg3\().d[0] //[8 9 10 11]+[12 13 14 15] 169 uzp1 \arg1\().4s, \arg0\().4s, \arg2\().4s //[0 1 4 5]+[8 9 12 13] 170 uzp2 \arg3\().4s, \arg0\().4s, \arg2\().4s //[2 3 6 7]+[10 11 14 15] 171 172 uzp1 \arg0\().8h, \arg1\().8h, \arg3\().8h //[0 4 8 12]+[2 6 10 14] 173 uzp2 \arg2\().8h, \arg1\().8h, \arg3\().8h //[1 5 9 13]+[3 7 11 15] 174 zip2 \arg1\().2d, \arg0\().2d, \arg2\().2d //[2 6 10 14]+[3 7 11 15] 175 zip1 \arg0\().2d, \arg0\().2d, \arg2\().2d //[0 4 8 12]+[1 5 9 13] 176// } 177.endm 178 179.macro LOAD_4x4_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5 180 ld1 {\arg0\().s}[0], [\arg2\()], \arg3\() 181 ld1 {\arg0\().s}[1], [\arg2\()], \arg3\() 182 ld1 {\arg0\().s}[2], [\arg2\()], \arg3\() 183 ld1 {\arg0\().s}[3], [\arg2\()] 184 185 ld1 {\arg1\().s}[0], [\arg4\()], \arg5\() 186 ld1 {\arg1\().s}[1], [\arg4\()], \arg5\() 187 ld1 {\arg1\().s}[2], [\arg4\()], \arg5\() 188 ld1 {\arg1\().s}[3], [\arg4\()] 189.endm 190 191.macro DCT_ROW_TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 192// { // input: src_d[0]~[3], working: [4]~[7] 193 add \arg4\().8h, \arg0\().8h, \arg3\().8h //int16 s[0] = data[i] + data[i3]; 194 sub \arg7\().8h, \arg0\().8h, \arg3\().8h //int16 s[3] = data[i] - data[i3]; 195 add \arg5\().8h, \arg1\().8h, \arg2\().8h //int16 s[1] = data[i1] + data[i2]; 196 sub \arg6\().8h, \arg1\().8h, \arg2\().8h //int16 s[2] = data[i1] - data[i2]; 197 198 add \arg0\().8h, \arg4\().8h, \arg5\().8h //int16 dct[i ] = s[0] + s[1]; 199 sub \arg2\().8h, \arg4\().8h, \arg5\().8h //int16 dct[i2] = s[0] - s[1]; 200 shl \arg1\().8h, \arg7\().8h, #1 201 shl \arg3\().8h, \arg6\().8h, #1 202 add \arg1\().8h, \arg1\().8h, \arg6\().8h //int16 dct[i1] = (s[3] << 1) + s[2]; 203 sub \arg3\().8h, \arg7\().8h, \arg3\().8h //int16 dct[i3] = s[3] - (s[2] << 1); 204// } 205.endm 206 207.macro LOAD_8x4_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9 208// { // input: \arg0\()~\arg3\(), src1*, src2*; untouched r2:src1_stride &r4:src2_stride 209 ld1 {\arg0\().d}[0], [\arg8\()], x2 210 ld1 {\arg1\().d}[0], [\arg8\()], x2 211 ld1 {\arg2\().d}[0], [\arg8\()], x2 212 ld1 {\arg3\().d}[0], [\arg8\()], x2 213 214 ld1 {\arg4\().d}[0], [\arg9\()], x4 215 ld1 {\arg5\().d}[0], [\arg9\()], x4 216 ld1 {\arg6\().d}[0], [\arg9\()], x4 217 ld1 {\arg7\().d}[0], [\arg9\()], x4 218// } 219.endm 220 221.macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 222// { // input: src_d[0]~[3], output: e_d[0]~[3]; 223 add \arg4\().8h, \arg0\().8h, \arg2\().8h //int16 e[i][0] = src[0] + src[2]; 224 sub \arg5\().8h, \arg0\().8h, \arg2\().8h //int16 e[i][1] = src[0] - src[2]; 225 sshr \arg6\().8h, \arg1\().8h, #1 226 sshr \arg7\().8h, \arg3\().8h, #1 227 sub \arg6\().8h, \arg6\().8h, \arg3\().8h //int16 e[i][2] = (src[1]>>1)-src[3]; 228 add \arg7\().8h, \arg1\().8h, \arg7\().8h //int16 e[i][3] = src[1] + (src[3]>>1); 229// } 230.endm 231 232.macro TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 233// both row & col transform used 234// { // output: f_q[0]~[3], input: e_q[0]~[3]; 235 add \arg0\().8h, \arg4\().8h, \arg7\().8h //int16 f[i][0] = e[i][0] + e[i][3]; 236 add \arg1\().8h, \arg5\().8h, \arg6\().8h //int16 f[i][1] = e[i][1] + e[i][2]; 237 sub \arg2\().8h, \arg5\().8h, \arg6\().8h //int16 f[i][2] = e[i][1] - e[i][2]; 238 sub \arg3\().8h, \arg4\().8h, \arg7\().8h //int16 f[i][3] = e[i][0] - e[i][3]; 239// } 240.endm 241 242.macro ROW_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 243// { // input: src_d[0]~[3], output: e_q[0]~[3]; 244 saddl \arg4\().4s, \arg0\().4h, \arg2\().4h //int32 e[i][0] = src[0] + src[2]; 245 ssubl \arg5\().4s, \arg0\().4h, \arg2\().4h //int32 e[i][1] = src[0] - src[2]; 246 ssubl \arg6\().4s, \arg1\().4h, \arg3\().4h //int32 e[i][2] = src[1] - src[3]; 247 saddl \arg7\().4s, \arg1\().4h, \arg3\().4h //int32 e[i][3] = src[1] + src[3]; 248// } 249.endm 250 251.macro COL_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 252// { // input: src_q[0]~[3], output: e_q[0]~[3]; 253 add \arg4\().4s, \arg0\().4s, \arg2\().4s //int32 e[0][j] = f[0][j] + f[2][j]; 254 sub \arg5\().4s, \arg0\().4s, \arg2\().4s //int32 e[1][j] = f[0][j] - f[2][j]; 255 sub \arg6\().4s, \arg1\().4s, \arg3\().4s //int32 e[2][j] = (f[1][j]>>1) - f[3][j]; 256 add \arg7\().4s, \arg1\().4s, \arg3\().4s //int32 e[3][j] = f[1][j] + (f[3][j]>>1); 257// } 258.endm 259 260.macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 261// both row & col transform used 262// { // output: f_q[0]~[3], input: e_q[0]~[3]; 263 add \arg0\().4s, \arg4\().4s, \arg7\().4s //int16 f[i][0] = e[i][0] + e[i][3]; 264 add \arg1\().4s, \arg5\().4s, \arg6\().4s //int16 f[i][1] = e[i][1] + e[i][2]; 265 sub \arg2\().4s, \arg5\().4s, \arg6\().4s //int16 f[i][2] = e[i][1] - e[i][2]; 266 sub \arg3\().4s, \arg4\().4s, \arg7\().4s //int16 f[i][3] = e[i][0] - e[i][3]; 267// } 268.endm 269 270.macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP arg0, arg1, arg2, arg3, arg4 271// { // input: pred_d[0](output), dct_q0/1, working_q0/1; 272 uxtl \arg3\().8h, \arg0\().8b 273 uxtl2 \arg4\().8h, \arg0\().16b 274 add \arg3\().8h, \arg3\().8h, \arg1\().8h 275 add \arg4\().8h, \arg4\().8h, \arg2\().8h 276 sqxtun \arg0\().8b, \arg3\().8h 277 sqxtun2 \arg0\().16b,\arg4\().8h 278// } 279.endm 280 281WELS_ASM_AARCH64_FUNC_BEGIN WelsGetNoneZeroCount_AArch64_neon 282 ld1 {v0.8h, v1.8h}, [x0] 283 ZERO_COUNT_IN_2_QUARWORD v0, v1, b0 284 mov x0, v0.d[0] 285 mov x1, #16 286 subs x0, x1, x0 287WELS_ASM_AARCH64_FUNC_END 288 289WELS_ASM_AARCH64_FUNC_BEGIN WelsQuant4x4_AArch64_neon 290 ld1 {v2.8h}, [x1] 291 ld1 {v0.8h, v1.8h}, [x0] 292 ld1 {v3.8h}, [x2] 293 mov v4.16b, v2.16b 294 NEWQUANT_COEF_EACH_16BITS v0, v2, v3, v5, v6, v7 295 st1 {v2.8h}, [x0], #16 296 NEWQUANT_COEF_EACH_16BITS v1, v4, v3, v5, v6, v7 297 st1 {v4.8h}, [x0], #16 298WELS_ASM_AARCH64_FUNC_END 299 300 301WELS_ASM_AARCH64_FUNC_BEGIN WelsQuant4x4Dc_AArch64_neon 302 ld1 {v0.8h, v1.8h}, [x0] 303 dup v2.8h, w1 // even ff range [0, 768] 304 dup v3.8h, w2 305 mov v4.16b, v2.16b 306 NEWQUANT_COEF_EACH_16BITS v0, v2, v3, v5, v6, v7 307 st1 {v2.8h}, [x0], #16 308 NEWQUANT_COEF_EACH_16BITS v1, v4, v3, v5, v6, v7 309 st1 {v4.8h}, [x0], #16 310WELS_ASM_AARCH64_FUNC_END 311 312WELS_ASM_AARCH64_FUNC_BEGIN WelsQuantFour4x4_AArch64_neon 313 ld1 {v2.8h}, [x1] 314 ld1 {v3.8h}, [x2] 315 mov x1, x0 316 317.rept 4 318 ld1 {v0.8h, v1.8h}, [x0], #32 319 mov v4.16b, v2.16b 320 NEWQUANT_COEF_EACH_16BITS v0, v4, v3, v5, v6, v7 321 st1 {v4.8h}, [x1], #16 322 mov v4.16b, v2.16b 323 NEWQUANT_COEF_EACH_16BITS v1, v4, v3, v5, v6, v7 324 st1 {v4.8h}, [x1], #16 325.endr 326WELS_ASM_AARCH64_FUNC_END 327 328 329WELS_ASM_AARCH64_FUNC_BEGIN WelsQuantFour4x4Max_AArch64_neon 330 ld1 {v2.8h}, [x1] 331 ld1 {v3.8h}, [x2] 332 mov x1, x0 333 334 ld1 {v0.8h, v1.8h}, [x0], #32 335 mov v4.16b, v2.16b 336 NEWQUANT_COEF_EACH_16BITS_MAX v0, v4, v3, v5, v6, v7, v16 337 st1 {v4.8h}, [x1], #16 338 mov v4.16b, v2.16b 339 NEWQUANT_COEF_EACH_16BITS_MAX v1, v4, v3, v5, v6, v7, v17 340 st1 {v4.8h}, [x1], #16 // then 1st 16 elem in v16 & v17 341 342 ld1 {v0.8h, v1.8h}, [x0], #32 343 mov v4.16b, v2.16b 344 NEWQUANT_COEF_EACH_16BITS_MAX v0, v4, v3, v5, v6, v7, v18 345 st1 {v4.8h}, [x1], #16 346 mov v4.16b, v2.16b 347 NEWQUANT_COEF_EACH_16BITS_MAX v1, v4, v3, v5, v6, v7, v19 348 st1 {v4.8h}, [x1], #16 // then 2st 16 elem in v18 & v19 349 350 SELECT_MAX_IN_ABS_COEF v16, v17, v18, v19, h20, h21 351 352 ld1 {v0.8h, v1.8h}, [x0], #32 353 mov v4.16b, v2.16b 354 NEWQUANT_COEF_EACH_16BITS_MAX v0, v4, v3, v5, v6, v7, v16 355 st1 {v4.8h}, [x1], #16 356 mov v4.16b, v2.16b 357 NEWQUANT_COEF_EACH_16BITS_MAX v1, v4, v3, v5, v6, v7, v17 358 st1 {v4.8h}, [x1], #16 // then 1st 16 elem in v16 & v17 359 360 ld1 {v0.8h, v1.8h}, [x0], #32 361 mov v4.16b, v2.16b 362 NEWQUANT_COEF_EACH_16BITS_MAX v0, v4, v3, v5, v6, v7, v18 363 st1 {v4.8h}, [x1], #16 364 mov v4.16b, v2.16b 365 NEWQUANT_COEF_EACH_16BITS_MAX v1, v4, v3, v5, v6, v7, v19 366 st1 {v4.8h}, [x1], #16 // then 2st 16 elem in v18 & v19 367 368 SELECT_MAX_IN_ABS_COEF v16, v17, v18, v19, h22, h23 369 370 st4 {v20.h,v21.h,v22.h,v23.h}[0], [x3] 371WELS_ASM_AARCH64_FUNC_END 372 373 374WELS_ASM_AARCH64_FUNC_BEGIN WelsDequant4x4_AArch64_neon 375 ld1 {v0.8h, v1.8h}, [x0] 376 ld1 {v2.8h}, [x1] 377 mul v3.8h, v0.8h, v2.8h 378 mul v4.8h, v1.8h, v2.8h 379 st1 {v3.8h, v4.8h}, [x0] 380WELS_ASM_AARCH64_FUNC_END 381 382WELS_ASM_AARCH64_FUNC_BEGIN WelsDequantFour4x4_AArch64_neon 383 ld1 {v2.8h}, [x1] 384 mov x1, x0 385.rept 4 386 ld1 {v0.8h,v1.8h}, [x0], #32 387 mul v3.8h, v0.8h, v2.8h 388 mul v4.8h, v1.8h, v2.8h 389 st1 {v3.8h,v4.8h}, [x1], #32 390.endr 391WELS_ASM_AARCH64_FUNC_END 392 393WELS_ASM_AARCH64_FUNC_BEGIN WelsHadamardQuant2x2SkipKernel_AArch64_neon 394 dup v4.8h, w1 395 mov x1, #32 396 ld1 {v0.h}[0], [x0], x1 //rs[0] 397 ld1 {v0.h}[1], [x0], x1 //rs[16] 398 ld1 {v0.h}[2], [x0], x1 //rs[32] 399 ld1 {v0.h}[3], [x0], x1 //rs[48] 400 401 HDM_QUANT_2x2_TOTAL_16BITS v0, v1, v2 // output v1 402 403 HDM_QUANT_2x2_TOTAL_16BITS v1, v0, v2 // output v0 404 405 abs v1.4h, v0.4h 406 cmhi v0.4h, v1.4h, v4.4h // abs(dct[i])>threshold; 407 mov w0, v0.s[0] 408 mov w1, v0.s[1] 409 orr w0, w0, w1 410WELS_ASM_AARCH64_FUNC_END 411 412 413WELS_ASM_AARCH64_FUNC_BEGIN WelsHadamardQuant2x2_AArch64_neon 414 415 dup v1.8h, w1 //ff 416 dup v2.8h, w2 //mf 417 eor v3.16b, v3.16b, v3.16b 418 419 mov x1, #32 420 mov x2, x0 421 ld1 {v0.h}[0], [x0], x1 //rs[0] 422 st1 {v3.h}[0], [x2], x1 //rs[00]=0 423 ld1 {v0.h}[1], [x0], x1 //rs[16] 424 st1 {v3.h}[1], [x2], x1 //rs[16]=0 425 ld1 {v0.h}[2], [x0], x1 //rs[32] 426 st1 {v3.h}[2], [x2], x1 //rs[32]=0 427 ld1 {v0.h}[3], [x0], x1 //rs[48] 428 st1 {v3.h}[3], [x2], x1 //rs[48]=0 429 430 431 HDM_QUANT_2x2_TOTAL_16BITS v0, v4, v5 // output v4 432 433 HDM_QUANT_2x2_TOTAL_16BITS v4, v0, v5 // output v0 434 435 QUANT_DUALWORD_COEF_EACH_16BITS v0, v1, v2, v3, v4 436 437 st1 {v1.d}[0], [x3] // store to dct 438 st1 {v1.d}[0], [x4] // store to block 439 440 movi v3.8h, #1, lsl #0 441 442 movi v0.16b, #255 443 444 DC_ZERO_COUNT_IN_DUALWORD v1, h0, v3 445 446 mov x0, v0.d[0] 447 mov x1, #4 448 subs x0, x1, x0 449WELS_ASM_AARCH64_FUNC_END 450 451 452 453WELS_ASM_AARCH64_FUNC_BEGIN WelsDequantIHadamard4x4_AArch64_neon 454 ld1 {v0.8h, v1.8h}, [x0] 455 dup v4.8h, w1 456 457 IHDM_4x4_TOTAL_16BITS v0, v2, v3 458 IHDM_4x4_TOTAL_16BITS v1, v2, v3 459 460 MATRIX_TRANSFORM_EACH_16BITS_2x8_OUT2 v0, v1, v2, v3 461 462 IHDM_4x4_TOTAL_16BITS v0, v2, v3 463 mul v0.8h, v0.8h, v4.8h 464 465 IHDM_4x4_TOTAL_16BITS v1, v2, v3 466 mul v1.8h, v1.8h, v4.8h 467 468 MATRIX_TRANSFORM_EACH_16BITS_2x8_OUT2 v0, v1, v2, v3 469 st1 {v0.16b, v1.16b}, [x0] 470WELS_ASM_AARCH64_FUNC_END 471 472//void WelsDctT4_AArch64_neon (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2); 473WELS_ASM_AARCH64_FUNC_BEGIN WelsDctT4_AArch64_neon 474 SIGN_EXTENSION x2, w2 475 SIGN_EXTENSION x4, w4 476 LOAD_4x4_DATA_FOR_DCT v0, v1, x1, x2, x3, x4 477 usubl v2.8h, v0.8b, v1.8b 478 usubl2 v4.8h, v0.16b, v1.16b 479 uzp1 v3.8h, v2.8h, v4.8h 480 uzp2 v5.8h, v2.8h, v4.8h 481 uzp2 v2.8h, v3.8h, v5.8h // s[2, 6, 10, 14] [3, 7, 11, 15] 482 uzp1 v0.8h, v3.8h, v5.8h // s[0, 4, 8, 12] [1, 5, 9, 13] 483 mov v3.d[0], v2.d[1] // s[3, 7, 11, 15] 484 mov v1.d[0], v0.d[1] // s[1, 5, 9, 13] 485 486 // horizontal transform 487 DCT_ROW_TRANSFORM_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7 488 // transform element 489 MATRIX_TRANSFORM_EACH_16BITS_OUT4 v0, v1, v2, v3, v4, v5, v6, v7 490 // vertical transform 491 DCT_ROW_TRANSFORM_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7 492 493 st4 {v0.d, v1.d, v2.d, v3.d}[0], [x0] 494WELS_ASM_AARCH64_FUNC_END 495 496//void WelsDctFourT4_AArch64_neon (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2); 497WELS_ASM_AARCH64_FUNC_BEGIN WelsDctFourT4_AArch64_neon 498 SIGN_EXTENSION x2,w2 499 SIGN_EXTENSION x4,w4 500.rept 2 501 LOAD_8x4_DATA_FOR_DCT v0, v1, v2, v3, v4, v5, v6, v7, x1, x3 502 usubl v0.8h, v0.8b, v4.8b 503 usubl v1.8h, v1.8b, v5.8b 504 usubl v2.8h, v2.8b, v6.8b 505 usubl v3.8h, v3.8b, v7.8b 506 507 MATRIX_TRANSFORM_EACH_16BITS_OUT4 v0, v1, v2, v3, v4, v5, v6, v7 508 509 // horizontal transform 510 DCT_ROW_TRANSFORM_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7 511 512 // transform element 513 MATRIX_TRANSFORM_EACH_16BITS_OUT4 v0, v1, v2, v3, v4, v5, v6, v7 514 515 // vertical transform 516 DCT_ROW_TRANSFORM_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7 517 518 uzp1 v4.2d, v0.2d, v1.2d 519 uzp2 v6.2d, v0.2d, v1.2d 520 uzp1 v5.2d, v2.2d, v3.2d 521 uzp2 v7.2d, v2.2d, v3.2d 522 st1 {v4.16b, v5.16b}, [x0], #32 523 st1 {v6.16b, v7.16b}, [x0], #32 524.endr 525WELS_ASM_AARCH64_FUNC_END 526//void WelsIDctT4Rec_AArch64_neon (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct) 527WELS_ASM_AARCH64_FUNC_BEGIN WelsIDctT4Rec_AArch64_neon 528 SIGN_EXTENSION x1,w1 529 SIGN_EXTENSION x3,w3 530 ld1 {v16.s}[0], [x2], x3 531 ld1 {v16.s}[1], [x2], x3 532 ld1 {v16.s}[2], [x2], x3 533 ld1 {v16.s}[3], [x2], x3 // Pred 534 ld4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x4] // dct coeff 535 536 ROW_TRANSFORM_1_STEP_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7 537 538 TRANSFORM_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7 539 540 MATRIX_TRANSFORM_EACH_16BITS_OUT4 v0, v1, v2, v3, v4, v5, v6, v7 541 542 ROW_TRANSFORM_1_STEP_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7 543 544 TRANSFORM_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7 545 ins v0.d[1], v1.d[0] 546 ins v2.d[1], v3.d[0] 547 srshr v0.8h, v0.8h, #6 548 srshr v2.8h, v2.8h, #6 549 //after rounding 6, clip into [0, 255] 550 uxtl v1.8h, v16.8b 551 add v0.8h, v0.8h, v1.8h 552 sqxtun v1.8b, v0.8h 553 st1 {v1.s}[0],[x0],x1 554 st1 {v1.s}[1],[x0],x1 555 556 uxtl2 v1.8h, v16.16b 557 add v2.8h, v2.8h, v1.8h 558 sqxtun v1.8b, v2.8h 559 st1 {v1.s}[0],[x0],x1 560 st1 {v1.s}[1],[x0],x1 561WELS_ASM_AARCH64_FUNC_END 562//void WelsIDctFourT4Rec_AArch64_neon (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct); 563WELS_ASM_AARCH64_FUNC_BEGIN WelsIDctFourT4Rec_AArch64_neon 564 SIGN_EXTENSION x1,w1 565 SIGN_EXTENSION x3,w3 566.rept 2 567 ld1 {v16.d}[0], [x2], x3 568 ld1 {v16.d}[1], [x2], x3 569 ld1 {v17.d}[0], [x2], x3 570 ld1 {v17.d}[1], [x2], x3 // Pred 571 ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [x4], #64 // dct coeff 572 573 ROW_TRANSFORM_1_STEP_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7 574 575 TRANSFORM_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7 576 577 MATRIX_TRANSFORM_EACH_16BITS_OUT4 v0, v1, v2, v3, v4, v5, v6, v7 578 579 ROW_TRANSFORM_1_STEP_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7 580 581 TRANSFORM_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7 582 srshr v0.8h, v0.8h, #6 583 srshr v1.8h, v1.8h, #6 584 srshr v2.8h, v2.8h, #6 585 srshr v3.8h, v3.8h, #6 586 587 //after rounding 6, clip into [0, 255] 588 uxtl v4.8h, v16.8b 589 add v0.8h, v0.8h, v4.8h 590 sqxtun v0.8b, v0.8h 591 st1 {v0.d}[0],[x0],x1 592 593 uxtl2 v5.8h, v16.16b 594 add v1.8h, v1.8h, v5.8h 595 sqxtun v1.8b, v1.8h 596 st1 {v1.d}[0],[x0],x1 597 598 uxtl v6.8h, v17.8b 599 add v2.8h, v2.8h, v6.8h 600 sqxtun v2.8b, v2.8h 601 st1 {v2.d}[0],[x0],x1 602 603 uxtl2 v7.8h, v17.16b 604 add v3.8h, v3.8h, v7.8h 605 sqxtun v3.8b, v3.8h 606 st1 {v3.d}[0],[x0],x1 607 .endr 608WELS_ASM_AARCH64_FUNC_END 609 610WELS_ASM_AARCH64_FUNC_BEGIN WelsHadamardT4Dc_AArch64_neon 611 612 mov x2, #32 613 ld1 {v0.h}[0], [x1], x2 614 ld1 {v1.h}[0], [x1], x2 615 ld1 {v0.h}[1], [x1], x2 616 ld1 {v1.h}[1], [x1], x2 617 618 ld1 {v2.h}[0], [x1], x2 619 ld1 {v3.h}[0], [x1], x2 620 ld1 {v2.h}[1], [x1], x2 621 ld1 {v3.h}[1], [x1], x2 622 623 ld1 {v0.h}[2], [x1], x2 624 ld1 {v1.h}[2], [x1], x2 625 ld1 {v0.h}[3], [x1], x2 626 ld1 {v1.h}[3], [x1], x2 627 628 ld1 {v2.h}[2], [x1], x2 629 ld1 {v3.h}[2], [x1], x2 630 ld1 {v2.h}[3], [x1], x2 631 ld1 {v3.h}[3], [x1], x2 // v0[0 4 08 12],v1[1 5 09 13],v2[2 6 10 14],v3[3 7 11 15] 632 633 ROW_TRANSFORM_0_STEP v0, v1, v3, v2, v4, v7, v6, v5 634 TRANSFORM_4BYTES v0, v1, v3, v2, v4, v7, v6, v5 635 636 // transform element 32bits 637 uzp1 v4.4s, v0.4s, v1.4s // 0 2 4 6 638 uzp2 v5.4s, v0.4s, v1.4s // 1 3 5 7 639 uzp1 v6.4s, v2.4s, v3.4s // 8 10 12 14 640 uzp2 v7.4s, v2.4s, v3.4s // 9 11 13 15 641 642 uzp1 v0.4s, v4.4s, v6.4s // 0 4 8 12 643 uzp2 v2.4s, v4.4s, v6.4s // 2 6 10 14 644 uzp1 v1.4s, v5.4s, v7.4s // 1 5 9 13 645 uzp2 v3.4s, v5.4s, v7.4s // 3 7 11 15 646 647 COL_TRANSFORM_0_STEP v0, v1, v3, v2, v4, v7, v6, v5 648 TRANSFORM_4BYTES v0, v1, v3, v2, v4, v7, v6, v5 649 sqrshrn v4.4h, v0.4s, #1 650 sqrshrn2 v4.8h, v1.4s, #1 651 sqrshrn v5.4h, v2.4s, #1 652 sqrshrn2 v5.8h, v3.4s, #1 653 st1 {v4.16b, v5.16b}, [x0] //store 654WELS_ASM_AARCH64_FUNC_END 655 656//void WelsIDctRecI16x16Dc_AArch64_neon (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, 657// int16_t* pDctDc); 658WELS_ASM_AARCH64_FUNC_BEGIN WelsIDctRecI16x16Dc_AArch64_neon 659 SIGN_EXTENSION x1,w1 660 SIGN_EXTENSION x3,w3 661 ld1 {v16.16b,v17.16b}, [x4] 662 srshr v16.8h, v16.8h, #6 663 srshr v17.8h, v17.8h, #6 664 665 dup v0.8h, v16.h[0] 666 dup v1.8h, v16.h[1] 667 ins v0.d[1], v1.d[0] 668 dup v1.8h, v16.h[2] 669 dup v2.8h, v16.h[3] 670 ins v1.d[1], v2.d[0] 671 672.rept 4 673 ld1 {v3.16b}, [x2], x3 674 MB_PRED_8BITS_ADD_DCT_16BITS_CLIP v3, v0, v1, v4, v5 675 st1 {v3.16b}, [x0], x1 676.endr 677 678 dup v0.8h, v16.h[4] 679 dup v1.8h, v16.h[5] 680 ins v0.d[1], v1.d[0] 681 dup v1.8h, v16.h[6] 682 dup v2.8h, v16.h[7] 683 ins v1.d[1], v2.d[0] 684 685.rept 4 686 ld1 {v3.16b}, [x2], x3 687 MB_PRED_8BITS_ADD_DCT_16BITS_CLIP v3, v0, v1, v4, v5 688 st1 {v3.16b}, [x0], x1 689.endr 690 691 dup v0.8h, v17.h[0] 692 dup v1.8h, v17.h[1] 693 ins v0.d[1], v1.d[0] 694 dup v1.8h, v17.h[2] 695 dup v2.8h, v17.h[3] 696 ins v1.d[1], v2.d[0] 697 698.rept 4 699 ld1 {v3.16b}, [x2], x3 700 MB_PRED_8BITS_ADD_DCT_16BITS_CLIP v3, v0, v1, v4, v5 701 st1 {v3.16b}, [x0], x1 702.endr 703 704 dup v0.8h, v17.h[4] 705 dup v1.8h, v17.h[5] 706 ins v0.d[1], v1.d[0] 707 dup v1.8h, v17.h[6] 708 dup v2.8h, v17.h[7] 709 ins v1.d[1], v2.d[0] 710 711.rept 4 712 ld1 {v3.16b}, [x2], x3 713 MB_PRED_8BITS_ADD_DCT_16BITS_CLIP v3, v0, v1, v4, v5 714 st1 {v3.16b}, [x0], x1 715.endr 716WELS_ASM_AARCH64_FUNC_END 717#endif 718