1/*! 2 * \copy 3 * Copyright (c) 2013, Cisco Systems 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 10 * * Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 21 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 22 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 24 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 28 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 * 31 */ 32 33#ifdef HAVE_NEON_AARCH64 34#include "arm_arch64_common_macro.S" 35 36WELS_ASM_AARCH64_FUNC_BEGIN DyadicBilinearDownsampler_AArch64_neon 37 38 //Initialize the register 39 mov x6, x2 40 mov x8, x0 41 mov w9, #0 42 lsr w5, w5, #1 43 44 //Save the tailer for the unasigned size 45 smaddl x7, w1, w5, x0 46 ld1 {v4.16b}, [x7] 47 48 add x7, x2, w3, sxtw 49 //processing a colume data 50comp_ds_bilinear_loop0: 51 52 ld1 {v0.16b, v1.16b}, [x2], #32 53 ld1 {v2.16b, v3.16b}, [x7], #32 54 uzp1 v4.16b, v0.16b, v1.16b 55 uzp2 v5.16b, v0.16b, v1.16b 56 uzp1 v6.16b, v2.16b, v3.16b 57 uzp2 v7.16b, v2.16b, v3.16b 58 urhadd v0.16b, v4.16b, v5.16b 59 urhadd v1.16b, v6.16b, v7.16b 60 urhadd v2.16b, v0.16b, v1.16b 61 st1 {v2.16b}, [x0], #16 62 add w9, w9, #32 63 64 cmp w9, w4 65 b.cc comp_ds_bilinear_loop0 66 67 mov w9, #0 68 add x6, x6, w3, sxtw #1 69 mov x2, x6 70 add x7, x2, w3, sxtw 71 add x8, x8, w1, sxtw 72 mov x0, x8 73 sub w5, w5, #1 74 75 cbnz w5, comp_ds_bilinear_loop0 76 77 //restore the tailer for the unasigned size 78 st1 {v4.16b}, [x0] 79 80WELS_ASM_AARCH64_FUNC_END 81 82WELS_ASM_AARCH64_FUNC_BEGIN DyadicBilinearDownsamplerWidthx32_AArch64_neon 83 sub w9, w3, w4 84 sub w1, w1, w4, lsr #1 85 lsr w5, w5, #1 86 87 //processing a colume data 88comp_ds_bilinear_w_x32_loop0: 89 90 lsr w6, w4, #5 91 add x7, x2, w3, sxtw 92 //processing a line data 93comp_ds_bilinear_w_x32_loop1: 94 95 ld1 {v0.16b, v1.16b}, [x2], #32 96 ld1 {v2.16b, v3.16b}, [x7], #32 97 uzp1 v4.16b, v0.16b, v1.16b 98 uzp2 v5.16b, v0.16b, v1.16b 99 uzp1 v6.16b, v2.16b, v3.16b 100 uzp2 v7.16b, v2.16b, v3.16b 101 urhadd v0.16b, v4.16b, v5.16b 102 urhadd v1.16b, v6.16b, v7.16b 103 urhadd v2.16b, v0.16b, v1.16b 104 st1 {v2.16b}, [x0], #16 105 106 sub w6, w6, #1 107 cbnz w6, comp_ds_bilinear_w_x32_loop1 108 109 add x2, x7, w9, sxtw 110 add x0, x0, w1, sxtw 111 sub w5, w5, #1 112 cbnz w5, comp_ds_bilinear_w_x32_loop0 113WELS_ASM_AARCH64_FUNC_END 114 115WELS_ASM_AARCH64_FUNC_BEGIN DyadicBilinearOneThirdDownsampler_AArch64_neon 116 117 //Initialize the register 118 mov x6, x2 119 mov x8, x0 120 mov w9, #0 121 122 //Save the tailer for the unasigned size 123 smaddl x7, w1, w5, x0 124 ld1 {v16.16b}, [x7] 125 126 add x7, x2, w3, sxtw 127 //processing a colume data 128comp_ds_bilinear_onethird_loop0: 129 130 ld3 {v0.16b, v1.16b, v2.16b}, [x2], #48 131 ld3 {v4.16b, v5.16b, v6.16b}, [x7], #48 132 133 uaddl v2.8h, v0.8b, v1.8b 134 uaddl2 v3.8h, v0.16b, v1.16b 135 uaddl v6.8h, v4.8b, v5.8b 136 uaddl2 v7.8h, v4.16b, v5.16b 137 urshr v2.8h, v2.8h, #1 138 urshr v3.8h, v3.8h, #1 139 urshr v6.8h, v6.8h, #1 140 urshr v7.8h, v7.8h, #1 141 142 urhadd v0.8h, v2.8h, v6.8h 143 urhadd v1.8h, v3.8h, v7.8h 144 xtn v0.8b, v0.8h 145 xtn v1.8b, v1.8h 146 st1 {v0.8b,v1.8b}, [x0], #16 147 148 add w9, w9, #48 149 150 cmp w9, w4 151 b.cc comp_ds_bilinear_onethird_loop0 152 153 mov w9, #0 154 add x6, x6, w3, sxtw #1 155 add x6, x6, w3, sxtw 156 mov x2, x6 157 add x7, x2, w3, sxtw 158 add x8, x8, w1, sxtw 159 mov x0, x8 160 sub w5, w5, #1 161 162 cbnz w5, comp_ds_bilinear_onethird_loop0 163 164 //restore the tailer for the unasigned size 165 st1 {v16.16b}, [x0] 166WELS_ASM_AARCH64_FUNC_END 167//void DyadicBilinearQuarterDownsampler_AArch64_neon(uint8_t* pDst, const int32_t kiDstStride, 168//uint8_t* pSrc, const int32_t kiSrcStride, 169//const int32_t kiSrcWidth, const int32_t kiHeight); 170 171WELS_ASM_AARCH64_FUNC_BEGIN DyadicBilinearQuarterDownsampler_AArch64_neon 172 //Initialize the register 173 mov x6, x2 174 mov x8, x0 175 mov w9, #0 176 lsr w5, w5, #2 177 178 //Save the tailer for the unasigned size 179 smaddl x7, w1, w5, x0 180 ld1 {v16.16b}, [x7] 181 182 add x7, x2, w3, sxtw 183 //processing a colume data 184comp_ds_bilinear_quarter_loop0: 185 186 ld2 {v0.8h, v1.8h}, [x2], #32 187 ld2 {v2.8h, v3.8h}, [x2], #32 188 ld2 {v4.8h, v5.8h}, [x7], #32 189 ld2 {v6.8h, v7.8h}, [x7], #32 190 191 uaddlp v0.8h, v0.16b 192 uaddlp v1.8h, v2.16b 193 uaddlp v4.8h, v4.16b 194 uaddlp v5.8h, v6.16b 195 urshr v0.8h, v0.8h, #1 196 urshr v1.8h, v1.8h, #1 197 urshr v4.8h, v4.8h, #1 198 urshr v5.8h, v5.8h, #1 199 200 urhadd v0.8h, v0.8h, v4.8h 201 urhadd v1.8h, v1.8h, v5.8h 202 xtn v0.8b, v0.8h 203 xtn v1.8b, v1.8h 204 st1 {v0.8b,v1.8b}, [x0], #16 205 206 add w9, w9, #64 207 208 cmp w9, w4 209 b.cc comp_ds_bilinear_quarter_loop0 210 211 mov w9, #0 212 add x6, x6, w3, sxtw #2 213 mov x2, x6 214 add x7, x2, w3, sxtw 215 add x8, x8, w1, sxtw 216 mov x0, x8 217 sub w5, w5, #1 218 219 cbnz w5, comp_ds_bilinear_quarter_loop0 220 221 //restore the tailer for the unasigned size 222 st1 {v16.16b}, [x0] 223WELS_ASM_AARCH64_FUNC_END 224 225//void GeneralBilinearAccurateDownsampler_AArch64_neon (uint8_t* pDst, const int32_t kiDstStride, 226// const int32_t kiDstWidth, const int32_t kiDstHeight, 227// uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, const uint32_t kuiScaleY); 228WELS_ASM_AARCH64_FUNC_BEGIN GeneralBilinearAccurateDownsampler_AArch64_neon 229 mov w10, #32767 230 and w8, w6, w10 231 mov w11, #-1 232 mul w12, w11, w8 233 234 dup v2.4h, w8 235 dup v0.4h, w12 236 zip1 v0.4h, v0.4h, v2.4h // uinc -uinc uinc -uinc 237 238 and w9, w7, w10 239 mul w12, w11, w9 240 241 dup v2.4h, w9 242 dup v5.4h, w12 243 ins v5.s[1], v2.s[0] // vinc vinc -vinc -vinc 244 245 mov w11, #0x40000000 246 mov w12, #0x3FFF 247 add w11, w11, w12 248 dup v1.2s, w11 //init u 16384 16383 16384 16383 249 250 mov w8, #16384 251 dup v7.4h, w8 252 sub w11, w8, #1 253 dup v2.4h, w11 254 ins v7.s[0], v2.s[0] //init v 16384 16384 16383 16383 255 256 eor v26.16b, v26.16b, v26.16b 257 eor v27.16b, v27.16b, v27.16b 258 SIGN_EXTENSION x1, w1 259 SIGN_EXTENSION x2, w2 260 SIGN_EXTENSION x3, w3 261 SIGN_EXTENSION x5, w5 262 SIGN_EXTENSION x6, w6 263 SIGN_EXTENSION x7, w7 264 265 sub x1, x1, x2 266 sub x3, x3, #1 267 268_HEIGHT: 269 lsr w11, w8, #15 270 mul w11, w11, w5 271 add x15, x4, w11, sxtw 272 add x12, x15, w5, sxtw 273 274 mov x9, #16384 275 sub x10, x2, #1 276 orr v6.8b, v1.8b, v1.8b 277 278_WIDTH: 279 lsr x13, x9, #15 280 add x14, x15, x13 281 ld2 {v26.b, v27.b}[0], [x14] //q14: 0000000b0000000a; 282 add x14, x12, x13 283 ld2 {v26.b, v27.b}[4], [x14] //q14: 000d000b000c000a; 284 zip1 v28.2s, v26.2s, v27.2s 285 zip2 v29.2s, v26.2s, v27.2s 286 287 umull v20.4s, v6.4h, v7.4h 288 umull v21.2d, v28.2s, v20.2s 289 ins v20.d[0], v20.d[1] 290 umlal v21.2d, v29.2s, v20.2s 291 292 addp d21, v21.2d 293 urshr d21, d21, #30 294 295 st1 {v21.b}[0], [x0], #1 296 add x9, x9, x6 297 add v6.4h, v6.4h, v0.4h 298 shl v6.4h, v6.4h, #1 299 ushr v6.4h, v6.4h, #1 300 sub x10, x10, #1 301 cbnz x10, _WIDTH 302 303WIDTH_END: 304 lsr x9, x9, #15 305 add x14, x15, x9 306 ld1 {v21.b}[0], [x14] 307 st1 {v21.b}[0], [x0], #1 308 add w8, w8, w7 309 add x0, x0, x1 310 add v7.4h, v7.4h, v5.4h 311 shl v7.4h, v7.4h, #1 312 ushr v7.4h, v7.4h, #1 313 sub x3, x3, #1 314 cbnz x3, _HEIGHT 315 316LAST_ROW: 317 lsr w8, w8, #15 318 mul w8, w8, w5 319 add x4, x4, w8, sxtw 320 mov x9, #16384 321 322_LAST_ROW_WIDTH: 323 mov x11, x9 324 lsr x11, x11, #15 325 add x3, x4, x11 326 ld1 {v21.b}[0], [x3] 327 st1 {v21.b}[0], [x0], #1 328 add x9, x9, x6 329 sub x2, x2, #1 330 cbnz x2, _LAST_ROW_WIDTH 331 332WELS_ASM_AARCH64_FUNC_END 333 334#endif 335