1/*! 2 * \copy 3 * Copyright (c) 2013, Cisco Systems 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 10 * * Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 21 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 22 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 24 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 28 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 * 31 */ 32 33#ifdef HAVE_NEON 34#include "arm_arch_common_macro.S" 35 36 37WELS_ASM_FUNC_BEGIN DyadicBilinearDownsampler_neon 38 stmdb sp!, {r4-r8, lr} 39 40 //Get the width and height 41 ldr r4, [sp, #24] //src_width 42 ldr r5, [sp, #28] //src_height 43 44 //Initialize the register 45 mov r6, r2 46 mov r8, r0 47 mov lr, #0 48 lsr r5, #1 49 50 //Save the tailer for the unasigned size 51 mla r7, r1, r5, r0 52 vld1.32 {q15}, [r7] 53 54 add r7, r2, r3 55 //processing a colume data 56comp_ds_bilinear_loop0: 57 58 vld1.8 {q0,q1}, [r2]! 59 vld1.8 {q2,q3}, [r7]! 60 vuzp.8 q0, q1 61 vuzp.8 q2, q3 62 vrhadd.u8 q0, q0, q1 63 vrhadd.u8 q2, q2, q3 64 vrhadd.u8 q0, q0, q2 65 vst1.32 {q0}, [r0]! 66 add lr, #32 67 68 cmp lr, r4 69 movcs lr, #0 70 addcs r6, r6, r3, lsl #1 71 movcs r2, r6 72 addcs r7, r2, r3 73 addcs r8, r1 74 movcs r0, r8 75 subscs r5, #1 76 bne comp_ds_bilinear_loop0 77 78 //restore the tailer for the unasigned size 79 vst1.32 {q15}, [r0] 80 81 ldmia sp!, {r4-r8,lr} 82WELS_ASM_FUNC_END 83 84 85WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x8_neon 86 stmdb sp!, {r4-r7, lr} 87 88 //Get the width and height 89 ldr r4, [sp, #20] //src_width 90 ldr r5, [sp, #24] //src_height 91 92 //Get the difference 93 sub lr, r3, r4 94 sub r1, r1, r4, lsr #1 95 96 lsr r5, #1 97 98 //processing a colume data 99comp_ds_bilinear_w_x8_loop0: 100 101 lsr r6, r4, #3 102 add r7, r2, r3 103 //processing a line data 104comp_ds_bilinear_w_x8_loop1: 105 106 vld1.8 {d0}, [r2]! 107 vld1.8 {d1}, [r7]! 108 vpaddl.u8 q0, q0 109 vrshr.u16 q0, #1 110 vrhadd.u16 d0, d1 111 112 vmovn.u16 d0, q0 113 vst1.32 {d0[0]}, [r0]! 114 subs r6, #1 115 bne comp_ds_bilinear_w_x8_loop1 116 117 add r2, r7, lr 118 add r0, r1 119 subs r5, #1 120 bne comp_ds_bilinear_w_x8_loop0 121 122 ldmia sp!, {r4-r7,lr} 123WELS_ASM_FUNC_END 124 125 126WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x16_neon 127 stmdb sp!, {r4-r7, lr} 128 129 //Get the width and height 130 ldr r4, [sp, #20] //src_width 131 ldr r5, [sp, #24] //src_height 132 133 //Get the difference 134 sub lr, r3, r4 135 sub r1, r1, r4, lsr #1 136 137 lsr r5, #1 138 139 //processing a colume data 140comp_ds_bilinear_w_x16_loop0: 141 142 lsr r6, r4, #4 143 add r7, r2, r3 144 //processing a line data 145comp_ds_bilinear_w_x16_loop1: 146 147 vld1.8 {q0}, [r2]! 148 vld1.8 {q1}, [r7]! 149 vpaddl.u8 q0, q0 150 vpaddl.u8 q1, q1 151 vrshr.u16 q0, #1 152 vrshr.u16 q1, #1 153 vrhadd.u16 q0, q1 154 155 vmovn.u16 d0, q0 156 vst1.32 {d0}, [r0]! 157 subs r6, #1 158 bne comp_ds_bilinear_w_x16_loop1 159 160 add r2, r7, lr 161 add r0, r1 162 subs r5, #1 163 bne comp_ds_bilinear_w_x16_loop0 164 165 ldmia sp!, {r4-r7,lr} 166WELS_ASM_FUNC_END 167 168 169WELS_ASM_FUNC_BEGIN DyadicBilinearDownsamplerWidthx32_neon 170 stmdb sp!, {r4-r7, lr} 171 172 //Get the width and height 173 ldr r4, [sp, #20] //src_width 174 ldr r5, [sp, #24] //src_height 175 176 //Get the difference 177 sub lr, r3, r4 178 sub r1, r1, r4, lsr #1 179 180 lsr r5, #1 181 182 //processing a colume data 183comp_ds_bilinear_w_x32_loop0: 184 185 lsr r6, r4, #5 186 add r7, r2, r3 187 //processing a line data 188comp_ds_bilinear_w_x32_loop1: 189 190 vld1.8 {q0,q1}, [r2]! 191 vld1.8 {q2,q3}, [r7]! 192 vuzp.8 q0, q1 193 vuzp.8 q2, q3 194 vrhadd.u8 q0, q0, q1 195 vrhadd.u8 q2, q2, q3 196 vrhadd.u8 q0, q0, q2 197 vst1.32 {q0}, [r0]! 198 subs r6, #1 199 bne comp_ds_bilinear_w_x32_loop1 200 201 add r2, r7, lr 202 add r0, r1 203 subs r5, #1 204 bne comp_ds_bilinear_w_x32_loop0 205 206 ldmia sp!, {r4-r7,lr} 207WELS_ASM_FUNC_END 208 209 210WELS_ASM_FUNC_BEGIN GeneralBilinearAccurateDownsampler_neon 211 stmdb sp!, {r4-r12, lr} 212 213 //Get the data from stack 214 ldr r4, [sp, #40] //the addr of src 215 ldr r5, [sp, #44] //the value of src_stride 216 ldr r6, [sp, #48] //the value of scaleX 217 ldr r7, [sp, #52] //the value of scaleY 218 219 mov r10, #32768 220 sub r10, #1 221 and r8, r6, r10 // r8 uinc(scaleX mod 32767) 222 mov r11, #-1 223 mul r11, r8 // r11 -uinc 224 225 vdup.s16 d2, r8 226 vdup.s16 d0, r11 227 vzip.s16 d0, d2 // uinc -uinc uinc -uinc 228 229 and r9, r7, r10 // r9 vinc(scaleY mod 32767) 230 mov r11, #-1 231 mul r11, r9 // r11 -vinc 232 233 vdup.s16 d2, r9 234 vdup.s16 d3, r11 235 vext.8 d5, d3, d2, #4 // vinc vinc -vinc -vinc 236 237 mov r11, #0x40000000 238 mov r12, #0x4000 239 sub r12, #1 240 add r11, r12 241 vdup.s32 d1, r11; //init u 16384 16383 16384 16383 242 243 mov r11, #16384 244 vdup.s16 d16, r11 245 sub r11, #1 246 vdup.s16 d17, r11 247 vext.8 d7, d17, d16, #4 //init v 16384 16384 16383 16383 248 249 veor q14, q14 250 sub r1, r2 // stride - width 251 mov r8, #16384 // yInverse 252 sub r3, #1 253 254_HEIGHT: 255 ldr r4, [sp, #40] //the addr of src 256 mov r11, r8 257 lsr r11, #15 258 mul r11, r5 259 add r11, r4 // get current row address 260 mov r12, r11 261 add r12, r5 262 263 mov r9, #16384 // xInverse 264 sub r10, r2, #1 265 vmov.s16 d6, d1 266 267_WIDTH: 268 mov lr, r9 269 lsr lr, #15 270 add r4, r11,lr 271 vld2.8 {d28[0],d29[0]}, [r4] //q14: 0000000b0000000a; 272 add r4, r12,lr 273 vld2.8 {d28[4],d29[4]}, [r4] //q14: 000d000b000c000a; 274 vzip.32 d28, d29 //q14: 000d000c000b000a; 275 276 vmull.u16 q13, d6, d7 //q13: init u * init v 277 vmull.u32 q12, d26,d28 278 vmlal.u32 q12, d27,d29 279 vqadd.u64 d24, d24,d25 280 vrshr.u64 d24, #30 281 282 vst1.8 {d24[0]}, [r0]! 283 add r9, r6 284 vadd.u16 d6, d0 // inc u 285 vshl.u16 d6, #1 286 vshr.u16 d6, #1 287 subs r10, #1 288 bne _WIDTH 289 290WIDTH_END: 291 lsr r9, #15 292 add r4,r11,r9 293 vld1.8 {d24[0]}, [r4] 294 vst1.8 {d24[0]}, [r0] 295 add r0, #1 296 add r8, r7 297 add r0, r1 298 vadd.s16 d7, d5 // inc v 299 vshl.u16 d7, #1 300 vshr.u16 d7, #1 301 subs r3, #1 302 bne _HEIGHT 303 304LAST_ROW: 305 ldr r4, [sp, #40] //the addr of src 306 lsr r8, #15 307 mul r8, r5 308 add r4, r8 // get current row address 309 mov r9, #16384 310 311_LAST_ROW_WIDTH: 312 mov r11, r9 313 lsr r11, #15 314 315 add r3, r4,r11 316 vld1.8 {d0[0]}, [r3] 317 vst1.8 {d0[0]}, [r0] 318 add r0, #1 319 add r9, r6 320 subs r2, #1 321 bne _LAST_ROW_WIDTH 322 323 ldmia sp!, {r4-r12, lr} 324WELS_ASM_FUNC_END 325 326WELS_ASM_FUNC_BEGIN DyadicBilinearOneThirdDownsampler_neon 327 stmdb sp!, {r4-r8, lr} 328 329 //Get the width and height 330 ldr r4, [sp, #24] //src_width 331 ldr r5, [sp, #28] //src_height 332 333 //Initialize the register 334 mov r6, r2 335 mov r8, r0 336 mov lr, #0 337 338 //Save the tailer for the un-aligned size 339 mla r7, r1, r5, r0 340 vld1.32 {q15}, [r7] 341 342 add r7, r2, r3 343 //processing a colume data 344comp_ds_bilinear_onethird_loop0: 345 346 vld3.8 {d0, d1, d2}, [r2]! 347 vld3.8 {d3, d4, d5}, [r2]! 348 vld3.8 {d16, d17, d18}, [r7]! 349 vld3.8 {d19, d20, d21}, [r7]! 350 351 vaddl.u8 q11, d0, d1 352 vaddl.u8 q12, d3, d4 353 vaddl.u8 q13, d16, d17 354 vaddl.u8 q14, d19, d20 355 vrshr.u16 q11, #1 356 vrshr.u16 q12, #1 357 vrshr.u16 q13, #1 358 vrshr.u16 q14, #1 359 360 vrhadd.u16 q11, q13 361 vrhadd.u16 q12, q14 362 363 vmovn.u16 d0, q11 364 vmovn.u16 d1, q12 365 vst1.8 {q0}, [r0]! 366 367 add lr, #48 368 cmp lr, r4 369 movcs lr, #0 370 addcs r6, r6, r3, lsl #1 371 addcs r6, r6, r3 372 movcs r2, r6 373 addcs r7, r2, r3 374 addcs r8, r1 375 movcs r0, r8 376 subscs r5, #1 377 bne comp_ds_bilinear_onethird_loop0 378 379 //restore the tailer for the un-aligned size 380 vst1.32 {q15}, [r0] 381 382 ldmia sp!, {r4-r8,lr} 383WELS_ASM_FUNC_END 384 385WELS_ASM_FUNC_BEGIN DyadicBilinearQuarterDownsampler_neon 386 stmdb sp!, {r4-r8, lr} 387 388 //Get the width and height 389 ldr r4, [sp, #24] //src_width 390 ldr r5, [sp, #28] //src_height 391 392 //Initialize the register 393 mov r6, r2 394 mov r8, r0 395 mov lr, #0 396 lsr r5, #2 397 398 //Save the tailer for the un-aligned size 399 mla r7, r1, r5, r0 400 vld1.32 {q15}, [r7] 401 402 add r7, r2, r3 403 //processing a colume data 404comp_ds_bilinear_quarter_loop0: 405 406 vld2.16 {q0, q1}, [r2]! 407 vld2.16 {q2, q3}, [r2]! 408 vld2.16 {q8, q9}, [r7]! 409 vld2.16 {q10, q11}, [r7]! 410 411 vpaddl.u8 q0, q0 412 vpaddl.u8 q2, q2 413 vpaddl.u8 q8, q8 414 vpaddl.u8 q10, q10 415 vrshr.u16 q0, #1 416 vrshr.u16 q2, #1 417 vrshr.u16 q8, #1 418 vrshr.u16 q10, #1 419 420 vrhadd.u16 q0, q8 421 vrhadd.u16 q2, q10 422 vmovn.u16 d0, q0 423 vmovn.u16 d1, q2 424 vst1.8 {q0}, [r0]! 425 426 add lr, #64 427 cmp lr, r4 428 movcs lr, #0 429 addcs r6, r6, r3, lsl #2 430 movcs r2, r6 431 addcs r7, r2, r3 432 addcs r8, r1 433 movcs r0, r8 434 subscs r5, #1 435 bne comp_ds_bilinear_quarter_loop0 436 437 //restore the tailer for the un-aligned size 438 vst1.32 {q15}, [r0] 439 440 ldmia sp!, {r4-r8,lr} 441WELS_ASM_FUNC_END 442 443#endif 444