1/*! 2 * \copy 3 * Copyright (c) 2013, Cisco Systems 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 10 * * Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 21 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 22 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 24 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 28 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 * 31 */ 32 33#ifdef HAVE_NEON_AARCH64 34#include "arm_arch64_common_macro.S" 35//int32_t SumOf8x8SingleBlock_AArch64_neon (uint8_t* pRef, const int32_t kiRefStride); 36WELS_ASM_AARCH64_FUNC_BEGIN SumOf8x8SingleBlock_AArch64_neon 37 SIGN_EXTENSION x1,w1 38 ld1 {v0.d}[0], [x0], x1 39 ld1 {v0.d}[1], [x0], x1 40 ld1 {v1.d}[0], [x0], x1 41 ld1 {v1.d}[1], [x0], x1 42 ld1 {v2.d}[0], [x0], x1 43 ld1 {v2.d}[1], [x0], x1 44 ld1 {v3.d}[0], [x0], x1 45 ld1 {v3.d}[1], [x0] 46 uaddlp v0.8h, v0.16b 47 uadalp v0.8h, v1.16b 48 uadalp v0.8h, v2.16b 49 uadalp v0.8h, v3.16b 50 uaddlv s0, v0.8h 51 mov x0, v0.d[0] 52WELS_ASM_AARCH64_FUNC_END 53 54//int32_t SumOf16x16SingleBlock_AArch64_neon (uint8_t* pRef, const int32_t kiRefStride); 55WELS_ASM_AARCH64_FUNC_BEGIN SumOf16x16SingleBlock_AArch64_neon 56 SIGN_EXTENSION x1,w1 57 ld1 {v0.16b}, [x0], x1 58 uaddlp v0.8h, v0.16b 59.rept 15 60 ld1 {v1.16b}, [x0], x1 61 uadalp v0.8h, v1.16b 62.endr 63 uaddlv s0, v0.8h 64 mov x0, v0.d[0] 65WELS_ASM_AARCH64_FUNC_END 66 67//void SumOf8x8BlockOfFrame_AArch64_neon (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, 68// const int32_t kiRefStride, 69// uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]); 70WELS_ASM_AARCH64_FUNC_BEGIN SumOf8x8BlockOfFrame_AArch64_neon 71//(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,const int32_t kiRefStride,uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]) 72 //x5: pTimesOfFeatureValue 73 //x4: pFeatureOfBlock 74 75 SIGN_EXTENSION x1,w1 76 SIGN_EXTENSION x2,w2 77 SIGN_EXTENSION x3,w3 78 mov x8, x0 79 mov x6, x1 80 add x8, x8, x6 81 add x4, x4, x6, lsl #1 82 83 mov x7, x6 84_width_loop8x8_1: 85 subs x0, x8, x7 86 ld1 {v0.d}[0], [x0], x3 87 ld1 {v0.d}[1], [x0], x3 88 ld1 {v1.d}[0], [x0], x3 89 ld1 {v1.d}[1], [x0], x3 90 ld1 {v2.d}[0], [x0], x3 91 ld1 {v2.d}[1], [x0], x3 92 ld1 {v3.d}[0], [x0], x3 93 ld1 {v3.d}[1], [x0] 94 uaddlp v0.8h, v0.16b 95 uadalp v0.8h, v1.16b 96 uadalp v0.8h, v2.16b 97 uadalp v0.8h, v3.16b 98 uaddlv s0, v0.8h 99 100 subs x1, x4, x7, lsl #1 101 st1 {v0.h}[0], [x1] // sum -> pFeatureOfBlock[i] 102 mov w0, #0 103 ins v0.s[1], w0 104 mov x0, v0.d[0] 105 add x1, x5, x0, lsl #2 106 ldr w0, [x1] 107 add w0, w0, #1 108 str w0, [x1] 109 subs x7, x7, #1 110 cbnz x7, _width_loop8x8_1 111 112 add x8, x8, x3 113 add x4, x4, x6, lsl #1 114 subs x2, x2, #1 115 cbz x2, _SumOf8x8BlockOfFrame_AArch64_neon_end 116 117_height_loop8x8: 118 mov x7, x6 119_width_loop8x8_2: 120 subs x0, x8, x7 121 subs x1, x4, x7, lsl #1 122 subs x9, x1, x6, lsl #1 // last line of pFeatureOfBlock[i] 123 ldrh w10, [x9] // sum of last line of pFeatureOfBlock[i] 124 125 subs x11, x0, x3 126 ld1 {v0.d}[1], [x11] 127 add x0, x11, x3, lsl #3 128 ld1 {v0.d}[0], [x0] // 129 130 uaddlp v0.8h, v0.16b 131 addp v0.8h, v0.8h, v1.8h 132 uaddlp v0.4s, v0.8h 133 umov w11, v0.s[0] 134 umov w12, v0.s[1] 135 136 subs w10, w10, w12 137 mov x0, #0 138 add w0, w10, w11 139 strh w0, [x1] // sum -> pFeatureOfBlock[i] 140 add x1, x5, x0, lsl #2 141 ldr w0, [x1] 142 add w0, w0, #1 143 str w0, [x1] 144 subs x7, x7, #1 145 cbnz x7, _width_loop8x8_2 146 147 add x8, x8, x3 148 add x4, x4, x6, lsl #1 149 subs x2, x2, #1 150 cbnz x2, _height_loop8x8 151_SumOf8x8BlockOfFrame_AArch64_neon_end: 152WELS_ASM_AARCH64_FUNC_END 153 154WELS_ASM_AARCH64_FUNC_BEGIN SumOf16x16BlockOfFrame_AArch64_neon 155//(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,const int32_t kiRefStride,uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]) 156 //x5: pTimesOfFeatureValue 157 //x4: pFeatureOfBlock 158 159 SIGN_EXTENSION x1,w1 160 SIGN_EXTENSION x2,w2 161 SIGN_EXTENSION x3,w3 162 mov x8, x0 163 mov x6, x1 164 add x8, x8, x6 165 add x4, x4, x6, lsl #1 166 167 mov x7, x6 168_width_loop16x16_1: 169 subs x0, x8, x7 170 ld1 {v0.16b}, [x0], x3 171 uaddlp v0.8h, v0.16b 172.rept 15 173 ld1 {v1.16b}, [x0], x3 174 uadalp v0.8h, v1.16b 175.endr 176 uaddlv s0, v0.8h 177 178 subs x1, x4, x7, lsl #1 179 st1 {v0.h}[0], [x1] // sum -> pFeatureOfBlock[i] 180 mov w0, #0 181 ins v0.s[1], w0 182 mov x0, v0.d[0] 183 add x1, x5, x0, lsl #2 184 ldr w0, [x1] 185 add w0, w0, #1 186 str w0, [x1] 187 subs x7, x7, #1 188 cbnz x7, _width_loop16x16_1 189 190 add x8, x8, x3 191 add x4, x4, x6, lsl #1 192 subs x2, x2, #1 193 cbz x2, _SumOf16x16BlockOfFrame_AArch64_neon_end 194 195_height_loop16x16: 196 mov x7, x6 197_width_loop16x16_2: 198 subs x0, x8, x7 199 200 subs x1, x4, x7, lsl #1 201 subs x9, x1, x6, lsl #1 // last line of pFeatureOfBlock[i] 202 ldrh w10, [x9] // sum of last line of pFeatureOfBlock[i] 203 204 subs x11, x0, x3 205 ld1 {v1.16b}, [x11] 206 add x0, x11, x3, lsl #4 207 ld1 {v0.16b}, [x0] // 208 209 uaddlv h0, v0.16b 210 uaddlv h1, v1.16b 211 umov w11, v0.h[0] 212 umov w12, v1.h[0] 213 214 subs w10, w10, w12 215 mov x0, #0 216 add w0, w10, w11 217 strh w0, [x1] // sum -> pFeatureOfBlock[i] 218 add x1, x5, x0, lsl #2 219 ldr w0, [x1] 220 add w0, w0, #1 221 str w0, [x1] 222 subs x7, x7, #1 223 cbnz x7, _width_loop16x16_2 224 225 add x8, x8, x3 226 add x4, x4, x6, lsl #1 227 subs x2, x2, #1 228 cbnz x2, _height_loop16x16 229_SumOf16x16BlockOfFrame_AArch64_neon_end: 230WELS_ASM_AARCH64_FUNC_END 231 232WELS_ASM_AARCH64_FUNC_BEGIN InitializeHashforFeature_AArch64_neon 233// (uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize, uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList); 234 SIGN_EXTENSION x2,w2 235 mov x9, #3 236 bic x5, x2, x9 237 mov x8, #0 238_hash_assign_loop_x4: 239 ld1 {v0.16b}, [x0], #16 240 shl v0.4s, v0.4s, #2 241 addv s1, v0.4s 242 umov w7, v1.s[0] 243 cbz w7, _hash_assign_with_copy_x4 244 245 ins v2.d[0], x1 246 umov w8, v0.s[0] 247 add x1, x1, x8 248 ins v2.d[1], x1 249 umov w8, v0.s[1] 250 add x1, x1, x8 251 ins v3.d[0], x1 252 umov w8, v0.s[2] 253 add x1, x1, x8 254 ins v3.d[1], x1 255 umov w8, v0.s[3] 256 add x1, x1, x8 257 st1 {v2.16b, v3.16b}, [x3], #32 258 st1 {v2.16b, v3.16b}, [x4], #32 259 b _assign_next 260_hash_assign_with_copy_x4: 261 dup v2.2d, x1 262 dup v3.2d, x1 263 st1 {v2.16b, v3.16b}, [x3], #32 264 st1 {v2.16b, v3.16b}, [x4], #32 265 266_assign_next: 267 subs x5, x5, #4 268 cbnz x5, _hash_assign_loop_x4 269 270 and x5, x2, x9 271 cbz x5, _hash_assign_end 272 273 274_hash_assign_loop_x4_rem: 275 str x1, [x3], #8 276 str x1, [x4], #8 277 ldr w8, [x0], #4 278 lsl w8, w8, #2 279 add x1, x1, x8 280 subs x5, x5, #1 281 cbnz x5, _hash_assign_loop_x4_rem 282 283_hash_assign_end: 284WELS_ASM_AARCH64_FUNC_END 285 286.align 4 287mv_x_inc_x4: .short 0x10, 0x10, 0x10, 0x10, 0x00, 0x00, 0x00, 0x00 288mv_y_inc_x4: .short 0x04, 0x04, 0x04, 0x04, 0x00, 0x00, 0x00, 0x00 289mx_x_offset_x4: .short 0x00, 0x04, 0x08, 0x0c, 0x00, 0x00, 0x00, 0x00 290 291WELS_ASM_AARCH64_FUNC_BEGIN FillQpelLocationByFeatureValue_AArch64_neon 292// void (uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList) 293 ldr q7, mv_x_inc_x4 294 ldr q6, mv_y_inc_x4 295 ldr q5, mx_x_offset_x4 296 SIGN_EXTENSION x1,w1 297 SIGN_EXTENSION x2,w2 298 eor v4.16b, v4.16b, v4.16b 299 eor v3.16b, v3.16b, v3.16b 300 dup v16.2d, x3 // v8->v16 301 302_hash_height_loop: 303 mov x7, x1 304 mov v2.16b, v5.16b //mx_x_offset_x4 305 306_hash_width_loop: 307 ld1 {v0.d}[0], [x0], #8 308 309 ushll v0.4s, v0.4h, #3 310 uaddw v17.2d, v16.2d, v0.2s 311 uaddw2 v18.2d, v16.2d, v0.4s 312 zip1 v1.8h, v2.8h, v3.8h 313 314 umov x4, v17.d[0] 315 ldr x5, [x4] 316 umov w6, v1.s[0] 317 str w6, [x5] 318 add x5, x5, #4 319 str x5, [x4] 320 321 umov x4, v17.d[1] 322 ldr x5, [x4] 323 umov w6, v1.s[1] 324 str w6, [x5] 325 add x5, x5, #4 326 str x5, [x4] 327 328 umov x4, v18.d[0] 329 ldr x5, [x4] 330 umov w6, v1.s[2] 331 str w6, [x5] 332 add x5, x5, #4 333 str x5, [x4] 334 335 umov x4, v18.d[1] 336 ldr x5, [x4] 337 umov w6, v1.s[3] 338 str w6, [x5] 339 add x5, x5, #4 340 str x5, [x4] 341 342 add v2.8h, v2.8h, v7.8h 343 subs x7, x7, #4 344 cbnz x7, _hash_width_loop 345 346 add v3.8h, v3.8h, v6.8h 347 subs x2, x2, #1 348 cbnz x2, _hash_height_loop 349WELS_ASM_AARCH64_FUNC_END 350#endif 351