1/*! 2 * \copy 3 * Copyright (c) 2013, Cisco Systems 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 10 * * Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 21 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 22 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 24 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 28 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 * 31 */ 32 33#ifdef HAVE_NEON 34#include "arm_arch_common_macro.S" 35 36 37WELS_ASM_FUNC_BEGIN SumOf8x8SingleBlock_neon 38 vld1.64 {d0}, [r0], r1 39 vld1.64 {d1}, [r0], r1 40 vld1.64 {d2}, [r0], r1 41 vld1.64 {d3}, [r0], r1 42 vld1.64 {d4}, [r0], r1 43 vld1.64 {d5}, [r0], r1 44 vld1.64 {d6}, [r0], r1 45 vld1.64 {d7}, [r0] 46 vpaddl.u8 q0, q0 47 vpadal.u8 q0, q1 48 vpadal.u8 q0, q2 49 vpadal.u8 q0, q3 50 51 vpaddl.u16 q0, q0 52 vpadd.i32 d0, d1 53 vpadd.i32 d0, d0 54 vmov r0, r1, d0 55WELS_ASM_FUNC_END 56 57 58WELS_ASM_FUNC_BEGIN SumOf16x16SingleBlock_neon 59 vld1.64 {q0}, [r0], r1 60 vpaddl.u8 q0, q0 61.rept 15 62 vld1.64 {q1}, [r0], r1 63 vpadal.u8 q0, q1 64.endr 65 vpaddl.u16 q0, q0 66 vpadd.i32 d0, d1 67 vpadd.i32 d0, d0 68 vmov r0, r1, d0 69WELS_ASM_FUNC_END 70 71 72WELS_ASM_FUNC_BEGIN SumOf8x8BlockOfFrame_neon 73//(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,const int32_t kiRefStride,uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]) 74 stmdb sp!, {r4-r12} 75 ldr r5, [sp, #40] //pTimesOfFeatureValue 76 ldr r4, [sp, #36] //pFeatureOfBlock 77 78 mov r8, r0 79 mov r6, r1 80 add r8, r6 81 add r4, r4, r6, lsl #1 82 83 mov r7, r6 84_width_loop8x8_1: 85 subs r0, r8, r7 86 vld1.64 {d0}, [r0], r3 87 vld1.64 {d1}, [r0], r3 88 vld1.64 {d2}, [r0], r3 89 vld1.64 {d3}, [r0], r3 90 vld1.64 {d4}, [r0], r3 91 vld1.64 {d5}, [r0], r3 92 vld1.64 {d6}, [r0], r3 93 vld1.64 {d7}, [r0] 94 95 vpaddl.u8 q0, q0 96 vpadal.u8 q0, q1 97 vpadal.u8 q0, q2 98 vpadal.u8 q0, q3 99 vpaddl.u16 q0, q0 100 vpadd.i32 d0, d1 101 vpadd.i32 d0, d0 102 103 subs r1, r4, r7, lsl #1 104 vst1.16 {d0[0]}, [r1] // sum -> pFeatureOfBlock[i] 105 vmov r0, r1, d0 106 add r1, r5, r0, lsl #2 107 ldr r0, [r1] 108 add r0, #1 109 str r0, [r1] 110 111 subs r7, #1 112 bne _width_loop8x8_1 113 114 add r8, r3 115 add r4, r4, r6, lsl #1 116 subs r2, #1 117 beq _SumOf8x8BlockOfFrame_end 118 119 120_height_loop8x8: 121 mov r7, r6 122_width_loop8x8_2: 123 subs r0, r8, r7 124 subs r1, r4, r7, lsl #1 125 126 subs r9, r1, r6, lsl #1 // last line of pFeatureOfBlock[i] 127 ldrh r10, [r9] // sum of last line of pFeatureOfBlock[i] 128 129 subs r11, r0, r3 130 vld1.64 {d1}, [r11] 131 add r0, r11, r3, lsl #3 132 vld1.64 {d0}, [r0] // 133 134 vpaddl.u8 q0, q0 135 vpadd.u16 d0, d0, d1 136 vpaddl.u16 d0, d0 137 vmov r11, r12, d0 138 subs r10, r12 139 add r0, r10, r11 140 141 strh r0, [r1] // sum -> pFeatureOfBlock[i] 142 143 add r1, r5, r0, lsl #2 144 ldr r0, [r1] 145 add r0, #1 146 str r0, [r1] 147 subs r7, #1 148 bne _width_loop8x8_2 149 150 add r8, r3 151 add r4, r4, r6, lsl #1 152 subs r2, #1 153 bne _height_loop8x8 154_SumOf8x8BlockOfFrame_end: 155 ldmia sp!, {r4-r12} 156WELS_ASM_FUNC_END 157 158WELS_ASM_FUNC_BEGIN SumOf16x16BlockOfFrame_neon 159//(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,const int32_t kiRefStride,uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]) 160 stmdb sp!, {r4-r12} 161 ldr r5, [sp, #40] //pTimesOfFeatureValue 162 ldr r4, [sp, #36] //pFeatureOfBlock 163 164 mov r8, r0 165 mov r6, r1 166 add r8, r6 167 add r4, r4, r6, lsl #1 168 169 mov r7, r6 170_width_loop16x16_1: 171 subs r0, r8, r7 172 vld1.64 {q0}, [r0], r3 173 vpaddl.u8 q0, q0 174.rept 15 175 vld1.64 {q1}, [r0], r3 176 vpadal.u8 q0, q1 177.endr 178 vpaddl.u16 q0, q0 179 vpadd.i32 d0, d1 180 vpadd.i32 d0, d0 181 182 subs r1, r4, r7, lsl #1 183 vst1.16 {d0[0]}, [r1] // sum -> pFeatureOfBlock[i] 184 vmov r0, r1, d0 185 add r1, r5, r0, lsl #2 186 ldr r0, [r1] 187 add r0, #1 188 str r0, [r1] 189 190 subs r7, #1 191 bne _width_loop16x16_1 192 add r8, r3 193 add r4, r4, r6, lsl #1 194 subs r2, #1 195 beq _SumOf16x16BlockOfFrame_neon_end 196 197_height_loop16x16: 198 mov r7, r6 199_width_loop16x16_2: 200 subs r0, r8, r7 201 subs r1, r4, r7, lsl #1 202 subs r9, r1, r6, lsl #1 // last line of pFeatureOfBlock[i] 203 ldrh r10, [r9] // sum of last line of pFeatureOfBlock[i] 204 205 subs r11, r0, r3 206 vld1.64 {q1}, [r11] 207 add r0, r11, r3, lsl #4 208 vld1.64 {q0}, [r0] // 209 210 vpaddl.u8 q0, q0 211 vpaddl.u8 q1, q1 212 vpadd.u16 d0, d0, d1 213 vpadd.u16 d1, d2, d3 214 vpadd.u16 d0, d0, d1 215 vpaddl.u16 d0, d0 216 217 vmov r11, r12, d0 218 subs r10, r12 219 add r0, r10, r11 220 221 strh r0, [r1] // sum -> pFeatureOfBlock[i] 222 add r1, r5, r0, lsl #2 223 ldr r0, [r1] 224 add r0, #1 225 str r0, [r1] 226 227 subs r7, #1 228 bne _width_loop16x16_2 229 230 add r8, r3 231 add r4, r4, r6, lsl #1 232 subs r2, #1 233 bne _height_loop16x16 234_SumOf16x16BlockOfFrame_neon_end: 235 ldmia sp!, {r4-r12} 236WELS_ASM_FUNC_END 237 238WELS_ASM_FUNC_BEGIN InitializeHashforFeature_neon 239// (uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize, uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList); 240 stmdb sp!, {r4-r7} 241 ldr r4, [sp, #16] //pFeatureValuePointerList 242 bic r5, r2, #3 243_hash_assign_loop_x4: 244 vld1.64 {q0}, [r0]! 245 vshl.u32 q0, q0, #2 246 vceq.u32 q1, q0, #0 247 vand.i32 d2, d2, d3 248 vmov r6, r7, d2 249 and r6, r6, r7 250 cmp r6, #0xffffffff 251 beq _hash_assign_with_copy_x4 252 253 veor q1, q1 254 vext.32 q2, q1, q0, #3 255 vext.32 q3, q1, q0, #2 256 vext.32 q4, q1, q0, #1 257 vadd.u32 q0, q0, q2 258 vadd.u32 q0, q0, q3 259 vadd.u32 q0, q0, q4 260 vext.32 q2, q1, q0, #3 261 vdup.32 q3, r1 262 vadd.u32 q2, q2, q3 263 vst1.64 {q2}, [r3]! 264 vst1.64 {q2}, [r4]! 265 vmov.32 r6, d1[1] 266 add r1, r1, r6 267 b _assign_next 268 269_hash_assign_with_copy_x4: 270 vdup.32 q2, r1 271 vst1.64 {q2}, [r3]! 272 vst1.64 {q2}, [r4]! 273 274_assign_next: 275 subs r5, r5, #4 276 bne _hash_assign_loop_x4 277 278 and r5, r2, #3 279 cmp r5, #0 280 beq _hash_assign_end 281_hash_assign_loop_x4_rem: 282 str r1, [r3], #4 283 str r1, [r4], #4 284 ldr r7, [r0], #4 285 lsl r7, r7, #2 286 add r1, r1, r7 287 subs r5, r5, #1 288 bne _hash_assign_loop_x4_rem 289_hash_assign_end: 290 291 ldmia sp!, {r4-r7} 292WELS_ASM_FUNC_END 293 294.align 4 295mv_x_inc_x4: .short 0x10, 0x10, 0x10, 0x10, 0x00, 0x00, 0x00, 0x00 296mv_y_inc_x4: .short 0x04, 0x04, 0x04, 0x04, 0x00, 0x00, 0x00, 0x00 297mx_x_offset_x4: .short 0x00, 0x04, 0x08, 0x0c, 0x00, 0x00, 0x00, 0x00 298 299WELS_ASM_FUNC_BEGIN FillQpelLocationByFeatureValue_neon 300// void (uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList) 301 stmdb sp!, {r4-r8} 302 vpush {q4-q7} 303 adr r7, mv_x_inc_x4 304 vld1.64 {q7}, [r7] 305 adr r7, mv_y_inc_x4 306 vld1.64 {q6}, [r7] 307 adr r7, mx_x_offset_x4 308 vld1.64 {q5}, [r7] 309 veor q4, q4 310 veor q3, q3 311 vdup.32 q8, r3 312_hash_height_loop: 313 mov r7, r1 314 vmov q2, q5 //mx_x_offset_x4 315_hash_width_loop: 316 vld1.64 {d0}, [r0]! 317 vshll.u16 q0, d0, #2 318 vadd.u32 q0, q8 319 vmov q1, q2 320 vmov q4, q3 321 vzip.16 q1, q4 322 323 vmov.32 r4, d0[0] 324 ldr r5, [r4] 325 vmov.32 r6, d2[0] 326 str r6, [r5] 327 add r5, r5, #4 328 pld [r5] // cache miss? 329 str r5, [r4] 330 331 vmov.32 r4, d0[1] 332 ldr r5, [r4] 333 vmov.32 r6, d2[1] 334 str r6, [r5] 335 add r5, r5, #4 336 pld [r5] // cache miss? 337 str r5, [r4] 338 339 vmov.32 r4, d1[0] 340 ldr r5, [r4] 341 vmov.32 r6, d3[0] 342 str r6, [r5] 343 add r5, r5, #4 344 pld [r5] // cache miss? 345 str r5, [r4] 346 347 vmov.32 r4, d1[1] 348 ldr r5, [r4] 349 vmov.32 r6, d3[1] 350 str r6, [r5] 351 add r5, r5, #4 352 pld [r5] // cache miss? 353 str r5, [r4] 354 355 vadd.u16 q2, q2, q7 356 subs r7, #4 357 bne _hash_width_loop 358 359 vadd.u16 q3, q3, q6 360 subs r2, #1 361 bne _hash_height_loop 362 363 vpop {q4-q7} 364 ldmia sp!, {r4-r8} 365WELS_ASM_FUNC_END 366#endif 367