1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12 EXPORT |vp8_mse16x16_neon| 13 EXPORT |vp8_get4x4sse_cs_neon| 14 15 ARM 16 REQUIRE8 17 PRESERVE8 18 19 AREA ||.text||, CODE, READONLY, ALIGN=2 20;============================ 21; r0 unsigned char *src_ptr 22; r1 int source_stride 23; r2 unsigned char *ref_ptr 24; r3 int recon_stride 25; stack unsigned int *sse 26;note: in this function, sum is never used. So, we can remove this part of calculation 27;from vp8_variance(). 28 29|vp8_mse16x16_neon| PROC 30 vmov.i8 q7, #0 ;q7, q8, q9, q10 - sse 31 vmov.i8 q8, #0 32 vmov.i8 q9, #0 33 vmov.i8 q10, #0 34 35 mov r12, #8 36 37mse16x16_neon_loop 38 vld1.8 {q0}, [r0], r1 ;Load up source and reference 39 vld1.8 {q2}, [r2], r3 40 vld1.8 {q1}, [r0], r1 41 vld1.8 {q3}, [r2], r3 42 43 vsubl.u8 q11, d0, d4 44 vsubl.u8 q12, d1, d5 45 vsubl.u8 q13, d2, d6 46 vsubl.u8 q14, d3, d7 47 48 vmlal.s16 q7, d22, d22 49 vmlal.s16 q8, d23, d23 50 51 subs r12, r12, #1 52 53 vmlal.s16 q9, d24, d24 54 vmlal.s16 q10, d25, d25 55 vmlal.s16 q7, d26, d26 56 vmlal.s16 q8, d27, d27 57 vmlal.s16 q9, d28, d28 58 vmlal.s16 q10, d29, d29 59 60 bne mse16x16_neon_loop 61 62 vadd.u32 q7, q7, q8 63 vadd.u32 q9, q9, q10 64 65 ldr r12, [sp] ;load *sse from stack 66 67 vadd.u32 q10, q7, q9 68 vpaddl.u32 q1, q10 69 vadd.u64 d0, d2, d3 70 71 vst1.32 {d0[0]}, [r12] 72 vmov.32 r0, d0[0] 73 74 bx lr 75 76 ENDP 77 78 79;============================= 80; r0 unsigned char *src_ptr, 81; r1 int source_stride, 82; r2 unsigned char *ref_ptr, 83; r3 int recon_stride 84|vp8_get4x4sse_cs_neon| PROC 85 vld1.8 {d0}, [r0], r1 ;Load up source and reference 86 vld1.8 {d4}, [r2], r3 87 vld1.8 {d1}, [r0], r1 88 vld1.8 {d5}, [r2], r3 89 vld1.8 {d2}, [r0], r1 90 vld1.8 {d6}, [r2], r3 91 vld1.8 {d3}, [r0], r1 92 vld1.8 {d7}, [r2], r3 93 94 vsubl.u8 q11, d0, d4 95 vsubl.u8 q12, d1, d5 96 vsubl.u8 q13, d2, d6 97 vsubl.u8 q14, d3, d7 98 99 vmull.s16 q7, d22, d22 100 vmull.s16 q8, d24, d24 101 vmull.s16 q9, d26, d26 102 vmull.s16 q10, d28, d28 103 104 vadd.u32 q7, q7, q8 105 vadd.u32 q9, q9, q10 106 vadd.u32 q9, q7, q9 107 108 vpaddl.u32 q1, q9 109 vadd.u64 d0, d2, d3 110 111 vmov.32 r0, d0[0] 112 bx lr 113 114 ENDP 115 116 END 117