1; 2; Copyright (c) 2011 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12 EXPORT |vp8_mse16x16_armv6| 13 14 ARM 15 16 AREA ||.text||, CODE, READONLY, ALIGN=2 17 18; r0 unsigned char *src_ptr 19; r1 int source_stride 20; r2 unsigned char *ref_ptr 21; r3 int recon_stride 22; stack unsigned int *sse 23; 24;note: Based on vp8_variance16x16_armv6. In this function, sum is never used. 25; So, we can remove this part of calculation. 26 27|vp8_mse16x16_armv6| PROC 28 29 push {r4-r9, lr} 30 mov r12, #16 ; set loop counter to 16 (=block height) 31 32 mov r4, #0 ; initialize sse = 0 33 34loop 35 ; 1st 4 pixels 36 ldr r5, [r0, #0x0] ; load 4 src pixels 37 ldr r6, [r2, #0x0] ; load 4 ref pixels 38 39 mov lr, #0 ; constant zero 40 41 usub8 r8, r5, r6 ; calculate difference 42 sel r7, r8, lr ; select bytes with positive difference 43 usub8 r9, r6, r5 ; calculate difference with reversed operands 44 sel r8, r9, lr ; select bytes with negative difference 45 46 ; calculate partial sums 47 usad8 r5, r7, lr ; calculate sum of positive differences 48 usad8 r6, r8, lr ; calculate sum of negative differences 49 orr r8, r8, r7 ; differences of all 4 pixels 50 51 ldr r5, [r0, #0x4] ; load 4 src pixels 52 53 ; calculate sse 54 uxtb16 r6, r8 ; byte (two pixels) to halfwords 55 uxtb16 r7, r8, ror #8 ; another two pixels to halfwords 56 smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) 57 58 ; 2nd 4 pixels 59 ldr r6, [r2, #0x4] ; load 4 ref pixels 60 smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) 61 62 usub8 r8, r5, r6 ; calculate difference 63 sel r7, r8, lr ; select bytes with positive difference 64 usub8 r9, r6, r5 ; calculate difference with reversed operands 65 sel r8, r9, lr ; select bytes with negative difference 66 67 ; calculate partial sums 68 usad8 r5, r7, lr ; calculate sum of positive differences 69 usad8 r6, r8, lr ; calculate sum of negative differences 70 orr r8, r8, r7 ; differences of all 4 pixels 71 ldr r5, [r0, #0x8] ; load 4 src pixels 72 ; calculate sse 73 uxtb16 r6, r8 ; byte (two pixels) to halfwords 74 uxtb16 r7, r8, ror #8 ; another two pixels to halfwords 75 smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) 76 77 ; 3rd 4 pixels 78 ldr r6, [r2, #0x8] ; load 4 ref pixels 79 smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) 80 81 usub8 r8, r5, r6 ; calculate difference 82 sel r7, r8, lr ; select bytes with positive difference 83 usub8 r9, r6, r5 ; calculate difference with reversed operands 84 sel r8, r9, lr ; select bytes with negative difference 85 86 ; calculate partial sums 87 usad8 r5, r7, lr ; calculate sum of positive differences 88 usad8 r6, r8, lr ; calculate sum of negative differences 89 orr r8, r8, r7 ; differences of all 4 pixels 90 91 ldr r5, [r0, #0xc] ; load 4 src pixels 92 93 ; calculate sse 94 uxtb16 r6, r8 ; byte (two pixels) to halfwords 95 uxtb16 r7, r8, ror #8 ; another two pixels to halfwords 96 smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) 97 98 ; 4th 4 pixels 99 ldr r6, [r2, #0xc] ; load 4 ref pixels 100 smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) 101 102 usub8 r8, r5, r6 ; calculate difference 103 add r0, r0, r1 ; set src_ptr to next row 104 sel r7, r8, lr ; select bytes with positive difference 105 usub8 r9, r6, r5 ; calculate difference with reversed operands 106 add r2, r2, r3 ; set dst_ptr to next row 107 sel r8, r9, lr ; select bytes with negative difference 108 109 ; calculate partial sums 110 usad8 r5, r7, lr ; calculate sum of positive differences 111 usad8 r6, r8, lr ; calculate sum of negative differences 112 orr r8, r8, r7 ; differences of all 4 pixels 113 114 subs r12, r12, #1 ; next row 115 116 ; calculate sse 117 uxtb16 r6, r8 ; byte (two pixels) to halfwords 118 uxtb16 r7, r8, ror #8 ; another two pixels to halfwords 119 smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) 120 smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) 121 122 bne loop 123 124 ; return stuff 125 ldr r1, [sp, #28] ; get address of sse 126 mov r0, r4 ; return sse 127 str r4, [r1] ; store sse 128 129 pop {r4-r9, pc} 130 131 ENDP 132 133 END 134