• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12    EXPORT  |vp8_variance16x16_neon|
13    EXPORT  |vp8_variance16x8_neon|
14    EXPORT  |vp8_variance8x16_neon|
15    EXPORT  |vp8_variance8x8_neon|
16
17    ARM
18    REQUIRE8
19    PRESERVE8
20
21    AREA ||.text||, CODE, READONLY, ALIGN=2
22
23; r0    unsigned char *src_ptr
24; r1    int source_stride
25; r2    unsigned char *ref_ptr
26; r3    int  recon_stride
27; stack unsigned int *sse
28|vp8_variance16x16_neon| PROC
29    vmov.i8         q8, #0                      ;q8 - sum
30    vmov.i8         q9, #0                      ;q9, q10 - sse
31    vmov.i8         q10, #0
32
33    mov             r12, #8
34
35variance16x16_neon_loop
36    vld1.8          {q0}, [r0], r1              ;Load up source and reference
37    vld1.8          {q2}, [r2], r3
38    vld1.8          {q1}, [r0], r1
39    vld1.8          {q3}, [r2], r3
40
41    vsubl.u8        q11, d0, d4                 ;calculate diff
42    vsubl.u8        q12, d1, d5
43    vsubl.u8        q13, d2, d6
44    vsubl.u8        q14, d3, d7
45
46    ;VPADAL adds adjacent pairs of elements of a vector, and accumulates
47    ;the results into the elements of the destination vector. The explanation
48    ;in ARM guide is wrong.
49    vpadal.s16      q8, q11                     ;calculate sum
50    vmlal.s16       q9, d22, d22                ;calculate sse
51    vmlal.s16       q10, d23, d23
52
53    subs            r12, r12, #1
54
55    vpadal.s16      q8, q12
56    vmlal.s16       q9, d24, d24
57    vmlal.s16       q10, d25, d25
58    vpadal.s16      q8, q13
59    vmlal.s16       q9, d26, d26
60    vmlal.s16       q10, d27, d27
61    vpadal.s16      q8, q14
62    vmlal.s16       q9, d28, d28
63    vmlal.s16       q10, d29, d29
64
65    bne             variance16x16_neon_loop
66
67    vadd.u32        q10, q9, q10                ;accumulate sse
68    vpaddl.s32      q0, q8                      ;accumulate sum
69
70    ldr             r12, [sp]                   ;load *sse from stack
71
72    vpaddl.u32      q1, q10
73    vadd.s64        d0, d0, d1
74    vadd.u64        d1, d2, d3
75
76    ;vmov.32        r0, d0[0]                   ;this instruction costs a lot
77    ;vmov.32        r1, d1[0]
78    ;mul            r0, r0, r0
79    ;str            r1, [r12]
80    ;sub            r0, r1, r0, asr #8
81
82    ;sum is in [-255x256, 255x256]. sumxsum is 32-bit. Shift to right should
83    ;have sign-bit exension, which is vshr.s. Have to use s32 to make it right.
84    vmull.s32       q5, d0, d0
85    vst1.32         {d1[0]}, [r12]              ;store sse
86    vshr.s32        d10, d10, #8
87    vsub.s32        d0, d1, d10
88
89    vmov.32         r0, d0[0]                   ;return
90    bx              lr
91
92    ENDP
93
94;================================
95;unsigned int vp8_variance16x8_c(
96;    unsigned char *src_ptr,
97;    int  source_stride,
98;    unsigned char *ref_ptr,
99;    int  recon_stride,
100;   unsigned int *sse)
101|vp8_variance16x8_neon| PROC
102    vmov.i8         q8, #0                      ;q8 - sum
103    vmov.i8         q9, #0                      ;q9, q10 - sse
104    vmov.i8         q10, #0
105
106    mov             r12, #4
107
108variance16x8_neon_loop
109    vld1.8          {q0}, [r0], r1              ;Load up source and reference
110    vld1.8          {q2}, [r2], r3
111    vld1.8          {q1}, [r0], r1
112    vld1.8          {q3}, [r2], r3
113
114    vsubl.u8        q11, d0, d4                 ;calculate diff
115    vsubl.u8        q12, d1, d5
116    vsubl.u8        q13, d2, d6
117    vsubl.u8        q14, d3, d7
118
119    vpadal.s16      q8, q11                     ;calculate sum
120    vmlal.s16       q9, d22, d22                ;calculate sse
121    vmlal.s16       q10, d23, d23
122
123    subs            r12, r12, #1
124
125    vpadal.s16      q8, q12
126    vmlal.s16       q9, d24, d24
127    vmlal.s16       q10, d25, d25
128    vpadal.s16      q8, q13
129    vmlal.s16       q9, d26, d26
130    vmlal.s16       q10, d27, d27
131    vpadal.s16      q8, q14
132    vmlal.s16       q9, d28, d28
133    vmlal.s16       q10, d29, d29
134
135    bne             variance16x8_neon_loop
136
137    vadd.u32        q10, q9, q10                ;accumulate sse
138    vpaddl.s32      q0, q8                      ;accumulate sum
139
140    ldr             r12, [sp]                   ;load *sse from stack
141
142    vpaddl.u32      q1, q10
143    vadd.s64        d0, d0, d1
144    vadd.u64        d1, d2, d3
145
146    vmull.s32       q5, d0, d0
147    vst1.32         {d1[0]}, [r12]              ;store sse
148    vshr.s32        d10, d10, #7
149    vsub.s32        d0, d1, d10
150
151    vmov.32         r0, d0[0]                   ;return
152    bx              lr
153
154    ENDP
155
156;=================================
157;unsigned int vp8_variance8x16_c(
158;    unsigned char *src_ptr,
159;    int  source_stride,
160;    unsigned char *ref_ptr,
161;    int  recon_stride,
162;   unsigned int *sse)
163
164|vp8_variance8x16_neon| PROC
165    vmov.i8         q8, #0                      ;q8 - sum
166    vmov.i8         q9, #0                      ;q9, q10 - sse
167    vmov.i8         q10, #0
168
169    mov             r12, #8
170
171variance8x16_neon_loop
172    vld1.8          {d0}, [r0], r1              ;Load up source and reference
173    vld1.8          {d4}, [r2], r3
174    vld1.8          {d2}, [r0], r1
175    vld1.8          {d6}, [r2], r3
176
177    vsubl.u8        q11, d0, d4                 ;calculate diff
178    vsubl.u8        q12, d2, d6
179
180    vpadal.s16      q8, q11                     ;calculate sum
181    vmlal.s16       q9, d22, d22                ;calculate sse
182    vmlal.s16       q10, d23, d23
183
184    subs            r12, r12, #1
185
186    vpadal.s16      q8, q12
187    vmlal.s16       q9, d24, d24
188    vmlal.s16       q10, d25, d25
189
190    bne             variance8x16_neon_loop
191
192    vadd.u32        q10, q9, q10                ;accumulate sse
193    vpaddl.s32      q0, q8                      ;accumulate sum
194
195    ldr             r12, [sp]                   ;load *sse from stack
196
197    vpaddl.u32      q1, q10
198    vadd.s64        d0, d0, d1
199    vadd.u64        d1, d2, d3
200
201    vmull.s32       q5, d0, d0
202    vst1.32         {d1[0]}, [r12]              ;store sse
203    vshr.s32        d10, d10, #7
204    vsub.s32        d0, d1, d10
205
206    vmov.32         r0, d0[0]                   ;return
207    bx              lr
208
209    ENDP
210
211;==================================
212; r0    unsigned char *src_ptr
213; r1    int source_stride
214; r2    unsigned char *ref_ptr
215; r3    int  recon_stride
216; stack unsigned int *sse
217|vp8_variance8x8_neon| PROC
218    vmov.i8         q8, #0                      ;q8 - sum
219    vmov.i8         q9, #0                      ;q9, q10 - sse
220    vmov.i8         q10, #0
221
222    mov             r12, #2
223
224variance8x8_neon_loop
225    vld1.8          {d0}, [r0], r1              ;Load up source and reference
226    vld1.8          {d4}, [r2], r3
227    vld1.8          {d1}, [r0], r1
228    vld1.8          {d5}, [r2], r3
229    vld1.8          {d2}, [r0], r1
230    vld1.8          {d6}, [r2], r3
231    vld1.8          {d3}, [r0], r1
232    vld1.8          {d7}, [r2], r3
233
234    vsubl.u8        q11, d0, d4                 ;calculate diff
235    vsubl.u8        q12, d1, d5
236    vsubl.u8        q13, d2, d6
237    vsubl.u8        q14, d3, d7
238
239    vpadal.s16      q8, q11                     ;calculate sum
240    vmlal.s16       q9, d22, d22                ;calculate sse
241    vmlal.s16       q10, d23, d23
242
243    subs            r12, r12, #1
244
245    vpadal.s16      q8, q12
246    vmlal.s16       q9, d24, d24
247    vmlal.s16       q10, d25, d25
248    vpadal.s16      q8, q13
249    vmlal.s16       q9, d26, d26
250    vmlal.s16       q10, d27, d27
251    vpadal.s16      q8, q14
252    vmlal.s16       q9, d28, d28
253    vmlal.s16       q10, d29, d29
254
255    bne             variance8x8_neon_loop
256
257    vadd.u32        q10, q9, q10                ;accumulate sse
258    vpaddl.s32      q0, q8                      ;accumulate sum
259
260    ldr             r12, [sp]                   ;load *sse from stack
261
262    vpaddl.u32      q1, q10
263    vadd.s64        d0, d0, d1
264    vadd.u64        d1, d2, d3
265
266    vmull.s32       q5, d0, d0
267    vst1.32         {d1[0]}, [r12]              ;store sse
268    vshr.s32        d10, d10, #6
269    vsub.s32        d0, d1, d10
270
271    vmov.32         r0, d0[0]                   ;return
272    bx              lr
273
274    ENDP
275
276    END
277