• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12    ; These functions are only valid when:
13    ; x_step_q4 == 16
14    ; w%4 == 0
15    ; h%4 == 0
16    ; taps == 8
17    ; VP9_FILTER_WEIGHT == 128
18    ; VP9_FILTER_SHIFT == 7
19
20    EXPORT  |vpx_convolve8_avg_horiz_neon|
21    EXPORT  |vpx_convolve8_avg_vert_neon|
22    ARM
23    REQUIRE8
24    PRESERVE8
25
26    AREA ||.text||, CODE, READONLY, ALIGN=2
27
28    ; Multiply and accumulate by q0
29    MACRO
30    MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7
31    vmull.s16 $dst, $src0, d0[0]
32    vmlal.s16 $dst, $src1, d0[1]
33    vmlal.s16 $dst, $src2, d0[2]
34    vmlal.s16 $dst, $src3, d0[3]
35    vmlal.s16 $dst, $src4, d1[0]
36    vmlal.s16 $dst, $src5, d1[1]
37    vmlal.s16 $dst, $src6, d1[2]
38    vmlal.s16 $dst, $src7, d1[3]
39    MEND
40
41; r0    const uint8_t *src
42; r1    int src_stride
43; r2    uint8_t *dst
44; r3    int dst_stride
45; sp[]const int16_t *filter
46; sp[]int x0_q4
47; sp[]int x_step_q4 ; unused
48; sp[]int y0_q4
49; sp[]int y_step_q4 ; unused
50; sp[]int w
51; sp[]int h
52
53|vpx_convolve8_avg_horiz_neon| PROC
54    push            {r4-r10, lr}
55
56    sub             r0, r0, #3              ; adjust for taps
57
58    ldrd            r4, r5, [sp, #32]       ; filter, x0_q4
59    add             r4, r5, lsl #4
60    ldrd            r6, r7, [sp, #52]       ; w, h
61
62    vld1.s16        {q0}, [r4]              ; filter
63
64    sub             r8, r1, r1, lsl #2      ; -src_stride * 3
65    add             r8, r8, #4              ; -src_stride * 3 + 4
66
67    sub             r4, r3, r3, lsl #2      ; -dst_stride * 3
68    add             r4, r4, #4              ; -dst_stride * 3 + 4
69
70    rsb             r9, r6, r1, lsl #2      ; reset src for outer loop
71    sub             r9, r9, #7
72    rsb             r12, r6, r3, lsl #2     ; reset dst for outer loop
73
74    mov             r10, r6                 ; w loop counter
75
76vpx_convolve8_avg_loop_horiz_v
77    vld1.8          {d24}, [r0], r1
78    vld1.8          {d25}, [r0], r1
79    vld1.8          {d26}, [r0], r1
80    vld1.8          {d27}, [r0], r8
81
82    vtrn.16         q12, q13
83    vtrn.8          d24, d25
84    vtrn.8          d26, d27
85
86    pld             [r0, r1, lsl #2]
87
88    vmovl.u8        q8, d24
89    vmovl.u8        q9, d25
90    vmovl.u8        q10, d26
91    vmovl.u8        q11, d27
92
93    ; save a few instructions in the inner loop
94    vswp            d17, d18
95    vmov            d23, d21
96
97    add             r0, r0, #3
98
99vpx_convolve8_avg_loop_horiz
100    add             r5, r0, #64
101
102    vld1.32         {d28[]}, [r0], r1
103    vld1.32         {d29[]}, [r0], r1
104    vld1.32         {d31[]}, [r0], r1
105    vld1.32         {d30[]}, [r0], r8
106
107    pld             [r5]
108
109    vtrn.16         d28, d31
110    vtrn.16         d29, d30
111    vtrn.8          d28, d29
112    vtrn.8          d31, d30
113
114    pld             [r5, r1]
115
116    ; extract to s16
117    vtrn.32         q14, q15
118    vmovl.u8        q12, d28
119    vmovl.u8        q13, d29
120
121    pld             [r5, r1, lsl #1]
122
123    ; slightly out of order load to match the existing data
124    vld1.u32        {d6[0]}, [r2], r3
125    vld1.u32        {d7[0]}, [r2], r3
126    vld1.u32        {d6[1]}, [r2], r3
127    vld1.u32        {d7[1]}, [r2], r3
128
129    sub             r2, r2, r3, lsl #2      ; reset for store
130
131    ; src[] * filter
132    MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
133    MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
134    MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
135    MULTIPLY_BY_Q0  q15, d22, d18, d19, d23, d24, d26, d27, d25
136
137    pld             [r5, -r8]
138
139    ; += 64 >> 7
140    vqrshrun.s32    d2, q1, #7
141    vqrshrun.s32    d3, q2, #7
142    vqrshrun.s32    d4, q14, #7
143    vqrshrun.s32    d5, q15, #7
144
145    ; saturate
146    vqmovn.u16      d2, q1
147    vqmovn.u16      d3, q2
148
149    ; transpose
150    vtrn.16         d2, d3
151    vtrn.32         d2, d3
152    vtrn.8          d2, d3
153
154    ; average the new value and the dst value
155    vrhadd.u8       q1, q1, q3
156
157    vst1.u32        {d2[0]}, [r2@32], r3
158    vst1.u32        {d3[0]}, [r2@32], r3
159    vst1.u32        {d2[1]}, [r2@32], r3
160    vst1.u32        {d3[1]}, [r2@32], r4
161
162    vmov            q8,  q9
163    vmov            d20, d23
164    vmov            q11, q12
165    vmov            q9,  q13
166
167    subs            r6, r6, #4              ; w -= 4
168    bgt             vpx_convolve8_avg_loop_horiz
169
170    ; outer loop
171    mov             r6, r10                 ; restore w counter
172    add             r0, r0, r9              ; src += src_stride * 4 - w
173    add             r2, r2, r12             ; dst += dst_stride * 4 - w
174    subs            r7, r7, #4              ; h -= 4
175    bgt vpx_convolve8_avg_loop_horiz_v
176
177    pop             {r4-r10, pc}
178
179    ENDP
180
181|vpx_convolve8_avg_vert_neon| PROC
182    push            {r4-r8, lr}
183
184    ; adjust for taps
185    sub             r0, r0, r1
186    sub             r0, r0, r1, lsl #1
187
188    ldr             r4, [sp, #24]           ; filter
189    ldr             r5, [sp, #36]           ; y0_q4
190    add             r4, r5, lsl #4
191    ldr             r6, [sp, #44]           ; w
192    ldr             lr, [sp, #48]           ; h
193
194    vld1.s16        {q0}, [r4]              ; filter
195
196    lsl             r1, r1, #1
197    lsl             r3, r3, #1
198
199vpx_convolve8_avg_loop_vert_h
200    mov             r4, r0
201    add             r7, r0, r1, asr #1
202    mov             r5, r2
203    add             r8, r2, r3, asr #1
204    mov             r12, lr                 ; h loop counter
205
206    vld1.u32        {d16[0]}, [r4], r1
207    vld1.u32        {d16[1]}, [r7], r1
208    vld1.u32        {d18[0]}, [r4], r1
209    vld1.u32        {d18[1]}, [r7], r1
210    vld1.u32        {d20[0]}, [r4], r1
211    vld1.u32        {d20[1]}, [r7], r1
212    vld1.u32        {d22[0]}, [r4], r1
213
214    vmovl.u8        q8, d16
215    vmovl.u8        q9, d18
216    vmovl.u8        q10, d20
217    vmovl.u8        q11, d22
218
219vpx_convolve8_avg_loop_vert
220    ; always process a 4x4 block at a time
221    vld1.u32        {d24[0]}, [r7], r1
222    vld1.u32        {d26[0]}, [r4], r1
223    vld1.u32        {d26[1]}, [r7], r1
224    vld1.u32        {d24[1]}, [r4], r1
225
226    ; extract to s16
227    vmovl.u8        q12, d24
228    vmovl.u8        q13, d26
229
230    vld1.u32        {d6[0]}, [r5@32], r3
231    vld1.u32        {d6[1]}, [r8@32], r3
232    vld1.u32        {d7[0]}, [r5@32], r3
233    vld1.u32        {d7[1]}, [r8@32], r3
234
235    pld             [r7]
236    pld             [r4]
237
238    ; src[] * filter
239    MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24
240
241    pld             [r7, r1]
242    pld             [r4, r1]
243
244    MULTIPLY_BY_Q0  q2,  d17, d18, d19, d20, d21, d22, d24, d26
245
246    pld             [r5]
247    pld             [r8]
248
249    MULTIPLY_BY_Q0  q14, d18, d19, d20, d21, d22, d24, d26, d27
250
251    pld             [r5, r3]
252    pld             [r8, r3]
253
254    MULTIPLY_BY_Q0  q15, d19, d20, d21, d22, d24, d26, d27, d25
255
256    ; += 64 >> 7
257    vqrshrun.s32    d2, q1, #7
258    vqrshrun.s32    d3, q2, #7
259    vqrshrun.s32    d4, q14, #7
260    vqrshrun.s32    d5, q15, #7
261
262    ; saturate
263    vqmovn.u16      d2, q1
264    vqmovn.u16      d3, q2
265
266    ; average the new value and the dst value
267    vrhadd.u8       q1, q1, q3
268
269    sub             r5, r5, r3, lsl #1      ; reset for store
270    sub             r8, r8, r3, lsl #1
271
272    vst1.u32        {d2[0]}, [r5@32], r3
273    vst1.u32        {d2[1]}, [r8@32], r3
274    vst1.u32        {d3[0]}, [r5@32], r3
275    vst1.u32        {d3[1]}, [r8@32], r3
276
277    vmov            q8, q10
278    vmov            d18, d22
279    vmov            d19, d24
280    vmov            q10, q13
281    vmov            d22, d25
282
283    subs            r12, r12, #4            ; h -= 4
284    bgt             vpx_convolve8_avg_loop_vert
285
286    ; outer loop
287    add             r0, r0, #4
288    add             r2, r2, #4
289    subs            r6, r6, #4              ; w -= 4
290    bgt             vpx_convolve8_avg_loop_vert_h
291
292    pop             {r4-r8, pc}
293
294    ENDP
295    END
296