• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*!
2 * \copy
3 *     Copyright (c)  2013, Cisco Systems
4 *     All rights reserved.
5 *
6 *     Redistribution and use in source and binary forms, with or without
7 *     modification, are permitted provided that the following conditions
8 *     are met:
9 *
10 *        * Redistributions of source code must retain the above copyright
11 *          notice, this list of conditions and the following disclaimer.
12 *
13 *        * Redistributions in binary form must reproduce the above copyright
14 *          notice, this list of conditions and the following disclaimer in
15 *          the documentation and/or other materials provided with the
16 *          distribution.
17 *
18 *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 *     POSSIBILITY OF SUCH DAMAGE.
30 *
31 */
32
33#ifdef HAVE_NEON
34#include "arm_arch_common_macro.S"
35
36
37WELS_ASM_FUNC_BEGIN SumOf8x8SingleBlock_neon
38    vld1.64 {d0}, [r0], r1
39    vld1.64 {d1}, [r0], r1
40    vld1.64 {d2}, [r0], r1
41    vld1.64 {d3}, [r0], r1
42    vld1.64 {d4}, [r0], r1
43    vld1.64 {d5}, [r0], r1
44    vld1.64 {d6}, [r0], r1
45    vld1.64 {d7}, [r0]
46    vpaddl.u8 q0, q0
47    vpadal.u8 q0, q1
48    vpadal.u8 q0, q2
49    vpadal.u8 q0, q3
50
51    vpaddl.u16 q0, q0
52    vpadd.i32 d0, d1
53    vpadd.i32 d0, d0
54    vmov    r0, r1, d0
55WELS_ASM_FUNC_END
56
57
58WELS_ASM_FUNC_BEGIN SumOf16x16SingleBlock_neon
59    vld1.64 {q0}, [r0], r1
60    vpaddl.u8 q0, q0
61.rept 15
62    vld1.64 {q1}, [r0], r1
63    vpadal.u8 q0, q1
64.endr
65    vpaddl.u16 q0, q0
66    vpadd.i32 d0, d1
67    vpadd.i32 d0, d0
68    vmov    r0, r1, d0
69WELS_ASM_FUNC_END
70
71
72WELS_ASM_FUNC_BEGIN SumOf8x8BlockOfFrame_neon
73//(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,const int32_t kiRefStride,uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[])
74    stmdb sp!, {r4-r12}
75    ldr r5, [sp, #40] //pTimesOfFeatureValue
76    ldr r4, [sp, #36] //pFeatureOfBlock
77
78    mov r8, r0
79    mov r6, r1
80    add r8, r6
81    add r4, r4, r6, lsl #1
82
83    mov r7, r6
84_width_loop8x8_1:
85    subs r0, r8, r7
86    vld1.64 {d0}, [r0], r3
87    vld1.64 {d1}, [r0], r3
88    vld1.64 {d2}, [r0], r3
89    vld1.64 {d3}, [r0], r3
90    vld1.64 {d4}, [r0], r3
91    vld1.64 {d5}, [r0], r3
92    vld1.64 {d6}, [r0], r3
93    vld1.64 {d7}, [r0]
94
95    vpaddl.u8 q0, q0
96    vpadal.u8 q0, q1
97    vpadal.u8 q0, q2
98    vpadal.u8 q0, q3
99    vpaddl.u16 q0, q0
100    vpadd.i32 d0, d1
101    vpadd.i32 d0, d0
102
103    subs r1, r4, r7, lsl #1
104    vst1.16 {d0[0]}, [r1] // sum -> pFeatureOfBlock[i]
105    vmov    r0, r1, d0
106    add r1, r5, r0, lsl #2
107    ldr r0, [r1]
108    add r0, #1
109    str r0, [r1]
110
111    subs r7, #1
112    bne _width_loop8x8_1
113
114    add r8, r3
115    add r4, r4, r6, lsl #1
116    subs r2, #1
117    beq _SumOf8x8BlockOfFrame_end
118
119
120_height_loop8x8:
121    mov r7, r6
122_width_loop8x8_2:
123    subs r0, r8, r7
124    subs r1, r4, r7, lsl #1
125
126    subs r9, r1, r6, lsl #1 // last line of pFeatureOfBlock[i]
127    ldrh  r10, [r9] // sum of last line of pFeatureOfBlock[i]
128
129    subs r11, r0, r3
130    vld1.64 {d1}, [r11]
131    add r0, r11, r3, lsl #3
132    vld1.64 {d0}, [r0] //
133
134    vpaddl.u8 q0, q0
135    vpadd.u16 d0, d0, d1
136    vpaddl.u16 d0, d0
137    vmov r11, r12, d0
138    subs r10, r12
139    add r0, r10, r11
140
141    strh r0, [r1] // sum -> pFeatureOfBlock[i]
142
143    add r1, r5, r0, lsl #2
144    ldr r0, [r1]
145    add r0, #1
146    str r0, [r1]
147    subs r7, #1
148    bne _width_loop8x8_2
149
150    add r8, r3
151    add r4, r4, r6, lsl #1
152    subs r2, #1
153    bne _height_loop8x8
154_SumOf8x8BlockOfFrame_end:
155    ldmia sp!, {r4-r12}
156WELS_ASM_FUNC_END
157
158WELS_ASM_FUNC_BEGIN SumOf16x16BlockOfFrame_neon
159//(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,const int32_t kiRefStride,uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[])
160    stmdb sp!, {r4-r12}
161    ldr r5, [sp, #40] //pTimesOfFeatureValue
162    ldr r4, [sp, #36] //pFeatureOfBlock
163
164    mov r8, r0
165    mov r6, r1
166    add r8, r6
167    add r4, r4, r6, lsl #1
168
169    mov r7, r6
170_width_loop16x16_1:
171    subs r0, r8, r7
172    vld1.64 {q0}, [r0], r3
173    vpaddl.u8 q0, q0
174.rept 15
175    vld1.64 {q1}, [r0], r3
176    vpadal.u8 q0, q1
177.endr
178    vpaddl.u16 q0, q0
179    vpadd.i32 d0, d1
180    vpadd.i32 d0, d0
181
182    subs r1, r4, r7, lsl #1
183    vst1.16 {d0[0]}, [r1] // sum -> pFeatureOfBlock[i]
184    vmov    r0, r1, d0
185    add r1, r5, r0, lsl #2
186    ldr r0, [r1]
187    add r0, #1
188    str r0, [r1]
189
190    subs r7, #1
191    bne _width_loop16x16_1
192    add r8, r3
193    add r4, r4, r6, lsl #1
194    subs r2, #1
195    beq _SumOf16x16BlockOfFrame_neon_end
196
197_height_loop16x16:
198    mov r7, r6
199_width_loop16x16_2:
200    subs r0, r8, r7
201    subs r1, r4, r7, lsl #1
202    subs r9, r1, r6, lsl #1 // last line of pFeatureOfBlock[i]
203    ldrh  r10, [r9] // sum of last line of pFeatureOfBlock[i]
204
205    subs r11, r0, r3
206    vld1.64 {q1}, [r11]
207    add r0, r11, r3, lsl #4
208    vld1.64 {q0}, [r0] //
209
210    vpaddl.u8 q0, q0
211    vpaddl.u8 q1, q1
212    vpadd.u16 d0, d0, d1
213    vpadd.u16 d1, d2, d3
214    vpadd.u16 d0, d0, d1
215    vpaddl.u16 d0, d0
216
217    vmov r11, r12, d0
218    subs r10, r12
219    add r0, r10, r11
220
221    strh r0, [r1] // sum -> pFeatureOfBlock[i]
222    add r1, r5, r0, lsl #2
223    ldr r0, [r1]
224    add r0, #1
225    str r0, [r1]
226
227    subs r7, #1
228    bne _width_loop16x16_2
229
230    add r8, r3
231    add r4, r4, r6, lsl #1
232    subs r2, #1
233    bne _height_loop16x16
234_SumOf16x16BlockOfFrame_neon_end:
235    ldmia sp!, {r4-r12}
236WELS_ASM_FUNC_END
237
238WELS_ASM_FUNC_BEGIN InitializeHashforFeature_neon
239// (uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize, uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList);
240    stmdb sp!, {r4-r7}
241    ldr r4, [sp, #16] //pFeatureValuePointerList
242    bic r5, r2, #3
243_hash_assign_loop_x4:
244    vld1.64 {q0}, [r0]!
245    vshl.u32 q0, q0, #2
246    vceq.u32 q1, q0, #0
247    vand.i32 d2, d2, d3
248    vmov r6, r7, d2
249    and r6, r6, r7
250    cmp r6, #0xffffffff
251    beq _hash_assign_with_copy_x4
252
253    veor q1, q1
254    vext.32 q2, q1, q0, #3
255    vext.32 q3, q1, q0, #2
256    vext.32 q4, q1, q0, #1
257    vadd.u32 q0, q0, q2
258    vadd.u32 q0, q0, q3
259    vadd.u32 q0, q0, q4
260    vext.32 q2, q1, q0, #3
261    vdup.32  q3, r1
262    vadd.u32 q2, q2, q3
263    vst1.64 {q2}, [r3]!
264    vst1.64 {q2}, [r4]!
265    vmov.32 r6, d1[1]
266    add r1, r1, r6
267    b _assign_next
268
269_hash_assign_with_copy_x4:
270    vdup.32  q2, r1
271    vst1.64 {q2}, [r3]!
272    vst1.64 {q2}, [r4]!
273
274_assign_next:
275    subs r5, r5, #4
276    bne _hash_assign_loop_x4
277
278    and r5, r2, #3
279    cmp r5, #0
280    beq _hash_assign_end
281_hash_assign_loop_x4_rem:
282    str r1, [r3], #4
283    str r1, [r4], #4
284    ldr r7, [r0], #4
285    lsl r7, r7, #2
286    add r1, r1, r7
287    subs r5, r5, #1
288    bne _hash_assign_loop_x4_rem
289_hash_assign_end:
290
291    ldmia sp!, {r4-r7}
292WELS_ASM_FUNC_END
293
294.align 4
295mv_x_inc_x4: .short 0x10, 0x10, 0x10, 0x10, 0x00, 0x00, 0x00, 0x00
296mv_y_inc_x4: .short 0x04, 0x04, 0x04, 0x04, 0x00, 0x00, 0x00, 0x00
297mx_x_offset_x4: .short 0x00, 0x04, 0x08, 0x0c, 0x00, 0x00, 0x00, 0x00
298
299WELS_ASM_FUNC_BEGIN FillQpelLocationByFeatureValue_neon
300// void  (uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList)
301    stmdb sp!, {r4-r8}
302    vpush {q4-q7}
303    adr r7, mv_x_inc_x4
304    vld1.64 {q7}, [r7]
305    adr r7, mv_y_inc_x4
306    vld1.64 {q6}, [r7]
307    adr r7, mx_x_offset_x4
308    vld1.64 {q5}, [r7]
309    veor q4, q4
310    veor q3, q3
311    vdup.32 q8, r3
312_hash_height_loop:
313    mov r7, r1
314    vmov q2, q5 //mx_x_offset_x4
315_hash_width_loop:
316    vld1.64 {d0}, [r0]!
317    vshll.u16 q0, d0, #2
318    vadd.u32 q0, q8
319    vmov q1, q2
320    vmov q4, q3
321    vzip.16 q1, q4
322
323    vmov.32 r4, d0[0]
324    ldr r5, [r4]
325    vmov.32 r6, d2[0]
326    str r6, [r5]
327    add r5, r5, #4
328    pld [r5] // cache miss?
329    str r5, [r4]
330
331    vmov.32 r4, d0[1]
332    ldr r5, [r4]
333    vmov.32 r6, d2[1]
334    str r6, [r5]
335    add r5, r5, #4
336    pld [r5] // cache miss?
337    str r5, [r4]
338
339    vmov.32 r4, d1[0]
340    ldr r5, [r4]
341    vmov.32 r6, d3[0]
342    str r6, [r5]
343    add r5, r5, #4
344    pld [r5] // cache miss?
345    str r5, [r4]
346
347    vmov.32 r4, d1[1]
348    ldr r5, [r4]
349    vmov.32 r6, d3[1]
350    str r6, [r5]
351    add r5, r5, #4
352    pld [r5] // cache miss?
353    str r5, [r4]
354
355    vadd.u16 q2, q2, q7
356    subs r7, #4
357    bne _hash_width_loop
358
359    vadd.u16 q3, q3, q6
360    subs r2, #1
361    bne _hash_height_loop
362
363    vpop {q4-q7}
364    ldmia sp!, {r4-r8}
365WELS_ASM_FUNC_END
366#endif
367