• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* @file
21@*  ihevc_weighted_pred_bi_default.s
22@*
23@* @brief
24@*  contains function definitions for weighted prediction used in inter
25@* prediction
26@*
27@* @author
28@*  parthiban v
29@*
30@* @par list of functions:
31@*  - ihevc_weighted_pred_bi_default()
32@*
33@* @remarks
34@*  none
35@*
36@*******************************************************************************
37@*/
38@/**
39@*******************************************************************************
40@*
41@* @brief
42@*  does default bi-weighted prediction on the arrays pointed by pi2_src1 and
43@* pi2_src2 and stores it at location  pointed by pi2_dst assumptions : the
44@* function is optimized considering the fact width and  height are multiple
45@* of 2.
46@*
47@* @par description:
48@*  dst = ( (src1 + lvl_shift1) +  (src2 + lvl_shift2) +  1 << (shift - 1) )
49@* >> shift  where shift = 15 - bitdepth
50@*
51@* @param[in] pi2_src1
52@*  pointer to source 1
53@*
54@* @param[in] pi2_src2
55@*  pointer to source 2
56@*
57@* @param[out] pu1_dst
58@*  pointer to destination
59@*
60@* @param[in] src_strd1
61@*  source stride 1
62@*
63@* @param[in] src_strd2
64@*  source stride 2
65@*
66@* @param[in] dst_strd
67@*  destination stride
68@*
69@* @param[in] lvl_shift1
70@*  added before shift and offset
71@*
72@* @param[in] lvl_shift2
73@*  added before shift and offset
74@*
75@* @param[in] ht
76@*  height of the source
77@*
78@* @param[in] wd
79@*  width of the source
80@*
81@* @returns
82@*
83@* @remarks
84@*  none
85@*
86@*******************************************************************************
87@*/
88@void ihevc_weighted_pred_bi_default(word16 *pi2_src1,
89@                                    word16 *pi2_src2,
90@                                    uword8 *pu1_dst,
91@                                    word32 src_strd1,
92@                                    word32 src_strd2,
93@                                    word32 dst_strd,
94@                                    word32 lvl_shift1,
95@                                    word32 lvl_shift2,
96@                                    word32 ht,
97@                                    word32 wd)
98
99@**************variables vs registers*****************************************
100@   r0 => *pi2_src1
101@   r1 => *pi2_src2
102@   r2 => *pu1_dst
103@   r3 =>  src_strd1
104@   r4 =>  src_strd2
105@   r5 =>  dst_strd
106@   r6 =>  lvl_shift1
107@   r7 =>  lvl_shift2
108@   r8 =>  ht
109@   r9 =>  wd
110
111.equ    src_strd2_offset,       104
112.equ    dst_strd_offset,        108
113.equ    lvl_shift1_offset,      112
114.equ    lvl_shift2_offset,      116
115.equ    ht_offset,              120
116.equ    wd_offset,              124
117
118.text
119.syntax unified
120.align 4
121
122
123
124
125.globl ihevc_weighted_pred_bi_default_a9q
126
127.type ihevc_weighted_pred_bi_default_a9q, %function
128
129ihevc_weighted_pred_bi_default_a9q:
130
131    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
132    vpush       {d8  -  d15}
133    ldr         r4,[sp,#src_strd2_offset]   @load src_strd2
134    lsl         r3,r3,#1
135    ldr         r5,[sp,#dst_strd_offset]    @load dst_strd
136    ldr         r6,[sp,#lvl_shift1_offset]  @load lvl_shift1
137    lsl         r4,r4,#1
138    ldr         r7,[sp,#lvl_shift2_offset]  @load lvl_shift2
139    ldr         r8,[sp,#ht_offset]          @load ht
140    ldr         r9,[sp,#wd_offset]          @load wd
141    vdup.16     q2,r6                       @lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1)
142    vdup.16     q3,r7                       @lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2)
143    vmov.i16    q0,#0x40                    @tmp_lvl_shift = 1 << (shift - 1)
144    vadd.i16    q2,q3
145    vadd.s16    q0,q0,q2
146@   vmvn.i32    q1,#0x6                         @vmovq_n_s32(tmp_shift)
147    lsl         r6,r9,#1
148    rsb         r7,r6,r3,lsl #2             @4*src_strd1 - wd
149    rsb         r10,r6,r4,lsl #2            @4*src_strd2 - wd
150    @asr            r6,#1
151    @rsb            r6,r6,r5,lsl #2             @4*dst_strd - wd
152
153    cmp         r8,#0                       @check ht == 0
154    beq         end_loops                   @if equal, then end the function
155
156chroma_decision:
157    orr         r14,r8,r9
158    cmp         r14,#10
159    beq         outer_loop_chroma_8x2
160
161    cmp         r14,#6
162    beq         outer_loop_chroma_4x2
163
164
165luma_decision:
166    cmp         r9,#24
167    beq         outer_loop_8
168
169    cmp         r9,#16
170    bge         outer_loop_16
171
172    cmp         r9,#12
173    beq         outer_loop_4
174
175    cmp         r9,#8
176    bge         outer_loop_8
177
178
179
180
181
182
183outer_loop_4:
184    cmp         r9,#0                       @check wd == 0
185    beq         end_loops                   @if equal, then end the function
186
187core_loop_4:
188    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
189    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
190    vld1.s16    {d6},[r0]!                  @load and increment the pi2_src1
191    add         r14,r2,r5                   @pu1_dst_tmp = pu1_dst + dst_strd
192    vld1.s16    {d7},[r1]!                  @load and increment the pi2_src2
193    vld1.s16    {d8},[r11],r3               @load and increment the pi2_src1 ii iteration
194    vqadd.s16   d18,d6,d7
195    vqadd.s16   d18,d18,d0                  @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
196    vld1.s16    {d9},[r12],r4               @load and increment the pi2_src2 ii iteration
197    vqadd.s16   d20,d8,d9                   @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
198    vqadd.s16   d19,d20,d0                  @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
199    vqshrun.s16 d20,q9,#7
200    vld1.s16    {d22},[r11],r3              @load and increment the pi2_src1 iii iteration
201    vld1.s16    {d23},[r12],r4              @load and increment the pi2_src2 iii iteration
202    vqadd.s16   d30,d22,d23
203    vqadd.s16   d30,d30,d0                  @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration
204    vld1.s16    {d24},[r11],r3              @load and increment the pi2_src1 iv iteration
205    vld1.s16    {d25},[r12],r4              @load and increment the pi2_src2 iv iteration
206    vqadd.s16   d18,d24,d25                 @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
207    vqadd.s16   d31,d18,d0
208    vst1.32     {d20[0]},[r2]!              @store pu1_dst i iteration
209    vst1.32     {d20[1]},[r14],r5           @store pu1_dst ii iteration
210    vqshrun.s16 d30,q15,#7
211    vst1.32     {d30[0]},[r14],r5           @store pu1_dst iii iteration                                                @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio
212    subs        r9,r9,#4                    @decrement wd by 4 and check for 0
213    vst1.32     {d30[1]},[r14],r5           @store pu1_dst iv iteration
214    bgt         core_loop_4                 @if greater than 0 repeat the core loop again
215
216end_core_loop_4:
217
218    subs        r8,r8,#4                    @decrement the ht by 4
219
220    add         r0,r0,r7                    @pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
221    asr         r9,r6,#1
222    add         r1,r1,r10                   @pi2_src2 + 4*src_strd2 - 2*wd
223    rsb         r14,r9,r5,lsl #2            @4*dst_strd - wd
224    add         r2,r2,r14
225                                            @pu1_dst + dst_std - wd
226    bgt         core_loop_4                 @if ht is greater than 0 goto outer_loop
227
228    b           end_loops
229
230
231@ this is only for chroma module with input 2x2
232outer_loop_chroma_4x2:
233    cmp         r9,#0                       @check wd == 0
234    beq         end_loops                   @if equal, then end the function
235    rsb         r7,r6,r3,lsl #1             @2*src_strd1 - wd
236    rsb         r10,r6,r4,lsl #1            @2*src_strd2 - wd
237core_loop_chroma_4x2:
238    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
239    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
240    vld1.s16    {d6},[r0]!                  @load and increment the pi2_src1
241    add         r14,r2,r5                   @pu1_dst_tmp = pu1_dst + dst_strd
242    vld1.s16    {d7},[r1]!                  @load and increment the pi2_src2
243    vld1.s16    {d8},[r11],r3               @load and increment the pi2_src1 ii iteration
244    vqadd.s16   d18,d6,d7
245    vqadd.s16   d18,d18,d0                  @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
246    vld1.s16    {d9},[r12],r4               @load and increment the pi2_src2 ii iteration
247    vqadd.s16   d20,d8,d9                   @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
248    vqadd.s16   d19,d20,d0                  @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
249    vqshrun.s16 d20,q9,#7
250    vst1.32     {d20[0]},[r2]!              @store pu1_dst i iteration
251    vst1.32     {d20[1]},[r14],r5           @store pu1_dst ii iteration
252
253    subs        r9,r9,#4                    @decrement wd by 4 and check for 0
254
255    bgt         core_loop_chroma_4x2        @if greater than 0 repeat the core loop again
256
257end_core_loop_chorma_4x2:
258
259    subs        r8,r8,#2                    @decrement the ht by 4
260
261    add         r0,r0,r7                    @pi2_src1 + 2*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
262    asr         r9,r6,#1
263    add         r1,r1,r10                   @pi2_src2 + 2*src_strd2 - 2*wd
264    rsb         r14,r9,r5,lsl #1            @2*dst_strd - wd
265    add         r2,r2,r14
266                                            @pu1_dst + dst_std - wd
267    bgt         core_loop_chroma_4x2        @if ht is greater than 0 goto outer_loop
268
269    b           end_loops
270
271
272
273outer_loop_8:
274    cmp         r9,#0                       @check wd == 0
275    beq         end_loops                   @if equal, then end the function
276    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
277    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
278core_loop_8:
279
280    vld1.s16    {q12},[r0]!                 @load and increment the pi2_src1
281    add         r14,r2,r5                   @pu1_dst_tmp = pu1_dst + dst_strd
282    vld1.s16    {q13},[r1]!                 @load and increment the pi2_src2
283    vqadd.s16   q12,q12,q13
284    vld1.s16    {q14},[r11],r3              @load and increment the pi2_src1 ii iteration
285    vqadd.s16   q12,q12,q0                  @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
286    vld1.s16    {q15},[r12],r4              @load and increment the pi2_src2 ii iteration
287    vld1.s16    {q8},[r11],r3               @load and increment the pi2_src1 iii iteration
288    vqadd.s16   q11,q14,q15                 @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
289    vld1.s16    {q9},[r12],r4               @load and increment the pi2_src2 iii iteration
290    vqadd.s16   q11,q11,q0                  @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
291    vqshrun.s16 d20,q12,#7
292    vld1.s16    {q6},[r11],r3               @load and increment the pi2_src1 iv iteration
293    vqadd.s16   q15,q8,q9
294    vqshrun.s16 d21,q11,#7
295    vld1.s16    {q7},[r12],r4               @load and increment the pi2_src2 iv iteration
296    vqadd.s16   q15,q15,q0                  @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration
297    vst1.32     {d20},[r2]!                 @store pu1_dst i iteration
298    vqadd.s16   q4,q6,q7                    @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
299    vst1.32     {d21},[r14],r5              @store pu1_dst ii iteration
300    vqadd.s16   q4,q4,q0
301    vqshrun.s16 d30,q15,#7
302    vqshrun.s16 d31,q4,#7
303    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
304    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
305    vst1.32     {d30},[r14],r5              @store pu1_dst iii iteration                                                @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio
306    subs        r9,r9,#8                    @decrement wd by 4 and check for 0
307    vst1.32     {d31},[r14],r5              @store pu1_dst iv iteration
308    bgt         core_loop_8                 @if greater than 0 repeat the core loop again
309
310end_core_loop_8:
311
312    subs        r8,r8,#4                    @decrement the ht by 4
313
314    add         r0,r0,r7                    @pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
315    asr         r9,r6,#1
316    add         r1,r1,r10                   @pi2_src2 + 4*src_strd2 - 2*wd
317    rsb         r14,r9,r5,lsl #2            @4*dst_strd - wd
318    add         r2,r2,r14
319    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
320    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)                                  @pu1_dst + dst_std - wd
321
322    bgt         core_loop_8
323    b           end_loops
324
325
326
327@ this is only for chroma module with inpput 4x2
328outer_loop_chroma_8x2:
329    cmp         r9,#0                       @check wd == 0
330    beq         end_loops                   @if equal, then end the function
331    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
332    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
333    rsb         r7,r6,r3,lsl #1             @2*src_strd1 - wd
334    rsb         r10,r6,r4,lsl #1            @2*src_strd2 - wd
335core_loop_chroma_8x2:
336
337    vld1.s16    {q12},[r0]!                 @load and increment the pi2_src1
338    add         r14,r2,r5                   @pu1_dst_tmp = pu1_dst + dst_strd
339    vld1.s16    {q13},[r1]!                 @load and increment the pi2_src2
340    vqadd.s16   q12,q12,q13
341    vld1.s16    {q14},[r11],r3              @load and increment the pi2_src1 ii iteration
342    vqadd.s16   q12,q12,q0                  @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
343    vld1.s16    {q15},[r12],r4              @load and increment the pi2_src2 ii iteration
344    vld1.s16    {q8},[r11],r3               @load and increment the pi2_src1 iii iteration
345    vqadd.s16   q11,q14,q15                 @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
346    vqadd.s16   q11,q11,q0                  @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
347    vqshrun.s16 d20,q12,#7
348    vqshrun.s16 d21,q11,#7
349    vst1.32     {d20},[r2]!                 @store pu1_dst i iteration
350    vst1.32     {d21},[r14],r5              @store pu1_dst ii iteration
351
352    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
353    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
354                                            @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio
355    subs        r9,r9,#8                    @decrement wd by 4 and check for 0
356
357    bgt         core_loop_chroma_8x2        @if greater than 0 repeat the core loop again
358
359end_core_loop_chroma_8x2:
360
361    subs        r8,r8,#2                    @decrement the ht by 4
362
363    add         r0,r0,r7                    @pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
364    asr         r9,r6,#1
365    add         r1,r1,r10                   @pi2_src2 + 4*src_strd2 - 2*wd
366    rsb         r14,r9,r5,lsl #1            @4*dst_strd - wd
367    add         r2,r2,r14
368    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
369    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)                                  @pu1_dst + dst_std - wd
370
371    bgt         core_loop_chroma_8x2
372
373    b           end_loops
374
375
376
377
378outer_loop_16:
379    cmp         r9,#0                       @check wd == 0
380    beq         end_loops                   @if equal, then end the function
381    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
382    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
383    rsb         r7,r6,r3,lsl #1             @2*src_strd1 - wd
384    mov         r14,#16
385    sub         r10,r14,r5
386    sub         r11,r3,r14
387    sub         r12,r14,r3
388
389    rsb         r14,r9,r5,lsl #1            @2*dst_strd - wd
390
391
392
393prolog_16:
394
395
396    vld1.s16    {q1},[r0]!                  @load and increment the pi2_src1
397    vld1.s16    {q2},[r1]!                  @load and increment the pi2_src2
398    vld1.s16    {q5},[r0],r11               @load and increment the pi2_src1
399    vld1.s16    {q6},[r1],r11               @load and increment the pi2_src2
400    vld1.s16    {q3},[r0]!                  @load and increment the pi2_src1 ii iteration
401    subs        r9,r9,#16
402    vld1.s16    {q4},[r1]!                  @load and increment the pi2_src2 ii iteration
403    subeq       r8,r8,#2
404    vqadd.s16   q11,q1,q2
405    vld1.s16    {q7},[r0],r12               @load and increment the pi2_src1 ii iteration
406    vqadd.s16   q14,q5,q6
407    vld1.s16    {q8},[r1],r12               @load and increment the pi2_src2 ii iteration
408    addeq       r0,r0,r7
409    addeq       r1,r1,r7
410    vqadd.s16   q12,q3,q4
411    vld1.s16    {q1},[r0]!
412    vqadd.s16   q13,q7,q8
413@ if the input is chroma with 8x2 block size
414    cmp         r8,#0
415    beq         epilog_16
416
417    vld1.s16    {q2},[r1]!                  @load and increment the pi2_src2
418    vqadd.s16   q11,q11,q0
419    vld1.s16    {q5},[r0],r11               @load and increment the pi2_src1
420    vqadd.s16   q14,q14,q0
421    vld1.s16    {q6},[r1],r11               @load and increment the pi2_src2
422    vqadd.s16   q12,q12,q0
423    vld1.s16    {q3},[r0]!                  @load and increment the pi2_src1 ii iteration
424    vqadd.s16   q15,q13,q0
425    vqshrun.s16 d20,q11,#7
426    vld1.s16    {q4},[r1]!                  @load and increment the pi2_src2 ii iteration
427    vqshrun.s16 d21,q14,#7
428    vld1.s16    {q7},[r0],r12               @load and increment the pi2_src1 ii iteration
429    vqshrun.s16 d26,q12,#7
430    vld1.s16    {q8},[r1],r12               @load and increment the pi2_src2 ii iteration
431    vqshrun.s16 d27,q15,#7
432
433
434
435core_loop_16:
436
437    cmp         r9,#0
438    vqadd.s16   q11,q1,q2
439    asreq       r9,r6,#1
440    vst1.32     {q10},[r2],r5
441    vqadd.s16   q14,q5,q6
442    vst1.32     {q13},[r2],r10
443    addeq       r2,r2,r14
444    vqadd.s16   q12,q3,q4
445    subs        r9,r9,#16
446    addeq       r0,r0,r7
447    vqadd.s16   q13,q7,q8
448
449    addeq       r1,r1,r7
450    subseq      r8,r8,#2                    @decrement the ht by 2
451    beq         epilog_16
452
453
454    vqadd.s16   q11,q11,q0
455    vld1.s16    {q1},[r0]!                  @load and increment the pi2_src1
456    vqadd.s16   q14,q14,q0
457    vld1.s16    {q2},[r1]!                  @load and increment the pi2_src2
458    vqadd.s16   q12,q12,q0
459    vld1.s16    {q5},[r0],r11               @load and increment the pi2_src1
460    vqadd.s16   q15,q13,q0
461    vld1.s16    {q6},[r1],r11               @load and increment the pi2_src2
462    vqshrun.s16 d20,q11,#7
463    vld1.s16    {q3},[r0]!                  @load and increment the pi2_src1 ii iteration
464    vqshrun.s16 d21,q14,#7
465    vld1.s16    {q4},[r1]!                  @load and increment the pi2_src2 ii iteration
466    vqshrun.s16 d26,q12,#7
467    vld1.s16    {q7},[r0],r12               @load and increment the pi2_src1 ii iteration
468    vqshrun.s16 d27,q15,#7
469    vld1.s16    {q8},[r1],r12               @load and increment the pi2_src2 ii iteration
470
471
472    b           core_loop_16
473
474
475epilog_16:
476
477    vqadd.s16   q11,q11,q0
478    vqadd.s16   q14,q14,q0
479    vqadd.s16   q12,q12,q0
480    vqadd.s16   q15,q13,q0
481    vqshrun.s16 d20,q11,#7
482    vqshrun.s16 d21,q14,#7
483    vqshrun.s16 d26,q12,#7
484    vqshrun.s16 d27,q15,#7
485    vst1.32     {q10},[r2],r5
486    vst1.32     {q13},[r2]
487
488
489
490end_core_loop_16:
491
492
493
494
495
496
497
498
499end_loops:
500    vpop        {d8  -  d15}
501    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
502
503
504
505
506