• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* @file
21@*  ihevc_intra_pred_luma_mode_3_to_9.s
22@*
23@* @brief
24@*  contains function definitions for intra prediction dc filtering.
25@* functions are coded using neon  intrinsics and can be compiled using
26
27@* rvct
28@*
29@* @author
30@*  parthiban v
31@*
32@* @par list of functions:
33@*
34@*
35@* @remarks
36@*  none
37@*
38@*******************************************************************************
39@*/
40@/**
41@*******************************************************************************
42@*
43@* @brief
44@*    luma intraprediction filter for dc input
45@*
46@* @par description:
47@*
48@* @param[in] pu1_ref
49@*  uword8 pointer to the source
50@*
51@* @param[out] pu1_dst
52@*  uword8 pointer to the destination
53@*
54@* @param[in] src_strd
55@*  integer source stride
56@*
57@* @param[in] dst_strd
58@*  integer destination stride
59@*
60@* @param[in] nt
61@*  size of tranform block
62@*
63@* @param[in] mode
64@*  type of filtering
65@*
66@* @returns
67@*
68@* @remarks
69@*  none
70@*
71@*******************************************************************************
72@*/
73
74@void ihevc_intra_pred_luma_mode_3_to_9(uword8* pu1_ref,
75@                               word32 src_strd,
76@                               uword8* pu1_dst,
77@                               word32 dst_strd,
78@                               word32 nt,
79@                               word32 mode)
80@
81@**************variables vs registers*****************************************
82@r0 => *pu1_ref
83@r1 => src_strd
84@r2 => *pu1_dst
85@r3 => dst_strd
86
87@stack contents from #104
88@   nt
89@   mode
90
91.equ    nt_offset,      104
92.equ    mode_offset,    108
93
94.text
95.align 4
96
97
98
99
100.globl ihevc_intra_pred_luma_mode_3_to_9_a9q
101.extern gai4_ihevc_ang_table
102.extern gai4_ihevc_inv_ang_table
103.extern col_for_intra_luma
104.extern idx_neg_idx_3_9
105
106gai4_ihevc_ang_table_addr:
107.long gai4_ihevc_ang_table - ulbl1 - 8
108
109gai4_ihevc_inv_ang_table_addr:
110.long gai4_ihevc_inv_ang_table - ulbl2 - 8
111
112idx_neg_idx_3_9_addr_1:
113.long idx_neg_idx_3_9 - ulbl3_1 - 8
114
115idx_neg_idx_3_9_addr_2:
116.long idx_neg_idx_3_9 - ulbl3_2 - 8
117
118col_for_intra_luma_addr_1:
119.long col_for_intra_luma - ulbl4_1 - 8
120
121col_for_intra_luma_addr_2:
122.long col_for_intra_luma - ulbl4_2 - 8
123
124col_for_intra_luma_addr_3:
125.long col_for_intra_luma - ulbl4_3 - 8
126
127.type ihevc_intra_pred_luma_mode_3_to_9_a9q, %function
128
129ihevc_intra_pred_luma_mode_3_to_9_a9q:
130
131    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
132    vpush       {d8 - d15}
133    ldr         r4,[sp,#nt_offset]          @loads nt
134    ldr         r7, gai4_ihevc_ang_table_addr
135ulbl1:
136    add         r7,r7,pc
137
138    ldr         r5,[sp,#mode_offset]        @mode (3 to 9)
139    ldr         r8, gai4_ihevc_inv_ang_table_addr
140ulbl2:
141    add         r8,r8,pc
142
143    add         r7, r7, r5, lsl #2          @gai4_ihevc_ang_table[mode]
144    ldr         r7, [r7]                    @intra_pred_ang
145    vdup.8      d30, r7                     @intra_pred_ang
146
147    ldr         r14, col_for_intra_luma_addr_1
148ulbl4_1:
149    add         r14,r14,pc
150    cmp         r4, #4
151
152    beq         sz_4_proc
153    b           prologue_8_16_32
154
155prologue_8_16_32:
156    lsr         r10, r4, #3
157    vld1.8      d31, [r14]!
158    mul         r10, r4, r10                @block counter (dec by #8)
159
160    mov         r11, r4                     @col counter to be inc/dec by #8
161    vmull.s8    q11, d30, d31               @(col+1)*intra_pred_angle [0:7](col)
162
163    sub         r7, r5, #3
164    vmov.i8     d2,#1                       @contains #1 for adding to get ref_main_idx + 1
165    ldr         r12, idx_neg_idx_3_9_addr_1 @load least idx table
166ulbl3_1:
167    add         r12,r12,pc
168
169    vmov.i8     d3, #2
170
171    add         r12, r12, r7, lsl #4
172    mov         r8, r12
173
174    mov         r7, #8
175    sub         r7, r7, r3, lsl #3          @r7 = 8-8r3
176
177    ldr         r9, [r8]
178    add         r1, r0, r4, lsl #1          @pu1_ref + nt
179
180    vmovn.s16   d6, q11
181    vdup.8      d26, r9                     @least idx added to final idx values
182    sub         r1, r1, #9                  @ref_main_idx + 2nt - (8 + 1)(two_nt - idx - row ) for 8 & 8 - 1row
183
184    sub         r6, r1, r9
185
186    vld1.8      {d0,d1}, [r6]               @stores the 32 values reqd based on indices values (from least idx)
187    vshr.s16    q11, q11, #5
188
189    vmov.i8     d29, #31                    @contains #31 for vand operation
190
191    vmov.i8     d28, #32
192
193    vqmovn.s16  d8, q11
194
195    vand        d6, d6, d29                 @fract values in d1/ idx values in d0
196
197    mov         r0, #1
198
199    vmov.i8     d27, #7                     @row 0 to 7
200
201    vsub.s8     d8, d8, d2                  @ref_main_idx (sub row)
202    vsub.s8     d8, d26, d8                 @ref_main_idx (row 0)
203    vadd.s8     d8, d8, d27                 @t0 compensate the pu1_src idx incremented by 8
204    vsub.s8     d9, d8, d2                  @ref_main_idx + 1 (row 0)
205    vtbl.8      d12, {d0,d1}, d8            @load from ref_main_idx (row 0)
206    vsub.s8     d7, d28, d6                 @32-fract
207
208    vtbl.8      d13, {d0,d1}, d9            @load from ref_main_idx + 1 (row 0)
209    vsub.s8     d4, d8, d2                  @ref_main_idx (row 1)
210    vsub.s8     d5, d9, d2                  @ref_main_idx + 1 (row 1)
211
212    vtbl.8      d16, {d0,d1}, d4            @load from ref_main_idx (row 1)
213    vmull.u8    q12, d12, d7                @mul (row 0)
214    vmlal.u8    q12, d13, d6                @mul (row 0)
215
216    vtbl.8      d17, {d0,d1}, d5            @load from ref_main_idx + 1 (row 1)
217    vsub.s8     d8, d8, d3                  @ref_main_idx (row 2)
218    vsub.s8     d9, d9, d3                  @ref_main_idx + 1 (row 2)
219
220    vrshrn.i16  d24, q12, #5                @round shft (row 0)
221
222    vtbl.8      d14, {d0,d1}, d8            @load from ref_main_idx (row 2)
223    vmull.u8    q11, d16, d7                @mul (row 1)
224    vmlal.u8    q11, d17, d6                @mul (row 1)
225
226    vtbl.8      d15, {d0,d1}, d9            @load from ref_main_idx + 1 (row 2)
227    vsub.s8     d4, d4, d3                  @ref_main_idx (row 3)
228    vsub.s8     d5, d5, d3                  @ref_main_idx + 1 (row 3)
229
230    vst1.8      d24, [r2], r3               @st (row 0)
231    vrshrn.i16  d22, q11, #5                @round shft (row 1)
232
233    vtbl.8      d10, {d0,d1}, d4            @load from ref_main_idx (row 3)
234    vmull.u8    q10, d14, d7                @mul (row 2)
235    vmlal.u8    q10, d15, d6                @mul (row 2)
236
237    vtbl.8      d11, {d0,d1}, d5            @load from ref_main_idx + 1 (row 3)
238    vsub.s8     d8, d8, d3                  @ref_main_idx (row 4)
239    vsub.s8     d9, d9, d3                  @ref_main_idx + 1 (row 4)
240
241    vst1.8      d22, [r2], r3               @st (row 1)
242    vrshrn.i16  d20, q10, #5                @round shft (row 2)
243
244    vtbl.8      d12, {d0,d1}, d8            @load from ref_main_idx (row 4)
245    vmull.u8    q9, d10, d7                 @mul (row 3)
246    vmlal.u8    q9, d11, d6                 @mul (row 3)
247
248    vtbl.8      d13, {d0,d1}, d9            @load from ref_main_idx + 1 (row 4)
249    vsub.s8     d4, d4, d3                  @ref_main_idx (row 5)
250    vsub.s8     d5, d5, d3                  @ref_main_idx + 1 (row 5)
251
252    vst1.8      d20, [r2], r3               @st (row 2)
253    vrshrn.i16  d18, q9, #5                 @round shft (row 3)
254
255    vtbl.8      d16, {d0,d1}, d4            @load from ref_main_idx (row 5)
256    vmull.u8    q12, d12, d7                @mul (row 4)
257    vmlal.u8    q12, d13, d6                @mul (row 4)
258
259    vtbl.8      d17, {d0,d1}, d5            @load from ref_main_idx + 1 (row 5)
260    vsub.s8     d8, d8, d3                  @ref_main_idx (row 6)
261    vsub.s8     d9, d9, d3                  @ref_main_idx + 1 (row 6)
262
263    vst1.8      d18, [r2], r3               @st (row 3)
264    vrshrn.i16  d24, q12, #5                @round shft (row 4)
265
266    vtbl.8      d14, {d0,d1}, d8            @load from ref_main_idx (row 6)
267    vmull.u8    q11, d16, d7                @mul (row 5)
268    vmlal.u8    q11, d17, d6                @mul (row 5)
269
270    vtbl.8      d15, {d0,d1}, d9            @load from ref_main_idx + 1 (row 6)
271    vsub.s8     d4, d4, d3                  @ref_main_idx (row 7)
272    vsub.s8     d5, d5, d3                  @ref_main_idx + 1 (row 7)
273
274    vst1.8      d24, [r2], r3               @st (row 4)
275    vrshrn.i16  d22, q11, #5                @round shft (row 5)
276
277    vtbl.8      d10, {d0,d1}, d4            @load from ref_main_idx (row 7)
278    vmull.u8    q10, d14, d7                @mul (row 6)
279    vmlal.u8    q10, d15, d6                @mul (row 6)
280
281    vtbl.8      d11, {d0,d1}, d5            @load from ref_main_idx + 1 (row 7)
282    vmull.u8    q9, d10, d7                 @mul (row 7)
283    vmlal.u8    q9, d11, d6                 @mul (row 7)
284
285    vst1.8      d22, [r2], r3               @st (row 5)
286    vrshrn.i16  d20, q10, #5                @round shft (row 6)
287    vrshrn.i16  d18, q9, #5                 @round shft (row 7)
288
289    vst1.8      d20, [r2], r3               @st (row 6)
290
291    subs        r10, r10, #8                @subtract 8 and go to end if 8x8
292
293    vst1.8      d18, [r2], r3               @st (row 7)
294
295    beq         end_func
296
297    subs        r11, r11, #8
298    addgt       r8, r8, #4
299    addgt       r2, r2, r7
300    movle       r8, r12
301    suble       r2, r2, r4
302    addle       r2, r2, #8
303    movle       r11, r4
304    ldrle       r14, col_for_intra_luma_addr_2
305ulbl4_2:
306    addle       r14,r14,pc
307    addle       r0, r0, #8
308
309    mov         r5,r2
310    vld1.8      d31, [r14]!
311    vmull.s8    q6, d30, d31                @(col+1)*intra_pred_angle [0:7](col)
312    vmovn.s16   d10, q6
313    vshr.s16    q6, q6, #5
314    vqmovn.s16  d11, q6
315    ldr         r9, [r8]
316    add         r9, r0, r9
317    sub         r9, r9, #1
318    vdup.8      d26, r9
319    vmov.i8     d16,#8
320
321    sub         r4,r4,#8
322
323kernel_8_16_32:
324
325    vsub.s8     d8, d26, d11                @ref_main_idx
326    vmov        d26,d10
327
328    subs        r11, r11, #8
329    sub         r6, r1, r9
330    vtbl.8      d10, {d0,d1}, d4            @load from ref_main_idx (row 7)
331    vadd.s8     d8, d8, d16                 @to compensate the pu1_src idx incremented by 8
332
333    vmull.u8    q10, d14, d7                @mul (row 6)
334    vtbl.8      d11, {d0,d1}, d5            @load from ref_main_idx - 1 (row 7)
335    vmlal.u8    q10, d15, d6                @mul (row 6)
336
337    vsub.s8     d9, d8, d2                  @ref_main_idx - 1
338    addle       r0, r0, #8
339    addgt       r8, r8, #4
340    vld1.8      {d0,d1}, [r6]               @stores the 32 values reqd based on indices values (from least idx)
341
342    vst1.8      d24, [r5], r3               @st (row 4)
343    vrshrn.i16  d22, q11, #5                @round shft (row 5)
344
345    ldrle       r14, col_for_intra_luma_addr_3
346ulbl4_3:
347    addle       r14,r14,pc
348
349    movle       r8, r12
350    vdup.8      d27, r0                     @row value inc or reset accordingly
351
352    vsub.s8     d4, d8, d2                  @ref_main_idx (row 1)
353    vtbl.8      d12, {d0,d1}, d8            @load from ref_main_idx (row 0)
354    vsub.s8     d5, d9, d2                  @ref_main_idx - 1 (row 1)
355
356
357    vmull.u8    q9, d10, d7                 @mul (row 7)
358    vtbl.8      d13, {d0,d1}, d9            @load from ref_main_idx + 1 (row 0)
359    vmlal.u8    q9, d11, d6                 @mul (row 7)
360
361    vld1.8      d31, [r14]!
362    vand        d6, d29, d26                @fract values in d1/ idx values in d0
363
364    vst1.8      d22, [r5], r3               @(from previous loop)st (row 5)
365    vrshrn.i16  d20, q10, #5                @(from previous loop)round shft (row 6)
366
367    vsub.s8     d8, d8, d3                  @ref_main_idx (row 2)
368    vtbl.8      d10, {d0,d1}, d4            @load from ref_main_idx (row 1)
369    vsub.s8     d9, d9, d3                  @ref_main_idx - 1 (row 2)
370
371    addle       r11, r4, #8
372    ldr         r9, [r8]
373    vsub.s8     d7, d28, d6                 @32-fract
374
375    vmull.u8    q12, d12, d7                @mul (row 0)
376    vtbl.8      d17, {d0,d1}, d5            @load from ref_main_idx + 1 (row 1)
377    vmlal.u8    q12, d13, d6                @mul (row 0)
378
379    vst1.8      d20, [r5], r3               @(from previous loop)st (row 6)
380    vrshrn.i16  d18, q9, #5                 @(from previous loop)round shft (row 7)
381
382    vsub.s8     d4, d4, d3                  @ref_main_idx (row 3)
383    vtbl.8      d14, {d0,d1}, d8            @load from ref_main_idx (row 2)
384    vsub.s8     d5, d5, d3                  @ref_main_idx - 1 (row 3)
385
386    vmull.u8    q11, d10, d7                @mul (row 1)
387    vtbl.8      d15, {d0,d1}, d9            @load from ref_main_idx + 1 (row 2)
388    vmlal.u8    q11, d17, d6                @mul (row 1)
389
390    vrshrn.i16  d24, q12, #5                @round shft (row 0)
391    vst1.8      d18, [r5], r3               @(from previous loop)st (row 7)
392
393    vsub.s8     d8, d8, d3                  @ref_main_idx (row 4)
394    vtbl.8      d10, {d0,d1}, d4            @load from ref_main_idx (row 3)
395    vsub.s8     d9, d9, d3                  @ref_main_idx - 1 (row 4)
396
397    vmull.u8    q10, d14, d7                @mul (row 2)
398    vtbl.8      d11, {d0,d1}, d5            @load from ref_main_idx + 1 (row 3)
399    vmlal.u8    q10, d15, d6                @mul (row 2)
400
401    vmull.s8    q7, d30, d31                @(col+1)*intra_pred_angle [0:7](col)
402    add         r5,r2,r3,lsl#2
403    add         r9, r0, r9
404
405    vst1.8      d24, [r2], r3               @st (row 0)
406    vrshrn.i16  d22, q11, #5                @round shft (row 1)
407
408    vsub.s8     d4, d4, d3                  @ref_main_idx (row 5)
409    vtbl.8      d12, {d0,d1}, d8            @load from ref_main_idx (row 4)
410    vsub.s8     d5, d5, d3                  @ref_main_idx - 1 (row 5)
411
412    vmull.u8    q9, d10, d7                 @mul (row 3)
413    vtbl.8      d13, {d0,d1}, d9            @load from ref_main_idx + 1 (row 4)
414    vmlal.u8    q9, d11, d6                 @mul (row 3)
415
416    vst1.8      d22, [r2], r3               @st (row 1)
417    vrshrn.i16  d20, q10, #5                @round shft (row 2)
418
419    vmovn.s16   d10, q7
420    vshr.s16    q7, q7, #5
421
422    vsub.s8     d8, d8, d3                  @ref_main_idx (row 6)
423    vtbl.8      d21, {d0,d1}, d4            @load from ref_main_idx (row 5)
424    vsub.s8     d9, d9, d3                  @ref_main_idx - 1 (row 6)
425
426    vmull.u8    q12, d12, d7                @mul (row 4)
427    vtbl.8      d17, {d0,d1}, d5            @load from ref_main_idx + 1 (row 5)
428    vmlal.u8    q12, d13, d6                @mul (row 4)
429
430    vst1.8      d20, [r2], r3               @st (row 2)
431    vrshrn.i16  d18, q9, #5                 @round shft (row 3)
432
433    sub         r9, r9, #1
434    vqmovn.s16  d11, q7
435
436    vsub.s8     d4, d4, d3                  @ref_main_idx (row 7)
437    vtbl.8      d14, {d0,d1}, d8            @load from ref_main_idx (row 6)
438    vsub.s8     d5, d5, d3                  @ref_main_idx - 1 (row 7)
439
440    vmull.u8    q11, d21, d7                @mul (row 5)
441    vtbl.8      d15, {d0,d1}, d9            @load from ref_main_idx + 1 (row 6)
442    vmlal.u8    q11, d17, d6                @mul (row 5)
443
444    vadd.s8     d11, d27, d11               @ref_main_idx (add row)
445    vdup.8      d26, r9
446
447    vst1.8      d18, [r2], r3               @st (row 3)
448    vrshrn.i16  d24, q12, #5                @round shft (row 4)
449
450    add         r2,r3, lsl #2
451    vsub.s8     d11, d11, d2                @ref_main_idx -1 (sub 1)
452    addgt       r2, r7, r2
453
454    suble       r2, r2, r4
455
456    subs        r10, r10, #8                @subtract 8 and go to end if 8x8
457
458    bne         kernel_8_16_32
459
460epil_8_16_32:
461    vtbl.8      d10, {d0,d1}, d4            @load from ref_main_idx (row 7)
462
463    vmull.u8    q10, d14, d7                @mul (row 6)
464    vtbl.8      d11, {d0,d1}, d5            @load from ref_main_idx + 1 (row 7)
465    vmlal.u8    q10, d15, d6                @mul (row 6)
466
467    vst1.8      d24, [r5], r3               @st (row 4)
468    vrshrn.i16  d24, q11, #5                @round shft (row 5)
469
470    vmull.u8    q9, d10, d7                 @mul (row 7)
471    vmlal.u8    q9, d11, d6                 @mul (row 7)
472
473    vst1.8      d24, [r5], r3               @(from previous loop)st (row 5)
474    vrshrn.i16  d20, q10, #5                @(from previous loop)round shft (row 6)
475
476    vst1.8      d20, [r5], r3               @(from previous loop)st (row 6)
477    vrshrn.i16  d18, q9, #5                 @(from previous loop)round shft (row 7)
478
479    vst1.8      d18, [r5], r3               @st (row 7)
480
481    b           end_func
482
483sz_4_proc:
484    vld1.8      d31, [r14]
485    vmov.i8     d2, #1                      @contains #1 for adding to get ref_main_idx - 1
486
487    vmov.i8     d3, #2
488    ldr         r12, idx_neg_idx_3_9_addr_2 @load least idx table
489ulbl3_2:
490    add         r12,r12,pc
491
492    vmull.s8    q11, d30, d31               @(col+1)*intra_pred_angle [0:7](col)
493    sub         r7, r5, #3
494
495    add         r12, r12, r7, lsl #4
496    mov         r8, r12
497
498    ldr         r9, [r8]
499
500    vdup.8      d26, r9                     @least idx added to final idx values
501    add         r6, r0, r4, lsl #1          @pu1_ref + 2nt
502
503    vmovn.s16   d6, q11
504    sub         r6, r6, #9                  @ref_main_idx + 2nt - (8 + 1)(two_nt - idx - row ) for 8 & 8 - 1row
505    sub         r6, r6, r9
506
507    vld1.8      {d0,d1}, [r6]               @stores the 32 values reqd based on indices values (from least idx)
508
509    vmov.i8     d29, #31                    @contains #31 for vand operation
510
511    vmov.i8     d28, #32
512
513    vshr.s16    q11, q11, #5
514    vqmovn.s16  d8, q11
515
516    vand        d6, d6, d29                 @fract values in d1/ idx values in d0
517    vsub.s8     d7, d28, d6                 @32-fract
518
519    vmov.i8     d27, #7                     @row 0 to 7(row-1)
520    vsub.s8     d8, d8, d2                  @ref_main_idx (add 1)
521    vsub.s8     d8, d26, d8                 @ref_main_idx
522    vadd.s8     d8, d8, d27                 @t0 compensate the pu1_src idx incremented by 8
523    vsub.s8     d9, d8, d2                  @ref_main_idx - 1
524
525    vsub.s8     d4, d8, d2                  @row 1 ref_main_idx
526    vsub.s8     d5, d9, d2
527
528    vtbl.8      d12, {d0,d1}, d8            @load from ref_main_idx (row 0)
529    vtbl.8      d13, {d0,d1}, d9            @load from ref_main_idx + 1 (row 0)
530
531
532    vmull.u8    q12, d12, d7                @mul (row 0)
533    vtbl.8      d16, {d0,d1}, d4            @load from ref_main_idx (row 1)
534    vmlal.u8    q12, d13, d6                @mul (row 0)
535
536    vsub.s8     d8, d8, d3                  @idx (row 2)
537    vtbl.8      d17, {d0,d1}, d5            @load from ref_main_idx + 1 (row 1)
538    vsub.s8     d9, d9, d3                  @idx+1 (row 2)
539
540    vmull.u8    q11, d16, d7                @mul (row 1)
541    vtbl.8      d12, {d0,d1}, d8            @load from ref_main_idx (row 2)
542    vmlal.u8    q11, d17, d6                @mul (row 1)
543
544    vrshrn.i16  d24, q12, #5                @round shift (row 0)
545
546    vsub.s8     d4, d4, d3                  @idx (row 3)
547    vtbl.8      d13, {d0,d1}, d9            @load from ref_main_idx + 1 (row 2)
548    vsub.s8     d5, d5, d3                  @idx+1 (row 3)
549
550    vmull.u8    q10, d12, d7                @mul (row 2)
551    vtbl.8      d16, {d0,d1}, d4            @load from ref_main_idx (row 3)
552    vmlal.u8    q10, d13, d6                @mul (row 2)
553
554    vst1.32     d24[0], [r2], r3            @st row 0
555    vrshrn.i16  d22, q11, #5                @round shift (row 1)
556
557    vtbl.8      d17, {d0,d1}, d5            @load from ref_main_idx + 1 (row 3)
558
559    vmull.u8    q9, d16, d7                 @mul (row 3)
560    vmlal.u8    q9, d17, d6                 @mul (row 3)
561
562    vst1.32     d22[0], [r2], r3            @st row 1
563    vrshrn.i16  d20, q10, #5                @round shift (row 2)
564
565    vst1.32     d20[0], [r2], r3            @st row 2
566
567    vrshrn.i16  d18, q9, #5                 @round shift (row 3)
568
569    vst1.32     d18[0], [r2], r3            @st (row 3)
570
571end_func:
572    vpop        {d8 - d15}
573    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
574
575
576
577
578