• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* @file
21@*  ihevc_intra_pred_luma_mode_27_to_33.s
22@*
23@* @brief
24@*  contains function definition for intra prediction  interpolation filters
25@*
26@*
27@* @author
28@*  parthiban v
29@*
30@* @par list of functions:
31@*  - ihevc_intra_pred_luma_mode_27_to_33()
32@*
33@* @remarks
34@*  none
35@*
36@*******************************************************************************
37@*/
38@
39@/**
40@*******************************************************************************
41@*
42@* @brief
43@*    intra prediction interpolation filter for luma mode 27 to mode 33
44@*
45@* @par description:
46@*    intraprediction for mode 27 to 33  (positive angle, vertical mode ) with
47@*   .extern  neighboring samples location pointed by 'pu1_ref' to the  tu
48@*    block location pointed by 'pu1_dst'
49@*
50@* @param[in] pu1_src
51@*  uword8 pointer to the source
52@*
53@* @param[out] pu1_dst
54@*  uword8 pointer to the destination
55@*
56@* @param[in] src_strd
57@*  integer source stride
58@*
59@* @param[in] dst_strd
60@*  integer destination stride
61@*
62@* @param[in] nt
63@*  integer transform block size
64@*
65@* @param[in] mode
66@*  integer intraprediction mode
67@*
68@* @returns
69@*
70@* @remarks
71@*  none
72@*
73@*******************************************************************************
74@*/
75
76@void ihevc_intra_pred_luma_mode_27_to_33(uword8 *pu1_ref,
77@                                       word32 src_strd,
78@                                       uword8 *pu1_dst,
79@                                       word32 dst_strd,
80@                                       word32 nt,
81@                                       word32 mode)
82@**************variables vs registers*****************************************
83@r0 => *pu1_ref
84@r1 =>  src_strd
85@r2 => *pu1_dst
86@r3 =>  dst_strd
87
88.equ    nt_offset,      104
89.equ    mode_offset,    108
90
91.text
92.align 4
93
94
95
96
97.globl ihevc_intra_pred_luma_mode_27_to_33_a9q
98.extern gai4_ihevc_ang_table
99.extern gau1_ihevc_planar_factor
100
101gai4_ihevc_ang_table_addr:
102.long gai4_ihevc_ang_table - ulbl1 - 8
103
104gau1_ihevc_planar_factor_addr:
105.long gau1_ihevc_planar_factor - ulbl2 - 8
106
107
108.type ihevc_intra_pred_luma_mode_27_to_33_a9q, %function
109
110ihevc_intra_pred_luma_mode_27_to_33_a9q:
111
112    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
113    vpush       {d8 - d15}
114    ldr         r4,[sp,#nt_offset]          @loads nt
115    ldr         r5,[sp,#mode_offset]        @loads mode
116    ldr         r6,gai4_ihevc_ang_table_addr @loads word32 gai4_ihevc_ang_table[35]
117ulbl1:
118    add         r6,r6,pc
119
120    lsl         r7,r4,#1                    @two_nt
121
122    add         r8,r6,r5,lsl #2             @*gai4_ihevc_ang_table[mode]
123    ldr         r9,[r8]                     @intra_pred_ang = gai4_ihevc_ang_table[mode]
124    ldr         r1,gau1_ihevc_planar_factor_addr @used for ((row + 1) * intra_pred_ang) row values
125ulbl2:
126    add         r1,r1,pc
127    add         r6,r1,#1
128
129    tst         r4,#7
130    add         r8,r0,r7                    @pu1_ref + two_nt
131    mov         lr,#0                       @row
132    mov         r12,r4
133    bne         core_loop_4
134
135core_loop_8:
136    add         r8,r8,#1                    @pu1_ref_main_idx += (two_nt + 1)
137    vdup.8      d0,r9                       @intra_pred_ang
138    mov         r12,r4,lsr #3               @divide by 8
139
140    vmov.i8     d1,#32
141    mul         r7,r4,r12
142
143    vmov.i16    q3,#31
144    @lsl            r12,r3,#3
145
146    mov         r1,r8
147    @sub            r12,r12,r4
148    mov         r5,r4
149    mov         r11,#1
150
151prologue:
152    vld1.8      {d3},[r6]                   @loads the row value
153    vmull.u8    q1,d3,d0                    @pos = ((row + 1) * intra_pred_ang)
154    vand        q2,q1,q3                    @dup_const_fract(fract = pos & (31))
155    vmovn.i16   d4,q2
156    vshrn.u16   d5,q1,#5                    @idx = pos >> 5
157
158    vdup.8      d31,d4[0]
159    add         r0,r2,r3
160
161    vmov.u32    lr,d5[0]                    @(i row)extract idx to the r register
162
163    vdup.8      d29,d4[1]                   @(ii)
164    and         r9,lr,#0xff                 @(i row) get the last byte
165
166    add         r10,r8,r9                   @(i row)*pu1_ref[ref_main_idx]
167
168    asr         lr,lr,#8                    @(ii)shift by 8
169    vld1.8      {d8},[r10],r11              @(i row)ref_main_idx
170    and         r9,lr,#0xff                 @(ii)get the last byte
171
172    asr         lr,lr,#8                    @(iii)
173    vld1.8      {d9},[r10]                  @(i row)ref_main_idx_1
174    add         r12,r8,r9                   @(ii)*pu1_ref[ref_main_idx]
175
176    and         r9,lr,#0xff                 @(iii)
177    vsub.u8     d30,d1,d31                  @32-fract(dup_const_32_fract)
178    add         r10,r8,r9                   @(iii)*pu1_ref[ref_main_idx]
179
180    vld1.8      {d12},[r12],r11             @(ii)ref_main_idx
181    vmull.u8    q5,d8,d30                   @(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
182
183    vld1.8      {d13},[r12]                 @(ii)ref_main_idx_1
184    vmlal.u8    q5,d9,d31                   @(i row)vmull_u8(ref_main_idx_1, dup_const_fract)
185    asr         lr,lr,#8                    @(iv)
186
187    vdup.8      d27,d4[2]                   @(iii)
188    vsub.u8     d28,d1,d29                  @(ii)32-fract(dup_const_32_fract)
189    and         r9,lr,#0xff                 @(iv)
190
191    vdup.8      d25,d4[3]                   @(iv)
192    vmull.u8    q7,d12,d28                  @(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
193    add         r12,r8,r9                   @(iv)*pu1_ref[ref_main_idx]
194
195    vld1.8      {d16},[r10],r11             @(iii)ref_main_idx
196    vmlal.u8    q7,d13,d29                  @(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
197
198    vld1.8      {d17},[r10]                 @(iii)ref_main_idx_1
199    vrshrn.i16  d10,q5,#5                   @(i row)shift_res = vrshrn_n_u16(add_res, 5)
200
201    vld1.8      {d20},[r12],r11             @(iv)ref_main_idx
202    vsub.u8     d26,d1,d27                  @(iii)32-fract(dup_const_32_fract)
203
204    vld1.8      {d21},[r12]                 @(iv)ref_main_idx_1
205
206    vdup.8      d31,d4[4]                   @(v)
207    vmull.u8    q9,d16,d26                  @(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
208
209    vmov.u32    lr,d5[1]                    @extract idx to the r register
210    vmlal.u8    q9,d17,d27                  @(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
211
212    vst1.8      {d10},[r2]!                 @(i row)
213    vrshrn.i16  d14,q7,#5                   @(ii)shift_res = vrshrn_n_u16(add_res, 5)
214
215    and         r9,lr,#0xff                 @(v)
216    vdup.8      d29,d4[5]                   @(vi)
217    add         r10,r8,r9                   @(v)*pu1_ref[ref_main_idx]
218
219    vld1.8      {d8},[r10],r11              @(v)ref_main_idx
220    vsub.u8     d24,d1,d25                  @(iv)32-fract(dup_const_32_fract)
221
222    asr         lr,lr,#8                    @(vi)
223    vmull.u8    q11,d20,d24                 @(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
224    and         r9,lr,#0xff                 @(vi)
225
226    vld1.8      {d9},[r10]                  @(v)ref_main_idx_1
227    vmlal.u8    q11,d21,d25                 @(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
228
229    vst1.8      {d14},[r0],r3               @(ii)
230    vrshrn.i16  d18,q9,#5                   @(iii)shift_res = vrshrn_n_u16(add_res, 5)
231
232    add         r12,r8,r9                   @(vi)*pu1_ref[ref_main_idx]
233    vdup.8      d27,d4[6]                   @(vii)
234    asr         lr,lr,#8                    @(vii)
235
236    and         r9,lr,#0xff                 @(vii)
237    vsub.u8     d30,d1,d31                  @(v)32-fract(dup_const_32_fract)
238    add         r10,r8,r9                   @(vii)*pu1_ref[ref_main_idx]
239
240    vld1.8      {d12},[r12],r11             @(vi)ref_main_idx
241    vmull.u8    q5,d8,d30                   @(v)vmull_u8(ref_main_idx, dup_const_32_fract)
242
243    vld1.8      {d13},[r12]                 @(vi)ref_main_idx_1
244    vmlal.u8    q5,d9,d31                   @(v)vmull_u8(ref_main_idx_1, dup_const_fract)
245
246    vst1.8      {d18},[r0],r3               @(iii)
247    vrshrn.i16  d22,q11,#5                  @(iv)shift_res = vrshrn_n_u16(add_res, 5)
248
249    asr         lr,lr,#8                    @(viii)
250    vdup.8      d25,d4[7]                   @(viii)
251    and         r9,lr,#0xff                 @(viii)
252
253    vld1.8      {d16},[r10],r11             @(vii)ref_main_idx
254    vsub.u8     d28,d1,d29                  @(vi)32-fract(dup_const_32_fract)
255
256    vld1.8      {d17},[r10]                 @(vii)ref_main_idx_1
257    vmull.u8    q7,d12,d28                  @(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
258
259    add         r12,r8,r9                   @(viii)*pu1_ref[ref_main_idx]
260    vmlal.u8    q7,d13,d29                  @(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
261    subs        r4,r4,#8
262
263    vst1.8      {d22},[r0],r3               @(iv)
264    vrshrn.i16  d10,q5,#5                   @(v)shift_res = vrshrn_n_u16(add_res, 5)
265
266    vld1.8      {d20},[r12],r11             @(viii)ref_main_idx
267    vsub.u8     d26,d1,d27                  @(vii)32-fract(dup_const_32_fract)
268
269    vld1.8      {d21},[r12]                 @(viii)ref_main_idx_1
270    vmull.u8    q9,d16,d26                  @(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
271
272    addgt       r8,r8,#8
273    vmlal.u8    q9,d17,d27                  @(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
274    subgt       r7,r7,#8
275
276    vst1.8      {d10},[r0],r3               @(v)
277    vrshrn.i16  d14,q7,#5                   @(vi)shift_res = vrshrn_n_u16(add_res, 5)
278
279    beq         epilogue
280
281    vld1.8      {d5},[r6]                   @loads the row value
282    vmull.u8    q1,d5,d0                    @pos = ((row + 1) * intra_pred_ang)
283    vand        q2,q1,q3                    @dup_const_fract(fract = pos & (31))
284    vmovn.i16   d4,q2
285    vshrn.u16   d3,q1,#5                    @idx = pos >> 5
286    vmov.u32    lr,d3[0]                    @(i)extract idx to the r register
287    and         r9,lr,#0xff                 @(i)
288    add         r10,r8,r9                   @(i)*pu1_ref[ref_main_idx]
289
290kernel_8_rows:
291    asr         lr,lr,#8                    @(ii)
292    vdup.8      d31,d4[0]
293    subs        r4,r4,#8
294
295    vld1.8      {d8},[r10],r11              @(i)ref_main_idx
296    vsub.u8     d24,d1,d25                  @(viii)32-fract(dup_const_32_fract)
297    and         r9,lr,#0xff                 @(ii)
298    addle       r6,r6,#8                    @increment the row value
299
300    vld1.8      {d9},[r10]                  @(i)ref_main_idx_1
301    vmull.u8    q11,d20,d24                 @(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
302    add         r12,r8,r9                   @(ii)*pu1_ref[ref_main_idx]
303
304    vld1.8      {d5},[r6]                   @loads the row value
305    vmlal.u8    q11,d21,d25                 @(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
306    asr         lr,lr,#8                    @(iii)
307
308    vdup.8      d29,d4[1]                   @(ii)
309    vrshrn.i16  d18,q9,#5                   @(vii)shift_res = vrshrn_n_u16(add_res, 5)
310    and         r9,lr,#0xff                 @(iii)
311
312    vst1.8      {d14},[r0],r3               @(vi)
313    vsub.u8     d30,d1,d31                  @(i)32-fract(dup_const_32_fract)
314    add         r10,r8,r9                   @(iii)*pu1_ref[ref_main_idx]
315
316    vld1.8      {d12},[r12],r11             @(ii)ref_main_idx
317    vmull.u8    q5,d8,d30                   @(i)vmull_u8(ref_main_idx, dup_const_32_fract)
318    asr         lr,lr,#8                    @(iv)
319
320    vld1.8      {d13},[r12]                 @(ii)ref_main_idx_1
321    vmlal.u8    q5,d9,d31                   @(i)vmull_u8(ref_main_idx_1, dup_const_fract)
322    and         r9,lr,#0xff                 @(iv)
323
324    vmov.u32    lr,d3[1]                    @extract idx to the r register
325    vrshrn.i16  d22,q11,#5                  @(viii)shift_res = vrshrn_n_u16(add_res, 5)
326
327    vdup.8      d27,d4[2]                   @(iii)
328    vsub.u8     d28,d1,d29                  @(ii)32-fract(dup_const_32_fract)
329    movle       r4,r5                       @reload nt
330
331    vld1.8      {d16},[r10],r11             @(iii)ref_main_idx
332    vmull.u8    q7,d12,d28                  @(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
333    add         r12,r8,r9                   @(iv)*pu1_ref[ref_main_idx]
334
335    vst1.8      {d18},[r0],r3               @(vii)
336    vmlal.u8    q7,d13,d29                  @(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
337
338    vld1.8      {d17},[r10]                 @(iii)ref_main_idx_1
339    vrshrn.i16  d10,q5,#5                   @(i)shift_res = vrshrn_n_u16(add_res, 5)
340
341    vdup.8      d25,d4[3]                   @(iv)
342    vmull.u8    q1,d5,d0                    @pos = ((row + 1) * intra_pred_ang)
343
344    vst1.8      {d22},[r0]                  @(viii)
345    vsub.u8     d26,d1,d27                  @(iii)32-fract(dup_const_32_fract)
346
347    vld1.8      {d20},[r12],r11             @(iv)ref_main_idx
348    vmull.u8    q9,d16,d26                  @(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
349    add         r0,r2,r3
350
351    vld1.8      {d21},[r12]                 @(iv)ref_main_idx_1
352    vmlal.u8    q9,d17,d27                  @(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
353    and         r9,lr,#0xff                 @(v)
354
355    vdup.8      d31,d4[4]                   @(v)
356    vrshrn.i16  d14,q7,#5                   @(ii)shift_res = vrshrn_n_u16(add_res, 5)
357    add         r10,r8,r9                   @(v)*pu1_ref[ref_main_idx]
358
359    vst1.8      {d10},[r2]!                 @(i)
360    vsub.u8     d24,d1,d25                  @(iv)32-fract(dup_const_32_fract)
361    asr         lr,lr,#8                    @(vi)
362
363    vdup.8      d29,d4[5]                   @(vi)
364    vmull.u8    q11,d20,d24                 @(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
365    and         r9,lr,#0xff                 @(vi)
366
367    vdup.8      d27,d4[6]                   @(vii)
368    vmlal.u8    q11,d21,d25                 @(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
369    add         r12,r8,r9                   @(vi)*pu1_ref[ref_main_idx]
370
371    vdup.8      d25,d4[7]                   @(viii)
372    vrshrn.i16  d18,q9,#5                   @(iii)shift_res = vrshrn_n_u16(add_res, 5)
373    asr         lr,lr,#8                    @(vii)
374
375    vld1.8      {d8},[r10],r11              @(v)ref_main_idx
376    vand        q2,q1,q3                    @dup_const_fract(fract = pos & (31))
377    and         r9,lr,#0xff                 @(vii)
378
379    vld1.8      {d9},[r10]                  @(v)ref_main_idx_1
380    vshrn.u16   d3,q1,#5                    @idx = pos >> 5
381    asr         lr,lr,#8                    @(viii)
382
383    vst1.8      {d14},[r0],r3               @(ii)
384    vrshrn.i16  d22,q11,#5                  @(iv)shift_res = vrshrn_n_u16(add_res, 5)
385    add         r10,r8,r9                   @(vii)*pu1_ref[ref_main_idx]
386
387    vld1.8      {d12},[r12],r11             @(vi)ref_main_idx
388    vsub.u8     d30,d1,d31                  @(v)32-fract(dup_const_32_fract)
389    and         r9,lr,#0xff                 @(viii)
390
391    vld1.8      {d13},[r12]                 @(vi)ref_main_idx_1
392    vmull.u8    q5,d8,d30                   @(v)vmull_u8(ref_main_idx, dup_const_32_fract)
393
394    vmov.u32    lr,d3[0]                    @(i)extract idx to the r register
395    vmlal.u8    q5,d9,d31                   @(v)vmull_u8(ref_main_idx_1, dup_const_fract)
396    add         r12,r8,r9                   @(viii)*pu1_ref[ref_main_idx]
397
398    vld1.8      {d16},[r10],r11             @(vii)ref_main_idx
399    vsub.u8     d28,d1,d29                  @(vi)32-fract(dup_const_32_fract)
400
401    vst1.8      {d18},[r0],r3               @(iii)
402    vmull.u8    q7,d12,d28                  @(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
403    movle       r8,r1                       @reload the source to pu1_src+2nt
404
405    vld1.8      {d17},[r10]                 @(vii)ref_main_idx_1
406    vmlal.u8    q7,d13,d29                  @(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
407    addgt       r8,r8,#8                    @increment the source next set 8 columns in same row
408
409    vld1.8      {d20},[r12],r11             @(viii)ref_main_idx
410    vrshrn.i16  d10,q5,#5                   @(v)shift_res = vrshrn_n_u16(add_res, 5)
411
412    vld1.8      {d21},[r12]                 @(viii)ref_main_idx_1
413    vsub.u8     d26,d1,d27                  @(vii)32-fract(dup_const_32_fract)
414    lslle       r12,r3,#3
415
416    vst1.8      {d22},[r0],r3               @(iv)
417    vmull.u8    q9,d16,d26                  @(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
418    suble       r12,r12,r5
419
420    vst1.8      {d10},[r0],r3               @(v)
421    vmlal.u8    q9,d17,d27                  @(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
422    addle       r2,r2,r12                   @increment the dst pointer to 8*dst_strd - nt
423
424    vmovn.i16   d4,q2
425    vrshrn.i16  d14,q7,#5                   @(vi)shift_res = vrshrn_n_u16(add_res, 5)
426    and         r9,lr,#0xff                 @(i)
427
428    subs        r7,r7,#8
429    add         r10,r8,r9                   @(i)*pu1_ref[ref_main_idx]
430
431    bne         kernel_8_rows
432
433epilogue:
434    vst1.8      {d14},[r0],r3               @(vi)
435    vrshrn.i16  d18,q9,#5                   @(vii)shift_res = vrshrn_n_u16(add_res, 5)
436
437    vsub.u8     d24,d1,d25                  @(viii)32-fract(dup_const_32_fract)
438    vmull.u8    q11,d20,d24                 @(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
439    vmlal.u8    q11,d21,d25                 @(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
440
441    vst1.8      {d18},[r0],r3               @(vii)
442    vrshrn.i16  d22,q11,#5                  @(viii)shift_res = vrshrn_n_u16(add_res, 5)
443
444    vst1.8      {d22},[r0],r3               @(viii)
445    b           end_loops
446
447core_loop_4:
448    add         r10,r8,#1                   @pu1_ref_main_idx += (two_nt + 1)
449    add         r11,r8,#2                   @pu1_ref_main_idx_1 += (two_nt + 2)
450    mov         r8,#0
451
452    add         r5,r8,#1                    @row + 1
453    mul         r5,r5,r9                    @pos = ((row + 1) * intra_pred_ang)
454    and         r5,r5,#31                   @fract = pos & (31)
455    cmp         lr,r5                       @if(fract_prev > fract)
456    addgt       r10,r10,#1                  @pu1_ref_main_idx += 1
457    add         r11,r10,#1                  @pu1_ref_main_idx_1 += 1
458    vdup.8      d0,r5                       @dup_const_fract
459    rsb         r4,r5,#32
460    vdup.8      d1,r4                       @dup_const_32_fract
461
462@inner_loop_4
463    vld1.32     {d2[0]},[r10]               @ref_main_idx
464    add         r8,r8,#1
465    mov         lr,r5                       @fract_prev = fract
466
467    vld1.32     {d3[0]},[r11]               @ref_main_idx_1
468    add         r5,r8,#1                    @row + 1
469    mul         r5,r5,r9                    @pos = ((row + 1) * intra_pred_ang)
470    and         r5,r5,#31                   @fract = pos & (31)
471    cmp         lr,r5                       @if(fract_prev > fract)
472    addgt       r10,r10,#1                  @pu1_ref_main_idx += 1
473    add         r11,r10,#1                  @pu1_ref_main_idx_1 += 1
474
475    vdup.8      d6,r5                       @dup_const_fract
476    vmull.u8    q2,d2,d1                    @vmull_u8(ref_main_idx, dup_const_32_fract)
477
478    rsb         r4,r5,#32
479    vdup.8      d7,r4                       @dup_const_32_fract
480    vmlal.u8    q2,d3,d0                    @vmull_u8(ref_main_idx_1, dup_const_fract)
481
482    vld1.32     {d8[0]},[r10]               @ref_main_idx
483    add         r8,r8,#1
484
485    vld1.32     {d9[0]},[r11]               @ref_main_idx_1
486    vrshrn.i16  d4,q2,#5                    @shift_res = vrshrn_n_u16(add_res, 5)
487
488    mov         lr,r5                       @fract_prev = fract
489    add         r5,r8,#1                    @row + 1
490    mul         r5,r5,r9                    @pos = ((row + 1) * intra_pred_ang)
491    and         r5,r5,#31                   @fract = pos & (31)
492    cmp         lr,r5                       @if(fract_prev > fract)
493    addgt       r10,r10,#1                  @pu1_ref_main_idx += 1
494    add         r11,r10,#1                  @pu1_ref_main_idx_1 += 1
495
496    vdup.8      d12,r5                      @dup_const_fract
497    vmull.u8    q5,d8,d7                    @vmull_u8(ref_main_idx, dup_const_32_fract)
498
499    rsb         r4,r5,#32
500    vdup.8      d13,r4                      @dup_const_32_fract
501    vmlal.u8    q5,d9,d6                    @vmull_u8(ref_main_idx_1, dup_const_fract)
502
503    vld1.32     {d14[0]},[r10]              @ref_main_idx
504    add         r8,r8,#1
505
506    vst1.32     {d4[0]},[r2],r3
507    vrshrn.i16  d10,q5,#5                   @shift_res = vrshrn_n_u16(add_res, 5)
508
509    vld1.32     {d15[0]},[r11]              @ref_main_idx_1
510    mov         lr,r5                       @fract_prev = fract
511    add         r5,r8,#1                    @row + 1
512    mul         r5,r5,r9                    @pos = ((row + 1) * intra_pred_ang)
513    and         r5,r5,#31                   @fract = pos & (31)
514    cmp         lr,r5                       @if(fract_prev > fract)
515    addgt       r10,r10,#1                  @pu1_ref_main_idx += 1
516    add         r11,r10,#1                  @pu1_ref_main_idx_1 += 1
517
518    vdup.8      d18,r5                      @dup_const_fract
519    vmull.u8    q8,d14,d13                  @vmull_u8(ref_main_idx, dup_const_32_fract)
520
521    rsb         r4,r5,#32
522    vdup.8      d19,r4                      @dup_const_32_fract
523    vmlal.u8    q8,d15,d12                  @vmull_u8(ref_main_idx_1, dup_const_fract)
524
525    vld1.32     {d20[0]},[r10]              @ref_main_idx
526
527    vst1.32     {d10[0]},[r2],r3
528    vrshrn.i16  d16,q8,#5                   @shift_res = vrshrn_n_u16(add_res, 5)
529    vld1.32     {d21[0]},[r11]              @ref_main_idx_1
530
531    vmull.u8    q11,d20,d19                 @vmull_u8(ref_main_idx, dup_const_32_fract)
532    vmlal.u8    q11,d21,d18                 @vmull_u8(ref_main_idx_1, dup_const_fract)
533
534    vst1.32     {d16[0]},[r2],r3
535    vrshrn.i16  d22,q11,#5                  @shift_res = vrshrn_n_u16(add_res, 5)
536
537    vst1.32     {d22[0]},[r2],r3
538
539end_loops:
540    vpop        {d8 - d15}
541    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
542
543
544
545