• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* ,:file
21@*  ihevc_sao_edge_offset_class2.s
22@*
23@* ,:brief
24@*  Contains function definitions for inter prediction  interpolation.
25@* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
26@* RVCT
27@*
28@* ,:author
29@*  Parthiban V
30@*
31@* ,:par List of Functions:
32@*
33@*
34@* ,:remarks
35@*  None
36@*
37@*******************************************************************************
38@*/
39@void ihevc_sao_edge_offset_class2(UWORD8 *pu1_src,
40@                              WORD32 src_strd,
41@                              UWORD8 *pu1_src_left,
42@                              UWORD8 *pu1_src_top,
43@                              UWORD8 *pu1_src_top_left,
44@                              UWORD8 *pu1_src_top_right,
45@                              UWORD8 *pu1_src_bot_left,
46@                              UWORD8 *pu1_avail,
47@                              WORD8 *pi1_sao_offset,
48@                              WORD32 wd,
49@                              WORD32 ht)
50@**************Variables Vs Registers*****************************************
51@r0 =>  *pu1_src
52@r1 =>  src_strd
53@r2 =>  *pu1_src_left
54@r3 =>  *pu1_src_top
55@r4 =>  *pu1_src_top_left
56@r5 =>  *pu1_avail
57@r6 =>  *pi1_sao_offset
58@r7 =>  wd
59@r8=>   ht
60
61.text
62.syntax unified
63.p2align 2
64
65.extern gi1_table_edge_idx
66.globl ihevc_sao_edge_offset_class2_a9q
67
68gi1_table_edge_idx_addr_1:
69.long gi1_table_edge_idx - ulbl1 - 8
70
71gi1_table_edge_idx_addr_2:
72.long gi1_table_edge_idx - ulbl2 - 8
73
74gi1_table_edge_idx_addr_3:
75.long gi1_table_edge_idx - ulbl3 - 8
76
77ihevc_sao_edge_offset_class2_a9q:
78
79
80    STMFD       sp!,{r4-r12,r14}            @stack stores the values of the arguments
81    LDR         r7,[sp,#0x3C]               @Loads wd
82
83    LDR         r8,[sp,#0x40]               @Loads ht
84    SUB         r9,r7,#1                    @wd - 1
85
86    LDR         r4,[sp,#0x28]               @Loads pu1_src_top_left
87    LDRB        r10,[r3,r9]                 @pu1_src_top[wd - 1]
88
89    STR         r0,[sp,#0x2C]               @Store pu1_src in sp
90    MOV         r9,r7                       @Move width to r9 for loop count
91
92    STR         r2,[sp,#0x30]               @Store pu1_src_left in sp
93    LDR         r5,[sp,#0x34]               @Loads pu1_avail
94    LDR         r6,[sp,#0x38]               @Loads pi1_sao_offset
95    STR         r3,[sp,#0x38]               @Store pu1_src_top in sp
96
97    SUB         sp,sp,#0x94                 @Decrement the stack pointer to store some temp arr values
98
99    STRB        r10,[sp]                    @u1_src_top_left_tmp = pu1_src_top[wd - 1]
100    SUB         r10,r8,#1                   @ht-1
101    MLA         r11,r10,r1,r0               @pu1_src[(ht - 1) * src_strd + col]
102    ADD         r12,sp,#0x02                @temp array
103
104AU1_SRC_TOP_LOOP:
105    VLD1.8      D0,[r11]!                   @pu1_src[(ht - 1) * src_strd + col]
106    SUBS        r9,r9,#8                    @Decrement the loop count by 8
107    VST1.8      D0,[r12]!                   @au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
108    BNE         AU1_SRC_TOP_LOOP
109
110PU1_AVAIL_4_LOOP:
111    LDRB        r10,[r5,#4]                 @pu1_avail[4]
112    CMP         r10,#0
113    LDRB        r9,[r0]                     @u1_pos_0_0_tmp = pu1_src[0]
114    BEQ         PU1_AVAIL_7_LOOP
115
116    LDRB        r11,[r4]                    @pu1_src_top_left[0]
117    ADD         r14,r0,r1                   @pu1_src + src_strd
118
119    SUBS        r12,r9,r11                  @pu1_src[0] - pu1_src_top_left[0]
120    LDRB        r4,[r14,#1]                 @pu1_src[1 + src_strd]
121
122    MVNLT       r12,#0
123    MOVGT       r12,#1                      @SIGN(pu1_src[0] - pu1_src_top_left[0])
124
125    LDR         r14, gi1_table_edge_idx_addr_1 @table pointer
126ulbl1:
127    add         r14,r14,pc
128    SUBS        r11,r9,r4                   @pu1_src[0] - pu1_src[1 + src_strd]
129
130    MVNLT       r11,#0
131    MOVGT       r11,#1                      @SIGN(pu1_src[0] - pu1_src[1 + src_strd])
132    ADD         r4,r12,r11                  @SIGN(pu1_src[0] - pu1_src_top_left[0]) +  SIGN(pu1_src[0] - pu1_src[1 + src_strd])
133    ADD         r4,r4,#2                    @edge_idx
134
135    LDRSB       r12,[r14,r4]                @edge_idx = gi1_table_edge_idx[edge_idx]
136    CMP         r12,#0                      @0 != edge_idx
137    BEQ         PU1_AVAIL_7_LOOP
138    LDRSB       r10,[r6,r12]                @pi1_sao_offset[edge_idx]
139    ADD         r9,r9,r10                   @pu1_src[0] + pi1_sao_offset[edge_idx]
140    USAT        r9,#8,r9                    @u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
141
142PU1_AVAIL_7_LOOP:
143    LDRB        r14,[r5,#7]                 @pu1_avail[7]
144    CMP         r14,#0
145    SUB         r10,r7,#1                   @wd - 1
146    SUB         r11,r8,#1                   @ht - 1
147    MLA         r12,r11,r1,r10              @wd - 1 + (ht - 1) * src_strd
148    ADD         r12,r12,r0                  @pu1_src[wd - 1 + (ht - 1) * src_strd]
149    LDRB        r10,[r12]                   @u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd]
150    BEQ         PU1_AVAIL
151
152    SUB         r4,r12,r1                   @pu1_src[(wd - 1 + (ht - 1) * src_strd) - src_strd]
153    LDRB        r11,[r4,#-1]                @Load pu1_src[wd - 1 + (ht - 1) * src_strd - 1 - src_strd]
154    ADD         r14,r12,r1                  @pu1_src[(wd - 1 + (ht - 1) * src_strd) + src_strd]
155
156    SUBS        r11,r10,r11                 @pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd- 1 - src_strd]
157    LDRB        r4,[r14,#1]                 @Load pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd]
158
159    MVNLT       r11,#0
160    MOVGT       r11,#1                      @SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd- 1 - src_strd])
161
162    SUBS        r4,r10,r4                   @pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd]
163    MVNLT       r4,#0
164    MOVGT       r4,#1                       @SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd])
165
166    ADD         r11,r11,r4                  @Add 2 sign value
167    ADD         r11,r11,#2                  @edge_idx
168    LDR         r14, gi1_table_edge_idx_addr_2 @table pointer
169ulbl2:
170    add         r14,r14,pc
171
172    LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
173    CMP         r12,#0
174    BEQ         PU1_AVAIL
175    LDRSB       r11,[r6,r12]                @pi1_sao_offset[edge_idx]
176    ADD         r10,r10,r11                 @pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
177    USAT        r10,#8,r10                  @u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
178
179PU1_AVAIL:
180    MOV         r12,r8                      @Move ht
181    VMOV.I8     Q0,#2                       @const_2 = vdupq_n_s8(2)
182    LDRB        r11,[r5,#3]                 @pu1_avail[3]
183
184    MOV         r14,r2                      @Move pu1_src_left to pu1_src_left_cpy
185    VMOV.I16    Q1,#0                       @const_min_clip = vdupq_n_s16(0)
186    CMP         r11,#0
187
188    LDRB        r5,[r5,#2]                  @pu1_avail[2]
189    VMOV.I16    Q2,#255                     @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
190    SUBEQ       r12,r12,#1                  @ht_tmp--
191
192    CMP         r5,#0
193    VLD1.8      D7,[r6]                     @offset_tbl = vld1_s8(pi1_sao_offset)
194    LDR         r11, gi1_table_edge_idx_addr_3 @table pointer
195ulbl3:
196    add         r11,r11,pc
197
198    ADDEQ       r0,r0,r1                    @pu1_src += src_strd
199    VLD1.8      D6,[r11]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
200    SUBEQ       r12,r12,#1                  @ht_tmp--
201
202    MOV         r6,r7                       @move wd to r6 loop_count
203    VMOV.S8     Q4,#0xFF                    @au1_mask = vdupq_n_s8(-1)
204    ADDEQ       r14,r14,#1                  @pu1_src_left_cpy += 1
205
206    STR         r0,[sp,#0x90]               @Store pu1_src in sp
207    CMP         r7,#16                      @Compare wd with 16
208
209    BLT         WIDTH_RESIDUE               @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
210    CMP         r8,#4                       @Compare ht with 4
211    BLE         WD_16_HT_4_LOOP             @If jump to WD_16_HT_4_LOOP
212
213WIDTH_LOOP_16:
214    LDR         r7,[sp,#0xD0]               @Loads wd
215
216    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
217    CMP         r6,r7                       @col == wd
218    LDRBEQ      r8,[r5]                     @pu1_avail[0]
219    MOVNE       r8,#-1                      @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
220
221    VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0)
222    CMP         r6,#16                      @if(col == 16)
223    BNE         SKIP_AU1_MASK_VAL
224    LDRB        r8,[r5,#1]                  @pu1_avail[1]
225    VMOV.8      d9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
226
227SKIP_AU1_MASK_VAL:
228    LDRB        r11,[r5,#2]                 @pu1_avail[2]
229    CMP         r11,#0
230
231    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
232    MOVNE       r8,r3                       @pu1_src_top_cpy
233    SUB         r8,r8,#1                    @pu1_src_top_cpy - 1 || pu1_src - src_strd - 1
234
235    LDR         r7,[sp,#0xD0]               @Loads wd
236    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
237    VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
238    SUB         r8,#8
239    ADD         r3,r3,#16
240
241    ADD         r5,sp,#0x42                 @*au1_src_left_tmp
242    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
243    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
244    SUB         r0,#8
245    LDR         r4,[sp,#0xD4]               @Loads ht
246
247    SUB         r7,r7,r6                    @(wd - col)
248    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
249    LDR         r8,[sp,#0xC0]               @Loads *pu1_src
250
251    ADD         r7,r7,#15                   @15 + (wd - col)
252    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
253    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
254
255    SUB         r5,r5,#1
256    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
257
258AU1_SRC_LEFT_LOOP:
259    LDRB        r8,[r7],r1                  @load the value and increment by src_strd
260    STRB        r8,[r5,#1]!                 @store it in the stack pointer
261    SUBS        r4,r4,#1                    @decrement the loop count
262    BNE         AU1_SRC_LEFT_LOOP
263
264    ADD         r8,r0,r1                    @I Iteration *pu1_src + src_strd
265    VMOV.I8     Q9,#0
266    LDR         r4,[sp,#0xC8]               @I Loads pu1_avail
267
268    MOV         r7,r12                      @row count, move ht_tmp to r7
269    VLD1.8      D16,[r8]!                   @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
270    VLD1.8      D17,[r8]                    @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
271    SUB         r8,#8
272    LDRB        r4,[r4,#2]                  @I pu1_avail[2]
273
274    LDRB        r5,[r8,#16]                 @I pu1_src_cpy[src_strd + 16]
275    VMOV.8      D18[0],r5                   @I pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
276
277    VEXT.8      Q9,Q8,Q9,#1                 @I pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
278    CMP         r4,#0                       @I
279    BNE         SIGN_UP_CHANGE_DONE         @I
280
281SIGN_UP_CHANGE:
282    SUB         r2,r12,r7                   @I ht_tmp - row
283    LDRB        r11,[r0]                    @I pu1_src_cpy[0]
284    ADD         r2,r14,r2                   @I pu1_src_left_cpy[ht_tmp - row]
285
286    LDRB        r5,[r2,#-1]                 @I load the value
287    SUBS        r4,r11,r5                   @I pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
288    MVNLT       r4,#0                       @I
289    MOVGT       r4,#1                       @I SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
290    VMOV.8      D14[0],r4                   @I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
291
292SIGN_UP_CHANGE_DONE:
293    VCGT.U8     Q5,Q6,Q9                    @I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
294    VADD.I8     Q12,Q0,Q7                   @I edge_idx = vaddq_s8(const_2, sign_up)
295
296    VCLT.U8     Q9,Q6,Q9                    @I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
297    VSUB.U8     Q5,Q9,Q5                    @I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
298
299    VADD.I8     Q12,Q12,Q5                  @I edge_idx = vaddq_s8(edge_idx, sign_down)
300    VTBL.8      D18,{D6},D24                @I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
301    VTBL.8      D19,{D6},D25                @I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
302
303    VAND        Q9,Q9,Q4                    @I edge_idx = vandq_s8(edge_idx, au1_mask)
304
305    VNEG.S8     Q7,Q5                       @I sign_up = vnegq_s8(sign_down)
306    VTBL.8      D10,{D7},D18                @I offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
307    VEXT.8      Q7,Q7,Q7,#15                @I sign_up = vextq_s8(sign_up, sign_up, 15)
308
309    VMOVL.U8    Q10,D12                     @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
310    VTBL.8      D11,{D7},D19                @I offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
311    VADDW.S8    Q10,Q10,D10                 @I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
312
313    VMAX.S16    Q10,Q10,Q1                  @I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
314    VMOVL.U8    Q11,D13                     @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
315
316    VMIN.U16    Q10,Q10,Q2                  @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
317    VMOV        Q6,Q8                       @I pu1_cur_row = pu1_next_row
318
319    VADDW.S8    Q11,Q11,D11                 @I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
320    VMOVN.I16   D20,Q10                     @I vmovn_s16(pi2_tmp_cur_row.val[0])
321
322    VMAX.S16    Q11,Q11,Q1                  @I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
323    SUB         r7,r7,#1                    @I Decrement the ht_tmp loop count by 1
324
325    VMIN.U16    Q11,Q11,Q2                  @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
326
327    VMOVN.I16   D21,Q11                     @I vmovn_s16(pi2_tmp_cur_row.val[1])
328
329PU1_SRC_LOOP:
330
331    VST1.8      {Q10},[r0],r1               @I vst1q_u8(pu1_src_cpy, pu1_cur_row)
332    ADD         r8,r0,r1                    @II iteration *pu1_src + src_strd
333
334    VLD1.8      D16,[r8]!                   @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
335    VLD1.8      D17,[r8]                    @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
336    SUB         r8,#8
337    ADD         r11,r8,r1                   @III iteration *pu1_src + src_strd
338
339    LDRB        r5,[r8,#16]                 @II pu1_src_cpy[src_strd + 16]
340    VLD1.8      D30,[r11]!                  @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
341    VLD1.8      D31,[r11]                   @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
342    SUB         r11,#8
343    LDRB        r4,[r0]                     @II pu1_src_cpy[0]
344
345    LDRB        r8,[r11,#16]                @III pu1_src_cpy[src_strd + 16]
346    VMOV.8      D28[0],r5                   @II pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
347
348    SUB         r5,r12,r7                   @II ht_tmp - row
349    VEXT.8      Q11,Q8,Q14,#1               @II pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
350    ADD         r5,r14,r5                   @II pu1_src_left_cpy[ht_tmp - row]
351
352    LDRB        r5,[r5,#-1]                 @II load the value
353    VMOV.8      D18[0],r8                   @III pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
354    SUB         r7,r7,#1                    @II Decrement the ht_tmp loop count by 1
355
356    SUBS        r4,r4,r5                    @II pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
357    VEXT.8      Q9,Q15,Q9,#1                @III pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
358    LDRB        r2,[r0,r1]                  @III pu1_src_cpy[0]
359
360    VCGT.U8     Q12,Q6,Q11                  @II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
361    SUB         r5,r12,r7                   @III ht_tmp - row
362
363    MVNLT       r4,#0                       @II
364    VCLT.U8     Q11,Q6,Q11                  @II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
365    ADD         r5,r14,r5                   @III pu1_src_left_cpy[ht_tmp - row]
366
367    MOVGT       r4,#1                       @II SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
368    VSUB.U8     Q12,Q11,Q12                 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
369    LDRB        r5,[r5,#-1]                 @III load the value
370
371    SUBS        r2,r2,r5                    @III pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
372    VMOV.8      D14[0],r4                   @II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
373
374    MVNLT       r2,#0                       @III
375    VCGT.U8     Q5,Q8,Q9                    @III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
376    MOVGT       r2,#1                       @III SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
377
378    VADD.I8     Q11,Q0,Q7                   @II edge_idx = vaddq_s8(const_2, sign_up)
379    VADD.I8     Q11,Q11,Q12                 @II edge_idx = vaddq_s8(edge_idx, sign_down)
380
381    VCLT.U8     Q9,Q8,Q9                    @III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
382    VTBL.8      D22,{D6},D22                @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
383    VNEG.S8     Q7,Q12                      @II sign_up = vnegq_s8(sign_down)
384
385    VSUB.U8     Q5,Q9,Q5                    @III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
386    VTBL.8      D23,{D6},D23                @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
387    VEXT.8      Q7,Q7,Q7,#15                @II sign_up = vextq_s8(sign_up, sign_up, 15)
388
389    VAND        Q11,Q11,Q4                  @II edge_idx = vandq_s8(edge_idx, au1_mask)
390    VMOV.8      D14[0],r2                   @III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
391
392    VADD.I8     Q9,Q0,Q7                    @III edge_idx = vaddq_s8(const_2, sign_up)
393    VTBL.8      D24,{D7},D22                @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
394    VADD.I8     Q9,Q9,Q5                    @III edge_idx = vaddq_s8(edge_idx, sign_down)
395
396    VMOVL.U8    Q13,D12                     @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
397    VTBL.8      D18,{D6},D18                @III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
398    VNEG.S8     Q7,Q5                       @III sign_up = vnegq_s8(sign_down)
399
400    VADDW.S8    Q13,Q13,D24                 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
401    VTBL.8      D19,{D6},D19                @III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
402    VEXT.8      Q7,Q7,Q7,#15                @III sign_up = vextq_s8(sign_up, sign_up, 15)
403
404    VAND        Q9,Q9,Q4                    @III edge_idx = vandq_s8(edge_idx, au1_mask)
405    VMOVL.U8    Q10,D16                     @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
406
407    VMAX.S16    Q13,Q13,Q1                  @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
408    VTBL.8      D10,{D7},D18                @III offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
409    VADDW.S8    Q10,Q10,D10                 @III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
410
411    VMIN.U16    Q13,Q13,Q2                  @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
412    VTBL.8      D25,{D7},D23                @II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
413    VMAX.S16    Q10,Q10,Q1                  @III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
414
415    VMOVL.U8    Q14,D13                     @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
416    VMIN.U16    Q10,Q10,Q2                  @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
417
418    VADDW.S8    Q14,Q14,D25                 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
419    VTBL.8      D11,{D7},D19                @III offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
420    VMAX.S16    Q14,Q14,Q1                  @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
421
422    VMIN.U16    Q14,Q14,Q2                  @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
423    VMOVL.U8    Q9,D17                      @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
424
425    VMOV        Q6,Q15                      @III pu1_cur_row = pu1_next_row
426    VMOVN.I16   D26,Q13                     @II vmovn_s16(pi2_tmp_cur_row.val[0])
427
428    VMOVN.I16   D27,Q14                     @II vmovn_s16(pi2_tmp_cur_row.val[1])
429    VADDW.S8    Q9,Q9,D11                   @III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
430
431    VMAX.S16    Q9,Q9,Q1                    @III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
432    VMOVN.I16   D20,Q10                     @III vmovn_s16(pi2_tmp_cur_row.val[0])
433
434    SUB         r7,r7,#1                    @III Decrement the ht_tmp loop count by 1
435    VMIN.U16    Q9,Q9,Q2                    @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
436    CMP         r7,#1                       @III
437
438    VST1.8      {Q13},[r0],r1               @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
439    VMOVN.I16   D21,Q9                      @III vmovn_s16(pi2_tmp_cur_row.val[1])
440
441    BGT         PU1_SRC_LOOP                @III If not equal jump to PU1_SRC_LOOP
442    BLT         INNER_LOOP_DONE
443
444    VST1.8      {Q10},[r0],r1               @III vst1q_u8(pu1_src_cpy, pu1_cur_row)
445    ADD         r8,r0,r1                    @*pu1_src + src_strd
446
447    LDRB        r2,[r0]                     @pu1_src_cpy[0]
448    VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
449    VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
450    SUB         r8,#8
451    LDRB        r5,[r8,#16]                 @pu1_src_cpy[src_strd + 16]
452
453    SUB         r11,r12,r7                  @ht_tmp - row
454    VMOV.8      D18[0],r5                   @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
455    ADD         r11,r14,r11                 @pu1_src_left_cpy[ht_tmp - row]
456
457    LDRB        r5,[r11,#-1]                @load the value
458    VEXT.8      Q9,Q8,Q9,#1                 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
459    SUBS        r4,r2,r5                    @pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
460
461    VCGT.U8     Q5,Q6,Q9                    @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
462    MVNLT       r4,#0
463
464    MOVGT       r4,#1                       @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
465    VCLT.U8     Q9,Q6,Q9                    @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
466
467    VMOV.8      D14[0],r4                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
468    VSUB.U8     Q5,Q9,Q5                    @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
469
470    VADD.I8     Q9,Q0,Q7                    @edge_idx = vaddq_s8(const_2, sign_up)
471    VADD.I8     Q9,Q9,Q5                    @edge_idx = vaddq_s8(edge_idx, sign_down)
472
473    VTBL.8      D18,{D6},D18                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
474    VNEG.S8     Q7,Q5                       @sign_up = vnegq_s8(sign_down)
475
476    VTBL.8      D19,{D6},D19                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
477    VEXT.8      Q7,Q7,Q7,#15                @sign_up = vextq_s8(sign_up, sign_up, 15)
478
479    VAND        Q9,Q9,Q4                    @edge_idx = vandq_s8(edge_idx, au1_mask)
480
481    VTBL.8      D10,{D7},D18                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
482
483    VMOVL.U8    Q10,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
484    VTBL.8      D11,{D7},D19                @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
485    VADDW.S8    Q10,Q10,D10                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
486
487    VMAX.S16    Q10,Q10,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
488    VMOVL.U8    Q6,D13                      @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
489
490    VMIN.U16    Q10,Q10,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
491    VADDW.S8    Q6,Q6,D11                   @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
492
493    VMAX.S16    Q6,Q6,Q1                    @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
494    VMOVN.I16   D20,Q10                     @vmovn_s16(pi2_tmp_cur_row.val[0])
495
496    VMIN.U16    Q6,Q6,Q2                    @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
497    VMOVN.I16   D21,Q6                      @vmovn_s16(pi2_tmp_cur_row.val[1])
498
499
500INNER_LOOP_DONE:
501    ADD         r5,sp,#0x42                 @*au1_src_left_tmp
502    VST1.8      {Q10},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
503    LDR         r2,[sp,#0xC4]               @Loads *pu1_src_left
504
505    LDR         r8,[sp,#0xD4]               @Loads ht
506    SUB         r5,r5,#1
507
508    SUB         r2,r2,#1
509SRC_LEFT_LOOP:
510    LDRB        r7,[r5,#1]!                 @au1_src_left_tmp[row]
511    SUBS        r8,r8,#1
512    STRB        r7,[r2,#1]!                 @pu1_src_left[row] = au1_src_left_tmp[row]
513    BNE         SRC_LEFT_LOOP
514
515    SUB         r6,r6,#16                   @Decrement the wd loop count by 16
516    CMP         r6,#8                       @Check whether residue remains
517    BLT         RE_ASSINING_LOOP            @Jump to re-assigning loop
518    LDR         r7,[sp,#0xD0]               @Loads wd
519    LDR         r0,[sp,#0x90]               @Loads *pu1_src
520    SUB         r7,r7,r6
521    ADD         r0,r0,r7
522    BGT         WIDTH_LOOP_16               @If not equal jump to width_loop
523    BEQ         WIDTH_RESIDUE               @If residue remains jump to residue loop
524
525
526WD_16_HT_4_LOOP:
527    LDR         r7,[sp,#0xD0]               @Loads wd
528    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
529    CMP         r6,r7                       @col == wd
530    LDRBEQ      r8,[r5]                     @pu1_avail[0]
531    MOVNE       r8,#-1                      @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
532
533    VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0)
534    CMP         r6,#16                      @if(col == 16)
535    BNE         SKIP_AU1_MASK_VAL_WD_16_HT_4
536    LDRB        r8,[r5,#1]                  @pu1_avail[1]
537    VMOV.8      d9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
538
539SKIP_AU1_MASK_VAL_WD_16_HT_4:
540    LDRB        r8,[r5,#2]                  @pu1_avail[2]
541    CMP         r8,#0
542
543    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
544    MOVNE       r8,r3
545    SUB         r8,r8,#1                    @pu1_src_top_cpy - 1 || pu1_src - src_strd - 1
546
547    LDR         r7,[sp,#0xD0]               @Loads wd
548    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
549    VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
550    SUB         r8,#8
551    ADD         r3,r3,#16
552
553    ADD         r5,sp,#0x42                 @*au1_src_left_tmp
554    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
555    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
556    SUB         r0,#8
557    LDR         r4,[sp,#0xD4]               @Loads ht
558
559    SUB         r7,r7,r6                    @(wd - col)
560    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
561    LDR         r8,[sp,#0xC0]               @Loads *pu1_src
562
563    ADD         r7,r7,#15                   @15 + (wd - col)
564    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
565    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
566
567    SUB         r5,r5,#1
568    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
569
570AU1_SRC_LEFT_LOOP_WD_16_HT_4:
571    LDRB        r8,[r7],r1                  @load the value and increment by src_strd
572    SUBS        r4,r4,#1                    @decrement the loop count
573    STRB        r8,[r5,#1]!                 @store it in the stack pointer
574    BNE         AU1_SRC_LEFT_LOOP_WD_16_HT_4
575
576    VMOV.I8     Q9,#0
577    MOV         r7,r12                      @row count, move ht_tmp to r7
578
579PU1_SRC_LOOP_WD_16_HT_4:
580    ADD         r8,r0,r1                    @*pu1_src + src_strd
581    VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
582    VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
583    SUB         r8,#8
584
585    LDRB        r5,[r8,#16]                 @pu1_src_cpy[src_strd + 16]
586    VMOV.8      D18[0],r5                   @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
587    VEXT.8      Q9,Q8,Q9,#1                 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
588
589    CMP         r7,r12
590    BLT         SIGN_UP_CHANGE_WD_16_HT_4
591    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
592    LDRB        r5,[r5,#2]                  @pu1_avail[2]
593    CMP         r5,#0
594    BNE         SIGN_UP_CHANGE_DONE_WD_16_HT_4
595
596SIGN_UP_CHANGE_WD_16_HT_4:
597    LDRB        r8,[r0]                     @pu1_src_cpy[0]
598    SUB         r5,r12,r7                   @ht_tmp - row
599    ADD         r5,r14,r5                   @pu1_src_left_cpy[ht_tmp - row]
600    LDRB        r5,[r5,#-1]                 @load the value
601    SUBS        r8,r8,r5                    @pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
602    MVNLT       r8,#0
603    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
604    VMOV.8      d14[0],r8                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
605
606SIGN_UP_CHANGE_DONE_WD_16_HT_4:
607    VCGT.U8     Q10,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
608    VCLT.U8     Q11,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
609    VSUB.U8     Q12,Q11,Q10                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
610
611    VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
612    VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
613    VTBL.8      D26,{D6},D26                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
614    VTBL.8      D27,{D6},D27                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
615
616    VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
617
618    VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
619    VEXT.8      Q7,Q7,Q7,#15                @sign_up = vextq_s8(sign_up, sign_up, 15)
620
621    VTBL.8      D24,{D7},D26                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
622    VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
623    VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
624    VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
625    VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
626
627    VTBL.8      D25,{D7},D27                @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
628    VMOVL.U8    Q15,D13                     @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
629    VADDW.S8    Q15,Q15,D25                 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
630    VMAX.S16    Q15,Q15,Q1                  @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
631    VMIN.U16    Q15,Q15,Q2                  @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
632
633    VMOVN.I16   D28,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
634    VMOVN.I16   D29,Q15                     @vmovn_s16(pi2_tmp_cur_row.val[1])
635
636    VST1.8      {Q14},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
637
638    VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
639    SUBS        r7,r7,#1                    @Decrement the ht_tmp loop count by 1
640    BNE         PU1_SRC_LOOP_WD_16_HT_4     @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
641
642    LDR         r8,[sp,#0xD4]               @Loads ht
643    ADD         r5,sp,#0x42                 @*au1_src_left_tmp
644    LDR         r2,[sp,#0xC4]               @Loads *pu1_src_left
645    SUB         r5,r5,#1
646    SUB         r2,r2,#1
647
648SRC_LEFT_LOOP_WD_16_HT_4:
649    LDRB        r7,[r5,#1]!                 @au1_src_left_tmp[row]
650    STRB        r7,[r2,#1]!                 @pu1_src_left[row] = au1_src_left_tmp[row]
651    SUBS        r8,r8,#1
652    BNE         SRC_LEFT_LOOP_WD_16_HT_4
653
654    SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
655    BLE         RE_ASSINING_LOOP            @Jump to re-assigning loop
656
657
658WIDTH_RESIDUE:
659    LDR         r7,[sp,#0xD0]               @Loads wd
660    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
661    CMP         r6,r7                       @wd_residue == wd
662    LDRBEQ      r8,[r5]                     @pu1_avail[0]
663
664    MOVNE       r8,#-1
665    VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
666
667    LDRB        r8,[r5,#1]                  @pu1_avail[1]
668    VMOV.8      d8[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
669
670PU1_AVAIL_2_RESIDUE:
671    LDRB        r11,[r5,#2]                 @pu1_avail[2]
672    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
673    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
674    SUB         r0,#8
675    CMP         r11,#0
676
677    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
678    MOVNE       r8,r3
679
680    SUB         r8,r8,#1
681
682    ADD         r5,sp,#0x42                 @*au1_src_left_tmp
683    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src_top_cpy - 1)
684    VLD1.8      D11,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src_top_cpy - 1)
685    LDR         r7,[sp,#0xD0]               @Loads wd
686
687    LDR         r4,[sp,#0xD4]               @Loads ht
688    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
689    SUB         r7,r7,#1                    @(wd - 1)
690
691    LDR         r8,[sp,#0xC0]               @Loads *pu1_src
692    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
693    SUB         r5,r5,#1
694
695    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + (wd - 1)]
696    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
697
698
699AU1_SRC_LEFT_LOOP_RESIDUE:
700    LDRB        r8,[r7],r1                  @load the value and increment by src_strd
701    SUBS        r4,r4,#1                    @decrement the loop count
702    STRB        r8,[r5,#1]!                 @store it in the stack pointer
703    BNE         AU1_SRC_LEFT_LOOP_RESIDUE
704
705
706    MOV         r7,r12                      @row count, move ht_tmp to r7
707
708PU1_SRC_LOOP_RESIDUE:
709    VMOV.I8     Q9,#0
710    ADD         r8,r0,r1                    @*pu1_src + src_strd
711    VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
712    VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
713    SUB         r8,#8
714
715    LDRB        r8,[r8,#16]                 @pu1_src_cpy[src_strd + 16]
716    VMOV.8      d18[0],r8                   @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
717    VEXT.8      Q9,Q8,Q9,#1                 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
718
719    CMP         r7,r12
720    BLT         SIGN_UP_CHANGE_RESIDUE
721    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
722    LDRB        r5,[r5,#2]                  @pu1_avail[2]
723    CMP         r5,#0
724    BNE         SIGN_UP_CHANGE_DONE_RESIDUE
725
726SIGN_UP_CHANGE_RESIDUE:
727    LDRB        r8,[r0]                     @pu1_src_cpy[0]
728    SUB         r5,r12,r7                   @ht_tmp - row
729
730    ADD         r5,r14,r5
731    LDRB        r5,[r5,#-1]                 @load the value
732    SUBS        r8,r8,r5                    @pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
733    MVNLT       r8,#0
734    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
735    VMOV.8      d14[0],r8                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
736
737SIGN_UP_CHANGE_DONE_RESIDUE:
738    VCGT.U8     Q10,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
739    VCLT.U8     Q11,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
740    VSUB.U8     Q12,Q11,Q10                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
741
742    VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
743    VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
744    VTBL.8      D26,{D6},D26                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
745    VTBL.8      D27,{D6},D27                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
746
747    VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
748
749    VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
750    VEXT.8      Q7,Q7,Q7,#15                @sign_up = vextq_s8(sign_up, sign_up, 15)
751
752    VTBL.8      D24,{D7},D26                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
753    VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
754    VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
755    VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
756    VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
757
758    VMOVN.I16   D30,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
759
760    VST1.8      {D30},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
761    VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
762    SUBS        r7,r7,#1
763    BNE         PU1_SRC_LOOP_RESIDUE
764
765    LDR         r8,[sp,#0xD4]               @Loads ht
766    ADD         r5,sp,#0x42                 @*au1_src_left_tmp
767
768    LDR         r2,[sp,#0xC4]               @Loads *pu1_src_left
769    SUB         r5,r5,#1
770
771    SUB         r2,r2,#1
772
773SRC_LEFT_LOOP_RESIDUE:
774    LDRB        r7,[r5,#1]!                 @au1_src_left_tmp[row]
775    SUBS        r8,r8,#1
776    STRB        r7,[r2,#1]!                 @pu1_src_left[row] = au1_src_left_tmp[row]
777    BNE         SRC_LEFT_LOOP_RESIDUE
778
779
780RE_ASSINING_LOOP:
781    LDR         r8,[sp,#0xD4]               @Loads ht
782    LDR         r7,[sp,#0xD0]               @Loads wd
783
784    LDR         r0,[sp,#0xC0]               @Loads *pu1_src
785    SUB         r8,r8,#1                    @ht - 1
786
787    MLA         r6,r8,r1,r7                 @wd - 1 + (ht - 1) * src_strd
788    STRB        r9,[r0]                     @pu1_src_org[0] = u1_pos_0_0_tmp
789
790    LDR         r4,[sp,#0xBC]               @Loads pu1_src_top_left
791    ADD         r6,r0,r6                    @pu1_src[wd - 1 + (ht - 1) * src_strd]
792
793    ADD         r12,sp,#0x02
794    STRB        r10,[r6,#-1]                @pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp
795
796    LDRB        r11,[sp]                    @load u1_src_top_left_tmp from stack pointer
797    LDR         r3,[sp,#0xCC]               @Loads pu1_src_top
798
799    STRB        r11,[r4]                    @*pu1_src_top_left = u1_src_top_left_tmp
800
801SRC_TOP_LOOP:
802    VLD1.8      D0,[r12]!                   @pu1_src_top[col] = au1_src_top_tmp[col]
803    SUBS        r7,r7,#8                    @Decrement the width
804    VST1.8      D0,[r3]!                    @pu1_src_top[col] = au1_src_top_tmp[col]
805    BNE         SRC_TOP_LOOP
806
807END_LOOPS:
808    ADD         sp,sp,#0x94
809    LDMFD       sp!,{r4-r12,r15}            @Reload the registers from SP
810
811
812
813