• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* @file
21@*  ihevc_inter_pred_chroma_copy_w16out_neon.s
22@*
23@* @brief
24@*  contains function definitions for inter prediction  interpolation.
25@* functions are coded using neon  intrinsics and can be compiled using
26
27@* rvct
28@*
29@* @author
30@*  yogeswaran rs
31@*
32@* @par list of functions:
33@*
34@*
35@* @remarks
36@*  none
37@*
38@*******************************************************************************
39@*/
40@/**
41@*******************************************************************************
42@*
43@* @brief
44@*   chroma interprediction filter for copy
45@*
46@* @par description:
47@*    copies the array of width 'wd' and height 'ht' from the  location pointed
48@*    by 'src' to the location pointed by 'dst'
49@*
50@* @param[in] pu1_src
51@*  uword8 pointer to the source
52@*
53@* @param[out] pu1_dst
54@*  uword8 pointer to the destination
55@*
56@* @param[in] src_strd
57@*  integer source stride
58@*
59@* @param[in] dst_strd
60@*  integer destination stride
61@*
62@* @param[in] pi1_coeff
63@*  word8 pointer to the filter coefficients
64@*
65@* @param[in] ht
66@*  integer height of the array
67@*
68@* @param[in] wd
69@*  integer width of the array
70@*
71@* @returns
72@*
73@* @remarks
74@*  none
75@*
76@*******************************************************************************
77@*/
78
79@void ihevc_inter_pred_chroma_copy_w16out(uword8 *pu1_src,
80@                                           word16 *pi2_dst,
81@                                           word32 src_strd,
82@                                           word32 dst_strd,
83@                                           word8 *pi1_coeff,
84@                                           word32 ht,
85@                                           word32 wd)
86@**************variables vs registers*****************************************
87@r0 => *pu1_src
88@r1 => *pi2_dst
89@r2 =>  src_strd
90@r3 =>  dst_strd
91@r4 => *pi1_coeff
92@r5 =>  ht
93@r6 =>  wd
94
95.equ    coeff_offset,   104
96.equ    ht_offset,      108
97.equ    wd_offset,      112
98
99
100.text
101.align 4
102
103
104
105
106.globl ihevc_inter_pred_chroma_copy_w16out_a9q
107
108.type ihevc_inter_pred_chroma_copy_w16out_a9q, %function
109
110ihevc_inter_pred_chroma_copy_w16out_a9q:
111
112    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
113    vpush        {d8 - d15}
114
115    ldr         r12,[sp,#wd_offset]                @loads wd
116    lsl         r12,r12,#1                  @2*wd
117    ldr         r7,[sp,#ht_offset]                 @loads ht
118    cmp         r7,#0                       @ht condition(ht == 0)
119    ble         end_loops                   @loop
120    and         r8,r7,#3                    @check ht for mul of 2
121    sub         r9,r7,r8                    @check the rounded height value
122    and         r11,r7,#6
123    cmp         r11,#6
124    beq         loop_ht_6
125    tst         r12,#7                      @conditional check for wd (multiples)
126    beq         core_loop_wd_8
127
128loop_ht_6:
129    sub         r11,r12,#4
130    lsls        r6,r3,#1
131    cmp         r9,#0
132    beq         outer_loop_wd_4_ht_2
133
134outer_loop_wd_4:
135    subs        r4,r12,#0                   @wd conditional subtract
136    ble         end_inner_loop_wd_4
137
138inner_loop_wd_4:
139    vld1.8      {d0},[r0]                   @vld1_u8(pu1_src_tmp)
140    add         r5,r0,r2                    @pu1_src +src_strd
141    vmovl.u8    q0,d0                       @vmovl_u8(vld1_u8(pu1_src_tmp)
142    add         r10,r1,r6
143    subs        r4,r4,#4                    @wd - 4
144    vshl.i64    q0,q0,#6                    @vshlq_n_s64(temp, 6)
145    vld1.8      {d22},[r5],r2               @vld1_u8(pu1_src_tmp)
146    add         r0,r0,#4                    @pu1_src += 4
147    vst1.64     {d0},[r1]                   @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
148    add         r1,r1,#8
149    vmovl.u8    q11,d22                     @vmovl_u8(vld1_u8(pu1_src_tmp)
150    vld1.8      {d24},[r5],r2               @vld1_u8(pu1_src_tmp)
151    vshl.i64    q11,q11,#6                  @vshlq_n_s64(temp, 6)
152    vmovl.u8    q12,d24                     @vmovl_u8(vld1_u8(pu1_src_tmp)
153    vst1.64     {d22},[r10],r6              @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
154    vshl.i64    q12,q12,#6                  @vshlq_n_s64(temp, 6)
155    vld1.8      {d26},[r5],r2               @vld1_u8(pu1_src_tmp)
156    vst1.64     {d24},[r10],r6              @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
157    vmovl.u8    q13,d26                     @vmovl_u8(vld1_u8(pu1_src_tmp)
158    vshl.i64    q13,q13,#6                  @vshlq_n_s64(temp, 6)
159    vst1.64     {d26},[r10],r6              @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
160    bgt         inner_loop_wd_4
161
162end_inner_loop_wd_4:
163    subs        r9,r9,#4                    @ht - 4
164    sub         r0,r5,r11
165    sub         r1,r10,r11,lsl #1
166    bgt         outer_loop_wd_4
167    cmp         r8,#0
168    bgt         outer_loop_wd_4_ht_2
169
170
171end_loops:
172    vpop         {d8 - d15}
173    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
174
175
176outer_loop_wd_4_ht_2:
177    subs        r4,r12,#0                   @wd conditional subtract
178    ble         end_inner_loop_wd_4
179
180inner_loop_wd_4_ht_2:
181    vld1.8      {d0},[r0]                   @vld1_u8(pu1_src_tmp)
182    add         r5,r0,r2                    @pu1_src +src_strd
183    vmovl.u8    q0,d0                       @vmovl_u8(vld1_u8(pu1_src_tmp)
184    add         r10,r1,r6
185    subs        r4,r4,#4                    @wd - 4
186    vshl.i64    q0,q0,#6                    @vshlq_n_s64(temp, 6)
187    vld1.8      {d22},[r5],r2               @vld1_u8(pu1_src_tmp)
188    add         r0,r0,#4                    @pu1_src += 4
189    vst1.64     {d0},[r1]                   @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
190    add         r1,r1,#8
191    vmovl.u8    q11,d22                     @vmovl_u8(vld1_u8(pu1_src_tmp)
192    vld1.8      {d24},[r5],r2               @vld1_u8(pu1_src_tmp)
193    vshl.i64    q11,q11,#6                  @vshlq_n_s64(temp, 6)
194    vmovl.u8    q12,d24                     @vmovl_u8(vld1_u8(pu1_src_tmp)
195    vst1.64     {d22},[r10],r6              @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
196    bgt         inner_loop_wd_4_ht_2
197    b           end_loops
198
199
200core_loop_wd_8:
201    @sub            r11,r12,#8
202    lsls        r5,r3,#1
203    rsb         r11,r12,r3, lsl #2          @ r11 = (dst_strd * 4) - width
204    rsb         r8,r12,r2,lsl #2            @r2->src_strd
205    mov         r4,r12, lsr #3              @ divide by 8
206    mov         r7,r9
207    mul         r7, r4
208    sub         r4,r12,#0                   @wd conditional check
209    sub         r7,r7,#4                    @subtract one for epilog
210    cmp         r9,#0
211    beq         core_loop_wd_8_ht_2
212
213prolog:
214    add         r6,r0,r2                    @pu1_src_tmp += src_strd
215    add         r10,r1,r5
216    vld1.8      {d8},[r0]!                  @vld1_u8(pu1_src_tmp)
217    vld1.8      {d10},[r6],r2               @vld1_u8(pu1_src_tmp)
218    vld1.8      {d12},[r6],r2               @vld1_u8(pu1_src_tmp)
219    vld1.8      {d14},[r6],r2               @vld1_u8(pu1_src_tmp)
220    vmovl.u8    q8,d8                       @vmovl_u8(vld1_u8(pu1_src_tmp))
221    vmovl.u8    q9,d10                      @vmovl_u8(vld1_u8(pu1_src_tmp)
222    vmovl.u8    q10,d12                     @vmovl_u8(vld1_u8(pu1_src_tmp)
223    vmovl.u8    q11,d14                     @vmovl_u8(vld1_u8(pu1_src_tmp)
224    subs        r4,r4,#8                    @wd decrements by 8
225    vshl.i16    q0,q8,#6                    @vshlq_n_s16(tmp, 6)
226    vshl.i16    q1,q9,#6                    @vshlq_n_s16(tmp, 6)
227    vshl.i16    q2,q10,#6                   @vshlq_n_s16(tmp, 6)
228    vshl.i16    q3,q11,#6                   @vshlq_n_s16(tmp, 6)
229    addle       r0,r0,r8
230    add         r6,r0,r2                    @pu1_src_tmp += src_strd
231    vld1.8      {d8},[r0]!                  @vld1_u8(pu1_src_tmp)
232    vld1.8      {d10},[r6],r2               @vld1_u8(pu1_src_tmp)
233    vld1.8      {d12},[r6],r2               @vld1_u8(pu1_src_tmp)
234    vld1.8      {d14},[r6],r2               @vld1_u8(pu1_src_tmp)
235
236    vst1.16     {d0,d1},[r1]!               @vst1q_s16(pi2_dst_tmp, tmp)
237    addle       r1,r1,r11,lsl #1
238    suble       r4,r12,#0                   @wd conditional check
239
240    subs        r7,r7,#4                    @ht - 4
241
242    blt         epilog_end                  @jumps to epilog_end
243    beq         epilog                      @jumps to epilog
244
245
246
247outer_loop_wd_8:
248
249    vst1.16     {d2,d3},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
250    vmovl.u8    q8,d8                       @vmovl_u8(vld1_u8(pu1_src_tmp))
251
252    vst1.16     {d4,d5},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
253    vmovl.u8    q9,d10                      @vmovl_u8(vld1_u8(pu1_src_tmp)
254
255    vst1.16     {d6,d7},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
256    vmovl.u8    q10,d12                     @vmovl_u8(vld1_u8(pu1_src_tmp)
257
258    vmovl.u8    q11,d14                     @vmovl_u8(vld1_u8(pu1_src_tmp)
259
260    subs        r4,r4,#8                    @wd decrements by 8
261    addle       r0,r0,r8
262
263    add         r6,r0,r2                    @pu1_src_tmp += src_strd
264
265    vld1.8      {d8},[r0]!                  @vld1_u8(pu1_src_tmp)
266    vshl.i16    q0,q8,#6                    @vshlq_n_s16(tmp, 6)
267
268    vld1.8      {d10},[r6],r2               @vld1_u8(pu1_src_tmp)
269    vshl.i16    q1,q9,#6                    @vshlq_n_s16(tmp, 6)
270
271    vld1.8      {d12},[r6],r2               @vld1_u8(pu1_src_tmp)
272    vshl.i16    q2,q10,#6                   @vshlq_n_s16(tmp, 6)
273
274    vld1.8      {d14},[r6],r2               @vld1_u8(pu1_src_tmp)
275    add         r10,r1,r5
276
277    vshl.i16    q3,q11,#6                   @vshlq_n_s16(tmp, 6)
278
279    vst1.16     {d0,d1},[r1]!               @vst1q_s16(pi2_dst_tmp, tmp)
280
281    addle       r1,r1,r11,lsl #1
282    suble       r4,r12,#0                   @wd conditional check
283
284    subs        r7,r7,#4                    @ht - 4
285    bgt         outer_loop_wd_8
286
287epilog:
288    vst1.16     {d2,d3},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
289    vmovl.u8    q8,d8                       @vmovl_u8(vld1_u8(pu1_src_tmp))
290
291    vst1.16     {d4,d5},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
292    vmovl.u8    q9,d10                      @vmovl_u8(vld1_u8(pu1_src_tmp)
293
294    vst1.16     {d6,d7},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
295    vmovl.u8    q10,d12                     @vmovl_u8(vld1_u8(pu1_src_tmp)
296
297    vmovl.u8    q11,d14                     @vmovl_u8(vld1_u8(pu1_src_tmp)
298    @add        r6,r0,r2                @pu1_src_tmp += src_strd
299
300    vshl.i16    q0,q8,#6                    @vshlq_n_s16(tmp, 6)
301    vshl.i16    q1,q9,#6                    @vshlq_n_s16(tmp, 6)
302    vshl.i16    q2,q10,#6                   @vshlq_n_s16(tmp, 6)
303    add         r10,r1,r5
304    vshl.i16    q3,q11,#6                   @vshlq_n_s16(tmp, 6)
305
306    vst1.16     {d0,d1},[r1]!               @vst1q_s16(pi2_dst_tmp, tmp)
307epilog_end:
308    vst1.16     {d2,d3},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
309    vst1.16     {d4,d5},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
310    vst1.16     {d6,d7},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
311    b           end_loops
312
313core_loop_wd_8_ht_2:
314    add         r6,r0,r2                    @pu1_src_tmp += src_strd
315    add         r10,r1,r5
316    vld1.8      {d8},[r0]!                  @vld1_u8(pu1_src_tmp)
317    vld1.8      {d10},[r6],r2               @vld1_u8(pu1_src_tmp)
318    vmovl.u8    q8,d8                       @vmovl_u8(vld1_u8(pu1_src_tmp))
319    vmovl.u8    q9,d10                      @vmovl_u8(vld1_u8(pu1_src_tmp)
320    subs        r12,r12,#8                  @wd decrements by 8
321    vshl.i16    q0,q8,#6                    @vshlq_n_s16(tmp, 6)
322    vshl.i16    q1,q9,#6                    @vshlq_n_s16(tmp, 6)
323    vst1.16     {d0,d1},[r1]!               @vst1q_s16(pi2_dst_tmp, tmp)
324    vst1.16     {d2,d3},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
325    bgt         core_loop_wd_8_ht_2
326
327    vpop         {d8 - d15}
328    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
329
330
331
332
333
334
335