• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* @file
21@*  ihevc_intra_pred_chroma_dc_neon.s
22@*
23@* @brief
24@*  contains function definitions for intra prediction dc filtering.
25@* functions are coded using neon  intrinsics and can be compiled using
26
27@* rvct
28@*
29@* @author
30@*  yogeswaran rs
31@*
32@* @par list of functions:
33@*
34@*
35@* @remarks
36@*  none
37@*
38@*******************************************************************************
39@*/
40@/**
41@*******************************************************************************
42@*
43@* @brief
44@*    luma intraprediction filter for dc input
45@*
46@* @par description:
47@*
48@* @param[in] pu1_ref
49@*  uword8 pointer to the source
50@*
51@* @param[out] pu1_dst
52@*  uword8 pointer to the destination
53@*
54@* @param[in] src_strd
55@*  integer source stride
56@*
57@* @param[in] dst_strd
58@*  integer destination stride
59@*
60@* @param[in] pi1_coeff
61@*  word8 pointer to the planar coefficients
62@*
63@* @param[in] nt
64@*  size of tranform block
65@*
66@* @param[in] mode
67@*  type of filtering
68@*
69@* @returns
70@*
71@* @remarks
72@*  none
73@*
74@*******************************************************************************
75@*/
76
77@void ihevc_intra_pred_chroma_dc(uword8 *pu1_ref,
78@                                word32 src_strd,
79@                                uword8 *pu1_dst,
80@                                word32 dst_strd,
81@                                word32 nt,
82@                                word32 mode)
83@
84@**************variables vs registers*****************************************
85@r0 => *pu1_ref
86@r1 => src_strd
87@r2 => *pu1_dst
88@r3 => dst_strd
89
90@stack contents from #40
91@   nt
92@   mode
93@   pi1_coeff
94
95.equ    nt_offset,      40
96
97.text
98.align 4
99
100
101
102
103.globl ihevc_intra_pred_chroma_dc_a9q
104
105.type ihevc_intra_pred_chroma_dc_a9q, %function
106
107ihevc_intra_pred_chroma_dc_a9q:
108
109    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
110
111    ldr         r4,[sp,#nt_offset]          @loads nt
112    mov         r9, #0
113    vmov        d17, r9, r9
114
115    clz         r5, r4                      @counts leading zeros
116
117    add         r6, r0, r4,lsl #1           @&src[2nt]
118    vmov        d18, r9, r9
119    rsb         r5, r5, #32                 @log2nt
120    add         r7, r0, r4, lsl #2          @&src[4nt]
121    mov         r12,r5
122    add         r8, r7, #2                  @&src[4nt+2]
123
124    cmp         r4, #4
125    beq         dc_4                        @nt=4 loop
126
127
128add_loop:
129    vld2.s8     {d30,d31}, [r6]!            @load from src[nt]
130    lsl         r10,r4,#1                   @2nt
131
132    vpaddl.u8   d2, d30
133    subs        r10, #0x10
134
135    vld2.s8     {d26,d27}, [r8]!            @load from src[2nt+1]
136
137    vpaddl.u8   d3, d31
138    vpaddl.u16  d2, d2
139    vpaddl.u16  d3, d3
140
141    vpadal.u32  d17, d2
142
143    vpadal.u32  d18, d3
144
145    vpaddl.u8   d2, d26
146    vpaddl.u8   d3, d27
147
148    vpaddl.u16  d2, d2
149    vpaddl.u16  d3, d3
150
151    vpadal.u32  d17, d2
152    vpadal.u32  d18, d3
153
154    beq         epil_add_loop
155
156core_loop_add:
157    vld2.s8     {d30,d31}, [r6]!            @load from src[nt]
158    vpaddl.u8   d28, d30
159    vpaddl.u8   d3, d31
160
161    vld2.s8     {d26,d27}, [r8]!            @load from src[2nt+1]
162
163    vpaddl.u16  d3, d3
164    vpaddl.u16  d29, d28
165
166    vpadal.u32  d18, d3
167    vpadal.u32  d17, d29
168
169    vpaddl.u8   d3, d27
170    vpaddl.u8   d28, d26
171
172    vpaddl.u16  d3, d3
173    vpaddl.u16  d29, d28
174
175    vpadal.u32  d18, d3
176    vpadal.u32  d17, d29
177
178
179epil_add_loop:
180
181    vmov.32     r1,d18[0]
182    vmov.32     r11,d17[0]
183
184    add         r1,r1,r4
185    add         r11,r11,r4
186
187    lsr         r1,r1,r12
188    lsr         r11,r11,r12
189
190    vdup.8      d17,r1
191    vdup.8      d16,r11
192
193prologue_cpy_32:
194
195    add         r5, r2, r3
196    subs        r9, r4, #8
197    lsl         r6, r3, #2
198    moveq       r11,r6
199    add         r8, r5, r3
200    add         r10, r8, r3
201
202    beq         epilogue_copy
203
204    vst2.8      {d16,d17}, [r2]!
205    add         r6, r6, #0xfffffff0
206
207    vst2.8      {d16,d17}, [r5]!
208    vst2.8      {d16,d17}, [r8]!
209    movne       r11,#16
210    vst2.8      {d16,d17}, [r10]!
211
212
213    vst2.8      {d16,d17}, [r2], r6
214    vst2.8      {d16,d17}, [r5], r6
215    vst2.8      {d16,d17}, [r8], r6
216    vst2.8      {d16,d17}, [r10], r6
217
218kernel_copy:
219    vst2.8      {d16,d17}, [r2]!
220    vst2.8      {d16,d17}, [r5]!
221    vst2.8      {d16,d17}, [r8]!
222    vst2.8      {d16,d17}, [r10]!
223
224    vst2.8      {d16,d17}, [r2], r6
225    vst2.8      {d16,d17}, [r5], r6
226    vst2.8      {d16,d17}, [r8], r6
227    vst2.8      {d16,d17}, [r10], r6
228
229    vst2.8      {d16,d17}, [r2]!
230    vst2.8      {d16,d17}, [r5]!
231    vst2.8      {d16,d17}, [r8]!
232    vst2.8      {d16,d17}, [r10]!
233
234    vst2.8      {d16,d17}, [r2], r6
235    vst2.8      {d16,d17}, [r5], r6
236    vst2.8      {d16,d17}, [r8], r6
237    vst2.8      {d16,d17}, [r10], r6
238
239epilogue_copy:
240    vst2.8      {d16,d17}, [r2],r11
241    vst2.8      {d16,d17}, [r5],r11
242    vst2.8      {d16,d17}, [r8],r11
243    vst2.8      {d16,d17}, [r10],r11
244
245    vst2.8      {d16,d17}, [r2]
246    vst2.8      {d16,d17}, [r5]
247    vst2.8      {d16,d17}, [r8]
248    vst2.8      {d16,d17}, [r10]
249    b           end_func
250
251dc_4:
252    vld2.s8     {d30,d31},[r6]              @load from src[nt]
253    vshl.i64    d3,d30,#32
254
255    vld2.s8     {d26,d27},[r8]              @load from src[2nt+1]
256    vshl.i64    d2,d31,#32
257
258    vpaddl.u8   d3,d3
259    vpaddl.u8   d2,d2
260    vpaddl.u16  d3,d3
261    vpaddl.u16  d2,d2
262    vpadal.u32  d17,d3
263    vpadal.u32  d18,d2
264
265    vshl.i64    d3,d26,#32
266    vshl.i64    d2,d27,#32
267    vpaddl.u8   d3,d3
268    vpaddl.u8   d2,d2
269    vpaddl.u16  d3,d3
270    vpaddl.u16  d2,d2
271    vpadal.u32  d17,d3
272    vpadal.u32  d18,d2
273
274    vmov.32     r10,d17[0]
275    vmov.32     r11,d18[0]
276
277    add         r10,r10,r4
278    add         r11,r11,r4
279    lsr         r10,r10,r12
280    lsr         r11,r11,r12
281    orr         r10,r10,r11,lsl #8
282    vdup.16     d0,r10
283
284    vst1.8      {d0},[r2],r3
285    vst1.8      {d0},[r2],r3
286    vst1.8      {d0},[r2],r3
287    vst1.8      {d0},[r2]
288
289end_func:
290    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
291
292
293
294
295