• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20
21///*
22////----------------------------------------------------------------------------
23//// File Name            : impeg2_format_conv.s
24////
25//// Description          : This file has the Idct Implementations for the
26////                        MPEG4 SP decoder on neon platform.
27////
28//// Reference Document   :
29////
30//// Revision History     :
31////      Date            Author                  Detail Description
32////   ------------    ----------------    ----------------------------------
33////   Jul 07, 2008     Naveen Kumar T                Created
34////
35////-------------------------------------------------------------------------
36//*/
37
38///*
39//// ----------------------------------------------------------------------------
40//// Include Files
41//// ----------------------------------------------------------------------------
42//*/
43.set log2_16                    ,      4
44.set log2_2                     ,      1
45
46.text
47.include "impeg2_neon_macros.s"
48///*
49//// ----------------------------------------------------------------------------
50//// Struct/Union Types and Define
51//// ----------------------------------------------------------------------------
52//*/
53
54///*
55//// ----------------------------------------------------------------------------
56//// Static Global Data section variables
57//// ----------------------------------------------------------------------------
58//*/
59////--------------------------- NONE --------------------------------------------
60
61///*
62//// ----------------------------------------------------------------------------
63//// Static Prototype Functions
64//// ----------------------------------------------------------------------------
65//*/
66//// -------------------------- NONE --------------------------------------------
67
68///*
69//// ----------------------------------------------------------------------------
70//// Exported functions
71//// ----------------------------------------------------------------------------
72//*/
73
74
75///*****************************************************************************
76//*                                                                            *
77//*  Function Name    : impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8()                      *
78//*                                                                            *
79//*  Description      : This function conversts the image from YUV420P color   *
80//*                     space to 420SP color space(UV interleaved).           *
81//*                                                                            *
82//*  Arguments        : x0          pu1_y                                     *
83//*                     x1          pu1_u                                     *
84//*                     x2          pu1_v                                     *
85//*                     x3          pu1_dest_y                                *
86//*                     x4          pu1_dest_uv                               *
87//*                     x5          u2_height                                 *
88//*                     x6          u2_width                                  *
89//*                     x7          u2_stridey                                *
90//*                     sp, #80     u2_strideu                                *
91//*                     sp, #88     u2_stridev                                *
92//*                     sp, #96     u2_dest_stride_y                          *
93//*                     sp, #104    u2_dest_stride_uv                         *
94//*                     sp, #112    convert_uv_only                           *
95//*                                                                            *
96//*  Values Returned  : None                                                   *
97//*                                                                            *
98//*  Register Usage   : x8, x10, x16, x20, v0, v1                              *
99//*                                                                            *
100//*  Stack Usage      : 80 Bytes                                               *
101//*                                                                            *
102//*  Interruptibility : Interruptible                                          *
103//*                                                                            *
104//*  Known Limitations                                                         *
105//*       Assumptions: Image Width:     Assumed to be multiple of 16 and       *
106//*                     greater than or equal to 16                  *
107//*                     Image Height:    Assumed to be even.                   *
108//*                                                                            *
109//*  Revision History :                                                        *
110//*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
111//*         07 06 2010   Varshita        Draft                                 *
112//*         07 06 2010   Naveen Kr T     Completed                             *
113//*                                                                            *
114//*****************************************************************************/
115.global impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8
116impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8:
117
118    //// push the registers on the stack
119    //    pu1_y,                - x0
120    //    pu1_u,                - x1
121    //    pu1_v,                - x2
122    //    pu1_dest_y,           - x3
123    //    pu1_dest_uv,          - x4
124    //    u2_height,            - x5
125    //    u2_width,             - x6
126    //    u2_stridey,           - x7
127    //    u2_strideu,           - sp, #80
128    //    u2_stridev,           - sp, #88
129    //    u2_dest_stride_y,     - sp, #96
130    //    u2_dest_stride_uv,    - sp, #104
131    //    convert_uv_only       - sp, #112
132    // STMFD sp!,{x4-x12,x14}
133    push_v_regs
134    stp             x19, x20, [sp, #-16]!
135
136    ldr             w14, [sp, #112]     //// Load convert_uv_only
137
138    cmp             w14, #1
139    mov             x9,  x5
140    beq             yuv420sp_uv_chroma
141    ///* Do the preprocessing before the main loops start */
142    //// Load the parameters from stack
143
144    ldr             w8, [sp, #96]       //// Load u2_dest_stride_y from stack
145    uxtw            x8, w8
146
147    sub             x7, x7, x6          //// Source increment
148
149    sub             x8, x8, x6          //// Destination increment
150
151
152yuv420sp_uv_row_loop_y:
153    mov             x16, x6
154
155yuv420sp_uv_col_loop_y:
156    prfm            pldl1keep, [x0, #128]
157    ld1             {v0.8b, v1.8b}, [x0], #16
158    st1             {v0.8b, v1.8b}, [x3], #16
159    sub             x16, x16, #16
160    cmp             x16, #15
161    bgt             yuv420sp_uv_col_loop_y
162
163    cmp             x16, #0
164    beq             yuv420sp_uv_row_loop__y
165    ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
166    ////Ex if width is 162, above loop will process 160 pixels. And
167    ////Both source and destination will point to 146th pixel and then 16 bytes will be read
168    //// and written using VLD1 and VST1
169    sub             x20, x16, #16
170    neg             x16, x20
171    sub             x0, x0, x16
172    sub             x3, x3, x16
173
174    ld1             {v0.8b, v1.8b}, [x0], #16
175    st1             {v0.8b, v1.8b}, [x3], #16
176
177yuv420sp_uv_row_loop__y:
178    add             x0, x0, x7
179    add             x3, x3, x8
180    subs            x5, x5, #1
181    bgt             yuv420sp_uv_row_loop_y
182
183yuv420sp_uv_chroma:
184    ldr             w7, [sp, #88]       //// Load u2_strideu from stack
185    sxtw            x7, w7
186
187    ldr             w8, [sp, #104]      //// Load u2_dest_stride_uv from stack
188    sxtw            x8, w8
189    add             x6, x6, 1
190    bic             x6, x6, #1
191
192    add             x9, x9, 1
193
194    sub             x7, x7, x6, lsr #1  //// Source increment
195
196    sub             x8, x8, x6          //// Destination increment
197
198    lsr             x6, x6, #1
199    lsr             x5, x9, #1
200yuv420sp_uv_row_loop_uv:
201    mov             x16, x6
202
203
204yuv420sp_uv_col_loop_uv:
205    prfm            pldl1keep, [x1, #128]
206    prfm            pldl1keep, [x2, #128]
207
208    ld1             {v0.8b}, [x1], #8
209    ld1             {v1.8b}, [x2], #8
210    st2             {v0.8b, v1.8b}, [x4], #16
211
212    sub             x16, x16, #8
213    cmp             x16, #7
214    bgt             yuv420sp_uv_col_loop_uv
215
216    cmp             x16, #0
217    beq             yuv420sp_uv_row_loop__uv
218    ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
219    ////Ex if width is 162, above loop will process 160 pixels. And
220    ////Both source and destination will point to 146th pixel and then 16 bytes will be read
221    //// and written using VLD1 and VST1
222    sub             x20, x16, #8
223    neg             x16, x20
224    sub             x1, x1, x16
225    sub             x2, x2, x16
226    sub             x4, x4, x16, lsl #1
227
228    ld1             {v0.8b}, [x1], #8
229    ld1             {v1.8b}, [x2], #8
230    st2             {v0.8b, v1.8b}, [x4], #16
231
232yuv420sp_uv_row_loop__uv:
233    add             x1, x1, x7
234    add             x2, x2, x7
235    add             x4, x4, x8
236    subs            x5, x5, #1
237    bgt             yuv420sp_uv_row_loop_uv
238    ////POP THE REGISTERS
239    // LDMFD sp!,{x4-x12,PC}
240    ldp             x19, x20, [sp], #16
241    pop_v_regs
242    ret
243
244
245
246
247
248///*****************************************************************************
249//*                                                                            *
250//*  Function Name    : impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8()                      *
251//*                                                                            *
252//*  Description      : This function conversts the image from YUV420P color   *
253//*                     space to 420SP color space(VU interleaved).           *
254//*               This function is similar to above function          *
255//*               IMP4D_CXA8_YUV420toYUV420SP_VU with a difference in   *
256//*               VLD1.8 for chroma - order of registers is different    *
257//*                                                                            *
258//*  Arguments        : x0          pu1_y                                     *
259//*                     x1          pu1_u                                     *
260//*                     x2          pu1_v                                     *
261//*                     x3          pu1_dest_y                                *
262//*                     x4          pu1_dest_uv                               *
263//*                     x5          u2_height                                 *
264//*                     x6          u2_width                                  *
265//*                     x7          u2_stridey                                *
266//*                     sp, #80     u2_strideu                                *
267//*                     sp, #88     u2_stridev                                *
268//*                     sp, #96     u2_dest_stride_y                          *
269//*                     sp, #104    u2_dest_stride_uv                         *
270//*                     sp, #112    convert_uv_only                           *
271//*                                                                            *
272//*  Values Returned  : None                                                   *
273//*                                                                            *
274//*  Register Usage   : x8, x14, x16, x20, v0, v1                              *
275//*                                                                            *
276//*  Stack Usage      : 80 Bytes                                               *
277//*                                                                            *
278//*  Interruptibility : Interruptible                                          *
279//*                                                                            *
280//*  Known Limitations                                                         *
281//*       Assumptions: Image Width:     Assumed to be multiple of 16 and       *
282//*                     greater than or equal to 16                  *
283//*                     Image Height:    Assumed to be even.                   *
284//*                                                                            *
285//*  Revision History :                                                        *
286//*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
287//*         07 06 2010   Varshita        Draft                                 *
288//*         07 06 2010   Naveen Kr T     Completed                             *
289//*                                                                            *
290//*****************************************************************************/
291
292.global impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8
293impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8:
294
295    //// push the registers on the stack
296    //    pu1_y,                - x0
297    //    pu1_u,                - x1
298    //    pu1_v,                - x2
299    //    pu1_dest_y,           - x3
300    //    pu1_dest_uv,          - x4
301    //    u2_height,            - x5
302    //    u2_width,             - x6
303    //    u2_stridey,           - x7
304    //    u2_strideu,           - sp, #80
305    //    u2_stridev,           - sp, #88
306    //    u2_dest_stride_y,     - sp, #96
307    //    u2_dest_stride_uv,    - sp, #104
308    //    convert_uv_only       - sp, #112
309    // STMFD sp!,{x4-x12,x14}
310    push_v_regs
311    stp             x19, x20, [sp, #-16]!
312
313    ldr             w14, [sp, #112]     //// Load convert_uv_only
314
315    cmp             w14, #1
316    mov             x9,  x5
317    beq             yuv420sp_vu_chroma
318
319    ///* Do the preprocessing before the main loops start */
320    //// Load the parameters from stack
321
322    ldr             w8, [sp, #96]       //// Load u2_dest_stride_y from stack
323    uxtw            x8, w8
324
325    sub             x7, x7, x6          //// Source increment
326
327    sub             x8, x8, x6          //// Destination increment
328
329
330yuv420sp_vu_row_loop_y:
331    mov             x16, x6
332
333yuv420sp_vu_col_loop_y:
334    prfm            pldl1keep, [x0, #128]
335    ld1             {v0.8b, v1.8b}, [x0], #16
336    st1             {v0.8b, v1.8b}, [x3], #16
337    sub             x16, x16, #16
338    cmp             x16, #15
339    bgt             yuv420sp_vu_col_loop_y
340
341    cmp             x16, #0
342    beq             yuv420sp_vu_row_loop__y
343    ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
344    ////Ex if width is 162, above loop will process 160 pixels. And
345    ////Both source and destination will point to 146th pixel and then 16 bytes will be read
346    //// and written using VLD1 and VST1
347    sub             x20, x16, #16
348    neg             x16, x20
349    sub             x0, x0, x16
350    sub             x3, x3, x16
351
352    ld1             {v0.8b, v1.8b}, [x0], #16
353    st1             {v0.8b, v1.8b}, [x3], #16
354
355yuv420sp_vu_row_loop__y:
356    add             x0, x0, x7
357    add             x3, x3, x8
358    subs            x5, x5, #1
359    bgt             yuv420sp_vu_row_loop_y
360
361yuv420sp_vu_chroma:
362    ldr             w7, [sp, #80]       //// Load u2_strideu from stack
363    sxtw            x7, w7
364
365    ldr             w8, [sp, #104]      //// Load u2_dest_stride_uv from stack
366    sxtw            x8, w8
367
368    add             x6, x6, 1
369    bic             x6, x6, #1
370
371    add             x9, x9, 1
372
373    sub             x7, x7, x6, lsr #1  //// Source increment
374
375    sub             x8, x8, x6          //// Destination increment
376
377    lsr             x6, x6, #1
378    lsr             x5, x9, #1
379yuv420sp_vu_row_loop_uv:
380    mov             x16, x6
381
382
383yuv420sp_vu_col_loop_uv:
384    prfm            pldl1keep, [x1, #128]
385    prfm            pldl1keep, [x2, #128]
386    ld1             {v1.8b}, [x1], #8
387    ld1             {v0.8b}, [x2], #8
388    st2             {v0.8b, v1.8b}, [x4], #16
389    sub             x16, x16, #8
390    cmp             x16, #7
391    bgt             yuv420sp_vu_col_loop_uv
392
393    cmp             x16, #0
394    beq             yuv420sp_vu_row_loop__uv
395    ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
396    ////Ex if width is 162, above loop will process 160 pixels. And
397    ////Both source and destination will point to 146th pixel and then 16 bytes will be read
398    //// and written using VLD1 and VST1
399    sub             x20, x16, #8
400    neg             x16, x20
401    sub             x1, x1, x16
402    sub             x2, x2, x16
403    sub             x4, x4, x16, lsl #1
404
405    ld1             {v1.8b}, [x1], #8
406    ld1             {v0.8b}, [x2], #8
407    st2             {v0.8b, v1.8b}, [x4], #16
408
409yuv420sp_vu_row_loop__uv:
410    add             x1, x1, x7
411    add             x2, x2, x7
412    add             x4, x4, x8
413    subs            x5, x5, #1
414    bgt             yuv420sp_vu_row_loop_uv
415    ////POP THE REGISTERS
416    // LDMFD sp!,{x4-x12,PC}
417    ldp             x19, x20, [sp], #16
418    pop_v_regs
419    ret
420
421