1///***************************************************************************** 2//* 3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//*****************************************************************************/ 18///** 19///******************************************************************************* 20//* //file 21//* ihevcd_fmt_conv_420sp_to_420p.s 22//* 23//* //brief 24//* contains function definitions for format conversions 25//* 26//* //author 27//* ittiam 28//* 29//* //par list of functions: 30//* 31//* 32//* //remarks 33//* none 34//* 35//*******************************************************************************/ 36 37.text 38 39.include "ihevc_neon_macros.s" 40 41 42 43 44///***************************************************************************** 45//* * 46//* Function Name : neon_copy_yuv420sp_to_yuv420p() * 47//* * 48//* Description : This function conversts the image from YUV420sP color * 49//* space to 420SP color space(UV interleaved). * 50//* * 51//* Arguments : x0 pu1_src_y * 52//* x1 pu1_src_uv * 53//* x2 pu1_dest_y * 54//* x3 pu1_dest_u * 55//* [x13 #40] pu1_dest_v * 56//* [x13 #44] u2_width * 57//* [x13 #48] u2_height * 58//* [x13 #52] u2_stridey * 59//* [x13 #56] u2_strideuv * 60//* [x13 #60] u2_dest_stridey * 61//* [x13 #64] u2_dest_strideuv * 62//* [x13 #68] is_u_first * 63//* [x13 #72] disable_luma_copy * 64//* * 65//* Values Returned : None * 66//* * 67//* Register Usage : x0 - x14 * 68//* * 69//* Stack Usage : 40 Bytes * 70//* * 71//* Interruptibility : Interruptible * 72//* * 73//* Known Limitations * 74//* Assumptions: Image Width: Assumed to be multiple of 2 and * 75//* Image Height: Assumed to be even. * 76//* * 77//* Revision History : * 78//* DD MM YYYY Author(s) Changes (Describe the changes made) * 79//* 16 05 2012 Naveen SR draft * 80//* * 81//*****************************************************************************/ 82 83.globl ihevcd_fmt_conv_420sp_to_420p_av8 84 85.type ihevcd_fmt_conv_420sp_to_420p_av8, %function 86 87ihevcd_fmt_conv_420sp_to_420p_av8: 88 // STMFD sp!,{x4-x12, x14} 89 push_v_regs 90 stp x19, x20,[sp,#-16]! 91 mov x15, x4 92 mov x8, x5 ////Load u2_width 93 mov x9, x6 ////Load u2_height 94 95 LDR w5, [sp,#88] ////Load u2_dest_stridey 96 sxtw x5,w5 97// LDR x6,[sp,#80] @//Load u2_strideuv 98 99 SUB x10,x7,x8 //// Src Y increment 100 SUB x11,x5,x8 //// Dst Y increment 101 102 LDR w5, [sp,#112] ////Load disable_luma_copy flag 103 sxtw x5,w5 104 CMP x5,#0 ////skip luma if disable_luma_copy is non-zero 105 BNE uv_copy_start 106 107 ///* Copy Y */ 108 109 MOV x4,x9 //// Copying height 110y_row_loop: 111 MOV x6,x8 //// Copying width 112 113y_col_loop: 114 115 SUB x6,x6,#16 116 ld1 {v0.8b, v1.8b},[x0],#16 117 st1 {v0.8b, v1.8b},[x2],#16 118 CMP x6,#16 119 BGE y_col_loop 120 CMP x6,#0 121 BEQ y_col_loop_end 122 ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read 123 ////Ex if width is 162, above loop will process 160 pixels. And 124 ////Both source and destination will point to 146th pixel and then 16 bytes will be read 125 //// and written using VLD1 and VST1 126 sub x20,x6,#16 127 neg x6, x20 128 SUB x0,x0,x6 129 SUB x2,x2,x6 130 ld1 {v0.8b, v1.8b}, [x0],#16 131 st1 {v0.8b, v1.8b}, [x2],#16 132 133y_col_loop_end: 134 ADD x0, x0, x10 135 ADD x2, x2, x11 136 SUBS x4, x4, #1 137 BGT y_row_loop 138 139 140 ///* Copy UV */ 141uv_copy_start: 142 143 LDR w5, [sp,#96] ////Load u2_dest_strideuv 144 sxtw x5,w5 145 LDR w7, [sp,#80] ////Load u2_strideuv 146 sxtw x7,w7 147 148 LSR x9, x9, #1 //// height/2 149// MOV x8,x8,LSR #1 @// Width/2 150 151 SUB x10,x7,x8 //// Src UV increment 152 LSR x11, x8, #1 153 SUB x11,x5,x11 //// Dst U and V increment 154 155 mov x5, x15 ////Load pu1_dest_v 156 157 LDR w4, [sp,#104] ////Load is_u_first_flag 158 sxtw x4,w4 159 CMP x4,#0 ////Swap U and V dest if is_u_first_flag is zero 160 csel x4, x5, x4,EQ 161 csel x5, x3, x5,EQ 162 csel x3, x4, x3,EQ 163 164 MOV x4,x9 //// Copying height 165uv_row_loop: 166 MOV x6,x8 //// Copying width 167 168uv_col_loop: 169 170 SUB x6,x6,#16 171 172 prfm PLDL1KEEP,[x1,#128] 173 ld2 {v0.8b, v1.8b},[x1],#16 174 ST1 {v0.8b},[x3],#8 175 ST1 {v1.8b},[x5],#8 176 CMP x6,#16 177 BGE uv_col_loop 178 CMP x6,#0 179 BEQ uv_col_loop_end 180 ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read 181 ////Ex if width is 162, above loop will process 160 pixels. And 182 ////Both source and destination will point to 146th pixel and then 16 bytes will be read 183 //// and written using VLD1 and VST1 184 sub x20,x6,#16 185 neg x6, x20 186 SUB x1,x1,x6 187 SUB x3,x3,x6,LSR #1 188 SUB x5,x5,x6,LSR #1 189 ld2 {v0.8b, v1.8b}, [x1],#16 190 ST1 {v0.8b},[x3],#8 191 ST1 {v1.8b},[x5],#8 192uv_col_loop_end: 193 ADD x1, x1, x10 194 ADD x3, x3, x11 195 ADD x5, x5, x11 196 SUBS x4, x4, #1 197 BGT uv_row_loop 198 199exit: 200 // LDMFD sp!,{x4-x12, pc} 201 ldp x19, x20,[sp],#16 202 pop_v_regs 203 ret 204 205 206 207 208 209 210