1//****************************************************************************** 2//* 3//* Copyright (C) 2015 The Android Open Source Project 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//***************************************************************************** 18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19//*/ 20 21///* 22////---------------------------------------------------------------------------- 23//// File Name : impeg2_format_conv.s 24//// 25//// Description : This file has the Idct Implementations for the 26//// MPEG4 SP decoder on neon platform. 27//// 28//// Reference Document : 29//// 30//// Revision History : 31//// Date Author Detail Description 32//// ------------ ---------------- ---------------------------------- 33//// Jul 07, 2008 Naveen Kumar T Created 34//// 35////------------------------------------------------------------------------- 36//*/ 37 38///* 39//// ---------------------------------------------------------------------------- 40//// Include Files 41//// ---------------------------------------------------------------------------- 42//*/ 43.set log2_16 , 4 44.set log2_2 , 1 45 46.text 47.include "impeg2_neon_macros.s" 48///* 49//// ---------------------------------------------------------------------------- 50//// Struct/Union Types and Define 51//// ---------------------------------------------------------------------------- 52//*/ 53 54///* 55//// ---------------------------------------------------------------------------- 56//// Static Global Data section variables 57//// ---------------------------------------------------------------------------- 58//*/ 59////--------------------------- NONE -------------------------------------------- 60 61///* 62//// ---------------------------------------------------------------------------- 63//// Static Prototype Functions 64//// ---------------------------------------------------------------------------- 65//*/ 66//// -------------------------- NONE -------------------------------------------- 67 68///* 69//// ---------------------------------------------------------------------------- 70//// Exported functions 71//// ---------------------------------------------------------------------------- 72//*/ 73 74 75///***************************************************************************** 76//* * 77//* Function Name : impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8() * 78//* * 79//* Description : This function conversts the image from YUV420P color * 80//* space to 420SP color space(UV interleaved). * 81//* * 82//* Arguments : x0 pu1_y * 83//* x1 pu1_u * 84//* x2 pu1_v * 85//* x3 pu1_dest_y * 86//* x4 pu1_dest_uv * 87//* x5 u2_height * 88//* x6 u2_width * 89//* x7 u2_stridey * 90//* sp, #80 u2_strideu * 91//* sp, #88 u2_stridev * 92//* sp, #96 u2_dest_stride_y * 93//* sp, #104 u2_dest_stride_uv * 94//* sp, #112 convert_uv_only * 95//* * 96//* Values Returned : None * 97//* * 98//* Register Usage : x8, x10, x16, x20, v0, v1 * 99//* * 100//* Stack Usage : 80 Bytes * 101//* * 102//* Interruptibility : Interruptible * 103//* * 104//* Known Limitations * 105//* Assumptions: Image Width: Assumed to be multiple of 16 and * 106//* greater than or equal to 16 * 107//* Image Height: Assumed to be even. * 108//* * 109//* Revision History : * 110//* DD MM YYYY Author(s) Changes (Describe the changes made) * 111//* 07 06 2010 Varshita Draft * 112//* 07 06 2010 Naveen Kr T Completed * 113//* * 114//*****************************************************************************/ 115.global impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8 116impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8: 117 118 //// push the registers on the stack 119 // pu1_y, - x0 120 // pu1_u, - x1 121 // pu1_v, - x2 122 // pu1_dest_y, - x3 123 // pu1_dest_uv, - x4 124 // u2_height, - x5 125 // u2_width, - x6 126 // u2_stridey, - x7 127 // u2_strideu, - sp, #80 128 // u2_stridev, - sp, #88 129 // u2_dest_stride_y, - sp, #96 130 // u2_dest_stride_uv, - sp, #104 131 // convert_uv_only - sp, #112 132 // STMFD sp!,{x4-x12,x14} 133 push_v_regs 134 stp x19, x20, [sp, #-16]! 135 136 ldr w14, [sp, #112] //// Load convert_uv_only 137 138 cmp w14, #1 139 mov x9, x5 140 beq yuv420sp_uv_chroma 141 ///* Do the preprocessing before the main loops start */ 142 //// Load the parameters from stack 143 144 ldr w8, [sp, #96] //// Load u2_dest_stride_y from stack 145 uxtw x8, w8 146 147 sub x7, x7, x6 //// Source increment 148 149 sub x8, x8, x6 //// Destination increment 150 151 152yuv420sp_uv_row_loop_y: 153 mov x16, x6 154 155yuv420sp_uv_col_loop_y: 156 prfm pldl1keep, [x0, #128] 157 ld1 {v0.8b, v1.8b}, [x0], #16 158 st1 {v0.8b, v1.8b}, [x3], #16 159 sub x16, x16, #16 160 cmp x16, #15 161 bgt yuv420sp_uv_col_loop_y 162 163 cmp x16, #0 164 beq yuv420sp_uv_row_loop__y 165 ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read 166 ////Ex if width is 162, above loop will process 160 pixels. And 167 ////Both source and destination will point to 146th pixel and then 16 bytes will be read 168 //// and written using VLD1 and VST1 169 sub x20, x16, #16 170 neg x16, x20 171 sub x0, x0, x16 172 sub x3, x3, x16 173 174 ld1 {v0.8b, v1.8b}, [x0], #16 175 st1 {v0.8b, v1.8b}, [x3], #16 176 177yuv420sp_uv_row_loop__y: 178 add x0, x0, x7 179 add x3, x3, x8 180 subs x5, x5, #1 181 bgt yuv420sp_uv_row_loop_y 182 183yuv420sp_uv_chroma: 184 ldr w7, [sp, #88] //// Load u2_strideu from stack 185 sxtw x7, w7 186 187 ldr w8, [sp, #104] //// Load u2_dest_stride_uv from stack 188 sxtw x8, w8 189 add x6, x6, 1 190 bic x6, x6, #1 191 192 add x9, x9, 1 193 194 sub x7, x7, x6, lsr #1 //// Source increment 195 196 sub x8, x8, x6 //// Destination increment 197 198 lsr x6, x6, #1 199 lsr x5, x9, #1 200yuv420sp_uv_row_loop_uv: 201 mov x16, x6 202 203 204yuv420sp_uv_col_loop_uv: 205 prfm pldl1keep, [x1, #128] 206 prfm pldl1keep, [x2, #128] 207 208 ld1 {v0.8b}, [x1], #8 209 ld1 {v1.8b}, [x2], #8 210 st2 {v0.8b, v1.8b}, [x4], #16 211 212 sub x16, x16, #8 213 cmp x16, #7 214 bgt yuv420sp_uv_col_loop_uv 215 216 cmp x16, #0 217 beq yuv420sp_uv_row_loop__uv 218 ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read 219 ////Ex if width is 162, above loop will process 160 pixels. And 220 ////Both source and destination will point to 146th pixel and then 16 bytes will be read 221 //// and written using VLD1 and VST1 222 sub x20, x16, #8 223 neg x16, x20 224 sub x1, x1, x16 225 sub x2, x2, x16 226 sub x4, x4, x16, lsl #1 227 228 ld1 {v0.8b}, [x1], #8 229 ld1 {v1.8b}, [x2], #8 230 st2 {v0.8b, v1.8b}, [x4], #16 231 232yuv420sp_uv_row_loop__uv: 233 add x1, x1, x7 234 add x2, x2, x7 235 add x4, x4, x8 236 subs x5, x5, #1 237 bgt yuv420sp_uv_row_loop_uv 238 ////POP THE REGISTERS 239 // LDMFD sp!,{x4-x12,PC} 240 ldp x19, x20, [sp], #16 241 pop_v_regs 242 ret 243 244 245 246 247 248///***************************************************************************** 249//* * 250//* Function Name : impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8() * 251//* * 252//* Description : This function conversts the image from YUV420P color * 253//* space to 420SP color space(VU interleaved). * 254//* This function is similar to above function * 255//* IMP4D_CXA8_YUV420toYUV420SP_VU with a difference in * 256//* VLD1.8 for chroma - order of registers is different * 257//* * 258//* Arguments : x0 pu1_y * 259//* x1 pu1_u * 260//* x2 pu1_v * 261//* x3 pu1_dest_y * 262//* x4 pu1_dest_uv * 263//* x5 u2_height * 264//* x6 u2_width * 265//* x7 u2_stridey * 266//* sp, #80 u2_strideu * 267//* sp, #88 u2_stridev * 268//* sp, #96 u2_dest_stride_y * 269//* sp, #104 u2_dest_stride_uv * 270//* sp, #112 convert_uv_only * 271//* * 272//* Values Returned : None * 273//* * 274//* Register Usage : x8, x14, x16, x20, v0, v1 * 275//* * 276//* Stack Usage : 80 Bytes * 277//* * 278//* Interruptibility : Interruptible * 279//* * 280//* Known Limitations * 281//* Assumptions: Image Width: Assumed to be multiple of 16 and * 282//* greater than or equal to 16 * 283//* Image Height: Assumed to be even. * 284//* * 285//* Revision History : * 286//* DD MM YYYY Author(s) Changes (Describe the changes made) * 287//* 07 06 2010 Varshita Draft * 288//* 07 06 2010 Naveen Kr T Completed * 289//* * 290//*****************************************************************************/ 291 292.global impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8 293impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8: 294 295 //// push the registers on the stack 296 // pu1_y, - x0 297 // pu1_u, - x1 298 // pu1_v, - x2 299 // pu1_dest_y, - x3 300 // pu1_dest_uv, - x4 301 // u2_height, - x5 302 // u2_width, - x6 303 // u2_stridey, - x7 304 // u2_strideu, - sp, #80 305 // u2_stridev, - sp, #88 306 // u2_dest_stride_y, - sp, #96 307 // u2_dest_stride_uv, - sp, #104 308 // convert_uv_only - sp, #112 309 // STMFD sp!,{x4-x12,x14} 310 push_v_regs 311 stp x19, x20, [sp, #-16]! 312 313 ldr w14, [sp, #112] //// Load convert_uv_only 314 315 cmp w14, #1 316 mov x9, x5 317 beq yuv420sp_vu_chroma 318 319 ///* Do the preprocessing before the main loops start */ 320 //// Load the parameters from stack 321 322 ldr w8, [sp, #96] //// Load u2_dest_stride_y from stack 323 uxtw x8, w8 324 325 sub x7, x7, x6 //// Source increment 326 327 sub x8, x8, x6 //// Destination increment 328 329 330yuv420sp_vu_row_loop_y: 331 mov x16, x6 332 333yuv420sp_vu_col_loop_y: 334 prfm pldl1keep, [x0, #128] 335 ld1 {v0.8b, v1.8b}, [x0], #16 336 st1 {v0.8b, v1.8b}, [x3], #16 337 sub x16, x16, #16 338 cmp x16, #15 339 bgt yuv420sp_vu_col_loop_y 340 341 cmp x16, #0 342 beq yuv420sp_vu_row_loop__y 343 ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read 344 ////Ex if width is 162, above loop will process 160 pixels. And 345 ////Both source and destination will point to 146th pixel and then 16 bytes will be read 346 //// and written using VLD1 and VST1 347 sub x20, x16, #16 348 neg x16, x20 349 sub x0, x0, x16 350 sub x3, x3, x16 351 352 ld1 {v0.8b, v1.8b}, [x0], #16 353 st1 {v0.8b, v1.8b}, [x3], #16 354 355yuv420sp_vu_row_loop__y: 356 add x0, x0, x7 357 add x3, x3, x8 358 subs x5, x5, #1 359 bgt yuv420sp_vu_row_loop_y 360 361yuv420sp_vu_chroma: 362 ldr w7, [sp, #80] //// Load u2_strideu from stack 363 sxtw x7, w7 364 365 ldr w8, [sp, #104] //// Load u2_dest_stride_uv from stack 366 sxtw x8, w8 367 368 add x6, x6, 1 369 bic x6, x6, #1 370 371 add x9, x9, 1 372 373 sub x7, x7, x6, lsr #1 //// Source increment 374 375 sub x8, x8, x6 //// Destination increment 376 377 lsr x6, x6, #1 378 lsr x5, x9, #1 379yuv420sp_vu_row_loop_uv: 380 mov x16, x6 381 382 383yuv420sp_vu_col_loop_uv: 384 prfm pldl1keep, [x1, #128] 385 prfm pldl1keep, [x2, #128] 386 ld1 {v1.8b}, [x1], #8 387 ld1 {v0.8b}, [x2], #8 388 st2 {v0.8b, v1.8b}, [x4], #16 389 sub x16, x16, #8 390 cmp x16, #7 391 bgt yuv420sp_vu_col_loop_uv 392 393 cmp x16, #0 394 beq yuv420sp_vu_row_loop__uv 395 ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read 396 ////Ex if width is 162, above loop will process 160 pixels. And 397 ////Both source and destination will point to 146th pixel and then 16 bytes will be read 398 //// and written using VLD1 and VST1 399 sub x20, x16, #8 400 neg x16, x20 401 sub x1, x1, x16 402 sub x2, x2, x16 403 sub x4, x4, x16, lsl #1 404 405 ld1 {v1.8b}, [x1], #8 406 ld1 {v0.8b}, [x2], #8 407 st2 {v0.8b, v1.8b}, [x4], #16 408 409yuv420sp_vu_row_loop__uv: 410 add x1, x1, x7 411 add x2, x2, x7 412 add x4, x4, x8 413 subs x5, x5, #1 414 bgt yuv420sp_vu_row_loop_uv 415 ////POP THE REGISTERS 416 // LDMFD sp!,{x4-x12,PC} 417 ldp x19, x20, [sp], #16 418 pop_v_regs 419 ret 420 421