1@/****************************************************************************** 2@ * 3@ * Copyright (C) 2015 The Android Open Source Project 4@ * 5@ * Licensed under the Apache License, Version 2.0 (the "License"); 6@ * you may not use this file except in compliance with the License. 7@ * You may obtain a copy of the License at: 8@ * 9@ * http://www.apache.org/licenses/LICENSE-2.0 10@ * 11@ * Unless required by applicable law or agreed to in writing, software 12@ * distributed under the License is distributed on an "AS IS" BASIS, 13@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@ * See the License for the specific language governing permissions and 15@ * limitations under the License. 16@ * 17@ ***************************************************************************** 18@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19@*/ 20 21@/* 22@//---------------------------------------------------------------------------- 23@// File Name : impeg2_format_conv.s 24@// 25@// Description : This file has the Idct Implementations for the 26@// MPEG4 SP decoder on neon platform. 27@// 28@// Reference Document : 29@// 30@// Revision History : 31@// Date Author Detail Description 32@// ------------ ---------------- ---------------------------------- 33@// Jul 07, 2008 Naveen Kumar T Created 34@// 35@//------------------------------------------------------------------------- 36@*/ 37 38@/* 39@// ---------------------------------------------------------------------------- 40@// Include Files 41@// ---------------------------------------------------------------------------- 42@*/ 43.text 44.p2align 2 45.equ log2_16 , 4 46.equ log2_2 , 1 47@/* 48@// ---------------------------------------------------------------------------- 49@// Struct/Union Types and Define 50@// ---------------------------------------------------------------------------- 51@*/ 52 53@/* 54@// ---------------------------------------------------------------------------- 55@// Static Global Data section variables 56@// ---------------------------------------------------------------------------- 57@*/ 58@//--------------------------- NONE -------------------------------------------- 59 60@/* 61@// ---------------------------------------------------------------------------- 62@// Static Prototype Functions 63@// ---------------------------------------------------------------------------- 64@*/ 65@// -------------------------- NONE -------------------------------------------- 66 67@/* 68@// ---------------------------------------------------------------------------- 69@// Exported functions 70@// ---------------------------------------------------------------------------- 71@*/ 72 73@/***************************************************************************** 74@* * 75@* Function Name : impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_a9q() * 76@* * 77@* Description : This function conversts the image from YUV420P color * 78@* space to 420SP color space(UV interleaved). * 79@* * 80@* Arguments : R0 pu1_y * 81@* R1 pu1_u * 82@* R2 pu1_v * 83@* R3 pu1_dest_y * 84@* [R13 #40] pu1_dest_uv * 85@* [R13 #44] u2_height * 86@* [R13 #48] u2_width * 87@* [R13 #52] u2_stridey * 88@* [R13 #56] u2_strideu * 89@* [R13 #60] u2_stridev * 90@* [R13 #64] u2_dest_stride_y * 91@* [R13 #68] u2_dest_stride_uv * 92@* [R13 #72] convert_uv_only * 93@* * 94@* Values Returned : None * 95@* * 96@* Register Usage : R0 - R8, Q0 * 97@* * 98@* Stack Usage : 24 Bytes * 99@* * 100@* Interruptibility : Interruptible * 101@* * 102@* Known Limitations * 103@* Assumptions: Image Width: Assumed to be multiple of 16 and * 104@* greater than or equal to 16 * 105@* Image Height: Assumed to be even. * 106@* * 107@* Revision History : * 108@* DD MM YYYY Author(s) Changes (Describe the changes made) * 109@* 07 06 2010 Varshita Draft * 110@* 07 06 2010 Naveen Kr T Completed * 111@* * 112@*****************************************************************************/ 113 .global impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_a9q 114impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_a9q: 115 116 @// push the registers on the stack 117 stmfd sp!, {r4-r8, lr} 118 119 ldr r4, [sp, #56] @// Load convert_uv_only 120 121 cmp r4, #1 122 beq yuv420sp_uv_chroma 123 @/* Do the preprocessing before the main loops start */ 124 @// Load the parameters from stack 125 ldr r4, [sp, #28] @// Load u2_height from stack 126 127 ldr r5, [sp, #32] @// Load u2_width from stack 128 129 ldr r7, [sp, #36] @// Load u2_stridey from stack 130 131 ldr r8, [sp, #48] @// Load u2_dest_stride_y from stack 132 133 sub r7, r7, r5 @// Source increment 134 135 sub r8, r8, r5 @// Destination increment 136 137 138yuv420sp_uv_row_loop_y: 139 mov r6, r5 140 141yuv420sp_uv_col_loop_y: 142 pld [r0, #128] 143 vld1.8 {q0}, [r0]! 144 vst1.8 {q0}, [r3]! 145 sub r6, r6, #16 146 cmp r6, #15 147 bgt yuv420sp_uv_col_loop_y 148 149 cmp r6, #0 150 beq yuv420sp_uv_row_loop_end_y 151 @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read 152 @//Ex if width is 162, above loop will process 160 pixels. And 153 @//Both source and destination will point to 146th pixel and then 16 bytes will be read 154 @// and written using VLD1 and VST1 155 rsb r6, r6, #16 156 sub r0, r0, r6 157 sub r3, r3, r6 158 159 vld1.8 {q0}, [r0]! 160 vst1.8 {q0}, [r3]! 161 162yuv420sp_uv_row_loop_end_y: 163 add r0, r0, r7 164 add r3, r3, r8 165 subs r4, r4, #1 166 bgt yuv420sp_uv_row_loop_y 167 168yuv420sp_uv_chroma: 169 170 ldr r3, [sp, #24] @// Load pu1_dest_uv from stack 171 172 ldr r4, [sp, #28] @// Load u2_height from stack 173 add r4, r4, 1 174 175 ldr r5, [sp, #32] @// Load u2_width from stack 176 add r5, r5, 1 177 bic r5, r5, #1 178 179 ldr r7, [sp, #40] @// Load u2_strideu from stack 180 181 ldr r8, [sp, #52] @// Load u2_dest_stride_uv from stack 182 183 sub r7, r7, r5, lsr #1 @// Source increment 184 185 sub r8, r8, r5 @// Destination increment 186 187 mov r5, r5, lsr #1 188 mov r4, r4, lsr #1 189 ldr r3, [sp, #24] @// Load pu1_dest_uv from stack 190yuv420sp_uv_row_loop_uv: 191 mov r6, r5 192 193 194yuv420sp_uv_col_loop_uv: 195 pld [r1, #128] 196 pld [r2, #128] 197 vld1.8 d0, [r1]! 198 vld1.8 d1, [r2]! 199 vst2.8 {d0, d1}, [r3]! 200 sub r6, r6, #8 201 cmp r6, #7 202 bgt yuv420sp_uv_col_loop_uv 203 204 cmp r6, #0 205 beq yuv420sp_uv_row_loop_end_uv 206 @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read 207 @//Ex if width is 162, above loop will process 160 pixels. And 208 @//Both source and destination will point to 146th pixel and then 16 bytes will be read 209 @// and written using VLD1 and VST1 210 rsb r6, r6, #8 211 sub r1, r1, r6 212 sub r2, r2, r6 213 sub r3, r3, r6, lsl #1 214 215 vld1.8 d0, [r1]! 216 vld1.8 d1, [r2]! 217 vst2.8 {d0, d1}, [r3]! 218 219yuv420sp_uv_row_loop_end_uv: 220 add r1, r1, r7 221 add r2, r2, r7 222 add r3, r3, r8 223 subs r4, r4, #1 224 bgt yuv420sp_uv_row_loop_uv 225 @//POP THE REGISTERS 226 ldmfd sp!, {r4-r8, pc} 227 228 229 230 231 232@/***************************************************************************** 233@* * 234@* Function Name : impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_a9q() * 235@* * 236@* Description : This function conversts the image from YUV420P color * 237@* space to 420SP color space(VU interleaved). * 238@* This function is similar to above function * 239@* IMP4D_CXA8_YUV420toYUV420SP_VU with a difference in * 240@* VLD1.8 for chroma - order of registers is different * 241@* * 242@* Arguments : R0 pu1_y * 243@* R1 pu1_u * 244@* R2 pu1_v * 245@* R3 pu1_dest_y * 246@* [R13 #40] pu1_dest_uv * 247@* [R13 #44] u2_height * 248@* [R13 #48] u2_width * 249@* [R13 #52] u2_stridey * 250@* [R13 #56] u2_strideu * 251@* [R13 #60] u2_stridev * 252@* [R13 #64] u2_dest_stride_y * 253@* [R13 #68] u2_dest_stride_uv * 254@* [R13 #72] convert_uv_only * 255@* * 256@* Values Returned : None * 257@* * 258@* Register Usage : R0 - R8, Q0 * 259@* * 260@* Stack Usage : 24 Bytes * 261@* * 262@* Interruptibility : Interruptible * 263@* * 264@* Known Limitations * 265@* Assumptions: Image Width: Assumed to be multiple of 16 and * 266@* greater than or equal to 16 * 267@* Image Height: Assumed to be even. * 268@* * 269@* Revision History : * 270@* DD MM YYYY Author(s) Changes (Describe the changes made) * 271@* 07 06 2010 Varshita Draft * 272@* 07 06 2010 Naveen Kr T Completed * 273@* * 274@*****************************************************************************/ 275 276 .global impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_a9q 277impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_a9q: 278 279 @// push the registers on the stack 280 stmfd sp!, {r4-r8, lr} 281 282 ldr r4, [sp, #56] @// Load convert_uv_only 283 284 cmp r4, #1 285 beq yuv420sp_vu_chroma 286 287 @/* Do the preprocessing before the main loops start */ 288 @// Load the parameters from stack 289 ldr r4, [sp, #28] @// Load u2_height from stack 290 291 ldr r5, [sp, #32] @// Load u2_width from stack 292 293 ldr r7, [sp, #36] @// Load u2_stridey from stack 294 295 ldr r8, [sp, #48] @// Load u2_dest_stride_y from stack 296 297 sub r7, r7, r5 @// Source increment 298 299 sub r8, r8, r5 @// Destination increment 300 301 302yuv420sp_vu_row_loop_y: 303 mov r6, r5 304 305yuv420sp_vu_col_loop_y: 306 pld [r0, #128] 307 vld1.8 {q0}, [r0]! 308 vst1.8 {q0}, [r3]! 309 sub r6, r6, #16 310 cmp r6, #15 311 bgt yuv420sp_vu_col_loop_y 312 313 cmp r6, #0 314 beq yuv420sp_vu_row_loop_end_y 315 @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read 316 @//Ex if width is 162, above loop will process 160 pixels. And 317 @//Both source and destination will point to 146th pixel and then 16 bytes will be read 318 @// and written using VLD1 and VST1 319 rsb r6, r6, #16 320 sub r0, r0, r6 321 sub r3, r3, r6 322 323 vld1.8 {q0}, [r0]! 324 vst1.8 {q0}, [r3]! 325 326yuv420sp_vu_row_loop_end_y: 327 add r0, r0, r7 328 add r3, r3, r8 329 subs r4, r4, #1 330 bgt yuv420sp_vu_row_loop_y 331 332yuv420sp_vu_chroma: 333 334 ldr r3, [sp, #24] @// Load pu1_dest_uv from stack 335 336 ldr r4, [sp, #28] @// Load u2_height from stack 337 add r4, r4, 1 338 339 ldr r5, [sp, #32] @// Load u2_width from stack 340 add r5, r5, 1 341 bic r5, r5, #1 342 343 ldr r7, [sp, #40] @// Load u2_strideu from stack 344 345 ldr r8, [sp, #52] @// Load u2_dest_stride_uv from stack 346 347 sub r7, r7, r5, lsr #1 @// Source increment 348 349 sub r8, r8, r5 @// Destination increment 350 351 mov r5, r5, lsr #1 352 mov r4, r4, lsr #1 353 ldr r3, [sp, #24] @// Load pu1_dest_uv from stack 354yuv420sp_vu_row_loop_uv: 355 mov r6, r5 356 357 358yuv420sp_vu_col_loop_uv: 359 pld [r1, #128] 360 pld [r2, #128] 361 vld1.8 d1, [r1]! 362 vld1.8 d0, [r2]! 363 vst2.8 {d0, d1}, [r3]! 364 sub r6, r6, #8 365 cmp r6, #7 366 bgt yuv420sp_vu_col_loop_uv 367 368 cmp r6, #0 369 beq yuv420sp_vu_row_loop_end_uv 370 @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read 371 @//Ex if width is 162, above loop will process 160 pixels. And 372 @//Both source and destination will point to 146th pixel and then 16 bytes will be read 373 @// and written using VLD1 and VST1 374 rsb r6, r6, #8 375 sub r1, r1, r6 376 sub r2, r2, r6 377 sub r3, r3, r6, lsl #1 378 379 vld1.8 d1, [r1]! 380 vld1.8 d0, [r2]! 381 vst2.8 {d0, d1}, [r3]! 382 383yuv420sp_vu_row_loop_end_uv: 384 add r1, r1, r7 385 add r2, r2, r7 386 add r3, r3, r8 387 subs r4, r4, #1 388 bgt yuv420sp_vu_row_loop_uv 389 @//POP THE REGISTERS 390 ldmfd sp!, {r4-r8, pc} 391 392 393 394 395 396