1@/***************************************************************************** 2@* 3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4@* 5@* Licensed under the Apache License, Version 2.0 (the "License"); 6@* you may not use this file except in compliance with the License. 7@* You may obtain a copy of the License at: 8@* 9@* http://www.apache.org/licenses/LICENSE-2.0 10@* 11@* Unless required by applicable law or agreed to in writing, software 12@* distributed under the License is distributed on an "AS IS" BASIS, 13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@* See the License for the specific language governing permissions and 15@* limitations under the License. 16@* 17@*****************************************************************************/ 18@/** 19@/******************************************************************************* 20@* @file 21@* ihevcd_fmt_conv_420sp_to_rgba8888.s 22@* 23@* @brief 24@* contains function definitions for format conversions 25@* 26@* @author 27@* ittiam 28@* 29@* @par list of functions: 30@* 31@* 32@* @remarks 33@* none 34@* 35@*******************************************************************************/ 36 .equ DO1STROUNDING, 0 37 38 @ ARM 39 @ 40 @ PRESERVE8 41 42.text 43.p2align 2 44 45 46 47 48@/***************************************************************************** 49@* * 50@* Function Name : ihevcd_fmt_conv_420sp_to_rgba8888() * 51@* * 52@* Description : This function conversts the image from YUV422 color * 53@* space to RGB888 color space. The function can be * 54@* invoked at the MB level. * 55@* * 56@* Arguments : R0 pubY * 57@* R1 pubUV * 58@* R2 pusRGB * 59@* R3 pusRGB * 60@* [R13 #40] usHeight * 61@* [R13 #44] usWidth * 62@* [R13 #48] usStrideY * 63@* [R13 #52] usStrideU * 64@* [R13 #56] usStrideV * 65@* [R13 #60] usStrideRGB * 66@* * 67@* Values Returned : None * 68@* * 69@* Register Usage : R0 - R14 * 70@* * 71@* Stack Usage : 40 Bytes * 72@* * 73@* Interruptibility : Interruptible * 74@* * 75@* Known Limitations * 76@* Assumptions: Image Width: Assumed to be multiple of 16 and * 77@* greater than or equal to 16 * 78@* Image Height: Assumed to be even. * 79@* * 80@* Revision History : * 81@* DD MM YYYY Author(s) Changes (Describe the changes made) * 82@* 07 06 2010 Varshita Draft * 83@* 07 06 2010 Naveen Kr T Completed * 84@* 05 08 2013 Naveen K P Modified for HEVC * 85@*****************************************************************************/ 86 .global ihevcd_fmt_conv_420sp_to_rgba8888_a9q 87.type ihevcd_fmt_conv_420sp_to_rgba8888_a9q, function 88ihevcd_fmt_conv_420sp_to_rgba8888_a9q: 89 90 @// push the registers on the stack 91 STMFD SP!,{R4-R12,LR} 92 93 94 @//R0 - Y PTR 95 @//R1 - UV PTR 96 @//R2 - RGB PTR 97 @//R3 - RGB PTR 98 @//R4 - PIC WIDTH 99 @//R5 - PIC HT 100 @//R6 - STRIDE Y 101 @//R7 - STRIDE U 102 @//R8 - STRIDE V 103 @//R9 - STRIDE RGB 104 105 @//ONE ROW PROCESSING AT A TIME 106 107 @//THE FOUR CONSTANTS ARE: 108 @//C1=0x3311,C2=0xF379,C3=0xE5F8,C4=0x4092 109 110 @PLD [R0] 111 @PLD [R1] 112 @PLD [R2] 113 114 115 @/* can be loaded from a defined const type */ 116 MOVW R10,#0x3311 117 VMOV.16 D0[0],R10 @//C1 118 119 MOVW R10,#0xF379 120 VMOV.16 D0[1],R10 @//C2 121 122 MOVW R10,#0xE5F8 123 VMOV.16 D0[2],R10 @//C3 124 125 MOVW R10,#0x4092 126 VMOV.16 D0[3],R10 @//C4 127 128 @//LOAD CONSTANT 128 INTO A CORTEX REGISTER 129 MOV R10,#128 130 VDUP.8 D1,R10 131 132 @//D0 HAS C1-C2-C3-C4 133 @// load other parameters from stack 134 LDR R5,[sp,#40] 135 @LDR R4,[sp,#44] 136 LDR R6,[sp,#44] 137 LDR R7,[sp,#48] 138 @LDR R8,[sp,#52] 139 LDR R9,[sp,#52] 140 141 @// calculate offsets, offset = stride - width 142 SUB R10,R6,R3 @// luma offset 143 SUB R11,R7,R3 144 @, LSR #1 @// u offset 145 @SUB R12,R8,R3, LSR #1 @// v offset 146 SUB R14,R9,R3 @// rgb offset in pixels 147 148 @// calculate height loop count 149 MOV R5,R5, LSR #1 @// height_cnt = height / 16 150 151 @// create next row pointers for rgb and luma data 152 ADD R7,R0,R6 @// luma_next_row = luma + luma_stride 153 ADD R8,R2,R9,LSL #2 @// rgb_next_row = rgb + rgb_stride 154 155LABEL_YUV420SP_TO_RGB8888_HEIGHT_LOOP: 156 157 @//LOAD VALUES OF U&V AND COMPUTE THE R,G,B WEIGHT VALUES. 158 VLD1.8 {D2,D3},[R1]! @//LOAD 8 VALUES OF UV 159 @//VLD1.8 {D3},[R2]! @//LOAD 8 VALUES OF V 160 161 @// calculate width loop count 162 MOV R6,R3, LSR #4 @// width_cnt = width / 16 163 164 @//COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME 165 @//LOAD VALUES OF Y 8-BIT VALUES 166 VLD2.8 {D30,D31},[R0]! @//D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row 1 167 @//D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15 168 VLD2.8 {D28,D29},[R7]! @//D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row2 169 @//D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15 170 171 SUBS R6,R6,#1 172 BEQ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP 173 174LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP: 175 @VMOV.I8 Q1,#128 176 VUZP.8 D2,D3 177 178 179 @//NEED TO SUBTRACT (U-128) AND (V-128) 180 @//(D2-D1),(D3-D1) 181 VSUBL.U8 Q2,D2,D1 @//(U-128) 182 VSUBL.U8 Q3,D3,D1 @//(V-128) 183 184 @//LOAD VALUES OF U&V for next row 185 VLD1.8 {D2,D3},[R1]! @//LOAD 8 VALUES OF U 186 @//VLD1.8 {D3},[R2]! @//LOAD 8 VALUES OF V 187 188 @PLD [R0] 189 PLD [R1] 190 191 @//NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS 192 VMULL.S16 Q4,D4,D0[3] @//(U-128)*C4 FOR B 193 VMULL.S16 Q5,D5,D0[3] @//(U-128)*C4 FOR B 194 195 VMULL.S16 Q10,D6,D0[0] @//(V-128)*C1 FOR R 196 VMULL.S16 Q11,D7,D0[0] @//(V-128)*C1 FOR R 197 198 VMULL.S16 Q6,D4,D0[1] @//(U-128)*C2 FOR G 199 VMLAL.S16 Q6,D6,D0[2] @//Q6 = (U-128)*C2 + (V-128)*C3 200 VMULL.S16 Q7,D5,D0[1] @//(U-128)*C2 FOR G 201 VMLAL.S16 Q7,D7,D0[2] @//Q7 = (U-128)*C2 + (V-128)*C3 202 203 @//NARROW RIGHT SHIFT BY 13 FOR R&B 204 VQSHRN.S32 D8,Q4,#13 @//D8 = (U-128)*C4>>13 4 16-BIT VALUES 205 VQSHRN.S32 D9,Q5,#13 @//D9 = (U-128)*C4>>13 4 16-BIT VALUES 206 @//Q4 - WEIGHT FOR B 207 208 @//NARROW RIGHT SHIFT BY 13 FOR R&B 209 VQSHRN.S32 D10,Q10,#13 @//D10 = (V-128)*C1>>13 4 16-BIT VALUES 210 VQSHRN.S32 D11,Q11,#13 @//D11 = (V-128)*C1>>13 4 16-BIT VALUES 211 @//Q5 - WEIGHT FOR R 212 213 @//NARROW RIGHT SHIFT BY 13 FOR G 214 VQSHRN.S32 D12,Q6,#13 @//D12 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES 215 VQSHRN.S32 D13,Q7,#13 @//D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES 216 @//Q6 - WEIGHT FOR G 217 218 VADDW.U8 Q7,Q4,D30 @//Q7 - HAS Y + B 219 VADDW.U8 Q8,Q5,D30 @//Q8 - HAS Y + R 220 VADDW.U8 Q9,Q6,D30 @//Q9 - HAS Y + G 221 222 VADDW.U8 Q10,Q4,D31 @//Q10 - HAS Y + B 223 VADDW.U8 Q11,Q5,D31 @//Q11 - HAS Y + R 224 VADDW.U8 Q12,Q6,D31 @//Q12 - HAS Y + G 225 226 VQMOVUN.S16 D14,Q7 227 VQMOVUN.S16 D15,Q9 228 VQMOVUN.S16 D16,Q8 229 VMOV.I8 D17,#0 230 231 VZIP.8 D14,D15 232 VZIP.8 D16,D17 233 VZIP.16 Q7,Q8 234 235 236 VQMOVUN.S16 D20,Q10 237 VQMOVUN.S16 D21,Q12 238 VQMOVUN.S16 D22,Q11 239 VMOV.I8 D23,#0 240 241 VZIP.8 D20,D21 242 VZIP.8 D22,D23 243 VZIP.16 Q10,Q11 244 245 VZIP.32 Q7,Q10 246 VZIP.32 Q8,Q11 247 248 VST1.32 D14,[R2]! 249 VST1.32 D15,[R2]! 250 VST1.32 D20,[R2]! 251 VST1.32 D21,[R2]! 252 VST1.32 D16,[R2]! 253 VST1.32 D17,[R2]! 254 VST1.32 D22,[R2]! 255 VST1.32 D23,[R2]! 256 257 @//D14-D20 - TOALLY HAVE 16 VALUES 258 @//WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS 259 VADDW.U8 Q7,Q4,D28 @//Q7 - HAS Y + B 260 VADDW.U8 Q8,Q5,D28 @//Q2 - HAS Y + R 261 VADDW.U8 Q9,Q6,D28 @//Q3 - HAS Y + G 262 263 VADDW.U8 Q10,Q4,D29 @//Q10 - HAS Y + B 264 VADDW.U8 Q11,Q5,D29 @//Q11 - HAS Y + R 265 VADDW.U8 Q12,Q6,D29 @//Q12 - HAS Y + G 266 267 @//COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME 268 @//LOAD VALUES OF Y 8-BIT VALUES 269 VLD2.8 {D30,D31},[R0]! @//D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row 1 270 @//D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15 271 VLD2.8 {D28,D29},[R7]! @//D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row2 272 @//D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15 273 274 PLD [R0] 275 PLD [R7] 276 277 VQMOVUN.S16 D14,Q7 278 VQMOVUN.S16 D15,Q9 279 VQMOVUN.S16 D16,Q8 280 VMOV.I8 D17,#0 281 282 VZIP.8 D14,D15 283 VZIP.8 D16,D17 284 VZIP.16 Q7,Q8 285 286 287 VQMOVUN.S16 D20,Q10 288 VQMOVUN.S16 D21,Q12 289 VQMOVUN.S16 D22,Q11 290 VMOV.I8 D23,#0 291 292 VZIP.8 D20,D21 293 VZIP.8 D22,D23 294 VZIP.16 Q10,Q11 295 296 VZIP.32 Q7,Q10 297 VZIP.32 Q8,Q11 298 299 VST1.32 D14,[R8]! 300 VST1.32 D15,[R8]! 301 VST1.32 D20,[R8]! 302 VST1.32 D21,[R8]! 303 VST1.32 D16,[R8]! 304 VST1.32 D17,[R8]! 305 VST1.32 D22,[R8]! 306 VST1.32 D23,[R8]! 307 308 SUBS R6,R6,#1 @// width_cnt -= 1 309 BNE LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP 310 311LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP: 312 @VMOV.I8 Q1,#128 313 VUZP.8 D2,D3 314 315 316 @//NEED TO SUBTRACT (U-128) AND (V-128) 317 @//(D2-D1),(D3-D1) 318 VSUBL.U8 Q2,D2,D1 @//(U-128) 319 VSUBL.U8 Q3,D3,D1 @//(V-128) 320 321 322 @//NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS 323 VMULL.S16 Q4,D4,D0[3] @//(U-128)*C4 FOR B 324 VMULL.S16 Q5,D5,D0[3] @//(U-128)*C4 FOR B 325 326 VMULL.S16 Q10,D6,D0[0] @//(V-128)*C1 FOR R 327 VMULL.S16 Q11,D7,D0[0] @//(V-128)*C1 FOR R 328 329 VMULL.S16 Q6,D4,D0[1] @//(U-128)*C2 FOR G 330 VMLAL.S16 Q6,D6,D0[2] @//Q6 = (U-128)*C2 + (V-128)*C3 331 VMULL.S16 Q7,D5,D0[1] @//(U-128)*C2 FOR G 332 VMLAL.S16 Q7,D7,D0[2] @//Q7 = (U-128)*C2 + (V-128)*C3 333 334 @//NARROW RIGHT SHIFT BY 13 FOR R&B 335 VQSHRN.S32 D8,Q4,#13 @//D8 = (U-128)*C4>>13 4 16-BIT VALUES 336 VQSHRN.S32 D9,Q5,#13 @//D9 = (U-128)*C4>>13 4 16-BIT VALUES 337 @//Q4 - WEIGHT FOR B 338 339 @//NARROW RIGHT SHIFT BY 13 FOR R&B 340 VQSHRN.S32 D10,Q10,#13 @//D10 = (V-128)*C1>>13 4 16-BIT VALUES 341 VQSHRN.S32 D11,Q11,#13 @//D11 = (V-128)*C1>>13 4 16-BIT VALUES 342 @//Q5 - WEIGHT FOR R 343 344 @//NARROW RIGHT SHIFT BY 13 FOR G 345 VQSHRN.S32 D12,Q6,#13 @//D12 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES 346 VQSHRN.S32 D13,Q7,#13 @//D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES 347 @//Q6 - WEIGHT FOR G 348 349 VADDW.U8 Q7,Q4,D30 @//Q7 - HAS Y + B 350 VADDW.U8 Q8,Q5,D30 @//Q8 - HAS Y + R 351 VADDW.U8 Q9,Q6,D30 @//Q9 - HAS Y + G 352 353 VADDW.U8 Q10,Q4,D31 @//Q10 - HAS Y + B 354 VADDW.U8 Q11,Q5,D31 @//Q11 - HAS Y + R 355 VADDW.U8 Q12,Q6,D31 @//Q12 - HAS Y + G 356 357 VQMOVUN.S16 D14,Q7 358 VQMOVUN.S16 D15,Q9 359 VQMOVUN.S16 D16,Q8 360 VMOV.I8 D17,#0 361 362 VZIP.8 D14,D15 363 VZIP.8 D16,D17 364 VZIP.16 Q7,Q8 365 366 367 VQMOVUN.S16 D20,Q10 368 VQMOVUN.S16 D21,Q12 369 VQMOVUN.S16 D22,Q11 370 VMOV.I8 D23,#0 371 372 VZIP.8 D20,D21 373 VZIP.8 D22,D23 374 VZIP.16 Q10,Q11 375 376 VZIP.32 Q7,Q10 377 VZIP.32 Q8,Q11 378 379 VST1.32 D14,[R2]! 380 VST1.32 D15,[R2]! 381 VST1.32 D20,[R2]! 382 VST1.32 D21,[R2]! 383 VST1.32 D16,[R2]! 384 VST1.32 D17,[R2]! 385 VST1.32 D22,[R2]! 386 VST1.32 D23,[R2]! 387 388 @//D14-D20 - TOALLY HAVE 16 VALUES 389 @//WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS 390 VADDW.U8 Q7,Q4,D28 @//Q7 - HAS Y + B 391 VADDW.U8 Q8,Q5,D28 @//Q2 - HAS Y + R 392 VADDW.U8 Q9,Q6,D28 @//Q3 - HAS Y + G 393 394 VADDW.U8 Q10,Q4,D29 @//Q10 - HAS Y + B 395 VADDW.U8 Q11,Q5,D29 @//Q11 - HAS Y + R 396 VADDW.U8 Q12,Q6,D29 @//Q12 - HAS Y + G 397 398 399 VQMOVUN.S16 D14,Q7 400 VQMOVUN.S16 D15,Q9 401 VQMOVUN.S16 D16,Q8 402 VMOV.I8 D17,#0 403 404 VZIP.8 D14,D15 405 VZIP.8 D16,D17 406 VZIP.16 Q7,Q8 407 408 409 VQMOVUN.S16 D20,Q10 410 VQMOVUN.S16 D21,Q12 411 VQMOVUN.S16 D22,Q11 412 VMOV.I8 D23,#0 413 414 VZIP.8 D20,D21 415 VZIP.8 D22,D23 416 VZIP.16 Q10,Q11 417 418 VZIP.32 Q7,Q10 419 VZIP.32 Q8,Q11 420 421 VST1.32 D14,[R8]! 422 VST1.32 D15,[R8]! 423 VST1.32 D20,[R8]! 424 VST1.32 D21,[R8]! 425 VST1.32 D16,[R8]! 426 VST1.32 D17,[R8]! 427 VST1.32 D22,[R8]! 428 VST1.32 D23,[R8]! 429 430 @// Adjust the address pointers 431 ADD R0,R7,R10 @// luma = luma_next + offset 432 ADD R2,R8,R14,LSL #2 @// rgb = rgb_next + offset 433 434 ADD R7,R0,R3 @// luma_next = luma + width 435 ADD R8,R2,R3,LSL #2 @// rgb_next_row = rgb + width 436 437 ADD R1,R1,R11 @// adjust u pointer 438 @ADD R2,R2,R12 @// adjust v pointer 439 440 ADD R7,R7,R10 @// luma_next = luma + width + offset (because of register crunch) 441 ADD R8,R8,R14,LSL #2 @// rgb_next_row = rgb + width + offset 442 443 SUBS R5,R5,#1 @// height_cnt -= 1 444 445 BNE LABEL_YUV420SP_TO_RGB8888_HEIGHT_LOOP 446 447 @//POP THE REGISTERS 448 LDMFD SP!,{R4-R12,PC} 449 450 451 452 453 .section .note.GNU-stack,"",%progbits 454 455