1@/***************************************************************************** 2@* 3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4@* 5@* Licensed under the Apache License, Version 2.0 (the "License"); 6@* you may not use this file except in compliance with the License. 7@* You may obtain a copy of the License at: 8@* 9@* http://www.apache.org/licenses/LICENSE-2.0 10@* 11@* Unless required by applicable law or agreed to in writing, software 12@* distributed under the License is distributed on an "AS IS" BASIS, 13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@* See the License for the specific language governing permissions and 15@* limitations under the License. 16@* 17@*****************************************************************************/ 18@/** 19@/******************************************************************************* 20@* @file 21@* ihevcd_fmt_conv_420sp_to_rgba8888.s 22@* 23@* @brief 24@* contains function definitions for format conversions 25@* 26@* @author 27@* ittiam 28@* 29@* @par list of functions: 30@* 31@* 32@* @remarks 33@* none 34@* 35@*******************************************************************************/ 36 .equ DO1STROUNDING, 0 37 38 @ ARM 39 @ 40 @ PRESERVE8 41 42.text 43.p2align 2 44 45 46 47 48@/***************************************************************************** 49@* * 50@* Function Name : ihevcd_fmt_conv_420sp_to_rgba8888() * 51@* * 52@* Description : This function conversts the image from YUV422 color * 53@* space to RGB888 color space. The function can be * 54@* invoked at the MB level. * 55@* * 56@* Arguments : R0 pubY * 57@* R1 pubUV * 58@* R2 pusRGB * 59@* R3 pusRGB * 60@* [R13 #40] usHeight * 61@* [R13 #44] usWidth * 62@* [R13 #48] usStrideY * 63@* [R13 #52] usStrideU * 64@* [R13 #56] usStrideV * 65@* [R13 #60] usStrideRGB * 66@* * 67@* Values Returned : None * 68@* * 69@* Register Usage : R0 - R14 * 70@* * 71@* Stack Usage : 104 Bytes * 72@* * 73@* Interruptibility : Interruptible * 74@* * 75@* Known Limitations * 76@* Assumptions: Image Width: Assumed to be multiple of 16 and * 77@* greater than or equal to 16 * 78@* Image Height: Assumed to be even. * 79@* * 80@* Revision History : * 81@* DD MM YYYY Author(s) Changes (Describe the changes made) * 82@* 07 06 2010 Varshita Draft * 83@* 07 06 2010 Naveen Kr T Completed * 84@* 05 08 2013 Naveen K P Modified for HEVC * 85@* 30 10 2018 Saurabh Sood Store D registers to stack * 86@*****************************************************************************/ 87 .global ihevcd_fmt_conv_420sp_to_rgba8888_a9q 88.type ihevcd_fmt_conv_420sp_to_rgba8888_a9q, function 89ihevcd_fmt_conv_420sp_to_rgba8888_a9q: 90 91 @// push the registers on the stack 92 STMFD SP!,{R4-R12,LR} 93 VPUSH {d8-d15} 94 95 @//R0 - Y PTR 96 @//R1 - UV PTR 97 @//R2 - RGB PTR 98 @//R3 - RGB PTR 99 @//R4 - PIC WIDTH 100 @//R5 - PIC HT 101 @//R6 - STRIDE Y 102 @//R7 - STRIDE U 103 @//R8 - STRIDE V 104 @//R9 - STRIDE RGB 105 106 @//ONE ROW PROCESSING AT A TIME 107 108 @//THE FOUR CONSTANTS ARE: 109 @//C1=0x3311,C2=0xF379,C3=0xE5F8,C4=0x4092 110 111 @PLD [R0] 112 @PLD [R1] 113 @PLD [R2] 114 115 116 @/* can be loaded from a defined const type */ 117 MOVW R10,#0x3311 118 VMOV.16 D0[0],R10 @//C1 119 120 MOVW R10,#0xF379 121 VMOV.16 D0[1],R10 @//C2 122 123 MOVW R10,#0xE5F8 124 VMOV.16 D0[2],R10 @//C3 125 126 MOVW R10,#0x4092 127 VMOV.16 D0[3],R10 @//C4 128 129 @//LOAD CONSTANT 128 INTO A CORTEX REGISTER 130 MOV R10,#128 131 VDUP.8 D1,R10 132 133 @//D0 HAS C1-C2-C3-C4 134 @// load other parameters from stack 135 LDR R5,[sp,#104] 136 @LDR R4,[sp,#44] 137 LDR R6,[sp,#108] 138 LDR R7,[sp,#112] 139 @LDR R8,[sp,#52] 140 LDR R9,[sp,#116] 141 142 @// calculate offsets, offset = stride - width 143 SUB R10,R6,R3 @// luma offset 144 SUB R11,R7,R3 145 @, LSR #1 @// u offset 146 @SUB R12,R8,R3, LSR #1 @// v offset 147 SUB R14,R9,R3 @// rgb offset in pixels 148 149 @// calculate height loop count 150 MOV R5,R5, LSR #1 @// height_cnt = height / 16 151 152 @// create next row pointers for rgb and luma data 153 ADD R7,R0,R6 @// luma_next_row = luma + luma_stride 154 ADD R8,R2,R9,LSL #2 @// rgb_next_row = rgb + rgb_stride 155 156LABEL_YUV420SP_TO_RGB8888_HEIGHT_LOOP: 157 158 @//LOAD VALUES OF U&V AND COMPUTE THE R,G,B WEIGHT VALUES. 159 VLD1.8 {D2,D3},[R1]! @//LOAD 8 VALUES OF UV 160 @//VLD1.8 {D3},[R2]! @//LOAD 8 VALUES OF V 161 162 @// calculate width loop count 163 MOV R6,R3, LSR #4 @// width_cnt = width / 16 164 165 @//COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME 166 @//LOAD VALUES OF Y 8-BIT VALUES 167 VLD2.8 {D30,D31},[R0]! @//D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row 1 168 @//D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15 169 VLD2.8 {D28,D29},[R7]! @//D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row2 170 @//D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15 171 172 SUBS R6,R6,#1 173 BEQ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP 174 175LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP: 176 @VMOV.I8 Q1,#128 177 VUZP.8 D2,D3 178 179 180 @//NEED TO SUBTRACT (U-128) AND (V-128) 181 @//(D2-D1),(D3-D1) 182 VSUBL.U8 Q2,D2,D1 @//(U-128) 183 VSUBL.U8 Q3,D3,D1 @//(V-128) 184 185 @//LOAD VALUES OF U&V for next row 186 VLD1.8 {D2,D3},[R1]! @//LOAD 8 VALUES OF U 187 @//VLD1.8 {D3},[R2]! @//LOAD 8 VALUES OF V 188 189 @PLD [R0] 190 PLD [R1] 191 192 @//NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS 193 VMULL.S16 Q4,D4,D0[3] @//(U-128)*C4 FOR B 194 VMULL.S16 Q5,D5,D0[3] @//(U-128)*C4 FOR B 195 196 VMULL.S16 Q10,D6,D0[0] @//(V-128)*C1 FOR R 197 VMULL.S16 Q11,D7,D0[0] @//(V-128)*C1 FOR R 198 199 VMULL.S16 Q6,D4,D0[1] @//(U-128)*C2 FOR G 200 VMLAL.S16 Q6,D6,D0[2] @//Q6 = (U-128)*C2 + (V-128)*C3 201 VMULL.S16 Q7,D5,D0[1] @//(U-128)*C2 FOR G 202 VMLAL.S16 Q7,D7,D0[2] @//Q7 = (U-128)*C2 + (V-128)*C3 203 204 @//NARROW RIGHT SHIFT BY 13 FOR R&B 205 VQSHRN.S32 D8,Q4,#13 @//D8 = (U-128)*C4>>13 4 16-BIT VALUES 206 VQSHRN.S32 D9,Q5,#13 @//D9 = (U-128)*C4>>13 4 16-BIT VALUES 207 @//Q4 - WEIGHT FOR B 208 209 @//NARROW RIGHT SHIFT BY 13 FOR R&B 210 VQSHRN.S32 D10,Q10,#13 @//D10 = (V-128)*C1>>13 4 16-BIT VALUES 211 VQSHRN.S32 D11,Q11,#13 @//D11 = (V-128)*C1>>13 4 16-BIT VALUES 212 @//Q5 - WEIGHT FOR R 213 214 @//NARROW RIGHT SHIFT BY 13 FOR G 215 VQSHRN.S32 D12,Q6,#13 @//D12 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES 216 VQSHRN.S32 D13,Q7,#13 @//D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES 217 @//Q6 - WEIGHT FOR G 218 219 VADDW.U8 Q7,Q4,D30 @//Q7 - HAS Y + B 220 VADDW.U8 Q8,Q5,D30 @//Q8 - HAS Y + R 221 VADDW.U8 Q9,Q6,D30 @//Q9 - HAS Y + G 222 223 VADDW.U8 Q10,Q4,D31 @//Q10 - HAS Y + B 224 VADDW.U8 Q11,Q5,D31 @//Q11 - HAS Y + R 225 VADDW.U8 Q12,Q6,D31 @//Q12 - HAS Y + G 226 227 VQMOVUN.S16 D14,Q7 228 VQMOVUN.S16 D15,Q9 229 VQMOVUN.S16 D16,Q8 230 VMOV.I8 D17,#0 231 232 VZIP.8 D14,D15 233 VZIP.8 D16,D17 234 VZIP.16 Q7,Q8 235 236 237 VQMOVUN.S16 D20,Q10 238 VQMOVUN.S16 D21,Q12 239 VQMOVUN.S16 D22,Q11 240 VMOV.I8 D23,#0 241 242 VZIP.8 D20,D21 243 VZIP.8 D22,D23 244 VZIP.16 Q10,Q11 245 246 VZIP.32 Q7,Q10 247 VZIP.32 Q8,Q11 248 249 VST1.32 D14,[R2]! 250 VST1.32 D15,[R2]! 251 VST1.32 D20,[R2]! 252 VST1.32 D21,[R2]! 253 VST1.32 D16,[R2]! 254 VST1.32 D17,[R2]! 255 VST1.32 D22,[R2]! 256 VST1.32 D23,[R2]! 257 258 @//D14-D20 - TOALLY HAVE 16 VALUES 259 @//WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS 260 VADDW.U8 Q7,Q4,D28 @//Q7 - HAS Y + B 261 VADDW.U8 Q8,Q5,D28 @//Q2 - HAS Y + R 262 VADDW.U8 Q9,Q6,D28 @//Q3 - HAS Y + G 263 264 VADDW.U8 Q10,Q4,D29 @//Q10 - HAS Y + B 265 VADDW.U8 Q11,Q5,D29 @//Q11 - HAS Y + R 266 VADDW.U8 Q12,Q6,D29 @//Q12 - HAS Y + G 267 268 @//COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME 269 @//LOAD VALUES OF Y 8-BIT VALUES 270 VLD2.8 {D30,D31},[R0]! @//D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row 1 271 @//D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15 272 VLD2.8 {D28,D29},[R7]! @//D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row2 273 @//D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15 274 275 PLD [R0] 276 PLD [R7] 277 278 VQMOVUN.S16 D14,Q7 279 VQMOVUN.S16 D15,Q9 280 VQMOVUN.S16 D16,Q8 281 VMOV.I8 D17,#0 282 283 VZIP.8 D14,D15 284 VZIP.8 D16,D17 285 VZIP.16 Q7,Q8 286 287 288 VQMOVUN.S16 D20,Q10 289 VQMOVUN.S16 D21,Q12 290 VQMOVUN.S16 D22,Q11 291 VMOV.I8 D23,#0 292 293 VZIP.8 D20,D21 294 VZIP.8 D22,D23 295 VZIP.16 Q10,Q11 296 297 VZIP.32 Q7,Q10 298 VZIP.32 Q8,Q11 299 300 VST1.32 D14,[R8]! 301 VST1.32 D15,[R8]! 302 VST1.32 D20,[R8]! 303 VST1.32 D21,[R8]! 304 VST1.32 D16,[R8]! 305 VST1.32 D17,[R8]! 306 VST1.32 D22,[R8]! 307 VST1.32 D23,[R8]! 308 309 SUBS R6,R6,#1 @// width_cnt -= 1 310 BNE LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP 311 312LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP: 313 @VMOV.I8 Q1,#128 314 VUZP.8 D2,D3 315 316 317 @//NEED TO SUBTRACT (U-128) AND (V-128) 318 @//(D2-D1),(D3-D1) 319 VSUBL.U8 Q2,D2,D1 @//(U-128) 320 VSUBL.U8 Q3,D3,D1 @//(V-128) 321 322 323 @//NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS 324 VMULL.S16 Q4,D4,D0[3] @//(U-128)*C4 FOR B 325 VMULL.S16 Q5,D5,D0[3] @//(U-128)*C4 FOR B 326 327 VMULL.S16 Q10,D6,D0[0] @//(V-128)*C1 FOR R 328 VMULL.S16 Q11,D7,D0[0] @//(V-128)*C1 FOR R 329 330 VMULL.S16 Q6,D4,D0[1] @//(U-128)*C2 FOR G 331 VMLAL.S16 Q6,D6,D0[2] @//Q6 = (U-128)*C2 + (V-128)*C3 332 VMULL.S16 Q7,D5,D0[1] @//(U-128)*C2 FOR G 333 VMLAL.S16 Q7,D7,D0[2] @//Q7 = (U-128)*C2 + (V-128)*C3 334 335 @//NARROW RIGHT SHIFT BY 13 FOR R&B 336 VQSHRN.S32 D8,Q4,#13 @//D8 = (U-128)*C4>>13 4 16-BIT VALUES 337 VQSHRN.S32 D9,Q5,#13 @//D9 = (U-128)*C4>>13 4 16-BIT VALUES 338 @//Q4 - WEIGHT FOR B 339 340 @//NARROW RIGHT SHIFT BY 13 FOR R&B 341 VQSHRN.S32 D10,Q10,#13 @//D10 = (V-128)*C1>>13 4 16-BIT VALUES 342 VQSHRN.S32 D11,Q11,#13 @//D11 = (V-128)*C1>>13 4 16-BIT VALUES 343 @//Q5 - WEIGHT FOR R 344 345 @//NARROW RIGHT SHIFT BY 13 FOR G 346 VQSHRN.S32 D12,Q6,#13 @//D12 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES 347 VQSHRN.S32 D13,Q7,#13 @//D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES 348 @//Q6 - WEIGHT FOR G 349 350 VADDW.U8 Q7,Q4,D30 @//Q7 - HAS Y + B 351 VADDW.U8 Q8,Q5,D30 @//Q8 - HAS Y + R 352 VADDW.U8 Q9,Q6,D30 @//Q9 - HAS Y + G 353 354 VADDW.U8 Q10,Q4,D31 @//Q10 - HAS Y + B 355 VADDW.U8 Q11,Q5,D31 @//Q11 - HAS Y + R 356 VADDW.U8 Q12,Q6,D31 @//Q12 - HAS Y + G 357 358 VQMOVUN.S16 D14,Q7 359 VQMOVUN.S16 D15,Q9 360 VQMOVUN.S16 D16,Q8 361 VMOV.I8 D17,#0 362 363 VZIP.8 D14,D15 364 VZIP.8 D16,D17 365 VZIP.16 Q7,Q8 366 367 368 VQMOVUN.S16 D20,Q10 369 VQMOVUN.S16 D21,Q12 370 VQMOVUN.S16 D22,Q11 371 VMOV.I8 D23,#0 372 373 VZIP.8 D20,D21 374 VZIP.8 D22,D23 375 VZIP.16 Q10,Q11 376 377 VZIP.32 Q7,Q10 378 VZIP.32 Q8,Q11 379 380 VST1.32 D14,[R2]! 381 VST1.32 D15,[R2]! 382 VST1.32 D20,[R2]! 383 VST1.32 D21,[R2]! 384 VST1.32 D16,[R2]! 385 VST1.32 D17,[R2]! 386 VST1.32 D22,[R2]! 387 VST1.32 D23,[R2]! 388 389 @//D14-D20 - TOALLY HAVE 16 VALUES 390 @//WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS 391 VADDW.U8 Q7,Q4,D28 @//Q7 - HAS Y + B 392 VADDW.U8 Q8,Q5,D28 @//Q2 - HAS Y + R 393 VADDW.U8 Q9,Q6,D28 @//Q3 - HAS Y + G 394 395 VADDW.U8 Q10,Q4,D29 @//Q10 - HAS Y + B 396 VADDW.U8 Q11,Q5,D29 @//Q11 - HAS Y + R 397 VADDW.U8 Q12,Q6,D29 @//Q12 - HAS Y + G 398 399 400 VQMOVUN.S16 D14,Q7 401 VQMOVUN.S16 D15,Q9 402 VQMOVUN.S16 D16,Q8 403 VMOV.I8 D17,#0 404 405 VZIP.8 D14,D15 406 VZIP.8 D16,D17 407 VZIP.16 Q7,Q8 408 409 410 VQMOVUN.S16 D20,Q10 411 VQMOVUN.S16 D21,Q12 412 VQMOVUN.S16 D22,Q11 413 VMOV.I8 D23,#0 414 415 VZIP.8 D20,D21 416 VZIP.8 D22,D23 417 VZIP.16 Q10,Q11 418 419 VZIP.32 Q7,Q10 420 VZIP.32 Q8,Q11 421 422 VST1.32 D14,[R8]! 423 VST1.32 D15,[R8]! 424 VST1.32 D20,[R8]! 425 VST1.32 D21,[R8]! 426 VST1.32 D16,[R8]! 427 VST1.32 D17,[R8]! 428 VST1.32 D22,[R8]! 429 VST1.32 D23,[R8]! 430 431 @// Adjust the address pointers 432 ADD R0,R7,R10 @// luma = luma_next + offset 433 ADD R2,R8,R14,LSL #2 @// rgb = rgb_next + offset 434 435 ADD R7,R0,R3 @// luma_next = luma + width 436 ADD R8,R2,R3,LSL #2 @// rgb_next_row = rgb + width 437 438 ADD R1,R1,R11 @// adjust u pointer 439 @ADD R2,R2,R12 @// adjust v pointer 440 441 ADD R7,R7,R10 @// luma_next = luma + width + offset (because of register crunch) 442 ADD R8,R8,R14,LSL #2 @// rgb_next_row = rgb + width + offset 443 444 SUBS R5,R5,#1 @// height_cnt -= 1 445 446 BNE LABEL_YUV420SP_TO_RGB8888_HEIGHT_LOOP 447 448 @//POP THE REGISTERS 449 VPOP {d8-d15} 450 LDMFD SP!,{R4-R12,PC} 451 452 453 .section .note.GNU-stack,"",%progbits 454