1 /* ------------------------------------------------------------------ 2 * Copyright (C) 1998-2009 PacketVideo 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 13 * express or implied. 14 * See the License for the specific language governing permissions 15 * and limitations under the License. 16 * ------------------------------------------------------------------- 17 */ 18 /*********************************************************************************/ 19 /* Filename: sad_inline.h */ 20 /* Description: Implementation for in-line functions used in dct.cpp */ 21 /* Modified: */ 22 /*********************************************************************************/ 23 #ifndef _SAD_INLINE_H_ 24 #define _SAD_INLINE_H_ 25 26 #ifdef __cplusplus 27 extern "C" 28 { 29 #endif 30 31 #if !defined(PV_ARM_GCC_V5) && !defined(PV_ARM_GCC_V4) /* ARM GNU COMPILER */ 32 SUB_SAD(int32 sad,int32 tmp,int32 tmp2)33 __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2) 34 { 35 tmp = tmp - tmp2; 36 if (tmp > 0) sad += tmp; 37 else sad -= tmp; 38 39 return sad; 40 } 41 sad_4pixel(int32 src1,int32 src2,int32 mask)42 __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask) 43 { 44 int32 x7; 45 46 x7 = src2 ^ src1; /* check odd/even combination */ 47 if ((uint32)src2 >= (uint32)src1) 48 { 49 src1 = src2 - src1; /* subs */ 50 } 51 else 52 { 53 src1 = src1 - src2; 54 } 55 x7 = x7 ^ src1; /* only odd bytes need to add carry */ 56 x7 = mask & ((uint32)x7 >> 1); 57 x7 = (x7 << 8) - x7; 58 src1 = src1 + (x7 >> 7); /* add 0xFF to the negative byte, add back carry */ 59 src1 = src1 ^(x7 >> 7); /* take absolute value of negative byte */ 60 61 return src1; 62 } 63 64 #define NUMBER 3 65 #define SHIFT 24 66 67 #include "sad_mb_offset.h" 68 69 #undef NUMBER 70 #define NUMBER 2 71 #undef SHIFT 72 #define SHIFT 16 73 #include "sad_mb_offset.h" 74 75 #undef NUMBER 76 #define NUMBER 1 77 #undef SHIFT 78 #define SHIFT 8 79 #include "sad_mb_offset.h" 80 81 simd_sad_mb(UChar * ref,UChar * blk,Int dmin,Int lx)82 __inline int32 simd_sad_mb(UChar *ref, UChar *blk, Int dmin, Int lx) 83 { 84 int32 x4, x5, x6, x8, x9, x10, x11, x12, x14; 85 86 x9 = 0x80808080; /* const. */ 87 88 x8 = (uintptr_t)ref & 0x3; 89 if (x8 == 3) 90 goto SadMBOffset3; 91 if (x8 == 2) 92 goto SadMBOffset2; 93 if (x8 == 1) 94 goto SadMBOffset1; 95 96 // x5 = (x4<<8)-x4; /* x5 = x4*255; */ 97 x4 = x5 = 0; 98 99 x6 = 0xFFFF00FF; 100 101 ref -= lx; 102 blk -= 16; 103 104 x8 = 16; 105 106 LOOP_SAD0: 107 /****** process 8 pixels ******/ 108 x10 = *((uint32*)(ref += lx)); 109 x11 = *((uint32*)(ref + 4)); 110 x12 = *((uint32*)(blk += 16)); 111 x14 = *((uint32*)(blk + 4)); 112 113 /* process x11 & x14 */ 114 x11 = sad_4pixel(x11, x14, x9); 115 116 /* process x12 & x10 */ 117 x10 = sad_4pixel(x10, x12, x9); 118 119 x5 = x5 + x10; /* accumulate low bytes */ 120 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ 121 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ 122 x5 = x5 + x11; /* accumulate low bytes */ 123 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ 124 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ 125 126 /****** process 8 pixels ******/ 127 x10 = *((uint32*)(ref + 8)); 128 x11 = *((uint32*)(ref + 12)); 129 x12 = *((uint32*)(blk + 8)); 130 x14 = *((uint32*)(blk + 12)); 131 132 /* process x11 & x14 */ 133 x11 = sad_4pixel(x11, x14, x9); 134 135 /* process x12 & x10 */ 136 x10 = sad_4pixel(x10, x12, x9); 137 138 x5 = x5 + x10; /* accumulate low bytes */ 139 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ 140 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ 141 x5 = x5 + x11; /* accumulate low bytes */ 142 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ 143 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ 144 145 /****************/ 146 x10 = x5 - (x4 << 8); /* extract low bytes */ 147 x10 = x10 + x4; /* add with high bytes */ 148 x10 = x10 + (x10 << 16); /* add with lower half word */ 149 150 if (((uint32)x10 >> 16) <= (uint32)dmin) /* compare with dmin */ 151 { 152 if (--x8) 153 { 154 goto LOOP_SAD0; 155 } 156 157 } 158 159 return ((uint32)x10 >> 16); 160 161 SadMBOffset3: 162 163 return sad_mb_offset3(ref, blk, lx, dmin); 164 165 SadMBOffset2: 166 167 return sad_mb_offset2(ref, blk, lx, dmin); 168 169 SadMBOffset1: 170 171 return sad_mb_offset1(ref, blk, lx, dmin); 172 173 } 174 175 #elif defined(__CC_ARM) /* only work with arm v5 */ 176 177 __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2) 178 { 179 __asm 180 { 181 rsbs tmp, tmp, tmp2 ; 182 rsbmi tmp, tmp, #0 ; 183 add sad, sad, tmp ; 184 } 185 186 return sad; 187 } 188 189 __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask) 190 { 191 int32 x7; 192 193 __asm 194 { 195 EOR x7, src2, src1; /* check odd/even combination */ 196 SUBS src1, src2, src1; 197 EOR x7, x7, src1; 198 AND x7, mask, x7, lsr #1; 199 ORRCC x7, x7, #0x80000000; 200 RSB x7, x7, x7, lsl #8; 201 ADD src1, src1, x7, asr #7; /* add 0xFF to the negative byte, add back carry */ 202 EOR src1, src1, x7, asr #7; /* take absolute value of negative byte */ 203 } 204 205 return src1; 206 } 207 208 __inline int32 sad_4pixelN(int32 src1, int32 src2, int32 mask) 209 { 210 int32 x7; 211 212 __asm 213 { 214 EOR x7, src2, src1; /* check odd/even combination */ 215 ADDS src1, src2, src1; 216 EOR x7, x7, src1; /* only odd bytes need to add carry */ 217 ANDS x7, mask, x7, rrx; 218 RSB x7, x7, x7, lsl #8; 219 SUB src1, src1, x7, asr #7; /* add 0xFF to the negative byte, add back carry */ 220 EOR src1, src1, x7, asr #7; /* take absolute value of negative byte */ 221 } 222 223 return src1; 224 } 225 226 #define sum_accumulate __asm{ SBC x5, x5, x10; /* accumulate low bytes */ \ 227 BIC x10, x6, x10; /* x10 & 0xFF00FF00 */ \ 228 ADD x4, x4, x10,lsr #8; /* accumulate high bytes */ \ 229 SBC x5, x5, x11; /* accumulate low bytes */ \ 230 BIC x11, x6, x11; /* x11 & 0xFF00FF00 */ \ 231 ADD x4, x4, x11,lsr #8; } /* accumulate high bytes */ 232 233 234 #define NUMBER 3 235 #define SHIFT 24 236 #define INC_X8 0x08000001 237 238 #include "sad_mb_offset.h" 239 240 #undef NUMBER 241 #define NUMBER 2 242 #undef SHIFT 243 #define SHIFT 16 244 #undef INC_X8 245 #define INC_X8 0x10000001 246 #include "sad_mb_offset.h" 247 248 #undef NUMBER 249 #define NUMBER 1 250 #undef SHIFT 251 #define SHIFT 8 252 #undef INC_X8 253 #define INC_X8 0x08000001 254 #include "sad_mb_offset.h" 255 256 257 __inline int32 simd_sad_mb(UChar *ref, UChar *blk, Int dmin, Int lx) 258 { 259 int32 x4, x5, x6, x8, x9, x10, x11, x12, x14; 260 261 x9 = 0x80808080; /* const. */ 262 x4 = x5 = 0; 263 264 __asm 265 { 266 MOVS x8, ref, lsl #31 ; 267 BHI SadMBOffset3; 268 BCS SadMBOffset2; 269 BMI SadMBOffset1; 270 271 MVN x6, #0xFF00; 272 } 273 LOOP_SAD0: 274 /****** process 8 pixels ******/ 275 x11 = *((int32*)(ref + 12)); 276 x10 = *((int32*)(ref + 8)); 277 x14 = *((int32*)(blk + 12)); 278 x12 = *((int32*)(blk + 8)); 279 280 /* process x11 & x14 */ 281 x11 = sad_4pixel(x11, x14, x9); 282 283 /* process x12 & x10 */ 284 x10 = sad_4pixel(x10, x12, x9); 285 286 x5 = x5 + x10; /* accumulate low bytes */ 287 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ 288 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ 289 x5 = x5 + x11; /* accumulate low bytes */ 290 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ 291 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ 292 293 __asm 294 { 295 /****** process 8 pixels ******/ 296 LDR x11, [ref, #4]; 297 LDR x10, [ref], lx ; 298 LDR x14, [blk, #4]; 299 LDR x12, [blk], #16 ; 300 } 301 302 /* process x11 & x14 */ 303 x11 = sad_4pixel(x11, x14, x9); 304 305 /* process x12 & x10 */ 306 x10 = sad_4pixel(x10, x12, x9); 307 308 x5 = x5 + x10; /* accumulate low bytes */ 309 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ 310 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ 311 x5 = x5 + x11; /* accumulate low bytes */ 312 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ 313 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ 314 315 /****************/ 316 x10 = x5 - (x4 << 8); /* extract low bytes */ 317 x10 = x10 + x4; /* add with high bytes */ 318 x10 = x10 + (x10 << 16); /* add with lower half word */ 319 320 __asm 321 { 322 /****************/ 323 RSBS x11, dmin, x10, lsr #16; 324 ADDLSS x8, x8, #0x10000001; 325 BLS LOOP_SAD0; 326 } 327 328 return ((uint32)x10 >> 16); 329 330 SadMBOffset3: 331 332 return sad_mb_offset3(ref, blk, lx, dmin, x8); 333 334 SadMBOffset2: 335 336 return sad_mb_offset2(ref, blk, lx, dmin, x8); 337 338 SadMBOffset1: 339 340 return sad_mb_offset1(ref, blk, lx, dmin, x8); 341 } 342 343 344 #elif ( defined(PV_ARM_GCC_V5) || defined(PV_ARM_GCC_V4) ) /* ARM GNU COMPILER */ 345 346 __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2) 347 { 348 register int32 out; 349 register int32 temp1; 350 register int32 ss = sad; 351 register int32 tt = tmp; 352 register int32 uu = tmp2; 353 354 asm volatile("rsbs %1, %4, %3\n\t" 355 "rsbmi %1, %1, #0\n\t" 356 "add %0, %2, %1" 357 : "=&r"(out), 358 "=&r"(temp1) 359 : "r"(ss), 360 "r"(tt), 361 "r"(uu)); 362 return out; 363 } 364 365 __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask) 366 { 367 register int32 out; 368 register int32 temp1; 369 register int32 s1 = src1; 370 register int32 s2 = src2; 371 register int32 mm = mask; 372 373 asm volatile("eor %0, %3, %2\n\t" 374 "subs %1, %3, %2\n\t" 375 "eor %0, %0, %1\n\t" 376 "and %0, %4, %0, lsr #1\n\t" 377 "orrcc %0, %0, #0x80000000\n\t" 378 "rsb %0, %0, %0, lsl #8\n\t" 379 "add %1, %1, %0, asr #7\n\t" 380 "eor %1, %1, %0, asr #7" 381 : "=&r"(out), 382 "=&r"(temp1) 383 : "r"(s1), 384 "r"(s2), 385 "r"(mm)); 386 387 return temp1; 388 } 389 390 __inline int32 sad_4pixelN(int32 src1, int32 src2, int32 mask) 391 { 392 register int32 out; 393 register int32 temp1; 394 register int32 s1 = src1; 395 register int32 s2 = src2; 396 register int32 mm = mask; 397 398 asm volatile("eor %1, %3, %2\n\t" 399 "adds %0, %3, %2\n\t" 400 "eor %1, %1, %0\n\t" 401 "ands %1, %4, %1,rrx\n\t" 402 "rsb %1, %1, %1, lsl #8\n\t" 403 "sub %0, %0, %1, asr #7\n\t" 404 "eor %0, %0, %1, asr #7" 405 : "=&r"(out), 406 "=&r"(temp1) 407 : "r"(s1), 408 "r"(s2), 409 "r"(mm)); 410 411 return (out); 412 } 413 414 #define sum_accumulate asm volatile("sbc %0, %0, %1\n\t" \ 415 "bic %1, %4, %1\n\t" \ 416 "add %2, %2, %1, lsr #8\n\t" \ 417 "sbc %0, %0, %3\n\t" \ 418 "bic %3, %4, %3\n\t" \ 419 "add %2, %2, %3, lsr #8" \ 420 :"+r"(x5), "+r"(x10), "+r"(x4), "+r"(x11) \ 421 :"r"(x6)); 422 423 #define NUMBER 3 424 #define SHIFT 24 425 #define INC_X8 0x08000001 426 427 #include "sad_mb_offset.h" 428 429 #undef NUMBER 430 #define NUMBER 2 431 #undef SHIFT 432 #define SHIFT 16 433 #undef INC_X8 434 #define INC_X8 0x10000001 435 #include "sad_mb_offset.h" 436 437 #undef NUMBER 438 #define NUMBER 1 439 #undef SHIFT 440 #define SHIFT 8 441 #undef INC_X8 442 #define INC_X8 0x08000001 443 #include "sad_mb_offset.h" 444 445 446 __inline int32 simd_sad_mb(UChar *ref, UChar *blk, Int dmin, Int lx) 447 { 448 int32 x4, x5, x6, x8, x9, x10, x11, x12, x14; 449 450 x9 = 0x80808080; /* const. */ 451 x4 = x5 = 0; 452 453 x8 = (uint32)ref & 0x3; 454 if (x8 == 3) 455 goto SadMBOffset3; 456 if (x8 == 2) 457 goto SadMBOffset2; 458 if (x8 == 1) 459 goto SadMBOffset1; 460 461 asm volatile("mvn %0, #0xFF00": "=r"(x6)); 462 463 LOOP_SAD0: 464 /****** process 8 pixels ******/ 465 x11 = *((int32*)(ref + 12)); 466 x10 = *((int32*)(ref + 8)); 467 x14 = *((int32*)(blk + 12)); 468 x12 = *((int32*)(blk + 8)); 469 470 /* process x11 & x14 */ 471 x11 = sad_4pixel(x11, x14, x9); 472 473 /* process x12 & x10 */ 474 x10 = sad_4pixel(x10, x12, x9); 475 476 x5 = x5 + x10; /* accumulate low bytes */ 477 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ 478 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ 479 x5 = x5 + x11; /* accumulate low bytes */ 480 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ 481 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ 482 483 asm volatile("ldr %0, [%4, #4]\n\t" 484 "ldr %1, [%4], %6\n\t" 485 "ldr %2, [%5, #4]\n\t" 486 "ldr %3, [%5], #16" 487 : "=r"(x11), "=r"(x10), "=r"(x14), "=r"(x12), "+r"(ref), "+r"(blk) 488 : "r"(lx)); 489 490 /* process x11 & x14 */ 491 x11 = sad_4pixel(x11, x14, x9); 492 493 /* process x12 & x10 */ 494 x10 = sad_4pixel(x10, x12, x9); 495 496 x5 = x5 + x10; /* accumulate low bytes */ 497 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ 498 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ 499 x5 = x5 + x11; /* accumulate low bytes */ 500 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ 501 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ 502 503 /****************/ 504 x10 = x5 - (x4 << 8); /* extract low bytes */ 505 x10 = x10 + x4; /* add with high bytes */ 506 x10 = x10 + (x10 << 16); /* add with lower half word */ 507 508 if (((uint32)x10 >> 16) <= (uint32)dmin) /* compare with dmin */ 509 { 510 if (--x8) 511 { 512 goto LOOP_SAD0; 513 } 514 515 } 516 517 return ((uint32)x10 >> 16); 518 519 SadMBOffset3: 520 521 return sad_mb_offset3(ref, blk, lx, dmin); 522 523 SadMBOffset2: 524 525 return sad_mb_offset2(ref, blk, lx, dmin); 526 527 SadMBOffset1: 528 529 return sad_mb_offset1(ref, blk, lx, dmin); 530 } 531 532 #endif // OS 533 534 #ifdef __cplusplus 535 } 536 #endif 537 538 #endif // _SAD_INLINE_H_ 539 540