1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12 .globl vp8_sub_pixel_variance4x4_ppc 13 .globl vp8_sub_pixel_variance8x8_ppc 14 .globl vp8_sub_pixel_variance8x16_ppc 15 .globl vp8_sub_pixel_variance16x8_ppc 16 .globl vp8_sub_pixel_variance16x16_ppc 17 18.macro load_c V, LABEL, OFF, R0, R1 19 lis \R0, \LABEL@ha 20 la \R1, \LABEL@l(\R0) 21 lvx \V, \OFF, \R1 22.endm 23 24.macro load_vfilter V0, V1 25 load_c \V0, vfilter_b, r6, r12, r10 26 27 addi r6, r6, 16 28 lvx \V1, r6, r10 29.endm 30 31.macro HProlog jump_label 32 ;# load up horizontal filter 33 slwi. r5, r5, 4 ;# index into horizontal filter array 34 35 ;# index to the next set of vectors in the row. 36 li r10, 16 37 38 ;# downshift by 7 ( divide by 128 ) at the end 39 vspltish v19, 7 40 41 ;# If there isn't any filtering to be done for the horizontal, then 42 ;# just skip to the second pass. 43 beq \jump_label 44 45 load_c v20, hfilter_b, r5, r12, r0 46 47 ;# setup constants 48 ;# v14 permutation value for alignment 49 load_c v28, b_hperm_b, 0, r12, r0 50 51 ;# index to the next set of vectors in the row. 52 li r12, 32 53 54 ;# rounding added in on the multiply 55 vspltisw v21, 8 56 vspltisw v18, 3 57 vslw v18, v21, v18 ;# 0x00000040000000400000004000000040 58 59 slwi. r6, r6, 5 ;# index into vertical filter array 60.endm 61 62;# Filters a horizontal line 63;# expects: 64;# r3 src_ptr 65;# r4 pitch 66;# r10 16 67;# r12 32 68;# v17 perm intput 69;# v18 rounding 70;# v19 shift 71;# v20 filter taps 72;# v21 tmp 73;# v22 tmp 74;# v23 tmp 75;# v24 tmp 76;# v25 tmp 77;# v26 tmp 78;# v27 tmp 79;# v28 perm output 80;# 81 82.macro hfilter_8 V, hp, lp, increment_counter 83 lvsl v17, 0, r3 ;# permutate value for alignment 84 85 ;# input to filter is 9 bytes wide, output is 8 bytes. 86 lvx v21, 0, r3 87 lvx v22, r10, r3 88 89.if \increment_counter 90 add r3, r3, r4 91.endif 92 vperm v21, v21, v22, v17 93 94 vperm v24, v21, v21, \hp ;# v20 = 0123 1234 2345 3456 95 vperm v25, v21, v21, \lp ;# v21 = 4567 5678 6789 789A 96 97 vmsummbm v24, v20, v24, v18 98 vmsummbm v25, v20, v25, v18 99 100 vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit) 101 102 vsrh v24, v24, v19 ;# divide v0, v1 by 128 103 104 vpkuhus \V, v24, v24 ;# \V = scrambled 8-bit result 105.endm 106 107.macro vfilter_16 P0 P1 108 vmuleub v22, \P0, v20 ;# 64 + 4 positive taps 109 vadduhm v22, v18, v22 110 vmuloub v23, \P0, v20 111 vadduhm v23, v18, v23 112 113 vmuleub v24, \P1, v21 114 vadduhm v22, v22, v24 ;# Re = evens, saturation unnecessary 115 vmuloub v25, \P1, v21 116 vadduhm v23, v23, v25 ;# Ro = odds 117 118 vsrh v22, v22, v19 ;# divide by 128 119 vsrh v23, v23, v19 ;# v16 v17 = evens, odds 120 vmrghh \P0, v22, v23 ;# v18 v19 = 16-bit result in order 121 vmrglh v23, v22, v23 122 vpkuhus \P0, \P0, v23 ;# P0 = 8-bit result 123.endm 124 125.macro compute_sum_sse src, ref, sum, sse, t1, t2, z0 126 ;# Compute sum first. Unpack to so signed subract 127 ;# can be used. Only have a half word signed 128 ;# subract. Do high, then low. 129 vmrghb \t1, \z0, \src 130 vmrghb \t2, \z0, \ref 131 vsubshs \t1, \t1, \t2 132 vsum4shs \sum, \t1, \sum 133 134 vmrglb \t1, \z0, \src 135 vmrglb \t2, \z0, \ref 136 vsubshs \t1, \t1, \t2 137 vsum4shs \sum, \t1, \sum 138 139 ;# Now compute sse. 140 vsububs \t1, \src, \ref 141 vsububs \t2, \ref, \src 142 vor \t1, \t1, \t2 143 144 vmsumubm \sse, \t1, \t1, \sse 145.endm 146 147.macro variance_final sum, sse, z0, DS 148 vsumsws \sum, \sum, \z0 149 vsumsws \sse, \sse, \z0 150 151 stvx \sum, 0, r1 152 lwz r3, 12(r1) 153 154 stvx \sse, 0, r1 155 lwz r4, 12(r1) 156 157 stw r4, 0(r9) ;# sse 158 159 mullw r3, r3, r3 ;# sum*sum 160 srawi r3, r3, \DS ;# (sum*sum) >> 8 161 subf r3, r3, r4 ;# sse - ((sum*sum) >> 8) 162.endm 163 164.macro compute_sum_sse_16 V, increment_counter 165 load_and_align_16 v16, r7, r8, \increment_counter 166 compute_sum_sse \V, v16, v18, v19, v20, v21, v23 167.endm 168 169.macro load_and_align_16 V, R, P, increment_counter 170 lvsl v17, 0, \R ;# permutate value for alignment 171 172 ;# input to filter is 21 bytes wide, output is 16 bytes. 173 ;# input will can span three vectors if not aligned correctly. 174 lvx v21, 0, \R 175 lvx v22, r10, \R 176 177.if \increment_counter 178 add \R, \R, \P 179.endif 180 181 vperm \V, v21, v22, v17 182.endm 183 184 .align 2 185;# r3 unsigned char *src_ptr 186;# r4 int src_pixels_per_line 187;# r5 int xoffset 188;# r6 int yoffset 189;# r7 unsigned char *dst_ptr 190;# r8 int dst_pixels_per_line 191;# r9 unsigned int *sse 192;# 193;# r3 return value 194vp8_sub_pixel_variance4x4_ppc: 195 mfspr r11, 256 ;# get old VRSAVE 196 oris r12, r11, 0xf830 197 ori r12, r12, 0xfff8 198 mtspr 256, r12 ;# set VRSAVE 199 200 stwu r1,-32(r1) ;# create space on the stack 201 202 HProlog second_pass_4x4_pre_copy_b 203 204 ;# Load up permutation constants 205 load_c v10, b_0123_b, 0, r12, r0 206 load_c v11, b_4567_b, 0, r12, r0 207 208 hfilter_8 v0, v10, v11, 1 209 hfilter_8 v1, v10, v11, 1 210 hfilter_8 v2, v10, v11, 1 211 hfilter_8 v3, v10, v11, 1 212 213 ;# Finished filtering main horizontal block. If there is no 214 ;# vertical filtering, jump to storing the data. Otherwise 215 ;# load up and filter the additional line that is needed 216 ;# for the vertical filter. 217 beq compute_sum_sse_4x4_b 218 219 hfilter_8 v4, v10, v11, 0 220 221 b second_pass_4x4_b 222 223second_pass_4x4_pre_copy_b: 224 slwi r6, r6, 5 ;# index into vertical filter array 225 226 load_and_align_16 v0, r3, r4, 1 227 load_and_align_16 v1, r3, r4, 1 228 load_and_align_16 v2, r3, r4, 1 229 load_and_align_16 v3, r3, r4, 1 230 load_and_align_16 v4, r3, r4, 0 231 232second_pass_4x4_b: 233 vspltish v20, 8 234 vspltish v18, 3 235 vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 236 237 load_vfilter v20, v21 238 239 vfilter_16 v0, v1 240 vfilter_16 v1, v2 241 vfilter_16 v2, v3 242 vfilter_16 v3, v4 243 244compute_sum_sse_4x4_b: 245 vspltish v18, 0 ;# sum 246 vspltish v19, 0 ;# sse 247 vspltish v23, 0 ;# unpack 248 li r10, 16 249 250 load_and_align_16 v4, r7, r8, 1 251 load_and_align_16 v5, r7, r8, 1 252 load_and_align_16 v6, r7, r8, 1 253 load_and_align_16 v7, r7, r8, 1 254 255 vmrghb v0, v0, v1 256 vmrghb v1, v2, v3 257 258 vmrghb v2, v4, v5 259 vmrghb v3, v6, v7 260 261 load_c v10, b_hilo_b, 0, r12, r0 262 263 vperm v0, v0, v1, v10 264 vperm v1, v2, v3, v10 265 266 compute_sum_sse v0, v1, v18, v19, v20, v21, v23 267 268 variance_final v18, v19, v23, 4 269 270 addi r1, r1, 32 ;# recover stack 271 mtspr 256, r11 ;# reset old VRSAVE 272 273 blr 274 275 .align 2 276;# r3 unsigned char *src_ptr 277;# r4 int src_pixels_per_line 278;# r5 int xoffset 279;# r6 int yoffset 280;# r7 unsigned char *dst_ptr 281;# r8 int dst_pixels_per_line 282;# r9 unsigned int *sse 283;# 284;# r3 return value 285vp8_sub_pixel_variance8x8_ppc: 286 mfspr r11, 256 ;# get old VRSAVE 287 oris r12, r11, 0xfff0 288 ori r12, r12, 0xffff 289 mtspr 256, r12 ;# set VRSAVE 290 291 stwu r1,-32(r1) ;# create space on the stack 292 293 HProlog second_pass_8x8_pre_copy_b 294 295 ;# Load up permutation constants 296 load_c v10, b_0123_b, 0, r12, r0 297 load_c v11, b_4567_b, 0, r12, r0 298 299 hfilter_8 v0, v10, v11, 1 300 hfilter_8 v1, v10, v11, 1 301 hfilter_8 v2, v10, v11, 1 302 hfilter_8 v3, v10, v11, 1 303 hfilter_8 v4, v10, v11, 1 304 hfilter_8 v5, v10, v11, 1 305 hfilter_8 v6, v10, v11, 1 306 hfilter_8 v7, v10, v11, 1 307 308 ;# Finished filtering main horizontal block. If there is no 309 ;# vertical filtering, jump to storing the data. Otherwise 310 ;# load up and filter the additional line that is needed 311 ;# for the vertical filter. 312 beq compute_sum_sse_8x8_b 313 314 hfilter_8 v8, v10, v11, 0 315 316 b second_pass_8x8_b 317 318second_pass_8x8_pre_copy_b: 319 slwi. r6, r6, 5 ;# index into vertical filter array 320 321 load_and_align_16 v0, r3, r4, 1 322 load_and_align_16 v1, r3, r4, 1 323 load_and_align_16 v2, r3, r4, 1 324 load_and_align_16 v3, r3, r4, 1 325 load_and_align_16 v4, r3, r4, 1 326 load_and_align_16 v5, r3, r4, 1 327 load_and_align_16 v6, r3, r4, 1 328 load_and_align_16 v7, r3, r4, 1 329 load_and_align_16 v8, r3, r4, 0 330 331 beq compute_sum_sse_8x8_b 332 333second_pass_8x8_b: 334 vspltish v20, 8 335 vspltish v18, 3 336 vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 337 338 load_vfilter v20, v21 339 340 vfilter_16 v0, v1 341 vfilter_16 v1, v2 342 vfilter_16 v2, v3 343 vfilter_16 v3, v4 344 vfilter_16 v4, v5 345 vfilter_16 v5, v6 346 vfilter_16 v6, v7 347 vfilter_16 v7, v8 348 349compute_sum_sse_8x8_b: 350 vspltish v18, 0 ;# sum 351 vspltish v19, 0 ;# sse 352 vspltish v23, 0 ;# unpack 353 li r10, 16 354 355 vmrghb v0, v0, v1 356 vmrghb v1, v2, v3 357 vmrghb v2, v4, v5 358 vmrghb v3, v6, v7 359 360 load_and_align_16 v4, r7, r8, 1 361 load_and_align_16 v5, r7, r8, 1 362 load_and_align_16 v6, r7, r8, 1 363 load_and_align_16 v7, r7, r8, 1 364 load_and_align_16 v8, r7, r8, 1 365 load_and_align_16 v9, r7, r8, 1 366 load_and_align_16 v10, r7, r8, 1 367 load_and_align_16 v11, r7, r8, 0 368 369 vmrghb v4, v4, v5 370 vmrghb v5, v6, v7 371 vmrghb v6, v8, v9 372 vmrghb v7, v10, v11 373 374 compute_sum_sse v0, v4, v18, v19, v20, v21, v23 375 compute_sum_sse v1, v5, v18, v19, v20, v21, v23 376 compute_sum_sse v2, v6, v18, v19, v20, v21, v23 377 compute_sum_sse v3, v7, v18, v19, v20, v21, v23 378 379 variance_final v18, v19, v23, 6 380 381 addi r1, r1, 32 ;# recover stack 382 mtspr 256, r11 ;# reset old VRSAVE 383 blr 384 385 .align 2 386;# r3 unsigned char *src_ptr 387;# r4 int src_pixels_per_line 388;# r5 int xoffset 389;# r6 int yoffset 390;# r7 unsigned char *dst_ptr 391;# r8 int dst_pixels_per_line 392;# r9 unsigned int *sse 393;# 394;# r3 return value 395vp8_sub_pixel_variance8x16_ppc: 396 mfspr r11, 256 ;# get old VRSAVE 397 oris r12, r11, 0xffff 398 ori r12, r12, 0xfffc 399 mtspr 256, r12 ;# set VRSAVE 400 401 stwu r1,-32(r1) ;# create space on the stack 402 403 HProlog second_pass_8x16_pre_copy_b 404 405 ;# Load up permutation constants 406 load_c v29, b_0123_b, 0, r12, r0 407 load_c v30, b_4567_b, 0, r12, r0 408 409 hfilter_8 v0, v29, v30, 1 410 hfilter_8 v1, v29, v30, 1 411 hfilter_8 v2, v29, v30, 1 412 hfilter_8 v3, v29, v30, 1 413 hfilter_8 v4, v29, v30, 1 414 hfilter_8 v5, v29, v30, 1 415 hfilter_8 v6, v29, v30, 1 416 hfilter_8 v7, v29, v30, 1 417 hfilter_8 v8, v29, v30, 1 418 hfilter_8 v9, v29, v30, 1 419 hfilter_8 v10, v29, v30, 1 420 hfilter_8 v11, v29, v30, 1 421 hfilter_8 v12, v29, v30, 1 422 hfilter_8 v13, v29, v30, 1 423 hfilter_8 v14, v29, v30, 1 424 hfilter_8 v15, v29, v30, 1 425 426 ;# Finished filtering main horizontal block. If there is no 427 ;# vertical filtering, jump to storing the data. Otherwise 428 ;# load up and filter the additional line that is needed 429 ;# for the vertical filter. 430 beq compute_sum_sse_8x16_b 431 432 hfilter_8 v16, v29, v30, 0 433 434 b second_pass_8x16_b 435 436second_pass_8x16_pre_copy_b: 437 slwi. r6, r6, 5 ;# index into vertical filter array 438 439 load_and_align_16 v0, r3, r4, 1 440 load_and_align_16 v1, r3, r4, 1 441 load_and_align_16 v2, r3, r4, 1 442 load_and_align_16 v3, r3, r4, 1 443 load_and_align_16 v4, r3, r4, 1 444 load_and_align_16 v5, r3, r4, 1 445 load_and_align_16 v6, r3, r4, 1 446 load_and_align_16 v7, r3, r4, 1 447 load_and_align_16 v8, r3, r4, 1 448 load_and_align_16 v9, r3, r4, 1 449 load_and_align_16 v10, r3, r4, 1 450 load_and_align_16 v11, r3, r4, 1 451 load_and_align_16 v12, r3, r4, 1 452 load_and_align_16 v13, r3, r4, 1 453 load_and_align_16 v14, r3, r4, 1 454 load_and_align_16 v15, r3, r4, 1 455 load_and_align_16 v16, r3, r4, 0 456 457 beq compute_sum_sse_8x16_b 458 459second_pass_8x16_b: 460 vspltish v20, 8 461 vspltish v18, 3 462 vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 463 464 load_vfilter v20, v21 465 466 vfilter_16 v0, v1 467 vfilter_16 v1, v2 468 vfilter_16 v2, v3 469 vfilter_16 v3, v4 470 vfilter_16 v4, v5 471 vfilter_16 v5, v6 472 vfilter_16 v6, v7 473 vfilter_16 v7, v8 474 vfilter_16 v8, v9 475 vfilter_16 v9, v10 476 vfilter_16 v10, v11 477 vfilter_16 v11, v12 478 vfilter_16 v12, v13 479 vfilter_16 v13, v14 480 vfilter_16 v14, v15 481 vfilter_16 v15, v16 482 483compute_sum_sse_8x16_b: 484 vspltish v18, 0 ;# sum 485 vspltish v19, 0 ;# sse 486 vspltish v23, 0 ;# unpack 487 li r10, 16 488 489 vmrghb v0, v0, v1 490 vmrghb v1, v2, v3 491 vmrghb v2, v4, v5 492 vmrghb v3, v6, v7 493 vmrghb v4, v8, v9 494 vmrghb v5, v10, v11 495 vmrghb v6, v12, v13 496 vmrghb v7, v14, v15 497 498 load_and_align_16 v8, r7, r8, 1 499 load_and_align_16 v9, r7, r8, 1 500 load_and_align_16 v10, r7, r8, 1 501 load_and_align_16 v11, r7, r8, 1 502 load_and_align_16 v12, r7, r8, 1 503 load_and_align_16 v13, r7, r8, 1 504 load_and_align_16 v14, r7, r8, 1 505 load_and_align_16 v15, r7, r8, 1 506 507 vmrghb v8, v8, v9 508 vmrghb v9, v10, v11 509 vmrghb v10, v12, v13 510 vmrghb v11, v14, v15 511 512 compute_sum_sse v0, v8, v18, v19, v20, v21, v23 513 compute_sum_sse v1, v9, v18, v19, v20, v21, v23 514 compute_sum_sse v2, v10, v18, v19, v20, v21, v23 515 compute_sum_sse v3, v11, v18, v19, v20, v21, v23 516 517 load_and_align_16 v8, r7, r8, 1 518 load_and_align_16 v9, r7, r8, 1 519 load_and_align_16 v10, r7, r8, 1 520 load_and_align_16 v11, r7, r8, 1 521 load_and_align_16 v12, r7, r8, 1 522 load_and_align_16 v13, r7, r8, 1 523 load_and_align_16 v14, r7, r8, 1 524 load_and_align_16 v15, r7, r8, 0 525 526 vmrghb v8, v8, v9 527 vmrghb v9, v10, v11 528 vmrghb v10, v12, v13 529 vmrghb v11, v14, v15 530 531 compute_sum_sse v4, v8, v18, v19, v20, v21, v23 532 compute_sum_sse v5, v9, v18, v19, v20, v21, v23 533 compute_sum_sse v6, v10, v18, v19, v20, v21, v23 534 compute_sum_sse v7, v11, v18, v19, v20, v21, v23 535 536 variance_final v18, v19, v23, 7 537 538 addi r1, r1, 32 ;# recover stack 539 mtspr 256, r11 ;# reset old VRSAVE 540 blr 541 542;# Filters a horizontal line 543;# expects: 544;# r3 src_ptr 545;# r4 pitch 546;# r10 16 547;# r12 32 548;# v17 perm intput 549;# v18 rounding 550;# v19 shift 551;# v20 filter taps 552;# v21 tmp 553;# v22 tmp 554;# v23 tmp 555;# v24 tmp 556;# v25 tmp 557;# v26 tmp 558;# v27 tmp 559;# v28 perm output 560;# 561.macro hfilter_16 V, increment_counter 562 563 lvsl v17, 0, r3 ;# permutate value for alignment 564 565 ;# input to filter is 21 bytes wide, output is 16 bytes. 566 ;# input will can span three vectors if not aligned correctly. 567 lvx v21, 0, r3 568 lvx v22, r10, r3 569 lvx v23, r12, r3 570 571.if \increment_counter 572 add r3, r3, r4 573.endif 574 vperm v21, v21, v22, v17 575 vperm v22, v22, v23, v17 ;# v8 v9 = 21 input pixels left-justified 576 577 ;# set 0 578 vmsummbm v24, v20, v21, v18 ;# taps times elements 579 580 ;# set 1 581 vsldoi v23, v21, v22, 1 582 vmsummbm v25, v20, v23, v18 583 584 ;# set 2 585 vsldoi v23, v21, v22, 2 586 vmsummbm v26, v20, v23, v18 587 588 ;# set 3 589 vsldoi v23, v21, v22, 3 590 vmsummbm v27, v20, v23, v18 591 592 vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit) 593 vpkswus v25, v26, v27 ;# v25 = 2 6 A E 3 7 B F 594 595 vsrh v24, v24, v19 ;# divide v0, v1 by 128 596 vsrh v25, v25, v19 597 598 vpkuhus \V, v24, v25 ;# \V = scrambled 8-bit result 599 vperm \V, \V, v0, v28 ;# \V = correctly-ordered result 600.endm 601 602 .align 2 603;# r3 unsigned char *src_ptr 604;# r4 int src_pixels_per_line 605;# r5 int xoffset 606;# r6 int yoffset 607;# r7 unsigned char *dst_ptr 608;# r8 int dst_pixels_per_line 609;# r9 unsigned int *sse 610;# 611;# r3 return value 612vp8_sub_pixel_variance16x8_ppc: 613 mfspr r11, 256 ;# get old VRSAVE 614 oris r12, r11, 0xffff 615 ori r12, r12, 0xfff8 616 mtspr 256, r12 ;# set VRSAVE 617 618 stwu r1, -32(r1) ;# create space on the stack 619 620 HProlog second_pass_16x8_pre_copy_b 621 622 hfilter_16 v0, 1 623 hfilter_16 v1, 1 624 hfilter_16 v2, 1 625 hfilter_16 v3, 1 626 hfilter_16 v4, 1 627 hfilter_16 v5, 1 628 hfilter_16 v6, 1 629 hfilter_16 v7, 1 630 631 ;# Finished filtering main horizontal block. If there is no 632 ;# vertical filtering, jump to storing the data. Otherwise 633 ;# load up and filter the additional line that is needed 634 ;# for the vertical filter. 635 beq compute_sum_sse_16x8_b 636 637 hfilter_16 v8, 0 638 639 b second_pass_16x8_b 640 641second_pass_16x8_pre_copy_b: 642 slwi. r6, r6, 5 ;# index into vertical filter array 643 644 load_and_align_16 v0, r3, r4, 1 645 load_and_align_16 v1, r3, r4, 1 646 load_and_align_16 v2, r3, r4, 1 647 load_and_align_16 v3, r3, r4, 1 648 load_and_align_16 v4, r3, r4, 1 649 load_and_align_16 v5, r3, r4, 1 650 load_and_align_16 v6, r3, r4, 1 651 load_and_align_16 v7, r3, r4, 1 652 load_and_align_16 v8, r3, r4, 1 653 654 beq compute_sum_sse_16x8_b 655 656second_pass_16x8_b: 657 vspltish v20, 8 658 vspltish v18, 3 659 vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 660 661 load_vfilter v20, v21 662 663 vfilter_16 v0, v1 664 vfilter_16 v1, v2 665 vfilter_16 v2, v3 666 vfilter_16 v3, v4 667 vfilter_16 v4, v5 668 vfilter_16 v5, v6 669 vfilter_16 v6, v7 670 vfilter_16 v7, v8 671 672compute_sum_sse_16x8_b: 673 vspltish v18, 0 ;# sum 674 vspltish v19, 0 ;# sse 675 vspltish v23, 0 ;# unpack 676 li r10, 16 677 678 compute_sum_sse_16 v0, 1 679 compute_sum_sse_16 v1, 1 680 compute_sum_sse_16 v2, 1 681 compute_sum_sse_16 v3, 1 682 compute_sum_sse_16 v4, 1 683 compute_sum_sse_16 v5, 1 684 compute_sum_sse_16 v6, 1 685 compute_sum_sse_16 v7, 0 686 687 variance_final v18, v19, v23, 7 688 689 addi r1, r1, 32 ;# recover stack 690 691 mtspr 256, r11 ;# reset old VRSAVE 692 693 blr 694 695 .align 2 696;# r3 unsigned char *src_ptr 697;# r4 int src_pixels_per_line 698;# r5 int xoffset 699;# r6 int yoffset 700;# r7 unsigned char *dst_ptr 701;# r8 int dst_pixels_per_line 702;# r9 unsigned int *sse 703;# 704;# r3 return value 705vp8_sub_pixel_variance16x16_ppc: 706 mfspr r11, 256 ;# get old VRSAVE 707 oris r12, r11, 0xffff 708 ori r12, r12, 0xfff8 709 mtspr 256, r12 ;# set VRSAVE 710 711 stwu r1, -32(r1) ;# create space on the stack 712 713 HProlog second_pass_16x16_pre_copy_b 714 715 hfilter_16 v0, 1 716 hfilter_16 v1, 1 717 hfilter_16 v2, 1 718 hfilter_16 v3, 1 719 hfilter_16 v4, 1 720 hfilter_16 v5, 1 721 hfilter_16 v6, 1 722 hfilter_16 v7, 1 723 hfilter_16 v8, 1 724 hfilter_16 v9, 1 725 hfilter_16 v10, 1 726 hfilter_16 v11, 1 727 hfilter_16 v12, 1 728 hfilter_16 v13, 1 729 hfilter_16 v14, 1 730 hfilter_16 v15, 1 731 732 ;# Finished filtering main horizontal block. If there is no 733 ;# vertical filtering, jump to storing the data. Otherwise 734 ;# load up and filter the additional line that is needed 735 ;# for the vertical filter. 736 beq compute_sum_sse_16x16_b 737 738 hfilter_16 v16, 0 739 740 b second_pass_16x16_b 741 742second_pass_16x16_pre_copy_b: 743 slwi. r6, r6, 5 ;# index into vertical filter array 744 745 load_and_align_16 v0, r3, r4, 1 746 load_and_align_16 v1, r3, r4, 1 747 load_and_align_16 v2, r3, r4, 1 748 load_and_align_16 v3, r3, r4, 1 749 load_and_align_16 v4, r3, r4, 1 750 load_and_align_16 v5, r3, r4, 1 751 load_and_align_16 v6, r3, r4, 1 752 load_and_align_16 v7, r3, r4, 1 753 load_and_align_16 v8, r3, r4, 1 754 load_and_align_16 v9, r3, r4, 1 755 load_and_align_16 v10, r3, r4, 1 756 load_and_align_16 v11, r3, r4, 1 757 load_and_align_16 v12, r3, r4, 1 758 load_and_align_16 v13, r3, r4, 1 759 load_and_align_16 v14, r3, r4, 1 760 load_and_align_16 v15, r3, r4, 1 761 load_and_align_16 v16, r3, r4, 0 762 763 beq compute_sum_sse_16x16_b 764 765second_pass_16x16_b: 766 vspltish v20, 8 767 vspltish v18, 3 768 vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 769 770 load_vfilter v20, v21 771 772 vfilter_16 v0, v1 773 vfilter_16 v1, v2 774 vfilter_16 v2, v3 775 vfilter_16 v3, v4 776 vfilter_16 v4, v5 777 vfilter_16 v5, v6 778 vfilter_16 v6, v7 779 vfilter_16 v7, v8 780 vfilter_16 v8, v9 781 vfilter_16 v9, v10 782 vfilter_16 v10, v11 783 vfilter_16 v11, v12 784 vfilter_16 v12, v13 785 vfilter_16 v13, v14 786 vfilter_16 v14, v15 787 vfilter_16 v15, v16 788 789compute_sum_sse_16x16_b: 790 vspltish v18, 0 ;# sum 791 vspltish v19, 0 ;# sse 792 vspltish v23, 0 ;# unpack 793 li r10, 16 794 795 compute_sum_sse_16 v0, 1 796 compute_sum_sse_16 v1, 1 797 compute_sum_sse_16 v2, 1 798 compute_sum_sse_16 v3, 1 799 compute_sum_sse_16 v4, 1 800 compute_sum_sse_16 v5, 1 801 compute_sum_sse_16 v6, 1 802 compute_sum_sse_16 v7, 1 803 compute_sum_sse_16 v8, 1 804 compute_sum_sse_16 v9, 1 805 compute_sum_sse_16 v10, 1 806 compute_sum_sse_16 v11, 1 807 compute_sum_sse_16 v12, 1 808 compute_sum_sse_16 v13, 1 809 compute_sum_sse_16 v14, 1 810 compute_sum_sse_16 v15, 0 811 812 variance_final v18, v19, v23, 8 813 814 addi r1, r1, 32 ;# recover stack 815 816 mtspr 256, r11 ;# reset old VRSAVE 817 818 blr 819 820 .data 821 822 .align 4 823hfilter_b: 824 .byte 128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0 825 .byte 112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0 826 .byte 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0 827 .byte 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0 828 .byte 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0 829 .byte 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0 830 .byte 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0 831 .byte 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0 832 833 .align 4 834vfilter_b: 835 .byte 128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128 836 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 837 .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112 838 .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 839 .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96 840 .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 841 .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80 842 .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48 843 .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 844 .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 845 .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48 846 .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80 847 .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 848 .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96 849 .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 850 .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112 851 852 .align 4 853b_hperm_b: 854 .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 855 856 .align 4 857b_0123_b: 858 .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 859 860 .align 4 861b_4567_b: 862 .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 863 864b_hilo_b: 865 .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 866