1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12 .globl short_idct4x4llm_ppc 13 14.macro load_c V, LABEL, OFF, R0, R1 15 lis \R0, \LABEL@ha 16 la \R1, \LABEL@l(\R0) 17 lvx \V, \OFF, \R1 18.endm 19 20;# r3 short *input 21;# r4 short *output 22;# r5 int pitch 23 .align 2 24short_idct4x4llm_ppc: 25 mfspr r11, 256 ;# get old VRSAVE 26 oris r12, r11, 0xfff8 27 mtspr 256, r12 ;# set VRSAVE 28 29 load_c v8, sinpi8sqrt2, 0, r9, r10 30 load_c v9, cospi8sqrt2minus1, 0, r9, r10 31 load_c v10, hi_hi, 0, r9, r10 32 load_c v11, lo_lo, 0, r9, r10 33 load_c v12, shift_16, 0, r9, r10 34 35 li r10, 16 36 lvx v0, 0, r3 ;# input ip[0], ip[ 4] 37 lvx v1, r10, r3 ;# input ip[8], ip[12] 38 39 ;# first pass 40 vupkhsh v2, v0 41 vupkhsh v3, v1 42 vaddsws v6, v2, v3 ;# a1 = ip[0]+ip[8] 43 vsubsws v7, v2, v3 ;# b1 = ip[0]-ip[8] 44 45 vupklsh v0, v0 46 vmulosh v4, v0, v8 47 vsraw v4, v4, v12 48 vaddsws v4, v4, v0 ;# ip[ 4] * sin(pi/8) * sqrt(2) 49 50 vupklsh v1, v1 51 vmulosh v5, v1, v9 52 vsraw v5, v5, v12 ;# ip[12] * cos(pi/8) * sqrt(2) 53 vaddsws v5, v5, v1 54 55 vsubsws v4, v4, v5 ;# c1 56 57 vmulosh v3, v1, v8 58 vsraw v3, v3, v12 59 vaddsws v3, v3, v1 ;# ip[12] * sin(pi/8) * sqrt(2) 60 61 vmulosh v5, v0, v9 62 vsraw v5, v5, v12 ;# ip[ 4] * cos(pi/8) * sqrt(2) 63 vaddsws v5, v5, v0 64 65 vaddsws v3, v3, v5 ;# d1 66 67 vaddsws v0, v6, v3 ;# a1 + d1 68 vsubsws v3, v6, v3 ;# a1 - d1 69 70 vaddsws v1, v7, v4 ;# b1 + c1 71 vsubsws v2, v7, v4 ;# b1 - c1 72 73 ;# transpose input 74 vmrghw v4, v0, v1 ;# a0 b0 a1 b1 75 vmrghw v5, v2, v3 ;# c0 d0 c1 d1 76 77 vmrglw v6, v0, v1 ;# a2 b2 a3 b3 78 vmrglw v7, v2, v3 ;# c2 d2 c3 d3 79 80 vperm v0, v4, v5, v10 ;# a0 b0 c0 d0 81 vperm v1, v4, v5, v11 ;# a1 b1 c1 d1 82 83 vperm v2, v6, v7, v10 ;# a2 b2 c2 d2 84 vperm v3, v6, v7, v11 ;# a3 b3 c3 d3 85 86 ;# second pass 87 vaddsws v6, v0, v2 ;# a1 = ip[0]+ip[8] 88 vsubsws v7, v0, v2 ;# b1 = ip[0]-ip[8] 89 90 vmulosh v4, v1, v8 91 vsraw v4, v4, v12 92 vaddsws v4, v4, v1 ;# ip[ 4] * sin(pi/8) * sqrt(2) 93 94 vmulosh v5, v3, v9 95 vsraw v5, v5, v12 ;# ip[12] * cos(pi/8) * sqrt(2) 96 vaddsws v5, v5, v3 97 98 vsubsws v4, v4, v5 ;# c1 99 100 vmulosh v2, v3, v8 101 vsraw v2, v2, v12 102 vaddsws v2, v2, v3 ;# ip[12] * sin(pi/8) * sqrt(2) 103 104 vmulosh v5, v1, v9 105 vsraw v5, v5, v12 ;# ip[ 4] * cos(pi/8) * sqrt(2) 106 vaddsws v5, v5, v1 107 108 vaddsws v3, v2, v5 ;# d1 109 110 vaddsws v0, v6, v3 ;# a1 + d1 111 vsubsws v3, v6, v3 ;# a1 - d1 112 113 vaddsws v1, v7, v4 ;# b1 + c1 114 vsubsws v2, v7, v4 ;# b1 - c1 115 116 vspltish v6, 4 117 vspltish v7, 3 118 119 vpkswss v0, v0, v1 120 vpkswss v1, v2, v3 121 122 vaddshs v0, v0, v6 123 vaddshs v1, v1, v6 124 125 vsrah v0, v0, v7 126 vsrah v1, v1, v7 127 128 ;# transpose output 129 vmrghh v2, v0, v1 ;# a0 c0 a1 c1 a2 c2 a3 c3 130 vmrglh v3, v0, v1 ;# b0 d0 b1 d1 b2 d2 b3 d3 131 132 vmrghh v0, v2, v3 ;# a0 b0 c0 d0 a1 b1 c1 d1 133 vmrglh v1, v2, v3 ;# a2 b2 c2 d2 a3 b3 c3 d3 134 135 stwu r1,-416(r1) ;# create space on the stack 136 137 stvx v0, 0, r1 138 lwz r6, 0(r1) 139 stw r6, 0(r4) 140 lwz r6, 4(r1) 141 stw r6, 4(r4) 142 143 add r4, r4, r5 144 145 lwz r6, 8(r1) 146 stw r6, 0(r4) 147 lwz r6, 12(r1) 148 stw r6, 4(r4) 149 150 add r4, r4, r5 151 152 stvx v1, 0, r1 153 lwz r6, 0(r1) 154 stw r6, 0(r4) 155 lwz r6, 4(r1) 156 stw r6, 4(r4) 157 158 add r4, r4, r5 159 160 lwz r6, 8(r1) 161 stw r6, 0(r4) 162 lwz r6, 12(r1) 163 stw r6, 4(r4) 164 165 addi r1, r1, 416 ;# recover stack 166 167 mtspr 256, r11 ;# reset old VRSAVE 168 169 blr 170 171 .align 4 172sinpi8sqrt2: 173 .short 35468, 35468, 35468, 35468, 35468, 35468, 35468, 35468 174 175 .align 4 176cospi8sqrt2minus1: 177 .short 20091, 20091, 20091, 20091, 20091, 20091, 20091, 20091 178 179 .align 4 180shift_16: 181 .long 16, 16, 16, 16 182 183 .align 4 184hi_hi: 185 .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 186 187 .align 4 188lo_lo: 189 .byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 190