1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12 .globl vp8_short_fdct4x4_ppc 13 .globl vp8_short_fdct8x4_ppc 14 15.macro load_c V, LABEL, OFF, R0, R1 16 lis \R0, \LABEL@ha 17 la \R1, \LABEL@l(\R0) 18 lvx \V, \OFF, \R1 19.endm 20 21;# Forward and inverse DCTs are nearly identical; only differences are 22;# in normalization (fwd is twice unitary, inv is half unitary) 23;# and that they are of course transposes of each other. 24;# 25;# The following three accomplish most of implementation and 26;# are used only by ppc_idct.c and ppc_fdct.c. 27.macro prologue 28 mfspr r11, 256 ;# get old VRSAVE 29 oris r12, r11, 0xfffc 30 mtspr 256, r12 ;# set VRSAVE 31 32 stwu r1,-32(r1) ;# create space on the stack 33 34 li r6, 16 35 36 load_c v0, dct_tab, 0, r9, r10 37 lvx v1, r6, r10 38 addi r10, r10, 32 39 lvx v2, 0, r10 40 lvx v3, r6, r10 41 42 load_c v4, ppc_dctperm_tab, 0, r9, r10 43 load_c v5, ppc_dctperm_tab, r6, r9, r10 44 45 load_c v6, round_tab, 0, r10, r9 46.endm 47 48.macro epilogue 49 addi r1, r1, 32 ;# recover stack 50 51 mtspr 256, r11 ;# reset old VRSAVE 52.endm 53 54;# Do horiz xf on two rows of coeffs v8 = a0 a1 a2 a3 b0 b1 b2 b3. 55;# a/A are the even rows 0,2 b/B are the odd rows 1,3 56;# For fwd transform, indices are horizontal positions, then frequencies. 57;# For inverse transform, frequencies then positions. 58;# The two resulting A0..A3 B0..B3 are later combined 59;# and vertically transformed. 60 61.macro two_rows_horiz Dst 62 vperm v9, v8, v8, v4 ;# v9 = a2 a3 a0 a1 b2 b3 b0 b1 63 64 vmsumshm v10, v0, v8, v6 65 vmsumshm v10, v1, v9, v10 66 vsraw v10, v10, v7 ;# v10 = A0 A1 B0 B1 67 68 vmsumshm v11, v2, v8, v6 69 vmsumshm v11, v3, v9, v11 70 vsraw v11, v11, v7 ;# v11 = A2 A3 B2 B3 71 72 vpkuwum v10, v10, v11 ;# v10 = A0 A1 B0 B1 A2 A3 B2 B3 73 vperm \Dst, v10, v10, v5 ;# Dest = A0 B0 A1 B1 A2 B2 A3 B3 74.endm 75 76;# Vertical xf on two rows. DCT values in comments are for inverse transform; 77;# forward transform uses transpose. 78 79.macro two_rows_vert Ceven, Codd 80 vspltw v8, \Ceven, 0 ;# v8 = c00 c10 or c02 c12 four times 81 vspltw v9, \Codd, 0 ;# v9 = c20 c30 or c22 c32 "" 82 vmsumshm v8, v8, v12, v6 83 vmsumshm v8, v9, v13, v8 84 vsraw v10, v8, v7 85 86 vspltw v8, \Codd, 1 ;# v8 = c01 c11 or c03 c13 87 vspltw v9, \Ceven, 1 ;# v9 = c21 c31 or c23 c33 88 vmsumshm v8, v8, v12, v6 89 vmsumshm v8, v9, v13, v8 90 vsraw v8, v8, v7 91 92 vpkuwum v8, v10, v8 ;# v8 = rows 0,1 or 2,3 93.endm 94 95.macro two_rows_h Dest 96 stw r0, 0(r8) 97 lwz r0, 4(r3) 98 stw r0, 4(r8) 99 lwzux r0, r3,r5 100 stw r0, 8(r8) 101 lwz r0, 4(r3) 102 stw r0, 12(r8) 103 lvx v8, 0,r8 104 two_rows_horiz \Dest 105.endm 106 107 .align 2 108;# r3 short *input 109;# r4 short *output 110;# r5 int pitch 111vp8_short_fdct4x4_ppc: 112 113 prologue 114 115 vspltisw v7, 14 ;# == 14, fits in 5 signed bits 116 addi r8, r1, 0 117 118 119 lwz r0, 0(r3) 120 two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13 121 122 lwzux r0, r3, r5 123 two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33 124 125 lvx v6, r6, r9 ;# v6 = Vround 126 vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter 127 128 two_rows_vert v0, v1 129 stvx v8, 0, r4 130 two_rows_vert v2, v3 131 stvx v8, r6, r4 132 133 epilogue 134 135 blr 136 137 .align 2 138;# r3 short *input 139;# r4 short *output 140;# r5 int pitch 141vp8_short_fdct8x4_ppc: 142 prologue 143 144 vspltisw v7, 14 ;# == 14, fits in 5 signed bits 145 addi r8, r1, 0 146 addi r10, r3, 0 147 148 lwz r0, 0(r3) 149 two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13 150 151 lwzux r0, r3, r5 152 two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33 153 154 lvx v6, r6, r9 ;# v6 = Vround 155 vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter 156 157 two_rows_vert v0, v1 158 stvx v8, 0, r4 159 two_rows_vert v2, v3 160 stvx v8, r6, r4 161 162 ;# Next block 163 addi r3, r10, 8 164 addi r4, r4, 32 165 lvx v6, 0, r9 ;# v6 = Hround 166 167 vspltisw v7, 14 ;# == 14, fits in 5 signed bits 168 addi r8, r1, 0 169 170 lwz r0, 0(r3) 171 two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13 172 173 lwzux r0, r3, r5 174 two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33 175 176 lvx v6, r6, r9 ;# v6 = Vround 177 vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter 178 179 two_rows_vert v0, v1 180 stvx v8, 0, r4 181 two_rows_vert v2, v3 182 stvx v8, r6, r4 183 184 epilogue 185 186 blr 187 188 .data 189 .align 4 190ppc_dctperm_tab: 191 .byte 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 192 .byte 0,1,4,5, 2,3,6,7, 8,9,12,13, 10,11,14,15 193 194 .align 4 195dct_tab: 196 .short 23170, 23170,-12540,-30274, 23170, 23170,-12540,-30274 197 .short 23170, 23170, 30274, 12540, 23170, 23170, 30274, 12540 198 199 .short 23170,-23170, 30274,-12540, 23170,-23170, 30274,-12540 200 .short -23170, 23170, 12540,-30274,-23170, 23170, 12540,-30274 201 202 .align 4 203round_tab: 204 .long (1 << (14-1)), (1 << (14-1)), (1 << (14-1)), (1 << (14-1)) 205 .long (1 << (16-1)), (1 << (16-1)), (1 << (16-1)), (1 << (16-1)) 206