1; PowerPC optimized zoom for Goom 2; © 2001-2003 Guillaume Borios 3; This library is free software; you can redistribute it and/or 4; modify it under the terms of the GNU Library General Public 5; License as published by the Free Software Foundation; either 6; version 2 of the License, or (at your option) any later version. 7; 8; This library is distributed in the hope that it will be useful, 9; but WITHOUT ANY WARRANTY; without even the implied warranty of 10; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11; Library General Public License for more details. 12; 13; You should have received a copy of the GNU Library General Public 14; License along with this library; if not, write to the 15; Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, 16; Boston, MA 02110-1301, USA. 17 18; Change log : 19; 21 Dec 2003 : Use of altivec is now determined with a parameter 20 21; Section definition : We use a read only section 22.text 23 24; name of the function to call by C program : ppc_zoom 25; We declare this label as a global to extend its scope outside this file 26.globl _ppc_zoom_generic 27.globl _ppc_zoom_G4 28 29; Description : 30; This routine dynamically computes and applies a zoom filter 31 32; parameters : 33; r3 <=> unsigned int sizeX (in pixels) 34; r4 <=> unsigned int sizeY (in pixels) 35; r5 <=> unsigned int * frompixmap 36; r6 <=> unsigned int * topixmap 37; r7 <=> unsigned int * brutS 38; r8 <=> unsigned int * brutD 39; r9 <=> unsigned int buffratio 40; r10 <=> int [16][16] precalccoeffs 41 42; globals after init 43; r5 <=> frompixmap - 1 byte needed for preincremental fetch (replaces r5) 44; r6 <=> topixmap - 1 byte needed for preincremental fetch (replaces r6) 45; r3 <=> ax = x max in 16th of pixels (replaces old r3) 46; r4 <=> ay = y max in 16th of pixels (replaces old r4) 47; r20 <=> row size in bytes 48; r12 <=> 0xFF00FF (mask for parallel 32 bits pixs computing) 49; r30 <=> brutS - 1 byte needed for preincremental fetch (replaces r7) 50; r31 <=> brutD - 1 byte needed for preincremental fetch (replaces r8) 51 52; ABI notes : 53; r1 is the Stack Pointer (SP) => Do not use 54; r13..r31 are non-volatiles => Do not use 55 56_ppc_zoom_generic: 57 58; Saves the used non volatile registers in the Mach-O stack s Red-Zone 59stmw r18,-56(r1) 60 61; init 62li r18,0 ; Default value if out of range : 0 (Black) 63mr r11,r10 64lis r12,0xFF 65mullw r2,r3,r4 ; Number of pixels to compute 66subi r30,r8,0 67slwi r20,r3,2 68srawi r19,r20,2 69ori r12,r12,0xFF 70subi r3,r3,1 71subi r4,r4,1 72mtspr ctr,r2 ; Init the loop count (one loop per pixel computed) 73subi r31,r7,0 74subi r6,r6,4 75slwi r3,r3,4 76slwi r4,r4,4 77 78;pre init for loop 79lwz r2,0(r31) ; px 80lwz r29,4(r31) ; py 81lwz r8,0(r30) ; px2 82lwz r10,4(r30) ; py2 83 84b L1 85.align 5 86L1: 87 88; computes dynamically the position to fetch 89sub r8,r8,r2 90sub r10,r10,r29 91mullw r8,r8,r9 92addi r31,r31,8 93mullw r10,r10,r9 94addi r30,r30,8 95 96srawi r8,r8,16 97srawi r10,r10,16 98add r2,r2,r8 99add r29,r29,r10 100 101; if px>ax or py>ay goto outofrange 102; computes the attenuation coeffs and the original point address 103rlwinm r10,r2,6,28-6,31-6 ; r10 <- (r2 << 2) & 0x000002D0 (r10=(r2%16)*4*16) 104cmpl cr4,0,r2,r3 105rlwimi r10, r29, 2, 28-2, 31-2 ; r10 <- ((r29 << 2) & 0x0000002D) | (r10 & !0x0000002D) (r10=(r10%16)*4 | r10) 106cmpl cr7,0,r29,r4 107srawi r29,r29,4 ; pos computing 108bge- cr4,L4 109srawi r2,r2,4 ; pos computing 110mullw r29, r29,r19 ; pos computing 111bge- cr7,L4 112 113; Channels notation : 00112233 (AARRVVBB) 114 115add r2,r2,r29 ; pos computing 116lwzx r10,r11,r10 ; Loads coefs 117slwi r2,r2,2 ; pos computing 118add r2,r2,r5 ; pos computing 119rlwinm r21,r10,0,24,31 ; Isolates coef1 (??????11 -> 00000011) 120lwz r25,0(r2) ; Loads col1 -> r25 121lwz r26,4(r2) ; Loads col2 -> r26 122rlwinm r22,r10,24,24,31 ; Isolates coef2 (????22?? -> 00000022) 123rlwinm r23,r10,16,24,31 ; Isolates coef3 (??33???? -> 00000033) 124add r2,r2,r20 ; Adds one line for future load of col3 and col4 125and r8, r25,r12 ; Masks col1 channels 1 & 3 : 0x00XX00XX 126rlwinm r24,r10,8,24,31 ; Isolates coef4 (44?????? -> 00000044) 127andi. r25,r25,0xFF00 ; Masks col1 channel 2 : 0x0000XX00 128mullw r8, r8, r21 ; Applies coef1 on col1 channels 1 & 3 129 130 131; computes final pixel color 132and r10,r26,r12 ; Masks col2 channels 1 & 3 : 0x00XX00XX 133lwz r27,0(r2) ; Loads col3 -> r27 134mullw r10,r10,r22 ; Applies coef2 on col2 channels 1 & 3 135mullw r25,r25,r21 ; Applies coef1 on col1 channel 2 136andi. r29,r26,0xFF00 ; Masks col2 channel 2 : 0x0000XX00 137mullw r29,r29,r22 ; Applies coef2 on col2 channel 2 138lwz r28,4(r2) ; Loads col4 -> r28 139add r8 ,r8 ,r10 ; Adds col1 & col2 channels 1 & 3 140and r10,r27,r12 ; Masks col3 channels 1 & 3 : 0x00XX00XX 141add r25,r25,r29 ; Adds col1 & col2 channel 2 142mullw r10,r10,r23 ; Applies coef3 on col3 channels 1 & 3 143andi. r29,r27,0xFF00 ; Masks col3 channel 2 : 0x0000XX00 144mullw r29,r29,r23 ; Applies coef3 on col3 channel 2 145lwz r2,0(r31) ; px 146add r7 ,r8 ,r10 ; Adds col3 to (col1 + col2) channels 1 & 3 147and r10,r28,r12 ; Masks col4 channels 1 & 3 : 0x00XX00XX 148mullw r10,r10,r24 ; Applies coef4 on col4 channels 1 & 3 149add r25,r25,r29 ; Adds col 3 to (col1 + col2) channel 2 150lwz r8,0(r30) ; px2 151andi. r28,r28,0xFF00 ; Masks col4 channel 2 : 0x0000XX00 152add r7 ,r7 ,r10 ; Adds col4 to (col1 + col2 + col3) channels 1 & 3 153lwz r10,4(r30) ; py2 154mullw r28,r28,r24 ; Applies coef4 on col4 channel 2 155srawi r7, r7, 8 ; (sum of channels 1 & 3) >> 8 156lwz r29,4(r31) ; py 157add r25,r25,r28 ; Adds col 4 to (col1 + col2 + col3) channel 2 158rlwimi r7, r25, 24, 16, 23 ; (((sum of channels 2) >> 8 ) & 0x0000FF00) | ((sum of channels 1 and 3) & 0xFFFF00FF) 159stwu r7,4(r6) ; Stores the computed pixel 160bdnz L1 ; Iterate again if needed 161b L3 ;goto end ; If not, returns from the function 162 163 164; if out of range 165L4: 166stwu r18,4(r6) 167lwz r8,0(r30) ; px2 168lwz r10,4(r30) ; py2 169lwz r2,0(r31) ; px 170lwz r29,4(r31) ; py 171bdnz L1 172 173 174L3: 175 176; Restore saved registers and return 177lmw r18,-56(r1) 178blr 179 180 181 182 183 184 185 186 187_ppc_zoom_G4: 188 189; Saves the used non volatile registers in the Mach-O stack s Red-Zone 190stmw r17,-60(r1) 191 192; init 193li r18,0 ; Default value if out of range : 0 (Black) 194mr r11,r10 195lis r12,0xFF 196mullw r2,r3,r4 ; Number of pixels to compute 197subi r30,r8,0 198slwi r20,r3,2 199srawi r19,r20,2 200ori r12,r12,0xFF 201subi r3,r3,1 202subi r4,r4,1 203mtspr ctr,r2 ; Init the loop count (one loop per pixel computed) 204subi r31,r7,0 205subi r6,r6,4 206slwi r3,r3,4 207slwi r4,r4,4 208 209;pre init for loop 210lwz r2,0(r31) ; px 211lwz r29,4(r31) ; py 212lwz r8,0(r30) ; px2 213lwz r10,4(r30) ; py2 214 215;********************* 216lis r17,0x0F01 217 218b L100 219.align 5 220L100: 221 222addi r6,r6,4 223 224; Optimization to ensure the destination buffer 225; won't be loaded into the data cache 226rlwinm. r0,r6,0,27,31 227bne+ L500 228dcbz 0,r6 229;dcba 0,r6 230L500: 231 232; computes dynamically the position to fetch 233;mullw r8,r8,r29 234;mullw r2,r2,r29 235;add r2,r8,r2 236;srawi r2,r2,17 237 238sub r8,r8,r2 239sub r10,r10,r29 240mullw r8,r8,r9 241addi r31,r31,8 242mullw r10,r10,r9 243addi r30,r30,8 244 245dst r30,r17,0 246 247srawi r8,r8,16 248srawi r10,r10,16 249add r2,r2,r8 250add r29,r29,r10 251 252dst r31,r17,1 253 254; if px>ax or py>ay goto outofrange 255; computes the attenuation coeffs and the original point address 256rlwinm r10,r2,6,28-6,31-6 ; r10 <- (r2 << 2) & 0x000002D0 (r10=(r2%16)*4*16) 257cmpl cr4,0,r2,r3 258rlwimi r10, r29, 2, 28-2, 31-2 ; r10 <- ((r29 << 2) & 0x0000002D) | (r10 & !0x0000002D) (r10=(r29%16)*4 | r10) 259cmpl cr7,0,r29,r4 260srawi r29,r29,4 ; pos computing 261bge- cr4,L400 262srawi r2,r2,4 ; pos computing 263mullw r29, r29,r19 ; pos computing 264bge- cr7,L400 265 266; Channels notation : 00112233 (AARRVVBB) 267 268add r2,r2,r29 ; pos computing 269lwzx r10,r11,r10 ; Loads coefs 270slwi r2,r2,2 ; pos computing 271add r2,r2,r5 ; pos computing 272rlwinm r21,r10,0,24,31 ; Isolates coef1 (??????11 -> 00000011) 273lwz r25,0(r2) ; Loads col1 -> r25 274lwz r26,4(r2) ; Loads col2 -> r26 275rlwinm r22,r10,24,24,31 ; Isolates coef2 (????22?? -> 00000022) 276rlwinm r23,r10,16,24,31 ; Isolates coef3 (??33???? -> 00000033) 277add r2,r2,r20 ; Adds one line for future load of col3 and col4 278and r8, r25,r12 ; Masks col1 channels 1 & 3 : 0x00XX00XX 279rlwinm r24,r10,8,24,31 ; Isolates coef4 (44?????? -> 00000044) 280dst r2,r17,2 281rlwinm r25,r25,0,16,23 ; Masks col1 channel 2 : 0x0000XX00 282;andi. r25,r25,0xFF00 ; Masks col1 channel 2 : 0x0000XX00 283mullw r8, r8, r21 ; Applies coef1 on col1 channels 1 & 3 284 285 286; computes final pixel color 287and r10,r26,r12 ; Masks col2 channels 1 & 3 : 0x00XX00XX 288lwz r27,0(r2) ; Loads col3 -> r27 289mullw r10,r10,r22 ; Applies coef2 on col2 channels 1 & 3 290mullw r25,r25,r21 ; Applies coef1 on col1 channel 2 291rlwinm r29,r26,0,16,23 ; Masks col2 channel 2 : 0x0000XX00 292;andi. r29,r26,0xFF00 ; Masks col2 channel 2 : 0x0000XX00 293mullw r29,r29,r22 ; Applies coef2 on col2 channel 2 294lwz r28,4(r2) ; Loads col4 -> r28 295add r8 ,r8 ,r10 ; Adds col1 & col2 channels 1 & 3 296and r10,r27,r12 ; Masks col3 channels 1 & 3 : 0x00XX00XX 297add r25,r25,r29 ; Adds col1 & col2 channel 2 298mullw r10,r10,r23 ; Applies coef3 on col3 channels 1 & 3 299rlwinm r29,r27,0,16,23 ; Masks col3 channel 2 : 0x0000XX00 300;andi. r29,r27,0xFF00 ; Masks col3 channel 2 : 0x0000XX00 301mullw r29,r29,r23 ; Applies coef3 on col3 channel 2 302lwz r2,0(r31) ; px 303add r7 ,r8 ,r10 ; Adds col3 to (col1 + col2) channels 1 & 3 304and r10,r28,r12 ; Masks col4 channels 1 & 3 : 0x00XX00XX 305mullw r10,r10,r24 ; Applies coef4 on col4 channels 1 & 3 306add r25,r25,r29 ; Adds col 3 to (col1 + col2) channel 2 307lwz r8,0(r30) ; px2 308rlwinm r28,r28,0,16,23 ; Masks col4 channel 2 : 0x0000XX00 309;andi. r28,r28,0xFF00 ; Masks col4 channel 2 : 0x0000XX00 310add r7 ,r7 ,r10 ; Adds col4 to (col1 + col2 + col3) channels 1 & 3 311lwz r10,4(r30) ; py2 312mullw r28,r28,r24 ; Applies coef4 on col4 channel 2 313srawi r7, r7, 8 ; (sum of channels 1 & 3) >> 8 314lwz r29,4(r31) ; py 315add r25,r25,r28 ; Adds col 4 to (col1 + col2 + col3) channel 2 316rlwimi r7, r25, 24, 16, 23 ; (((sum of channels 2) >> 8 ) & 0x0000FF00) | ((sum of channels 1 and 3) & 0xFFFF00FF) 317stw r7,0(r6) ; Stores the computed pixel 318bdnz L100 ; Iterate again if needed 319b L300 ;goto end ; If not, returns from the function 320 321 322; if out of range 323L400: 324stw r18,0(r6) 325lwz r8,0(r30) ; px2 326lwz r10,4(r30) ; py2 327lwz r2,0(r31) ; px 328lwz r29,4(r31) ; py 329bdnz L100 330 331 332L300: 333 334; Restore saved registers and return 335lmw r17,-60(r1) 336blr 337