1;*! 2;* \copy 3;* Copyright (c) 2009-2013, Cisco Systems 4;* All rights reserved. 5;* 6;* Redistribution and use in source and binary forms, with or without 7;* modification, are permitted provided that the following conditions 8;* are met: 9;* 10;* ?Redistributions of source code must retain the above copyright 11;* notice, this list of conditions and the following disclaimer. 12;* 13;* ?Redistributions in binary form must reproduce the above copyright 14;* notice, this list of conditions and the following disclaimer in 15;* the documentation and/or other materials provided with the 16;* distribution. 17;* 18;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 21;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 22;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 24;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 28;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29;* POSSIBILITY OF SUCH DAMAGE. 30;*************************************************************************/ 31 32%include "asm_inc.asm" 33 34;in: m0, m1, m2, m3, m4, m5, m6, m7 35;out: m0, m3, m5, m2, m7, m1, m6, m4 36%macro TRANSPOSE_8x8B_MMX 10 37 MMX_XSwap bw, %1, %2, %8 38 MMX_XSwap bw, %3, %4, %2 39 MMX_XSwap bw, %5, %6, %4 40 movq %6, %9 41 movq %10, %4 42 MMX_XSwap bw, %7, %6, %4 43 44 MMX_XSwap wd, %1, %3, %6 45 MMX_XSwap wd, %8, %2, %3 46 MMX_XSwap wd, %5, %7, %2 47 movq %7, %10 48 movq %10, %3 49 MMX_XSwap wd, %7, %4, %3 50 51 MMX_XSwap dq, %1, %5, %4 52 MMX_XSwap dq, %6, %2, %5 53 MMX_XSwap dq, %8, %7, %2 54 movq %7, %10 55 movq %10, %5 56 MMX_XSwap dq, %7, %3, %5 57 58 movq %3, %10 59%endmacro 60 61;in: m0, m3, m5, m2, m7, m1, m6, m4 62%macro TRANSPOSE8x8_WRITE_MMX 2 ; dst, dst_stride 63 movq [%1], mm0 ; result of line 1, x8 bytes 64 movq [%1+%2], mm3 ; result of line 2 65 lea %1, [%1+2*%2] 66 movq [%1], mm5 ; result of line 3 67 movq [%1+%2], mm2 ; result of line 4 68 lea %1, [%1+2*%2] 69 movq [%1], mm7 ; result of line 5 70 movq [%1+%2], mm1 ; result of line 6 71 lea %1, [%1+2*%2] 72 movq [%1], mm6 ; result of line 7 73 movq [%1+%2], mm4 ; result of line 8 74%endmacro 75 76;in: m0, m3, m5, m2, m7, m1, m6, m4 77%macro TRANSPOSE8x8_WRITE_ALT_MMX 3 ; dst, dst_stride, reg32 78 movq [%1], mm0 ; result of line 1, x8 bytes 79 movq [%1+%2], mm3 ; result of line 2 80 lea %3, [%1+2*%2] 81 movq [%3], mm5 ; result of line 3 82 movq [%3+%2], mm2 ; result of line 4 83 lea %3, [%3+2*%2] 84 movq [%3], mm7 ; result of line 5 85 movq [%3+%2], mm1 ; result of line 6 86 lea %3, [%3+2*%2] 87 movq [%3], mm6 ; result of line 7 88 movq [%3+%2], mm4 ; result of line 8 89%endmacro ; end of TRANSPOSE8x8_WRITE_ALT_MMX 90 91; for transpose 16x8 92 93;in: m0, m1, m2, m3, m4, m5, m6, m7 94;out: m4, m2, m3, m7, m5, m1, m6, m0 95%macro TRANSPOSE_8x16B_SSE2 10 96 SSE2_XSawp bw, %1, %2, %8 97 SSE2_XSawp bw, %3, %4, %2 98 SSE2_XSawp bw, %5, %6, %4 99 movdqa %6, %9 100 movdqa %10, %4 101 SSE2_XSawp bw, %7, %6, %4 102 103 SSE2_XSawp wd, %1, %3, %6 104 SSE2_XSawp wd, %8, %2, %3 105 SSE2_XSawp wd, %5, %7, %2 106 movdqa %7, %10 107 movdqa %10, %3 108 SSE2_XSawp wd, %7, %4, %3 109 110 SSE2_XSawp dq, %1, %5, %4 111 SSE2_XSawp dq, %6, %2, %5 112 SSE2_XSawp dq, %8, %7, %2 113 movdqa %7, %10 114 movdqa %10, %5 115 SSE2_XSawp dq, %7, %3, %5 116 117 SSE2_XSawp qdq, %1, %8, %3 118 SSE2_XSawp qdq, %4, %2, %8 119 SSE2_XSawp qdq, %6, %7, %2 120 movdqa %7, %10 121 movdqa %10, %1 122 SSE2_XSawp qdq, %7, %5, %1 123 movdqa %5, %10 124%endmacro ; end of TRANSPOSE_8x16B_SSE2 125 126 127%macro TRANSPOSE8x16_WRITE_SSE2 2 ; dst, dst_stride 128 movq [%1], xmm4 ; result of line 1, x8 bytes 129 movq [%1+%2], xmm2 ; result of line 2 130 lea %1, [%1+2*%2] 131 movq [%1], xmm3 ; result of line 3 132 movq [%1+%2], xmm7 ; result of line 4 133 134 lea %1, [%1+2*%2] 135 movq [%1], xmm5 ; result of line 5 136 movq [%1+%2], xmm1 ; result of line 6 137 lea %1, [%1+2*%2] 138 movq [%1], xmm6 ; result of line 7 139 movq [%1+%2], xmm0 ; result of line 8 140 141 lea %1, [%1+2*%2] 142 movhpd [%1], xmm4 ; result of line 9 143 movhpd [%1+%2], xmm2 ; result of line 10 144 lea %1, [%1+2*%2] 145 movhpd [%1], xmm3 ; result of line 11 146 movhpd [%1+%2], xmm7 ; result of line 12 147 148 lea %1, [%1+2*%2] 149 movhpd [%1], xmm5 ; result of line 13 150 movhpd [%1+%2], xmm1 ; result of line 14 151 lea %1, [%1+2*%2] 152 movhpd [%1], xmm6 ; result of line 15 153 movhpd [%1+%2], xmm0 ; result of line 16 154%endmacro ; end of TRANSPOSE_WRITE_RESULT_SSE2 155 156%macro TRANSPOSE8x16_WRITE_ALT_SSE2 3 ; dst, dst_stride, reg32 157 movq [%1], xmm4 ; result of line 1, x8 bytes 158 movq [%1+%2], xmm2 ; result of line 2 159 lea %3, [%1+2*%2] 160 movq [%3], xmm3 ; result of line 3 161 movq [%3+%2], xmm7 ; result of line 4 162 163 lea %3, [%3+2*%2] 164 movq [%3], xmm5 ; result of line 5 165 movq [%3+%2], xmm1 ; result of line 6 166 lea %3, [%3+2*%2] 167 movq [%3], xmm6 ; result of line 7 168 movq [%3+%2], xmm0 ; result of line 8 169 170 lea %3, [%3+2*%2] 171 movhpd [%3], xmm4 ; result of line 9 172 movhpd [%3+%2], xmm2 ; result of line 10 173 lea %3, [%3+2*%2] 174 movhpd [%3], xmm3 ; result of line 11 175 movhpd [%3+%2], xmm7 ; result of line 12 176 177 lea %3, [%3+2*%2] 178 movhpd [%3], xmm5 ; result of line 13 179 movhpd [%3+%2], xmm1 ; result of line 14 180 lea %3, [%3+2*%2] 181 movhpd [%3], xmm6 ; result of line 15 182 movhpd [%3+%2], xmm0 ; result of line 16 183%endmacro ; end of TRANSPOSE8x16_WRITE_ALT_SSE2 184 185 186SECTION .text 187 188WELS_EXTERN TransposeMatrixBlock16x16_sse2 189; void TransposeMatrixBlock16x16_sse2( void *dst/*16x16*/, const int32_t dst_stride, void *src/*16x16*/, const int32_t src_stride ); 190 push r4 191 push r5 192 %assign push_num 2 193 LOAD_4_PARA 194 PUSH_XMM 8 195 SIGN_EXTENSION r1, r1d 196 SIGN_EXTENSION r3, r3d 197 198 mov r4, r7 199 and r4, 0Fh 200 sub r7, 10h 201 sub r7, r4 202 lea r5, [r3+r3*2] 203 ; top 8x16 block 204 movdqa xmm0, [r2] 205 movdqa xmm1, [r2+r3] 206 movdqa xmm2, [r2+r3*2] 207 movdqa xmm3, [r2+r5] 208 lea r2, [r2+r3*4] 209 movdqa xmm4, [r2] 210 movdqa xmm5, [r2+r3] 211 movdqa xmm6, [r2+r3*2] 212 213 ;in: m0, m1, m2, m3, m4, m5, m6, m7 214 ;out: m4, m2, m3, m7, m5, m1, m6, m0 215 TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7] 216 217 TRANSPOSE8x16_WRITE_SSE2 r0, r1 218 219 ; bottom 8x16 block 220 lea r2, [r2+r3*4] 221 movdqa xmm0, [r2] 222 movdqa xmm1, [r2+r3] 223 movdqa xmm2, [r2+r3*2] 224 movdqa xmm3, [r2+r5] 225 lea r2, [r2+r3*4] 226 movdqa xmm4, [r2] 227 movdqa xmm5, [r2+r3] 228 movdqa xmm6, [r2+r3*2] 229 230 ;in: m0, m1, m2, m3, m4, m5, m6, m7 231 ;out: m4, m2, m3, m7, m5, m1, m6, m0 232 TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7] 233 234 mov r5, r1 235 sal r5, 4 236 sub r0, r5 237 lea r0, [r0+r1*2+8] 238 TRANSPOSE8x16_WRITE_SSE2 r0, r1 239 240 add r7, r4 241 add r7, 10h 242 POP_XMM 243 LOAD_4_PARA_POP 244 pop r5 245 pop r4 246 ret 247 248WELS_EXTERN TransposeMatrixBlocksx16_sse2 249; void TransposeMatrixBlocksx16_sse2( void *dst/*W16x16*/, const int32_t dst_stride, void *src/*16xW16*/, const int32_t src_stride, const int32_t num_blocks ); 250 push r5 251 push r6 252 %assign push_num 2 253 LOAD_5_PARA 254 PUSH_XMM 8 255 SIGN_EXTENSION r1, r1d 256 SIGN_EXTENSION r3, r3d 257 SIGN_EXTENSION r4, r4d 258 mov r5, r7 259 and r5, 0Fh 260 sub r7, 10h 261 sub r7, r5 262TRANSPOSE_LOOP_SSE2: 263 ; explictly loading next loop data 264 lea r6, [r2+r3*8] 265 push r4 266%rep 8 267 mov r4, [r6] 268 mov r4, [r6+r3] 269 lea r6, [r6+r3*2] 270%endrep 271 pop r4 272 ; top 8x16 block 273 movdqa xmm0, [r2] 274 movdqa xmm1, [r2+r3] 275 lea r2, [r2+r3*2] 276 movdqa xmm2, [r2] 277 movdqa xmm3, [r2+r3] 278 lea r2, [r2+r3*2] 279 movdqa xmm4, [r2] 280 movdqa xmm5, [r2+r3] 281 lea r2, [r2+r3*2] 282 movdqa xmm6, [r2] 283 284 ;in: m0, m1, m2, m3, m4, m5, m6, m7 285 ;out: m4, m2, m3, m7, m5, m1, m6, m0 286 TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7] 287 TRANSPOSE8x16_WRITE_ALT_SSE2 r0, r1, r6 288 lea r2, [r2+r3*2] 289 290 ; bottom 8x16 block 291 movdqa xmm0, [r2] 292 movdqa xmm1, [r2+r3] 293 lea r2, [r2+r3*2] 294 movdqa xmm2, [r2] 295 movdqa xmm3, [r2+r3] 296 lea r2, [r2+r3*2] 297 movdqa xmm4, [r2] 298 movdqa xmm5, [r2+r3] 299 lea r2, [r2+r3*2] 300 movdqa xmm6, [r2] 301 302 ;in: m0, m1, m2, m3, m4, m5, m6, m7 303 ;out: m4, m2, m3, m7, m5, m1, m6, m0 304 TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7] 305 TRANSPOSE8x16_WRITE_ALT_SSE2 r0+8, r1, r6 306 lea r2, [r2+r3*2] 307 lea r0, [r0+16] 308 dec r4 309 jg near TRANSPOSE_LOOP_SSE2 310 311 add r7, r5 312 add r7, 10h 313 POP_XMM 314 LOAD_5_PARA_POP 315 pop r6 316 pop r5 317 ret 318 319WELS_EXTERN TransposeMatrixBlock8x8_mmx 320; void TransposeMatrixBlock8x8_mmx( void *dst/*8x8*/, const int32_t dst_stride, void *src/*8x8*/, const int32_t src_stride ); 321 %assign push_num 0 322 LOAD_4_PARA 323 SIGN_EXTENSION r1, r1d 324 SIGN_EXTENSION r3, r3d 325 sub r7, 8 326 327 movq mm0, [r2] 328 movq mm1, [r2+r3] 329 lea r2, [r2+2*r3] 330 movq mm2, [r2] 331 movq mm3, [r2+r3] 332 lea r2, [r2+2*r3] 333 movq mm4, [r2] 334 movq mm5, [r2+r3] 335 lea r2, [r2+2*r3] 336 movq mm6, [r2] 337 338 ;in: m0, m1, m2, m3, m4, m5, m6, m7 339 ;out: m0, m3, m5, m2, m7, m1, m6, m4 340 TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7] 341 342 TRANSPOSE8x8_WRITE_MMX r0, r1 343 344 emms 345 add r7, 8 346 LOAD_4_PARA_POP 347 ret 348 349WELS_EXTERN TransposeMatrixBlocksx8_mmx 350; void TransposeMatrixBlocksx8_mmx( void *dst/*8xW8*/, const int32_t dst_stride, void *src/*W8x8*/, const int32_t src_stride, const int32_t num_blocks ); 351 push r5 352 push r6 353 %assign push_num 2 354 LOAD_5_PARA 355 SIGN_EXTENSION r1, r1d 356 SIGN_EXTENSION r3, r3d 357 SIGN_EXTENSION r4, r4d 358 sub r7, 8 359 360 lea r5, [r2+r3*8] 361 362TRANSPOSE_BLOCKS_X8_LOOP_MMX: 363 ; explictly loading next loop data 364%rep 4 365 mov r6, [r5] 366 mov r6, [r5+r3] 367 lea r5, [r5+r3*2] 368%endrep 369 movq mm0, [r2] 370 movq mm1, [r2+r3] 371 lea r2, [r2+2*r3] 372 movq mm2, [r2] 373 movq mm3, [r2+r3] 374 lea r2, [r2+2*r3] 375 movq mm4, [r2] 376 movq mm5, [r2+r3] 377 lea r2, [r2+2*r3] 378 movq mm6, [r2] 379 380 ;in: m0, m1, m2, m3, m4, m5, m6, m7 381 ;out: m0, m3, m5, m2, m7, m1, m6, m4 382 TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7] 383 384 TRANSPOSE8x8_WRITE_ALT_MMX r0, r1, r6 385 lea r0, [r0+8] 386 lea r2, [r2+2*r3] 387 dec r4 388 jg near TRANSPOSE_BLOCKS_X8_LOOP_MMX 389 390 emms 391 add r7, 8 392 LOAD_5_PARA_POP 393 pop r6 394 pop r5 395 ret 396