• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;*!
2;* \copy
3;*     Copyright (c)  2009-2013, Cisco Systems
4;*     All rights reserved.
5;*
6;*     Redistribution and use in source and binary forms, with or without
7;*     modification, are permitted provided that the following conditions
8;*     are met:
9;*
10;*        ?Redistributions of source code must retain the above copyright
11;*          notice, this list of conditions and the following disclaimer.
12;*
13;*        ?Redistributions in binary form must reproduce the above copyright
14;*          notice, this list of conditions and the following disclaimer in
15;*          the documentation and/or other materials provided with the
16;*          distribution.
17;*
18;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29;*     POSSIBILITY OF SUCH DAMAGE.
30;*************************************************************************/
31
32%include "asm_inc.asm"
33
34;in:  m0, m1, m2, m3, m4, m5, m6, m7
35;out: m0, m3, m5, m2, m7, m1, m6, m4
36%macro TRANSPOSE_8x8B_MMX 10
37    MMX_XSwap bw,  %1, %2, %8
38    MMX_XSwap bw,  %3, %4, %2
39    MMX_XSwap bw,  %5, %6, %4
40    movq    %6, %9
41    movq    %10, %4
42    MMX_XSwap bw,  %7, %6, %4
43
44    MMX_XSwap wd,  %1, %3, %6
45    MMX_XSwap wd,  %8, %2, %3
46    MMX_XSwap wd,  %5, %7, %2
47    movq    %7, %10
48    movq    %10, %3
49    MMX_XSwap wd,  %7, %4, %3
50
51    MMX_XSwap dq,  %1, %5, %4
52    MMX_XSwap dq,  %6, %2, %5
53    MMX_XSwap dq,  %8, %7, %2
54    movq    %7, %10
55    movq    %10, %5
56    MMX_XSwap dq,  %7, %3, %5
57
58    movq    %3, %10
59%endmacro
60
61;in: m0, m3, m5, m2, m7, m1, m6, m4
62%macro TRANSPOSE8x8_WRITE_MMX 2 ; dst, dst_stride
63    movq [%1], mm0          ; result of line 1, x8 bytes
64    movq [%1+%2], mm3       ; result of line 2
65    lea %1, [%1+2*%2]
66    movq [%1], mm5          ; result of line 3
67    movq [%1+%2], mm2       ; result of line 4
68    lea %1, [%1+2*%2]
69    movq [%1], mm7          ; result of line 5
70    movq [%1+%2], mm1       ; result of line 6
71    lea %1, [%1+2*%2]
72    movq [%1], mm6          ; result of line 7
73    movq [%1+%2], mm4       ; result of line 8
74%endmacro
75
76;in: m0, m3, m5, m2, m7, m1, m6, m4
77%macro TRANSPOSE8x8_WRITE_ALT_MMX 3 ; dst, dst_stride, reg32
78    movq [%1], mm0          ; result of line 1, x8 bytes
79    movq [%1+%2], mm3       ; result of line 2
80    lea %3, [%1+2*%2]
81    movq [%3], mm5          ; result of line 3
82    movq [%3+%2], mm2       ; result of line 4
83    lea %3, [%3+2*%2]
84    movq [%3], mm7          ; result of line 5
85    movq [%3+%2], mm1       ; result of line 6
86    lea %3, [%3+2*%2]
87    movq [%3], mm6          ; result of line 7
88    movq [%3+%2], mm4       ; result of line 8
89%endmacro   ; end of TRANSPOSE8x8_WRITE_ALT_MMX
90
91; for transpose 16x8
92
93;in:  m0, m1, m2, m3, m4, m5, m6, m7
94;out: m4, m2, m3, m7, m5, m1, m6, m0
95%macro TRANSPOSE_8x16B_SSE2     10
96    SSE2_XSawp bw,  %1, %2, %8
97    SSE2_XSawp bw,  %3, %4, %2
98    SSE2_XSawp bw,  %5, %6, %4
99    movdqa  %6, %9
100    movdqa  %10, %4
101    SSE2_XSawp bw,  %7, %6, %4
102
103    SSE2_XSawp wd,  %1, %3, %6
104    SSE2_XSawp wd,  %8, %2, %3
105    SSE2_XSawp wd,  %5, %7, %2
106    movdqa  %7, %10
107    movdqa  %10, %3
108    SSE2_XSawp wd,  %7, %4, %3
109
110    SSE2_XSawp dq,  %1, %5, %4
111    SSE2_XSawp dq,  %6, %2, %5
112    SSE2_XSawp dq,  %8, %7, %2
113    movdqa  %7, %10
114    movdqa  %10, %5
115    SSE2_XSawp dq,  %7, %3, %5
116
117    SSE2_XSawp qdq,  %1, %8, %3
118    SSE2_XSawp qdq,  %4, %2, %8
119    SSE2_XSawp qdq,  %6, %7, %2
120    movdqa  %7, %10
121    movdqa  %10, %1
122    SSE2_XSawp qdq,  %7, %5, %1
123    movdqa  %5, %10
124%endmacro   ; end of TRANSPOSE_8x16B_SSE2
125
126
127%macro TRANSPOSE8x16_WRITE_SSE2 2   ; dst, dst_stride
128    movq [%1], xmm4         ; result of line 1, x8 bytes
129    movq [%1+%2], xmm2      ; result of line 2
130    lea %1, [%1+2*%2]
131    movq [%1], xmm3         ; result of line 3
132    movq [%1+%2], xmm7      ; result of line 4
133
134    lea %1, [%1+2*%2]
135    movq [%1], xmm5         ; result of line 5
136    movq [%1+%2], xmm1      ; result of line 6
137    lea %1, [%1+2*%2]
138    movq [%1], xmm6         ; result of line 7
139    movq [%1+%2], xmm0      ; result of line 8
140
141    lea %1, [%1+2*%2]
142    movhpd [%1], xmm4       ; result of line 9
143    movhpd [%1+%2], xmm2    ; result of line 10
144    lea %1, [%1+2*%2]
145    movhpd [%1], xmm3       ; result of line 11
146    movhpd [%1+%2], xmm7    ; result of line 12
147
148    lea %1, [%1+2*%2]
149    movhpd [%1], xmm5       ; result of line 13
150    movhpd [%1+%2], xmm1    ; result of line 14
151    lea %1, [%1+2*%2]
152    movhpd [%1], xmm6       ; result of line 15
153    movhpd [%1+%2], xmm0    ; result of line 16
154%endmacro   ; end of TRANSPOSE_WRITE_RESULT_SSE2
155
156%macro TRANSPOSE8x16_WRITE_ALT_SSE2 3   ; dst, dst_stride, reg32
157    movq [%1], xmm4         ; result of line 1, x8 bytes
158    movq [%1+%2], xmm2      ; result of line 2
159    lea %3, [%1+2*%2]
160    movq [%3], xmm3         ; result of line 3
161    movq [%3+%2], xmm7      ; result of line 4
162
163    lea %3, [%3+2*%2]
164    movq [%3], xmm5         ; result of line 5
165    movq [%3+%2], xmm1      ; result of line 6
166    lea %3, [%3+2*%2]
167    movq [%3], xmm6         ; result of line 7
168    movq [%3+%2], xmm0      ; result of line 8
169
170    lea %3, [%3+2*%2]
171    movhpd [%3], xmm4       ; result of line 9
172    movhpd [%3+%2], xmm2    ; result of line 10
173    lea %3, [%3+2*%2]
174    movhpd [%3], xmm3       ; result of line 11
175    movhpd [%3+%2], xmm7    ; result of line 12
176
177    lea %3, [%3+2*%2]
178    movhpd [%3], xmm5       ; result of line 13
179    movhpd [%3+%2], xmm1    ; result of line 14
180    lea %3, [%3+2*%2]
181    movhpd [%3], xmm6       ; result of line 15
182    movhpd [%3+%2], xmm0    ; result of line 16
183%endmacro   ; end of TRANSPOSE8x16_WRITE_ALT_SSE2
184
185
186SECTION .text
187
188WELS_EXTERN TransposeMatrixBlock16x16_sse2
189; void TransposeMatrixBlock16x16_sse2( void *dst/*16x16*/, const int32_t dst_stride, void *src/*16x16*/, const int32_t src_stride );
190    push r4
191    push r5
192    %assign push_num 2
193    LOAD_4_PARA
194    PUSH_XMM 8
195    SIGN_EXTENSION  r1, r1d
196    SIGN_EXTENSION  r3, r3d
197
198    mov r4, r7
199    and r4, 0Fh
200    sub r7, 10h
201    sub r7, r4
202    lea r5, [r3+r3*2]
203    ; top 8x16 block
204    movdqa xmm0, [r2]
205    movdqa xmm1, [r2+r3]
206    movdqa xmm2, [r2+r3*2]
207    movdqa xmm3, [r2+r5]
208    lea r2, [r2+r3*4]
209    movdqa xmm4, [r2]
210    movdqa xmm5, [r2+r3]
211    movdqa xmm6, [r2+r3*2]
212
213    ;in:  m0, m1, m2, m3, m4, m5, m6, m7
214    ;out: m4, m2, m3, m7, m5, m1, m6, m0
215    TRANSPOSE_8x16B_SSE2    xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]
216
217    TRANSPOSE8x16_WRITE_SSE2        r0, r1
218
219    ; bottom 8x16 block
220    lea r2, [r2+r3*4]
221    movdqa xmm0, [r2]
222    movdqa xmm1, [r2+r3]
223    movdqa xmm2, [r2+r3*2]
224    movdqa xmm3, [r2+r5]
225    lea r2, [r2+r3*4]
226    movdqa xmm4, [r2]
227    movdqa xmm5, [r2+r3]
228    movdqa xmm6, [r2+r3*2]
229
230    ;in:  m0, m1, m2, m3, m4, m5, m6, m7
231    ;out: m4, m2, m3, m7, m5, m1, m6, m0
232    TRANSPOSE_8x16B_SSE2    xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]
233
234    mov r5, r1
235    sal r5, 4
236    sub r0, r5
237    lea r0, [r0+r1*2+8]
238    TRANSPOSE8x16_WRITE_SSE2        r0, r1
239
240    add r7, r4
241    add r7, 10h
242    POP_XMM
243    LOAD_4_PARA_POP
244    pop r5
245    pop r4
246    ret
247
248WELS_EXTERN TransposeMatrixBlocksx16_sse2
249; void TransposeMatrixBlocksx16_sse2( void *dst/*W16x16*/, const int32_t dst_stride, void *src/*16xW16*/, const int32_t src_stride, const int32_t num_blocks );
250    push r5
251    push r6
252    %assign push_num 2
253    LOAD_5_PARA
254    PUSH_XMM 8
255    SIGN_EXTENSION  r1, r1d
256    SIGN_EXTENSION  r3, r3d
257    SIGN_EXTENSION  r4, r4d
258    mov r5, r7
259    and r5, 0Fh
260    sub r7, 10h
261    sub r7, r5
262TRANSPOSE_LOOP_SSE2:
263    ; explictly loading next loop data
264    lea r6, [r2+r3*8]
265    push r4
266%rep 8
267    mov r4, [r6]
268    mov r4, [r6+r3]
269    lea r6, [r6+r3*2]
270%endrep
271    pop r4
272    ; top 8x16 block
273    movdqa xmm0, [r2]
274    movdqa xmm1, [r2+r3]
275    lea r2, [r2+r3*2]
276    movdqa xmm2, [r2]
277    movdqa xmm3, [r2+r3]
278    lea r2, [r2+r3*2]
279    movdqa xmm4, [r2]
280    movdqa xmm5, [r2+r3]
281    lea r2, [r2+r3*2]
282    movdqa xmm6, [r2]
283
284    ;in:  m0, m1, m2, m3, m4, m5, m6, m7
285    ;out: m4, m2, m3, m7, m5, m1, m6, m0
286    TRANSPOSE_8x16B_SSE2    xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]
287    TRANSPOSE8x16_WRITE_ALT_SSE2        r0, r1, r6
288    lea r2, [r2+r3*2]
289
290    ; bottom 8x16 block
291    movdqa xmm0, [r2]
292    movdqa xmm1, [r2+r3]
293    lea r2, [r2+r3*2]
294    movdqa xmm2, [r2]
295    movdqa xmm3, [r2+r3]
296    lea r2, [r2+r3*2]
297    movdqa xmm4, [r2]
298    movdqa xmm5, [r2+r3]
299    lea r2, [r2+r3*2]
300    movdqa xmm6, [r2]
301
302    ;in:  m0, m1, m2, m3, m4, m5, m6, m7
303    ;out: m4, m2, m3, m7, m5, m1, m6, m0
304    TRANSPOSE_8x16B_SSE2    xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]
305    TRANSPOSE8x16_WRITE_ALT_SSE2        r0+8, r1, r6
306    lea r2, [r2+r3*2]
307    lea r0, [r0+16]
308    dec r4
309    jg near TRANSPOSE_LOOP_SSE2
310
311    add r7, r5
312    add r7, 10h
313    POP_XMM
314    LOAD_5_PARA_POP
315    pop r6
316    pop r5
317    ret
318
319WELS_EXTERN TransposeMatrixBlock8x8_mmx
320; void TransposeMatrixBlock8x8_mmx( void *dst/*8x8*/, const int32_t dst_stride, void *src/*8x8*/, const int32_t src_stride );
321    %assign push_num 0
322    LOAD_4_PARA
323    SIGN_EXTENSION  r1, r1d
324    SIGN_EXTENSION  r3, r3d
325    sub r7, 8
326
327    movq mm0, [r2]
328    movq mm1, [r2+r3]
329    lea r2, [r2+2*r3]
330    movq mm2, [r2]
331    movq mm3, [r2+r3]
332    lea r2, [r2+2*r3]
333    movq mm4, [r2]
334    movq mm5, [r2+r3]
335    lea r2, [r2+2*r3]
336    movq mm6, [r2]
337
338    ;in:  m0, m1, m2, m3, m4, m5, m6, m7
339    ;out: m0, m3, m5, m2, m7, m1, m6, m4
340    TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]
341
342    TRANSPOSE8x8_WRITE_MMX r0, r1
343
344    emms
345    add r7, 8
346    LOAD_4_PARA_POP
347    ret
348
349WELS_EXTERN TransposeMatrixBlocksx8_mmx
350; void TransposeMatrixBlocksx8_mmx( void *dst/*8xW8*/, const int32_t dst_stride, void *src/*W8x8*/, const int32_t src_stride, const int32_t num_blocks );
351    push r5
352    push r6
353    %assign push_num 2
354    LOAD_5_PARA
355    SIGN_EXTENSION  r1, r1d
356    SIGN_EXTENSION  r3, r3d
357    SIGN_EXTENSION  r4, r4d
358    sub r7, 8
359
360    lea r5, [r2+r3*8]
361
362TRANSPOSE_BLOCKS_X8_LOOP_MMX:
363    ; explictly loading next loop data
364%rep 4
365    mov r6, [r5]
366    mov r6, [r5+r3]
367    lea r5, [r5+r3*2]
368%endrep
369    movq mm0, [r2]
370    movq mm1, [r2+r3]
371    lea r2, [r2+2*r3]
372    movq mm2, [r2]
373    movq mm3, [r2+r3]
374    lea r2, [r2+2*r3]
375    movq mm4, [r2]
376    movq mm5, [r2+r3]
377    lea r2, [r2+2*r3]
378    movq mm6, [r2]
379
380    ;in:  m0, m1, m2, m3, m4, m5, m6, m7
381    ;out: m0, m3, m5, m2, m7, m1, m6, m4
382    TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]
383
384    TRANSPOSE8x8_WRITE_ALT_MMX r0, r1, r6
385    lea r0, [r0+8]
386    lea r2, [r2+2*r3]
387    dec r4
388    jg near TRANSPOSE_BLOCKS_X8_LOOP_MMX
389
390    emms
391    add r7, 8
392    LOAD_5_PARA_POP
393    pop r6
394    pop r5
395    ret
396