• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;******************************************************************************
2;* TAK DSP SIMD optimizations
3;*
4;* Copyright (C) 2015 Paul B Mahol
5;*
6;* This file is part of FFmpeg.
7;*
8;* FFmpeg is free software; you can redistribute it and/or
9;* modify it under the terms of the GNU Lesser General Public
10;* License as published by the Free Software Foundation; either
11;* version 2.1 of the License, or (at your option) any later version.
12;*
13;* FFmpeg is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16;* Lesser General Public License for more details.
17;*
18;* You should have received a copy of the GNU Lesser General Public
19;* License along with FFmpeg; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21;******************************************************************************
22
23%include "libavutil/x86/x86util.asm"
24
25SECTION_RODATA
26
27pd_128: times 4 dd 128
28
29SECTION .text
30
31INIT_XMM sse2
32cglobal tak_decorrelate_ls, 3, 3, 2, p1, p2, length
33    shl                     lengthd, 2
34    add                         p1q, lengthq
35    add                         p2q, lengthq
36    neg                     lengthq
37.loop:
38    mova                         m0, [p1q+lengthq+mmsize*0]
39    mova                         m1, [p1q+lengthq+mmsize*1]
40    paddd                        m0, [p2q+lengthq+mmsize*0]
41    paddd                        m1, [p2q+lengthq+mmsize*1]
42    mova     [p2q+lengthq+mmsize*0], m0
43    mova     [p2q+lengthq+mmsize*1], m1
44    add                     lengthq, mmsize*2
45    jl .loop
46    REP_RET
47
48cglobal tak_decorrelate_sr, 3, 3, 2, p1, p2, length
49    shl                     lengthd, 2
50    add                         p1q, lengthq
51    add                         p2q, lengthq
52    neg                     lengthq
53
54.loop:
55    mova                         m0, [p2q+lengthq+mmsize*0]
56    mova                         m1, [p2q+lengthq+mmsize*1]
57    psubd                        m0, [p1q+lengthq+mmsize*0]
58    psubd                        m1, [p1q+lengthq+mmsize*1]
59    mova     [p1q+lengthq+mmsize*0], m0
60    mova     [p1q+lengthq+mmsize*1], m1
61    add                     lengthq, mmsize*2
62    jl .loop
63    REP_RET
64
65cglobal tak_decorrelate_sm, 3, 3, 6, p1, p2, length
66    shl                     lengthd, 2
67    add                         p1q, lengthq
68    add                         p2q, lengthq
69    neg                     lengthq
70
71.loop:
72    mova                         m0, [p1q+lengthq]
73    mova                         m1, [p2q+lengthq]
74    mova                         m3, [p1q+lengthq+mmsize]
75    mova                         m4, [p2q+lengthq+mmsize]
76    mova                         m2, m1
77    mova                         m5, m4
78    psrad                        m2, 1
79    psrad                        m5, 1
80    psubd                        m0, m2
81    psubd                        m3, m5
82    paddd                        m1, m0
83    paddd                        m4, m3
84    mova              [p1q+lengthq], m0
85    mova              [p2q+lengthq], m1
86    mova       [p1q+lengthq+mmsize], m3
87    mova       [p2q+lengthq+mmsize], m4
88    add                     lengthq, mmsize*2
89    jl .loop
90    REP_RET
91
92INIT_XMM sse4
93cglobal tak_decorrelate_sf, 3, 3, 5, p1, p2, length, dshift, dfactor
94    shl             lengthd, 2
95    add                 p1q, lengthq
96    add                 p2q, lengthq
97    neg             lengthq
98
99    movd                 m2, dshiftm
100    movd                 m3, dfactorm
101    pshufd               m3, m3, 0
102    mova                 m4, [pd_128]
103
104.loop:
105    mova                 m0, [p1q+lengthq]
106    mova                 m1, [p2q+lengthq]
107    psrad                m1, m2
108    pmulld               m1, m3
109    paddd                m1, m4
110    psrad                m1, 8
111    pslld                m1, m2
112    psubd                m1, m0
113    mova      [p1q+lengthq], m1
114    add             lengthq, mmsize
115    jl .loop
116    REP_RET
117