• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright (c) 2014, Cisco Systems, INC
2    Written by XiangMingZhu WeiZhou MinPeng YanWang
3 
4    Redistribution and use in source and binary forms, with or without
5    modification, are permitted provided that the following conditions
6    are met:
7 
8    - Redistributions of source code must retain the above copyright
9    notice, this list of conditions and the following disclaimer.
10 
11    - Redistributions in binary form must reproduce the above copyright
12    notice, this list of conditions and the following disclaimer in the
13    documentation and/or other materials provided with the distribution.
14 
15    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19    OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27 
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31 
32 #include <xmmintrin.h>
33 #include <emmintrin.h>
34 
35 #include "macros.h"
36 #include "celt_lpc.h"
37 #include "stack_alloc.h"
38 #include "mathops.h"
39 #include "pitch.h"
40 
41 #if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
42 #include <smmintrin.h>
43 #include "x86cpu.h"
44 
celt_inner_prod_sse4_1(const opus_val16 * x,const opus_val16 * y,int N)45 opus_val32 celt_inner_prod_sse4_1(const opus_val16 *x, const opus_val16 *y,
46       int N)
47 {
48     opus_int  i, dataSize16;
49     opus_int32 sum;
50     __m128i inVec1_76543210, inVec1_FEDCBA98, acc1;
51     __m128i inVec2_76543210, inVec2_FEDCBA98, acc2;
52     __m128i inVec1_3210, inVec2_3210;
53 
54     sum = 0;
55     dataSize16 = N & ~15;
56 
57     acc1 = _mm_setzero_si128();
58     acc2 = _mm_setzero_si128();
59 
60     for (i=0;i<dataSize16;i+=16) {
61         inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
62         inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
63 
64         inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8]));
65         inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8]));
66 
67         inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
68         inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98);
69 
70         acc1 = _mm_add_epi32(acc1, inVec1_76543210);
71         acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98);
72     }
73 
74     acc1 = _mm_add_epi32(acc1, acc2);
75 
76     if (N - i >= 8)
77     {
78         inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
79         inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
80 
81         inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
82 
83         acc1 = _mm_add_epi32(acc1, inVec1_76543210);
84         i += 8;
85     }
86 
87     if (N - i >= 4)
88     {
89         inVec1_3210 = OP_CVTEPI16_EPI32_M64(&x[i + 0]);
90         inVec2_3210 = OP_CVTEPI16_EPI32_M64(&y[i + 0]);
91 
92         inVec1_3210 = _mm_mullo_epi32(inVec1_3210, inVec2_3210);
93 
94         acc1 = _mm_add_epi32(acc1, inVec1_3210);
95         i += 4;
96     }
97 
98     acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64(acc1, acc1));
99     acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16(acc1, 0x0E));
100 
101     sum += _mm_cvtsi128_si32(acc1);
102 
103     for (;i<N;i++)
104     {
105         sum = silk_SMLABB(sum, x[i], y[i]);
106     }
107 
108     return sum;
109 }
110 
xcorr_kernel_sse4_1(const opus_val16 * x,const opus_val16 * y,opus_val32 sum[4],int len)111 void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[ 4 ], int len)
112 {
113     int j;
114 
115     __m128i vecX, vecX0, vecX1, vecX2, vecX3;
116     __m128i vecY0, vecY1, vecY2, vecY3;
117     __m128i sum0, sum1, sum2, sum3, vecSum;
118     __m128i initSum;
119 
120 #ifdef OPUS_CHECK_ASM
121     opus_val32 sum_c[4];
122     for (j=0;j<4;j++) {
123       sum_c[j] = sum[j];
124     }
125     xcorr_kernel_c(x, y, sum_c, len);
126 #endif
127 
128     celt_assert(len >= 3);
129 
130     sum0 = _mm_setzero_si128();
131     sum1 = _mm_setzero_si128();
132     sum2 = _mm_setzero_si128();
133     sum3 = _mm_setzero_si128();
134 
135     for (j=0;j<(len-7);j+=8)
136     {
137         vecX = _mm_loadu_si128((__m128i *)(&x[j + 0]));
138         vecY0 = _mm_loadu_si128((__m128i *)(&y[j + 0]));
139         vecY1 = _mm_loadu_si128((__m128i *)(&y[j + 1]));
140         vecY2 = _mm_loadu_si128((__m128i *)(&y[j + 2]));
141         vecY3 = _mm_loadu_si128((__m128i *)(&y[j + 3]));
142 
143         sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(vecX, vecY0));
144         sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(vecX, vecY1));
145         sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(vecX, vecY2));
146         sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(vecX, vecY3));
147     }
148 
149     sum0 = _mm_add_epi32(sum0, _mm_unpackhi_epi64( sum0, sum0));
150     sum0 = _mm_add_epi32(sum0, _mm_shufflelo_epi16( sum0, 0x0E));
151 
152     sum1 = _mm_add_epi32(sum1, _mm_unpackhi_epi64( sum1, sum1));
153     sum1 = _mm_add_epi32(sum1, _mm_shufflelo_epi16( sum1, 0x0E));
154 
155     sum2 = _mm_add_epi32(sum2, _mm_unpackhi_epi64( sum2, sum2));
156     sum2 = _mm_add_epi32(sum2, _mm_shufflelo_epi16( sum2, 0x0E));
157 
158     sum3 = _mm_add_epi32(sum3, _mm_unpackhi_epi64( sum3, sum3));
159     sum3 = _mm_add_epi32(sum3, _mm_shufflelo_epi16( sum3, 0x0E));
160 
161     vecSum = _mm_unpacklo_epi64(_mm_unpacklo_epi32(sum0, sum1),
162           _mm_unpacklo_epi32(sum2, sum3));
163 
164     for (;j<(len-3);j+=4)
165     {
166         vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
167         vecX0 = _mm_shuffle_epi32(vecX, 0x00);
168         vecX1 = _mm_shuffle_epi32(vecX, 0x55);
169         vecX2 = _mm_shuffle_epi32(vecX, 0xaa);
170         vecX3 = _mm_shuffle_epi32(vecX, 0xff);
171 
172         vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
173         vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
174         vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]);
175         vecY3 = OP_CVTEPI16_EPI32_M64(&y[j + 3]);
176 
177         sum0 = _mm_mullo_epi32(vecX0, vecY0);
178         sum1 = _mm_mullo_epi32(vecX1, vecY1);
179         sum2 = _mm_mullo_epi32(vecX2, vecY2);
180         sum3 = _mm_mullo_epi32(vecX3, vecY3);
181 
182         sum0 = _mm_add_epi32(sum0, sum1);
183         sum2 = _mm_add_epi32(sum2, sum3);
184         vecSum = _mm_add_epi32(vecSum, sum0);
185         vecSum = _mm_add_epi32(vecSum, sum2);
186     }
187 
188     vecX = OP_CVTEPI16_EPI32_M64(&x[len - 4]);
189     if (len - j == 3)
190     {
191         vecX0 = _mm_shuffle_epi32(vecX, 0x55);
192         vecX1 = _mm_shuffle_epi32(vecX, 0xaa);
193         vecX2 = _mm_shuffle_epi32(vecX, 0xff);
194 
195         vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
196         vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
197         vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]);
198 
199         sum0 = _mm_mullo_epi32(vecX0, vecY0);
200         sum1 = _mm_mullo_epi32(vecX1, vecY1);
201         sum2 = _mm_mullo_epi32(vecX2, vecY2);
202 
203         vecSum = _mm_add_epi32(vecSum, sum0);
204         vecSum = _mm_add_epi32(vecSum, sum1);
205         vecSum = _mm_add_epi32(vecSum, sum2);
206     }
207     else if (len - j == 2)
208     {
209         vecX0 = _mm_shuffle_epi32(vecX, 0xaa);
210         vecX1 = _mm_shuffle_epi32(vecX, 0xff);
211 
212         vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
213         vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
214 
215         sum0 = _mm_mullo_epi32(vecX0, vecY0);
216         sum1 = _mm_mullo_epi32(vecX1, vecY1);
217 
218         vecSum = _mm_add_epi32(vecSum, sum0);
219         vecSum = _mm_add_epi32(vecSum, sum1);
220     }
221     else if (len - j == 1)
222     {
223         vecX0 = _mm_shuffle_epi32(vecX, 0xff);
224 
225         vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
226 
227         sum0 = _mm_mullo_epi32(vecX0, vecY0);
228 
229         vecSum = _mm_add_epi32(vecSum, sum0);
230     }
231 
232     initSum = _mm_loadu_si128((__m128i *)(&sum[0]));
233     initSum = _mm_add_epi32(initSum, vecSum);
234     _mm_storeu_si128((__m128i *)sum, initSum);
235 
236 #ifdef OPUS_CHECK_ASM
237     celt_assert(!memcmp(sum_c, sum, sizeof(sum_c)));
238 #endif
239 }
240 #endif
241