1 /*
2 * Copyright (C) 2008 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 /* ---- includes ----------------------------------------------------------- */
18
19 #include "b_BasicEm/Basic.h" /* to disable some warnings in VC++ */
20
21 #if ( defined( WIN64 ) || defined( HW_SSE2 ) )
22
23 #include "emmintrin.h"
24
25 /* disable warning "local variable 'x' used without having been initialized" */
26 #pragma warning( disable : 4700 )
27
28
29 /** Using half register (64-bit) in SSE2 to calculate dot product.
30 * This is a SSE2 reimplementation of bbs_dotProduct_intelMMX16 in Math.c.
31 * Dependencies: input vectors need to be 16-bit aligned
32 * Return Value: int32 containing resultL of dot product
33 */
bbs_dotProduct_64SSE2(const int16 * vec1A,const int16 * vec2A,uint32 sizeA)34 int32 bbs_dotProduct_64SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA )
35 {
36 __m128i m_XMM0, m_XMM1, m_XMM2, m_XMM3, m_XMM4, m_XMM5, m_XMM6, m_XMM7, m_XMM8;
37 int16* vec1L = ( int16* )vec1A;
38 int16* vec2L = ( int16* )vec2A;
39
40 int32 resultL = 0;
41 uint32 alignOffSetL = 0;
42
43 /* initialize registers to 0 */
44 m_XMM4 = _mm_xor_si128( m_XMM4, m_XMM4 );
45 m_XMM6 = _mm_xor_si128( m_XMM6, m_XMM6 );
46 m_XMM7 = _mm_xor_si128( m_XMM7, m_XMM7 );
47
48 alignOffSetL = sizeA % 16;
49 sizeA >>= 4;
50
51 if( sizeA )
52 {
53 while( sizeA > 0 )
54 {
55 m_XMM0 = _mm_loadl_epi64( (__m128i *)&0[vec1L] );
56 m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM4 );
57
58 m_XMM1 = _mm_loadl_epi64( (__m128i *)&0[vec2L] );
59 m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM6 );
60
61 m_XMM2 = _mm_loadl_epi64( (__m128i *)&4[vec1L] );
62
63 m_XMM0 = _mm_madd_epi16( m_XMM0, m_XMM1 );
64
65 m_XMM3 = _mm_loadl_epi64( (__m128i *)&4[vec2L] );
66 m_XMM4 = _mm_loadl_epi64( (__m128i *)&8[vec1L] );
67
68 m_XMM2 = _mm_madd_epi16( m_XMM2, m_XMM3 );
69
70 m_XMM5 = _mm_loadl_epi64( (__m128i *)&8[vec2L] );
71
72 m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM0 );
73
74 m_XMM6 = _mm_loadl_epi64( (__m128i *)&12[vec1L] );
75
76 m_XMM4 = _mm_madd_epi16( m_XMM4, m_XMM5 );
77
78 m_XMM8 = _mm_loadl_epi64( (__m128i *)&12[vec2L] );
79 m_XMM6 = _mm_madd_epi16( m_XMM6, m_XMM8 );
80
81 m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM2 );
82
83 vec1L += 16;
84 vec2L += 16;
85 sizeA--;
86 }
87
88 /* sum up accumulators */
89 m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM4 );
90
91 m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM6 );
92
93 m_XMM0 = _mm_loadl_epi64( (__m128i *)&m_XMM7 );
94
95 m_XMM0 = _mm_srli_epi64( m_XMM0, 32 );
96
97 m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM0 );
98
99 resultL = _mm_cvtsi128_si32( m_XMM7 );
100 }
101
102 /* switch statements produces faster code than loop */
103 switch( alignOffSetL )
104 {
105 case 15:
106 resultL += ( int32 )*vec1L++ * *vec2L++;
107 case 14:
108 resultL += ( int32 )*vec1L++ * *vec2L++;
109 case 13:
110 resultL += ( int32 )*vec1L++ * *vec2L++;
111 case 12:
112 resultL += ( int32 )*vec1L++ * *vec2L++;
113 case 11:
114 resultL += ( int32 )*vec1L++ * *vec2L++;
115 case 10:
116 resultL += ( int32 )*vec1L++ * *vec2L++;
117 case 9:
118 resultL += ( int32 )*vec1L++ * *vec2L++;
119 case 8:
120 resultL += ( int32 )*vec1L++ * *vec2L++;
121 case 7:
122 resultL += ( int32 )*vec1L++ * *vec2L++;
123 case 6:
124 resultL += ( int32 )*vec1L++ * *vec2L++;
125 case 5:
126 resultL += ( int32 )*vec1L++ * *vec2L++;
127 case 4:
128 resultL += ( int32 )*vec1L++ * *vec2L++;
129 case 3:
130 resultL += ( int32 )*vec1L++ * *vec2L++;
131 case 2:
132 resultL += ( int32 )*vec1L++ * *vec2L++;
133 case 1:
134 resultL += ( int32 )*vec1L++ * *vec2L++;
135 }
136
137 return resultL;
138 }
139
140 /* ------------------------------------------------------------------------- */
141
142 /** Using full register (128-bit) in SSE2 to calculate dot Product.
143 * Dependencies: 16-bit aligned
144 * Return Value: int32 containing dot Product
145 */
bbs_dotProduct_128SSE2(const int16 * vec1A,const int16 * vec2A,uint32 sizeA)146 int32 bbs_dotProduct_128SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA )
147 {
148 __m128i m_XMM0, m_XMM2, m_XMM3, m_XMM5, m_XMM6;
149 int16* vec1L = ( int16* )vec1A;
150 int16* vec2L = ( int16* )vec2A;
151
152 int32 resultL = 0;
153 uint32 alignOffSetL = 0;
154
155 m_XMM5 = _mm_xor_si128( m_XMM5, m_XMM5 );
156 m_XMM6 = _mm_xor_si128( m_XMM6, m_XMM6 );
157
158 alignOffSetL = sizeA % 16;
159 sizeA >>= 4;
160
161 if( sizeA )
162 {
163 while( sizeA > 0 )
164 {
165 m_XMM0 = _mm_load_si128( (__m128i *)&0[vec1L] );
166 m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM6 );
167
168 m_XMM2 = _mm_load_si128( (__m128i *)&0[vec2L] );
169
170 m_XMM6 = _mm_load_si128( (__m128i *)&8[vec1L] );
171
172 m_XMM0 = _mm_madd_epi16( m_XMM0, m_XMM2 );
173
174 m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM0 );
175
176 m_XMM3 = _mm_load_si128( (__m128i *)&8[vec2L] );
177
178 m_XMM6 = _mm_madd_epi16( m_XMM6, m_XMM3 );
179
180 vec1L += 16;
181 vec2L += 16;
182 sizeA--;
183 }
184
185 /* sum up accumulators */
186 m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM6 );
187
188 m_XMM0 = _mm_load_si128( (__m128i *)&m_XMM5 );
189
190 resultL = _mm_cvtsi128_si32( m_XMM0 ); /* 1st 32bits */
191
192 m_XMM0 = _mm_srli_si128( m_XMM0, 4 );
193
194 resultL += _mm_cvtsi128_si32( m_XMM0 ); /* 2nd 32bits */
195
196 m_XMM0 = _mm_srli_si128( m_XMM0, 4 );
197
198 resultL += _mm_cvtsi128_si32( m_XMM0 ); /* 3rd 32bits */
199
200 m_XMM0 = _mm_srli_si128( m_XMM0, 4 );
201
202 resultL += _mm_cvtsi128_si32( m_XMM0 ); /* 4th 32bits */
203 }
204
205 switch( alignOffSetL )
206 {
207 case 15:
208 resultL += ( int32 )*vec1L++ * *vec2L++;
209 case 14:
210 resultL += ( int32 )*vec1L++ * *vec2L++;
211 case 13:
212 resultL += ( int32 )*vec1L++ * *vec2L++;
213 case 12:
214 resultL += ( int32 )*vec1L++ * *vec2L++;
215 case 11:
216 resultL += ( int32 )*vec1L++ * *vec2L++;
217 case 10:
218 resultL += ( int32 )*vec1L++ * *vec2L++;
219 case 9:
220 resultL += ( int32 )*vec1L++ * *vec2L++;
221 case 8:
222 resultL += ( int32 )*vec1L++ * *vec2L++;
223 case 7:
224 resultL += ( int32 )*vec1L++ * *vec2L++;
225 case 6:
226 resultL += ( int32 )*vec1L++ * *vec2L++;
227 case 5:
228 resultL += ( int32 )*vec1L++ * *vec2L++;
229 case 4:
230 resultL += ( int32 )*vec1L++ * *vec2L++;
231 case 3:
232 resultL += ( int32 )*vec1L++ * *vec2L++;
233 case 2:
234 resultL += ( int32 )*vec1L++ * *vec2L++;
235 case 1:
236 resultL += ( int32 )*vec1L++ * *vec2L++;
237 }
238
239 return resultL;
240 }
241
242 /* ------------------------------------------------------------------------- */
243
244
245 /** Using full register (128-bit) in SSE2 to calculate dot product (non aligned version).
246 * Dependencies: memory does not need to be 16-bit aligned
247 * Return Value: int32 containing dot product
248 */
bbs_dotProduct_u128SSE2(const int16 * vec1A,const int16 * vec2A,uint32 sizeA)249 int32 bbs_dotProduct_u128SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA )
250 {
251 __m128i m_XMM0, m_XMM2, m_XMM3, m_XMM5, m_XMM6;
252 int16* vec1L = ( int16* )vec1A;
253 int16* vec2L = ( int16* )vec2A;
254 int32 resultL = 0;
255 uint32 alignOffSetL = 0;
256
257 /* initialize registers to 0 */
258 m_XMM5 = _mm_xor_si128( m_XMM5, m_XMM5 );
259 m_XMM6 = _mm_xor_si128( m_XMM6, m_XMM6 );
260
261
262 alignOffSetL = sizeA % 16;
263 sizeA >>= 4;
264
265 if( sizeA )
266 {
267 while( sizeA > 0 )
268 {
269 m_XMM0 = _mm_loadu_si128( (__m128i *)&0[vec1L] );
270 m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM6 );
271
272 m_XMM2 = _mm_loadu_si128( (__m128i *)&0[vec2L] );
273
274 m_XMM6 = _mm_loadu_si128( (__m128i *)&8[vec1L] );
275
276 m_XMM0 = _mm_madd_epi16( m_XMM0, m_XMM2 );
277
278 m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM0 );
279
280 m_XMM3 = _mm_loadu_si128( (__m128i *)&8[vec2L] );
281
282 m_XMM6 = _mm_madd_epi16( m_XMM6, m_XMM3 );
283
284 vec1L += 16;
285 vec2L += 16;
286 sizeA--;
287 }
288
289 /* sum up accumulators */
290 m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM6 );
291
292 m_XMM0 = _mm_loadu_si128( (__m128i *)&m_XMM5 );
293
294 resultL = _mm_cvtsi128_si32( m_XMM0 ); /* 1st 32bits */
295
296 m_XMM0 = _mm_srli_si128( m_XMM0, 4 );
297
298 resultL += _mm_cvtsi128_si32( m_XMM0 ); /* 2nd 32bits */
299
300 m_XMM0 = _mm_srli_si128( m_XMM0, 4 );
301
302 resultL += _mm_cvtsi128_si32( m_XMM0 ); /* 3rd 32bits */
303
304 m_XMM0 = _mm_srli_si128( m_XMM0, 4 );
305
306 resultL += _mm_cvtsi128_si32( m_XMM0 ); /* 4th 32bits */
307 }
308
309
310 switch( alignOffSetL )
311 {
312 case 15:
313 resultL += ( int32 )*vec1L++ * *vec2L++;
314 case 14:
315 resultL += ( int32 )*vec1L++ * *vec2L++;
316 case 13:
317 resultL += ( int32 )*vec1L++ * *vec2L++;
318 case 12:
319 resultL += ( int32 )*vec1L++ * *vec2L++;
320 case 11:
321 resultL += ( int32 )*vec1L++ * *vec2L++;
322 case 10:
323 resultL += ( int32 )*vec1L++ * *vec2L++;
324 case 9:
325 resultL += ( int32 )*vec1L++ * *vec2L++;
326 case 8:
327 resultL += ( int32 )*vec1L++ * *vec2L++;
328 case 7:
329 resultL += ( int32 )*vec1L++ * *vec2L++;
330 case 6:
331 resultL += ( int32 )*vec1L++ * *vec2L++;
332 case 5:
333 resultL += ( int32 )*vec1L++ * *vec2L++;
334 case 4:
335 resultL += ( int32 )*vec1L++ * *vec2L++;
336 case 3:
337 resultL += ( int32 )*vec1L++ * *vec2L++;
338 case 2:
339 resultL += ( int32 )*vec1L++ * *vec2L++;
340 case 1:
341 resultL += ( int32 )*vec1L++ * *vec2L++;
342 }
343
344 return resultL;
345 }
346
347 /* ------------------------------------------------------------------------- */
348
349 #endif /* HW_SSE2 */
350