1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <emmintrin.h> // SSE2
12
13 #include "./vpx_config.h"
14 #include "./vpx_dsp_rtcd.h"
15
16 #include "vpx_ports/mem.h"
17
18 typedef void (*getNxMvar_fn_t) (const unsigned char *src, int src_stride,
19 const unsigned char *ref, int ref_stride,
20 unsigned int *sse, int *sum);
21
vpx_get_mb_ss_sse2(const int16_t * src)22 unsigned int vpx_get_mb_ss_sse2(const int16_t *src) {
23 __m128i vsum = _mm_setzero_si128();
24 int i;
25
26 for (i = 0; i < 32; ++i) {
27 const __m128i v = _mm_loadu_si128((const __m128i *)src);
28 vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
29 src += 8;
30 }
31
32 vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
33 vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
34 return _mm_cvtsi128_si32(vsum);
35 }
36
37 #define READ64(p, stride, i) \
38 _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \
39 _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride)))
40
get4x4var_sse2(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,unsigned int * sse,int * sum)41 static void get4x4var_sse2(const uint8_t *src, int src_stride,
42 const uint8_t *ref, int ref_stride,
43 unsigned int *sse, int *sum) {
44 const __m128i zero = _mm_setzero_si128();
45 const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero);
46 const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero);
47 const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero);
48 const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero);
49 const __m128i diff0 = _mm_sub_epi16(src0, ref0);
50 const __m128i diff1 = _mm_sub_epi16(src1, ref1);
51
52 // sum
53 __m128i vsum = _mm_add_epi16(diff0, diff1);
54 vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
55 vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
56 vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
57 *sum = (int16_t)_mm_extract_epi16(vsum, 0);
58
59 // sse
60 vsum = _mm_add_epi32(_mm_madd_epi16(diff0, diff0),
61 _mm_madd_epi16(diff1, diff1));
62 vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
63 vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
64 *sse = _mm_cvtsi128_si32(vsum);
65 }
66
vpx_get8x8var_sse2(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,unsigned int * sse,int * sum)67 void vpx_get8x8var_sse2(const uint8_t *src, int src_stride,
68 const uint8_t *ref, int ref_stride,
69 unsigned int *sse, int *sum) {
70 const __m128i zero = _mm_setzero_si128();
71 __m128i vsum = _mm_setzero_si128();
72 __m128i vsse = _mm_setzero_si128();
73 int i;
74
75 for (i = 0; i < 8; i += 2) {
76 const __m128i src0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
77 (const __m128i *)(src + i * src_stride)), zero);
78 const __m128i ref0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
79 (const __m128i *)(ref + i * ref_stride)), zero);
80 const __m128i diff0 = _mm_sub_epi16(src0, ref0);
81
82 const __m128i src1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
83 (const __m128i *)(src + (i + 1) * src_stride)), zero);
84 const __m128i ref1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
85 (const __m128i *)(ref + (i + 1) * ref_stride)), zero);
86 const __m128i diff1 = _mm_sub_epi16(src1, ref1);
87
88 vsum = _mm_add_epi16(vsum, diff0);
89 vsum = _mm_add_epi16(vsum, diff1);
90 vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
91 vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
92 }
93
94 // sum
95 vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
96 vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
97 vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
98 *sum = (int16_t)_mm_extract_epi16(vsum, 0);
99
100 // sse
101 vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
102 vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
103 *sse = _mm_cvtsi128_si32(vsse);
104 }
105
vpx_get16x16var_sse2(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,unsigned int * sse,int * sum)106 void vpx_get16x16var_sse2(const uint8_t *src, int src_stride,
107 const uint8_t *ref, int ref_stride,
108 unsigned int *sse, int *sum) {
109 const __m128i zero = _mm_setzero_si128();
110 __m128i vsum = _mm_setzero_si128();
111 __m128i vsse = _mm_setzero_si128();
112 int i;
113
114 for (i = 0; i < 16; ++i) {
115 const __m128i s = _mm_loadu_si128((const __m128i *)src);
116 const __m128i r = _mm_loadu_si128((const __m128i *)ref);
117
118 const __m128i src0 = _mm_unpacklo_epi8(s, zero);
119 const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
120 const __m128i diff0 = _mm_sub_epi16(src0, ref0);
121
122 const __m128i src1 = _mm_unpackhi_epi8(s, zero);
123 const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
124 const __m128i diff1 = _mm_sub_epi16(src1, ref1);
125
126 vsum = _mm_add_epi16(vsum, diff0);
127 vsum = _mm_add_epi16(vsum, diff1);
128 vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
129 vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
130
131 src += src_stride;
132 ref += ref_stride;
133 }
134
135 // sum
136 vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
137 vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
138 *sum = (int16_t)_mm_extract_epi16(vsum, 0) +
139 (int16_t)_mm_extract_epi16(vsum, 1);
140
141 // sse
142 vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
143 vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
144 *sse = _mm_cvtsi128_si32(vsse);
145 }
146
147
variance_sse2(const unsigned char * src,int src_stride,const unsigned char * ref,int ref_stride,int w,int h,unsigned int * sse,int * sum,getNxMvar_fn_t var_fn,int block_size)148 static void variance_sse2(const unsigned char *src, int src_stride,
149 const unsigned char *ref, int ref_stride,
150 int w, int h, unsigned int *sse, int *sum,
151 getNxMvar_fn_t var_fn, int block_size) {
152 int i, j;
153
154 *sse = 0;
155 *sum = 0;
156
157 for (i = 0; i < h; i += block_size) {
158 for (j = 0; j < w; j += block_size) {
159 unsigned int sse0;
160 int sum0;
161 var_fn(src + src_stride * i + j, src_stride,
162 ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
163 *sse += sse0;
164 *sum += sum0;
165 }
166 }
167 }
168
vpx_variance4x4_sse2(const unsigned char * src,int src_stride,const unsigned char * ref,int ref_stride,unsigned int * sse)169 unsigned int vpx_variance4x4_sse2(const unsigned char *src, int src_stride,
170 const unsigned char *ref, int ref_stride,
171 unsigned int *sse) {
172 int sum;
173 get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
174 return *sse - (((unsigned int)sum * sum) >> 4);
175 }
176
vpx_variance8x4_sse2(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,unsigned int * sse)177 unsigned int vpx_variance8x4_sse2(const uint8_t *src, int src_stride,
178 const uint8_t *ref, int ref_stride,
179 unsigned int *sse) {
180 int sum;
181 variance_sse2(src, src_stride, ref, ref_stride, 8, 4,
182 sse, &sum, get4x4var_sse2, 4);
183 return *sse - (((unsigned int)sum * sum) >> 5);
184 }
185
vpx_variance4x8_sse2(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,unsigned int * sse)186 unsigned int vpx_variance4x8_sse2(const uint8_t *src, int src_stride,
187 const uint8_t *ref, int ref_stride,
188 unsigned int *sse) {
189 int sum;
190 variance_sse2(src, src_stride, ref, ref_stride, 4, 8,
191 sse, &sum, get4x4var_sse2, 4);
192 return *sse - (((unsigned int)sum * sum) >> 5);
193 }
194
vpx_variance8x8_sse2(const unsigned char * src,int src_stride,const unsigned char * ref,int ref_stride,unsigned int * sse)195 unsigned int vpx_variance8x8_sse2(const unsigned char *src, int src_stride,
196 const unsigned char *ref, int ref_stride,
197 unsigned int *sse) {
198 int sum;
199 vpx_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
200 return *sse - (((unsigned int)sum * sum) >> 6);
201 }
202
vpx_variance16x8_sse2(const unsigned char * src,int src_stride,const unsigned char * ref,int ref_stride,unsigned int * sse)203 unsigned int vpx_variance16x8_sse2(const unsigned char *src, int src_stride,
204 const unsigned char *ref, int ref_stride,
205 unsigned int *sse) {
206 int sum;
207 variance_sse2(src, src_stride, ref, ref_stride, 16, 8,
208 sse, &sum, vpx_get8x8var_sse2, 8);
209 return *sse - (((unsigned int)sum * sum) >> 7);
210 }
211
vpx_variance8x16_sse2(const unsigned char * src,int src_stride,const unsigned char * ref,int ref_stride,unsigned int * sse)212 unsigned int vpx_variance8x16_sse2(const unsigned char *src, int src_stride,
213 const unsigned char *ref, int ref_stride,
214 unsigned int *sse) {
215 int sum;
216 variance_sse2(src, src_stride, ref, ref_stride, 8, 16,
217 sse, &sum, vpx_get8x8var_sse2, 8);
218 return *sse - (((unsigned int)sum * sum) >> 7);
219 }
220
vpx_variance16x16_sse2(const unsigned char * src,int src_stride,const unsigned char * ref,int ref_stride,unsigned int * sse)221 unsigned int vpx_variance16x16_sse2(const unsigned char *src, int src_stride,
222 const unsigned char *ref, int ref_stride,
223 unsigned int *sse) {
224 int sum;
225 vpx_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
226 return *sse - (((unsigned int)sum * sum) >> 8);
227 }
228
vpx_variance32x32_sse2(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,unsigned int * sse)229 unsigned int vpx_variance32x32_sse2(const uint8_t *src, int src_stride,
230 const uint8_t *ref, int ref_stride,
231 unsigned int *sse) {
232 int sum;
233 variance_sse2(src, src_stride, ref, ref_stride, 32, 32,
234 sse, &sum, vpx_get16x16var_sse2, 16);
235 return *sse - (((int64_t)sum * sum) >> 10);
236 }
237
vpx_variance32x16_sse2(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,unsigned int * sse)238 unsigned int vpx_variance32x16_sse2(const uint8_t *src, int src_stride,
239 const uint8_t *ref, int ref_stride,
240 unsigned int *sse) {
241 int sum;
242 variance_sse2(src, src_stride, ref, ref_stride, 32, 16,
243 sse, &sum, vpx_get16x16var_sse2, 16);
244 return *sse - (((int64_t)sum * sum) >> 9);
245 }
246
vpx_variance16x32_sse2(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,unsigned int * sse)247 unsigned int vpx_variance16x32_sse2(const uint8_t *src, int src_stride,
248 const uint8_t *ref, int ref_stride,
249 unsigned int *sse) {
250 int sum;
251 variance_sse2(src, src_stride, ref, ref_stride, 16, 32,
252 sse, &sum, vpx_get16x16var_sse2, 16);
253 return *sse - (((int64_t)sum * sum) >> 9);
254 }
255
vpx_variance64x64_sse2(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,unsigned int * sse)256 unsigned int vpx_variance64x64_sse2(const uint8_t *src, int src_stride,
257 const uint8_t *ref, int ref_stride,
258 unsigned int *sse) {
259 int sum;
260 variance_sse2(src, src_stride, ref, ref_stride, 64, 64,
261 sse, &sum, vpx_get16x16var_sse2, 16);
262 return *sse - (((int64_t)sum * sum) >> 12);
263 }
264
vpx_variance64x32_sse2(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,unsigned int * sse)265 unsigned int vpx_variance64x32_sse2(const uint8_t *src, int src_stride,
266 const uint8_t *ref, int ref_stride,
267 unsigned int *sse) {
268 int sum;
269 variance_sse2(src, src_stride, ref, ref_stride, 64, 32,
270 sse, &sum, vpx_get16x16var_sse2, 16);
271 return *sse - (((int64_t)sum * sum) >> 11);
272 }
273
vpx_variance32x64_sse2(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,unsigned int * sse)274 unsigned int vpx_variance32x64_sse2(const uint8_t *src, int src_stride,
275 const uint8_t *ref, int ref_stride,
276 unsigned int *sse) {
277 int sum;
278 variance_sse2(src, src_stride, ref, ref_stride, 32, 64,
279 sse, &sum, vpx_get16x16var_sse2, 16);
280 return *sse - (((int64_t)sum * sum) >> 11);
281 }
282
vpx_mse8x8_sse2(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,unsigned int * sse)283 unsigned int vpx_mse8x8_sse2(const uint8_t *src, int src_stride,
284 const uint8_t *ref, int ref_stride,
285 unsigned int *sse) {
286 vpx_variance8x8_sse2(src, src_stride, ref, ref_stride, sse);
287 return *sse;
288 }
289
vpx_mse8x16_sse2(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,unsigned int * sse)290 unsigned int vpx_mse8x16_sse2(const uint8_t *src, int src_stride,
291 const uint8_t *ref, int ref_stride,
292 unsigned int *sse) {
293 vpx_variance8x16_sse2(src, src_stride, ref, ref_stride, sse);
294 return *sse;
295 }
296
vpx_mse16x8_sse2(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,unsigned int * sse)297 unsigned int vpx_mse16x8_sse2(const uint8_t *src, int src_stride,
298 const uint8_t *ref, int ref_stride,
299 unsigned int *sse) {
300 vpx_variance16x8_sse2(src, src_stride, ref, ref_stride, sse);
301 return *sse;
302 }
303
vpx_mse16x16_sse2(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,unsigned int * sse)304 unsigned int vpx_mse16x16_sse2(const uint8_t *src, int src_stride,
305 const uint8_t *ref, int ref_stride,
306 unsigned int *sse) {
307 vpx_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);
308 return *sse;
309 }
310
311 #if CONFIG_USE_X86INC
312 // The 2 unused parameters are place holders for PIC enabled build.
313 // These definitions are for functions defined in subpel_variance.asm
314 #define DECL(w, opt) \
315 int vpx_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
316 ptrdiff_t src_stride, \
317 int x_offset, int y_offset, \
318 const uint8_t *dst, \
319 ptrdiff_t dst_stride, \
320 int height, unsigned int *sse, \
321 void *unused0, void *unused)
322 #define DECLS(opt1, opt2) \
323 DECL(4, opt2); \
324 DECL(8, opt1); \
325 DECL(16, opt1)
326
327 DECLS(sse2, sse);
328 DECLS(ssse3, ssse3);
329 #undef DECLS
330 #undef DECL
331
332 #define FN(w, h, wf, wlog2, hlog2, opt, cast) \
333 unsigned int vpx_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
334 int src_stride, \
335 int x_offset, \
336 int y_offset, \
337 const uint8_t *dst, \
338 int dst_stride, \
339 unsigned int *sse_ptr) { \
340 unsigned int sse; \
341 int se = vpx_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \
342 y_offset, dst, dst_stride, \
343 h, &sse, NULL, NULL); \
344 if (w > wf) { \
345 unsigned int sse2; \
346 int se2 = vpx_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \
347 x_offset, y_offset, \
348 dst + 16, dst_stride, \
349 h, &sse2, NULL, NULL); \
350 se += se2; \
351 sse += sse2; \
352 if (w > wf * 2) { \
353 se2 = vpx_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
354 x_offset, y_offset, \
355 dst + 32, dst_stride, \
356 h, &sse2, NULL, NULL); \
357 se += se2; \
358 sse += sse2; \
359 se2 = vpx_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
360 x_offset, y_offset, \
361 dst + 48, dst_stride, \
362 h, &sse2, NULL, NULL); \
363 se += se2; \
364 sse += sse2; \
365 } \
366 } \
367 *sse_ptr = sse; \
368 return sse - ((cast se * se) >> (wlog2 + hlog2)); \
369 }
370
371 #define FNS(opt1, opt2) \
372 FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
373 FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
374 FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
375 FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
376 FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
377 FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
378 FN(16, 16, 16, 4, 4, opt1, (uint32_t)); \
379 FN(16, 8, 16, 4, 3, opt1, (uint32_t)); \
380 FN(8, 16, 8, 3, 4, opt1, (uint32_t)); \
381 FN(8, 8, 8, 3, 3, opt1, (uint32_t)); \
382 FN(8, 4, 8, 3, 2, opt1, (uint32_t)); \
383 FN(4, 8, 4, 2, 3, opt2, (uint32_t)); \
384 FN(4, 4, 4, 2, 2, opt2, (uint32_t))
385
386 FNS(sse2, sse);
387 FNS(ssse3, ssse3);
388
389 #undef FNS
390 #undef FN
391
392 // The 2 unused parameters are place holders for PIC enabled build.
393 #define DECL(w, opt) \
394 int vpx_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \
395 ptrdiff_t src_stride, \
396 int x_offset, int y_offset, \
397 const uint8_t *dst, \
398 ptrdiff_t dst_stride, \
399 const uint8_t *sec, \
400 ptrdiff_t sec_stride, \
401 int height, unsigned int *sse, \
402 void *unused0, void *unused)
403 #define DECLS(opt1, opt2) \
404 DECL(4, opt2); \
405 DECL(8, opt1); \
406 DECL(16, opt1)
407
408 DECLS(sse2, sse);
409 DECLS(ssse3, ssse3);
410 #undef DECL
411 #undef DECLS
412
413 #define FN(w, h, wf, wlog2, hlog2, opt, cast) \
414 unsigned int vpx_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \
415 int src_stride, \
416 int x_offset, \
417 int y_offset, \
418 const uint8_t *dst, \
419 int dst_stride, \
420 unsigned int *sseptr, \
421 const uint8_t *sec) { \
422 unsigned int sse; \
423 int se = vpx_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \
424 y_offset, dst, dst_stride, \
425 sec, w, h, &sse, NULL, \
426 NULL); \
427 if (w > wf) { \
428 unsigned int sse2; \
429 int se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \
430 x_offset, y_offset, \
431 dst + 16, dst_stride, \
432 sec + 16, w, h, &sse2, \
433 NULL, NULL); \
434 se += se2; \
435 sse += sse2; \
436 if (w > wf * 2) { \
437 se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \
438 x_offset, y_offset, \
439 dst + 32, dst_stride, \
440 sec + 32, w, h, &sse2, \
441 NULL, NULL); \
442 se += se2; \
443 sse += sse2; \
444 se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \
445 x_offset, y_offset, \
446 dst + 48, dst_stride, \
447 sec + 48, w, h, &sse2, \
448 NULL, NULL); \
449 se += se2; \
450 sse += sse2; \
451 } \
452 } \
453 *sseptr = sse; \
454 return sse - ((cast se * se) >> (wlog2 + hlog2)); \
455 }
456
457 #define FNS(opt1, opt2) \
458 FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
459 FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
460 FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
461 FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
462 FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
463 FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
464 FN(16, 16, 16, 4, 4, opt1, (uint32_t)); \
465 FN(16, 8, 16, 4, 3, opt1, (uint32_t)); \
466 FN(8, 16, 8, 3, 4, opt1, (uint32_t)); \
467 FN(8, 8, 8, 3, 3, opt1, (uint32_t)); \
468 FN(8, 4, 8, 3, 2, opt1, (uint32_t)); \
469 FN(4, 8, 4, 2, 3, opt2, (uint32_t)); \
470 FN(4, 4, 4, 2, 2, opt2, (uint32_t))
471
472 FNS(sse2, sse);
473 FNS(ssse3, ssse3);
474
475 #undef FNS
476 #undef FN
477 #endif // CONFIG_USE_X86INC
478