1 /*
2 * Copyright (c) 2013 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Chris Wilson <chris@chris-wilson.co.uk>
25 *
26 */
27
28 #include "config.h"
29
30 #ifdef HAVE_CPUID_H
31 #include <cpuid.h>
32 #else
33 #define __get_cpuid_max(x, y) 0
34 #define __cpuid(level, a, b, c, d) a = b = c = d = 0
35 #define __cpuid_count(level, count, a, b, c, d) a = b = c = d = 0
36 #endif
37
38 #include "igt_x86.h"
39 #include "igt_aux.h"
40
41 #include <stdint.h>
42 #include <stdio.h>
43 #include <string.h>
44
45 /**
46 * SECTION:igt_x86
47 * @short_description: x86 helper library
48 * @title: x86
49 * @include: igt_x86.h
50 */
51
52 #define BASIC_CPUID 0x0
53 #define EXTENDED_CPUID 0x80000000
54
55 #ifndef bit_MMX
56 #define bit_MMX (1 << 23)
57 #endif
58
59 #ifndef bit_SSE
60 #define bit_SSE (1 << 25)
61 #endif
62
63 #ifndef bit_SSE2
64 #define bit_SSE2 (1 << 26)
65 #endif
66
67 #ifndef bit_SSE3
68 #define bit_SSE3 (1 << 0)
69 #endif
70
71 #ifndef bit_SSSE3
72 #define bit_SSSE3 (1 << 9)
73 #endif
74
75 #ifndef bit_SSE4_1
76 #define bit_SSE4_1 (1 << 19)
77 #endif
78
79 #ifndef bit_SSE4_2
80 #define bit_SSE4_2 (1 << 20)
81 #endif
82
83 #ifndef bit_OSXSAVE
84 #define bit_OSXSAVE (1 << 27)
85 #endif
86
87 #ifndef bit_AVX
88 #define bit_AVX (1 << 28)
89 #endif
90
91 #ifndef bit_F16C
92 #define bit_F16C (1 << 29)
93 #endif
94
95 #ifndef bit_AVX2
96 #define bit_AVX2 (1<<5)
97 #endif
98
99 #define xgetbv(index,eax,edx) \
100 __asm__ ("xgetbv" : "=a"(eax), "=d"(edx) : "c" (index))
101
102 #define has_YMM 0x1
103
104 #if defined(__x86_64__) || defined(__i386__)
igt_x86_features(void)105 unsigned igt_x86_features(void)
106 {
107 unsigned max = __get_cpuid_max(BASIC_CPUID, 0);
108 unsigned eax, ebx, ecx, edx;
109 unsigned features = 0;
110 unsigned extra = 0;
111
112 if (max >= 1) {
113 __cpuid(1, eax, ebx, ecx, edx);
114
115 if (ecx & bit_SSE3)
116 features |= SSE3;
117
118 if (ecx & bit_SSSE3)
119 features |= SSSE3;
120
121 if (ecx & bit_SSE4_1)
122 features |= SSE4_1;
123
124 if (ecx & bit_SSE4_2)
125 features |= SSE4_2;
126
127 if (ecx & bit_OSXSAVE) {
128 unsigned int bv_eax, bv_ecx;
129 xgetbv(0, bv_eax, bv_ecx);
130 if ((bv_eax & 6) == 6)
131 extra |= has_YMM;
132 }
133
134 if ((extra & has_YMM) && (ecx & bit_AVX))
135 features |= AVX;
136
137 if (edx & bit_MMX)
138 features |= MMX;
139
140 if (edx & bit_SSE)
141 features |= SSE;
142
143 if (edx & bit_SSE2)
144 features |= SSE2;
145
146 if (ecx & bit_F16C)
147 features |= F16C;
148 }
149
150 if (max >= 7) {
151 __cpuid_count(7, 0, eax, ebx, ecx, edx);
152
153 if ((extra & has_YMM) && (ebx & bit_AVX2))
154 features |= AVX2;
155 }
156
157 return features;
158 }
159
igt_x86_features_to_string(unsigned features,char * line)160 char *igt_x86_features_to_string(unsigned features, char *line)
161 {
162 char *ret = line;
163
164 #ifdef __x86_64__
165 line += sprintf(line, "x86-64");
166 #else
167 line += sprintf(line, "x86");
168 #endif
169
170 if (features & SSE2)
171 line += sprintf(line, ", sse2");
172 if (features & SSE3)
173 line += sprintf(line, ", sse3");
174 if (features & SSSE3)
175 line += sprintf(line, ", ssse3");
176 if (features & SSE4_1)
177 line += sprintf(line, ", sse4.1");
178 if (features & SSE4_2)
179 line += sprintf(line, ", sse4.2");
180 if (features & AVX)
181 line += sprintf(line, ", avx");
182 if (features & AVX2)
183 line += sprintf(line, ", avx2");
184 if (features & F16C)
185 line += sprintf(line, ", f16c");
186
187 (void)line;
188
189 return ret;
190 }
191 #endif
192
193 #if defined(__x86_64__) && !defined(__clang__)
194 #pragma GCC push_options
195 #pragma GCC target("sse4.1")
196 #pragma GCC diagnostic ignored "-Wpointer-arith"
197
198 #include <smmintrin.h>
memcpy_from_wc_sse41(void * dst,const void * src,unsigned long len)199 static void memcpy_from_wc_sse41(void *dst, const void *src, unsigned long len)
200 {
201 char buf[16];
202
203 /* Flush the internal buffer of potential stale gfx data */
204 _mm_mfence();
205
206 if ((uintptr_t)src & 15) {
207 __m128i *S = (__m128i *)((uintptr_t)src & ~15);
208 unsigned long misalign = (uintptr_t)src & 15;
209 unsigned long copy = min(len, 16 - misalign);
210
211 _mm_storeu_si128((__m128i *)buf,
212 _mm_stream_load_si128(S));
213
214 memcpy(dst, buf + misalign, copy);
215
216 dst += copy;
217 src += copy;
218 len -= copy;
219 }
220
221 /* We assume we are doing bulk transfers, so prefer aligned moves */
222 if (((uintptr_t)dst & 15) == 0) {
223 while (len >= 64) {
224 __m128i *S = (__m128i *)src;
225 __m128i *D = (__m128i *)dst;
226 __m128i tmp[4];
227
228 tmp[0] = _mm_stream_load_si128(S + 0);
229 tmp[1] = _mm_stream_load_si128(S + 1);
230 tmp[2] = _mm_stream_load_si128(S + 2);
231 tmp[3] = _mm_stream_load_si128(S + 3);
232
233 _mm_store_si128(D + 0, tmp[0]);
234 _mm_store_si128(D + 1, tmp[1]);
235 _mm_store_si128(D + 2, tmp[2]);
236 _mm_store_si128(D + 3, tmp[3]);
237
238 src += 64;
239 dst += 64;
240 len -= 64;
241 }
242 } else {
243 while (len >= 64) {
244 __m128i *S = (__m128i *)src;
245 __m128i *D = (__m128i *)dst;
246 __m128i tmp[4];
247
248 tmp[0] = _mm_stream_load_si128(S + 0);
249 tmp[1] = _mm_stream_load_si128(S + 1);
250 tmp[2] = _mm_stream_load_si128(S + 2);
251 tmp[3] = _mm_stream_load_si128(S + 3);
252
253 _mm_storeu_si128(D + 0, tmp[0]);
254 _mm_storeu_si128(D + 1, tmp[1]);
255 _mm_storeu_si128(D + 2, tmp[2]);
256 _mm_storeu_si128(D + 3, tmp[3]);
257
258 src += 64;
259 dst += 64;
260 len -= 64;
261 }
262 }
263
264 while (len >= 16) {
265 _mm_storeu_si128((__m128i *)dst,
266 _mm_stream_load_si128((__m128i *)src));
267
268 src += 16;
269 dst += 16;
270 len -= 16;
271 }
272
273 if (len) {
274 _mm_storeu_si128((__m128i *)buf,
275 _mm_stream_load_si128((__m128i *)src));
276 memcpy(dst, buf, len);
277 }
278 }
279
280 #pragma GCC pop_options
281
memcpy_from_wc(void * dst,const void * src,unsigned long len)282 static void memcpy_from_wc(void *dst, const void *src, unsigned long len)
283 {
284 memcpy(dst, src, len);
285 }
286
resolve_memcpy_from_wc(void)287 static void (*resolve_memcpy_from_wc(void))(void *, const void *, unsigned long)
288 {
289 if (igt_x86_features() & SSE4_1)
290 return memcpy_from_wc_sse41;
291
292 return memcpy_from_wc;
293 }
294
295 void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len)
296 __attribute__((ifunc("resolve_memcpy_from_wc")));
297
298 #else
igt_memcpy_from_wc(void * dst,const void * src,unsigned long len)299 void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len)
300 {
301 memcpy(dst, src, len);
302 }
303 #endif
304