• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *          Copyright Andrey Semashev 2007 - 2015.
3  * Distributed under the Boost Software License, Version 1.0.
4  *    (See accompanying file LICENSE_1_0.txt or copy at
5  *          http://www.boost.org/LICENSE_1_0.txt)
6  */
7 /*!
8  * \file   dump_avx2.cpp
9  * \author Andrey Semashev
10  * \date   05.05.2013
11  *
12  * \brief  This header is the Boost.Log library implementation, see the library documentation
13  *         at http://www.boost.org/doc/libs/release/libs/log/doc/html/index.html.
14  */
15 
16 // NOTE: You should generally avoid including headers as much as possible here, because this file
17 //       is compiled with special compiler options, and any included header may result in generation of
18 //       unintended code with these options and violation of ODR.
19 #include <boost/log/detail/config.hpp>
20 #include <ostream>
21 #include <immintrin.h>
22 #include <boost/cstdint.hpp>
23 #include <boost/log/detail/header.hpp>
24 
25 #if defined(__x86_64) || defined(__x86_64__) || \
26     defined(__amd64__) || defined(__amd64) || \
27     defined(_M_X64)
28 #define BOOST_LOG_AUX_X86_64
29 #endif
30 
31 namespace boost {
32 
33 BOOST_LOG_OPEN_NAMESPACE
34 
35 namespace aux {
36 
37 extern const char g_hex_char_table[2][16];
38 
39 template< typename CharT >
40 extern void dump_data_generic(const void* data, std::size_t size, std::basic_ostream< CharT >& strm);
41 
42 BOOST_LOG_ANONYMOUS_NAMESPACE {
43 
44 enum
45 {
46     packs_per_stride = 32,
47     stride = packs_per_stride * 32
48 };
49 
50 union ymm_constant
51 {
52     uint8_t as_bytes[32];
53     __m256i as_mm;
54 
55     BOOST_FORCEINLINE operator __m256i () const { return as_mm; }
56 };
57 
58 static const ymm_constant mm_shuffle_pattern1 = {{ 0x80, 0, 1, 0x80, 2, 3, 0x80, 4, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80,       0x80, 0, 1, 0x80, 2, 3, 0x80, 4, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80 }};
59 static const ymm_constant mm_shuffle_pattern2 = {{ 0, 1, 0x80, 2, 3, 0x80, 4, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 10,         0, 1, 0x80, 2, 3, 0x80, 4, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 10 }};
60 static const ymm_constant mm_shuffle_pattern3 = {{ 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 10, 11, 0x80, 12, 13, 0x80, 14, 15,    5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 10, 11, 0x80, 12, 13, 0x80, 14, 15 }};
61 static const ymm_constant mm_shuffle_pattern13 = {{ 0x80, 0, 1, 0x80, 2, 3, 0x80, 4, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80,      5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 10, 11, 0x80, 12, 13, 0x80, 14, 15 }};
62 
63 #if defined(BOOST_LOG_AUX_X86_64)
64 
65 // x86-64 architecture has more registers which we can utilize to pass constants
66 #define BOOST_LOG_AUX_MM_CONSTANT_ARGS_DECL __m256i mm_15, __m256i mm_9, __m256i mm_char_0, __m256i mm_char_space,
67 #define BOOST_LOG_AUX_MM_CONSTANT_ARGS mm_15, mm_9, mm_char_0, mm_char_space,
68 #define BOOST_LOG_AUX_MM_CONSTANTS \
69     const __m256i mm_15 = _mm256_set1_epi32(0x0F0F0F0F);\
70     const __m256i mm_9 = _mm256_set1_epi32(0x09090909);\
71     const __m256i mm_char_0 = _mm256_set1_epi32(0x30303030);\
72     const __m256i mm_char_space = _mm256_set1_epi32(0x20202020);
73 
74 #else
75 
76 // MSVC in 32-bit mode is not able to pass all constants to dump_pack, and is also not able to align them on the stack, so we have to fetch them from global constants
77 static const ymm_constant mm_15 = {{ 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F,   0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F }};
78 static const ymm_constant mm_9 = {{ 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,   0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09 }};
79 static const ymm_constant mm_char_0 = {{ 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,   0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30 }};
80 static const ymm_constant mm_char_space = {{ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,   0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20 }};
81 #define BOOST_LOG_AUX_MM_CONSTANT_ARGS_DECL
82 #define BOOST_LOG_AUX_MM_CONSTANT_ARGS
83 #define BOOST_LOG_AUX_MM_CONSTANTS
84 
85 #endif
86 
87 /*!
88  * \brief Dumps a pack of input data into a string of 8 bit ASCII characters.
89  *
90  * The composed string is placed as follows (in Intel notation): mm_output1[127:0], mm_output2[127:0], mm_output3[127:0], mm_output1[255:128], mm_output2[255:128], mm_output3[255:128].
91  */
92 static BOOST_FORCEINLINE void dump_pack
93 (
94     BOOST_LOG_AUX_MM_CONSTANT_ARGS_DECL
95     __m256i mm_char_10_to_a, __m256i mm_input,
96     __m256i& mm_output1, __m256i& mm_output2, __m256i& mm_output3
97 )
98 {
99     // Split half-bytes
100     __m256i mm_input_hi = _mm256_and_si256(_mm256_srli_epi16(mm_input, 4), mm_15);
101     __m256i mm_input_lo = _mm256_and_si256(mm_input, mm_15);
102 
103     // Stringize each of the halves
104     __m256i mm_addend_hi = _mm256_cmpgt_epi8(mm_input_hi, mm_9);
105     __m256i mm_addend_lo = _mm256_cmpgt_epi8(mm_input_lo, mm_9);
106     mm_addend_hi = _mm256_and_si256(mm_char_10_to_a, mm_addend_hi);
107     mm_addend_lo = _mm256_and_si256(mm_char_10_to_a, mm_addend_lo);
108 
109     mm_input_hi = _mm256_add_epi8(mm_input_hi, mm_char_0);
110     mm_input_lo = _mm256_add_epi8(mm_input_lo, mm_char_0);
111 
112     mm_input_hi = _mm256_add_epi8(mm_input_hi, mm_addend_hi);
113     mm_input_lo = _mm256_add_epi8(mm_input_lo, mm_addend_lo);
114 
115     // Join them back together
116     __m256i mm_1 = _mm256_unpacklo_epi8(mm_input_hi, mm_input_lo);
117     __m256i mm_2 = _mm256_unpackhi_epi8(mm_input_hi, mm_input_lo);
118 
119     // Insert spaces between stringized bytes:
120     // |0123456789abcdef|0123456789abcdef|
121     // | 01 23 45 67 89 |ab cd ef 01 23 4|5 67 89 ab cd ef|
122     __m256i mm_out1 = _mm256_shuffle_epi8(mm_1, mm_shuffle_pattern1.as_mm);
123     __m256i mm_out3 = _mm256_shuffle_epi8(mm_2, mm_shuffle_pattern3.as_mm);
124     __m256i mm_out2 = _mm256_shuffle_epi8(_mm256_alignr_epi8(mm_2, mm_1, 10), mm_shuffle_pattern2.as_mm);
125 
126     mm_output1 = _mm256_max_epu8(mm_out1, mm_char_space);
127     mm_output2 = _mm256_max_epu8(mm_out2, mm_char_space);
128     mm_output3 = _mm256_max_epu8(mm_out3, mm_char_space);
129 }
130 
131 //! Dumps a pack of input data into a string of 8 bit ASCII characters
132 static BOOST_FORCEINLINE void dump_pack
133 (
134     BOOST_LOG_AUX_MM_CONSTANT_ARGS_DECL
135     __m256i mm_char_10_to_a, __m128i mm_input,
136     __m128i& mm_output1, __m128i& mm_output2, __m128i& mm_output3
137 )
138 {
139     // Split half-bytes
140     __m128i mm_input_hi = _mm_srli_epi16(mm_input, 4);
141     __m256i mm = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_unpacklo_epi8(mm_input_hi, mm_input)), _mm_unpackhi_epi8(mm_input_hi, mm_input), 1);
142     mm = _mm256_and_si256(mm, mm_15);
143 
144     // Stringize the halves
145     __m256i mm_addend = _mm256_cmpgt_epi8(mm, mm_9);
146     mm_addend = _mm256_and_si256(mm_char_10_to_a, mm_addend);
147 
148     mm = _mm256_add_epi8(mm, mm_char_0);
149     mm = _mm256_add_epi8(mm, mm_addend);
150 
151     // Insert spaces between stringized bytes:
152     __m256i mm_out13 = _mm256_shuffle_epi8(mm, mm_shuffle_pattern13.as_mm);
153     __m128i mm_out2 = _mm_shuffle_epi8(_mm_alignr_epi8(_mm256_extracti128_si256(mm, 1), _mm256_castsi256_si128(mm), 10), _mm256_castsi256_si128(mm_shuffle_pattern2.as_mm));
154 
155     mm_out13 = _mm256_max_epu8(mm_out13, mm_char_space);
156     mm_output2 = _mm_max_epu8(mm_out2, _mm256_castsi256_si128(mm_char_space));
157     mm_output1 = _mm256_castsi256_si128(mm_out13);
158     mm_output3 = _mm256_extracti128_si256(mm_out13, 1);
159 }
160 
161 template< typename CharT >
162 BOOST_FORCEINLINE void store_characters(__m128i mm_chars, CharT* buf)
163 {
164     switch (sizeof(CharT))
165     {
166     case 1:
167         _mm_store_si128(reinterpret_cast< __m128i* >(buf), mm_chars);
168         break;
169 
170     case 2:
171         _mm256_store_si256(reinterpret_cast< __m256i* >(buf), _mm256_cvtepu8_epi16(mm_chars));
172         break;
173 
174     case 4:
175         {
176             __m128i mm = _mm_unpackhi_epi64(mm_chars, mm_chars);
177             _mm256_store_si256(reinterpret_cast< __m256i* >(buf), _mm256_cvtepu8_epi32(mm_chars));
178             _mm256_store_si256(reinterpret_cast< __m256i* >(buf) + 1, _mm256_cvtepu8_epi32(mm));
179         }
180         break;
181     }
182 }
183 
184 template< typename CharT >
185 BOOST_FORCEINLINE void store_characters_x3(__m256i mm_chars1, __m256i mm_chars2, __m256i mm_chars3, CharT* buf)
186 {
187     store_characters(_mm256_castsi256_si128(mm_chars1), buf);
188     store_characters(_mm256_castsi256_si128(mm_chars2), buf + 16);
189     store_characters(_mm256_castsi256_si128(mm_chars3), buf + 32);
190     store_characters(_mm256_extracti128_si256(mm_chars1, 1), buf + 48);
191     store_characters(_mm256_extracti128_si256(mm_chars2, 1), buf + 64);
192     store_characters(_mm256_extracti128_si256(mm_chars3, 1), buf + 80);
193 }
194 
195 template< typename CharT >
196 BOOST_FORCEINLINE void dump_data_avx2(const void* data, std::size_t size, std::basic_ostream< CharT >& strm)
197 {
198     typedef CharT char_type;
199 
200     char_type buf_storage[stride * 3u + 32u];
201     // Align the temporary buffer at 32 bytes
202     char_type* const buf = reinterpret_cast< char_type* >((uint8_t*)buf_storage + (32u - (((uintptr_t)(char_type*)buf_storage) & 31u)));
203     char_type* buf_begin = buf + 1u; // skip the first space of the first chunk
204     char_type* buf_end = buf + stride * 3u;
205 
206     __m256i mm_char_10_to_a;
207     if (strm.flags() & std::ios_base::uppercase)
208         mm_char_10_to_a = _mm256_set1_epi32(0x07070707); // '9' is 0x39 and 'A' is 0x41 in ASCII, so we have to add 0x07 to 0x3A to get uppercase letters
209     else
210         mm_char_10_to_a = _mm256_set1_epi32(0x27272727); // ...and 'a' is 0x61, which means we have to add 0x27 to 0x3A to get lowercase letters
211 
212     // First, check the input alignment. Also, if we can dump the whole data in one go, do it right away. It turns out to be faster than splitting
213     // the work between prealign and tail part. It is also a fairly common case since on most platforms memory is not aligned to 32 bytes (i.e. prealign is often needed).
214     const uint8_t* p = static_cast< const uint8_t* >(data);
215     const std::size_t prealign_size = size == 32u ? static_cast< std::size_t >(32u) : static_cast< std::size_t >((32u - ((uintptr_t)p & 31u)) & 31u);
216     if (prealign_size)
217     {
218         __m256i mm_input = _mm256_lddqu_si256(reinterpret_cast< const __m256i* >(p));
219         BOOST_LOG_AUX_MM_CONSTANTS
220 
221         __m256i mm_output1, mm_output2, mm_output3;
222         dump_pack(BOOST_LOG_AUX_MM_CONSTANT_ARGS mm_char_10_to_a, mm_input, mm_output1, mm_output2, mm_output3);
223 
224         store_characters_x3(mm_output1, mm_output2, mm_output3, buf);
225 
226         _mm256_zeroall(); // need to zero all ymm registers to avoid register spills/restores the compler generates around the function call
227         strm.write(buf_begin, prealign_size * 3u - 1u);
228 
229         buf_begin = buf;
230         size -= prealign_size;
231         p += prealign_size;
232     }
233 
234     const std::size_t stride_count = size / stride;
235     std::size_t tail_size = size % stride;
236     for (std::size_t i = 0; i < stride_count; ++i)
237     {
238         char_type* b = buf;
239         BOOST_LOG_AUX_MM_CONSTANTS
240 
241         for (unsigned int j = 0; j < packs_per_stride; ++j, b += 3u * 32u, p += 32u)
242         {
243             __m256i mm_input = _mm256_load_si256(reinterpret_cast< const __m256i* >(p));
244             __m256i mm_output1, mm_output2, mm_output3;
245             dump_pack(BOOST_LOG_AUX_MM_CONSTANT_ARGS mm_char_10_to_a, mm_input, mm_output1, mm_output2, mm_output3);
246 
247             store_characters_x3(mm_output1, mm_output2, mm_output3, b);
248         }
249 
250         _mm256_zeroall(); // need to zero all ymm registers to avoid register spills/restores the compler generates around the function call
251         strm.write(buf_begin, buf_end - buf_begin);
252         buf_begin = buf;
253     }
254 
255     if (BOOST_UNLIKELY(tail_size > 0))
256     {
257         char_type* b = buf;
258         while (tail_size >= 16u)
259         {
260             __m128i mm_input = _mm_load_si128(reinterpret_cast< const __m128i* >(p));
261             BOOST_LOG_AUX_MM_CONSTANTS
262 
263             __m128i mm_output1, mm_output2, mm_output3;
264             dump_pack(BOOST_LOG_AUX_MM_CONSTANT_ARGS mm_char_10_to_a, mm_input, mm_output1, mm_output2, mm_output3);
265 
266             store_characters(mm_output1, b);
267             store_characters(mm_output2, b + 16u);
268             store_characters(mm_output3, b + 32u);
269 
270             b += 3u * 16u;
271             p += 16u;
272             tail_size -= 16u;
273         }
274 
275         _mm256_zeroall(); // need to zero all ymm registers to avoid register spills/restores the compler generates around the function call
276         const char* const char_table = g_hex_char_table[(strm.flags() & std::ios_base::uppercase) != 0];
277         for (unsigned int i = 0; i < tail_size; ++i, ++p, b += 3u)
278         {
279             uint32_t n = *p;
280             b[0] = static_cast< char_type >(' ');
281             b[1] = static_cast< char_type >(char_table[n >> 4]);
282             b[2] = static_cast< char_type >(char_table[n & 0x0F]);
283         }
284 
285         strm.write(buf_begin, b - buf_begin);
286     }
287 }
288 
289 } // namespace
290 
291 void dump_data_char_avx2(const void* data, std::size_t size, std::basic_ostream< char >& strm)
292 {
293     if (size >= 32)
294     {
295         dump_data_avx2(data, size, strm);
296     }
297     else
298     {
299         dump_data_generic(data, size, strm);
300     }
301 }
302 
dump_data_wchar_avx2(const void * data,std::size_t size,std::basic_ostream<wchar_t> & strm)303 void dump_data_wchar_avx2(const void* data, std::size_t size, std::basic_ostream< wchar_t >& strm)
304 {
305     if (size >= 32)
306     {
307         dump_data_avx2(data, size, strm);
308     }
309     else
310     {
311         dump_data_generic(data, size, strm);
312     }
313 }
314 
315 #if !defined(BOOST_NO_CXX11_CHAR16_T)
dump_data_char16_avx2(const void * data,std::size_t size,std::basic_ostream<char16_t> & strm)316 void dump_data_char16_avx2(const void* data, std::size_t size, std::basic_ostream< char16_t >& strm)
317 {
318     if (size >= 32)
319     {
320         dump_data_avx2(data, size, strm);
321     }
322     else
323     {
324         dump_data_generic(data, size, strm);
325     }
326 }
327 #endif
328 
329 #if !defined(BOOST_NO_CXX11_CHAR32_T)
dump_data_char32_avx2(const void * data,std::size_t size,std::basic_ostream<char32_t> & strm)330 void dump_data_char32_avx2(const void* data, std::size_t size, std::basic_ostream< char32_t >& strm)
331 {
332     if (size >= 32)
333     {
334         dump_data_avx2(data, size, strm);
335     }
336     else
337     {
338         dump_data_generic(data, size, strm);
339     }
340 }
341 #endif
342 
343 } // namespace aux
344 
345 BOOST_LOG_CLOSE_NAMESPACE // namespace log
346 
347 } // namespace boost
348 
349 #include <boost/log/detail/footer.hpp>
350