1 /*
2 * Copyright 2019 The libgav1 Authors
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #ifndef LIBGAV1_SRC_DSP_X86_COMMON_SSE4_H_
18 #define LIBGAV1_SRC_DSP_X86_COMMON_SSE4_H_
19
20 #include "src/utils/compiler_attributes.h"
21 #include "src/utils/cpu.h"
22
23 #if LIBGAV1_ENABLE_SSE4_1
24
25 #include <emmintrin.h>
26 #include <smmintrin.h>
27
28 #include <cassert>
29 #include <cstdint>
30 #include <cstdlib>
31 #include <cstring>
32
33 #if 0
34 #include <cinttypes>
35 #include <cstdio>
36
37 // Quite useful macro for debugging. Left here for convenience.
38 inline void PrintReg(const __m128i r, const char* const name, int size) {
39 int n;
40 union {
41 __m128i r;
42 uint8_t i8[16];
43 uint16_t i16[8];
44 uint32_t i32[4];
45 uint64_t i64[2];
46 } tmp;
47 tmp.r = r;
48 fprintf(stderr, "%s\t: ", name);
49 if (size == 8) {
50 for (n = 0; n < 16; ++n) fprintf(stderr, "%.2x ", tmp.i8[n]);
51 } else if (size == 16) {
52 for (n = 0; n < 8; ++n) fprintf(stderr, "%.4x ", tmp.i16[n]);
53 } else if (size == 32) {
54 for (n = 0; n < 4; ++n) fprintf(stderr, "%.8x ", tmp.i32[n]);
55 } else {
56 for (n = 0; n < 2; ++n)
57 fprintf(stderr, "%.16" PRIx64 " ", static_cast<uint64_t>(tmp.i64[n]));
58 }
59 fprintf(stderr, "\n");
60 }
61
62 inline void PrintReg(const int r, const char* const name) {
63 fprintf(stderr, "%s: %d\n", name, r);
64 }
65
66 inline void PrintRegX(const int r, const char* const name) {
67 fprintf(stderr, "%s: %.8x\n", name, r);
68 }
69
70 #define PR(var, N) PrintReg(var, #var, N)
71 #define PD(var) PrintReg(var, #var);
72 #define PX(var) PrintRegX(var, #var);
73 #endif // 0
74
75 namespace libgav1 {
76 namespace dsp {
77
78 //------------------------------------------------------------------------------
79 // Load functions.
80
Load2(const void * src)81 inline __m128i Load2(const void* src) {
82 int16_t val;
83 memcpy(&val, src, sizeof(val));
84 return _mm_cvtsi32_si128(val);
85 }
86
Load2x2(const void * src1,const void * src2)87 inline __m128i Load2x2(const void* src1, const void* src2) {
88 uint16_t val1;
89 uint16_t val2;
90 memcpy(&val1, src1, sizeof(val1));
91 memcpy(&val2, src2, sizeof(val2));
92 return _mm_cvtsi32_si128(val1 | (val2 << 16));
93 }
94
95 // Load 2 uint8_t values into |lane| * 2 and |lane| * 2 + 1.
96 template <int lane>
Load2(const void * const buf,__m128i val)97 inline __m128i Load2(const void* const buf, __m128i val) {
98 uint16_t temp;
99 memcpy(&temp, buf, 2);
100 return _mm_insert_epi16(val, temp, lane);
101 }
102
Load4(const void * src)103 inline __m128i Load4(const void* src) {
104 // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
105 // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
106 // movss instruction.
107 //
108 // Until compiler support of _mm_loadu_si32 is widespread, use of
109 // _mm_loadu_si32 is banned.
110 int val;
111 memcpy(&val, src, sizeof(val));
112 return _mm_cvtsi32_si128(val);
113 }
114
Load4x2(const void * src1,const void * src2)115 inline __m128i Load4x2(const void* src1, const void* src2) {
116 // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
117 // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
118 // movss instruction.
119 //
120 // Until compiler support of _mm_loadu_si32 is widespread, use of
121 // _mm_loadu_si32 is banned.
122 int val1, val2;
123 memcpy(&val1, src1, sizeof(val1));
124 memcpy(&val2, src2, sizeof(val2));
125 return _mm_insert_epi32(_mm_cvtsi32_si128(val1), val2, 1);
126 }
127
LoadLo8(const void * a)128 inline __m128i LoadLo8(const void* a) {
129 return _mm_loadl_epi64(static_cast<const __m128i*>(a));
130 }
131
LoadHi8(const __m128i v,const void * a)132 inline __m128i LoadHi8(const __m128i v, const void* a) {
133 const __m128 x =
134 _mm_loadh_pi(_mm_castsi128_ps(v), static_cast<const __m64*>(a));
135 return _mm_castps_si128(x);
136 }
137
LoadUnaligned16(const void * a)138 inline __m128i LoadUnaligned16(const void* a) {
139 return _mm_loadu_si128(static_cast<const __m128i*>(a));
140 }
141
LoadAligned16(const void * a)142 inline __m128i LoadAligned16(const void* a) {
143 assert((reinterpret_cast<uintptr_t>(a) & 0xf) == 0);
144 return _mm_load_si128(static_cast<const __m128i*>(a));
145 }
146
147 //------------------------------------------------------------------------------
148 // Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning.
149
MaskOverreads(const __m128i source,const int over_read_in_bytes)150 inline __m128i MaskOverreads(const __m128i source,
151 const int over_read_in_bytes) {
152 __m128i dst = source;
153 #if LIBGAV1_MSAN
154 if (over_read_in_bytes > 0) {
155 __m128i mask = _mm_set1_epi8(-1);
156 for (int i = 0; i < over_read_in_bytes; ++i) {
157 mask = _mm_srli_si128(mask, 1);
158 }
159 dst = _mm_and_si128(dst, mask);
160 }
161 #else
162 static_cast<void>(over_read_in_bytes);
163 #endif
164 return dst;
165 }
166
LoadLo8Msan(const void * const source,const int over_read_in_bytes)167 inline __m128i LoadLo8Msan(const void* const source,
168 const int over_read_in_bytes) {
169 return MaskOverreads(LoadLo8(source), over_read_in_bytes + 8);
170 }
171
LoadAligned16Msan(const void * const source,const int over_read_in_bytes)172 inline __m128i LoadAligned16Msan(const void* const source,
173 const int over_read_in_bytes) {
174 return MaskOverreads(LoadAligned16(source), over_read_in_bytes);
175 }
176
LoadUnaligned16Msan(const void * const source,const int over_read_in_bytes)177 inline __m128i LoadUnaligned16Msan(const void* const source,
178 const int over_read_in_bytes) {
179 return MaskOverreads(LoadUnaligned16(source), over_read_in_bytes);
180 }
181
182 //------------------------------------------------------------------------------
183 // Store functions.
184
Store2(void * dst,const __m128i x)185 inline void Store2(void* dst, const __m128i x) {
186 const int val = _mm_cvtsi128_si32(x);
187 memcpy(dst, &val, 2);
188 }
189
Store4(void * dst,const __m128i x)190 inline void Store4(void* dst, const __m128i x) {
191 const int val = _mm_cvtsi128_si32(x);
192 memcpy(dst, &val, sizeof(val));
193 }
194
StoreLo8(void * a,const __m128i v)195 inline void StoreLo8(void* a, const __m128i v) {
196 _mm_storel_epi64(static_cast<__m128i*>(a), v);
197 }
198
StoreHi8(void * a,const __m128i v)199 inline void StoreHi8(void* a, const __m128i v) {
200 _mm_storeh_pi(static_cast<__m64*>(a), _mm_castsi128_ps(v));
201 }
202
StoreAligned16(void * a,const __m128i v)203 inline void StoreAligned16(void* a, const __m128i v) {
204 _mm_store_si128(static_cast<__m128i*>(a), v);
205 }
206
StoreUnaligned16(void * a,const __m128i v)207 inline void StoreUnaligned16(void* a, const __m128i v) {
208 _mm_storeu_si128(static_cast<__m128i*>(a), v);
209 }
210
211 //------------------------------------------------------------------------------
212 // Arithmetic utilities.
213
RightShiftWithRounding_U16(const __m128i v_val_d,int bits)214 inline __m128i RightShiftWithRounding_U16(const __m128i v_val_d, int bits) {
215 assert(bits <= 16);
216 const __m128i v_bias_d =
217 _mm_set1_epi16(static_cast<int16_t>((1 << bits) >> 1));
218 const __m128i v_tmp_d = _mm_add_epi16(v_val_d, v_bias_d);
219 return _mm_srli_epi16(v_tmp_d, bits);
220 }
221
RightShiftWithRounding_S16(const __m128i v_val_d,int bits)222 inline __m128i RightShiftWithRounding_S16(const __m128i v_val_d, int bits) {
223 assert(bits <= 16);
224 const __m128i v_bias_d =
225 _mm_set1_epi16(static_cast<int16_t>((1 << bits) >> 1));
226 const __m128i v_tmp_d = _mm_add_epi16(v_val_d, v_bias_d);
227 return _mm_srai_epi16(v_tmp_d, bits);
228 }
229
RightShiftWithRounding_U32(const __m128i v_val_d,int bits)230 inline __m128i RightShiftWithRounding_U32(const __m128i v_val_d, int bits) {
231 const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
232 const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
233 return _mm_srli_epi32(v_tmp_d, bits);
234 }
235
RightShiftWithRounding_S32(const __m128i v_val_d,int bits)236 inline __m128i RightShiftWithRounding_S32(const __m128i v_val_d, int bits) {
237 const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
238 const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
239 return _mm_srai_epi32(v_tmp_d, bits);
240 }
241
242 //------------------------------------------------------------------------------
243 // Masking utilities
MaskHighNBytes(int n)244 inline __m128i MaskHighNBytes(int n) {
245 static constexpr uint8_t kMask[32] = {
246 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
247 0, 0, 0, 0, 0, 255, 255, 255, 255, 255, 255,
248 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
249 };
250
251 return LoadUnaligned16(kMask + n);
252 }
253
254 } // namespace dsp
255 } // namespace libgav1
256
257 #endif // LIBGAV1_ENABLE_SSE4_1
258 #endif // LIBGAV1_SRC_DSP_X86_COMMON_SSE4_H_
259