• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2019 The libgav1 Authors
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef LIBGAV1_SRC_DSP_X86_COMMON_SSE4_H_
18 #define LIBGAV1_SRC_DSP_X86_COMMON_SSE4_H_
19 
20 #include "src/utils/compiler_attributes.h"
21 #include "src/utils/cpu.h"
22 
23 #if LIBGAV1_ENABLE_SSE4_1
24 
25 #include <emmintrin.h>
26 #include <smmintrin.h>
27 
28 #include <cassert>
29 #include <cstdint>
30 #include <cstdlib>
31 #include <cstring>
32 
33 #if 0
34 #include <cinttypes>
35 #include <cstdio>
36 
37 // Quite useful macro for debugging. Left here for convenience.
38 inline void PrintReg(const __m128i r, const char* const name, int size) {
39   int n;
40   union {
41     __m128i r;
42     uint8_t i8[16];
43     uint16_t i16[8];
44     uint32_t i32[4];
45     uint64_t i64[2];
46   } tmp;
47   tmp.r = r;
48   fprintf(stderr, "%s\t: ", name);
49   if (size == 8) {
50     for (n = 0; n < 16; ++n) fprintf(stderr, "%.2x ", tmp.i8[n]);
51   } else if (size == 16) {
52     for (n = 0; n < 8; ++n) fprintf(stderr, "%.4x ", tmp.i16[n]);
53   } else if (size == 32) {
54     for (n = 0; n < 4; ++n) fprintf(stderr, "%.8x ", tmp.i32[n]);
55   } else {
56     for (n = 0; n < 2; ++n)
57       fprintf(stderr, "%.16" PRIx64 " ", static_cast<uint64_t>(tmp.i64[n]));
58   }
59   fprintf(stderr, "\n");
60 }
61 
62 inline void PrintReg(const int r, const char* const name) {
63   fprintf(stderr, "%s: %d\n", name, r);
64 }
65 
66 inline void PrintRegX(const int r, const char* const name) {
67   fprintf(stderr, "%s: %.8x\n", name, r);
68 }
69 
70 #define PR(var, N) PrintReg(var, #var, N)
71 #define PD(var) PrintReg(var, #var);
72 #define PX(var) PrintRegX(var, #var);
73 #endif  // 0
74 
75 namespace libgav1 {
76 namespace dsp {
77 
78 //------------------------------------------------------------------------------
79 // Load functions.
80 
Load2(const void * src)81 inline __m128i Load2(const void* src) {
82   int16_t val;
83   memcpy(&val, src, sizeof(val));
84   return _mm_cvtsi32_si128(val);
85 }
86 
Load2x2(const void * src1,const void * src2)87 inline __m128i Load2x2(const void* src1, const void* src2) {
88   uint16_t val1;
89   uint16_t val2;
90   memcpy(&val1, src1, sizeof(val1));
91   memcpy(&val2, src2, sizeof(val2));
92   return _mm_cvtsi32_si128(val1 | (val2 << 16));
93 }
94 
95 // Load 2 uint8_t values into |lane| * 2 and |lane| * 2 + 1.
96 template <int lane>
Load2(const void * const buf,__m128i val)97 inline __m128i Load2(const void* const buf, __m128i val) {
98   uint16_t temp;
99   memcpy(&temp, buf, 2);
100   return _mm_insert_epi16(val, temp, lane);
101 }
102 
Load4(const void * src)103 inline __m128i Load4(const void* src) {
104   // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
105   // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
106   // movss instruction.
107   //
108   // Until compiler support of _mm_loadu_si32 is widespread, use of
109   // _mm_loadu_si32 is banned.
110   int val;
111   memcpy(&val, src, sizeof(val));
112   return _mm_cvtsi32_si128(val);
113 }
114 
Load4x2(const void * src1,const void * src2)115 inline __m128i Load4x2(const void* src1, const void* src2) {
116   // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
117   // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
118   // movss instruction.
119   //
120   // Until compiler support of _mm_loadu_si32 is widespread, use of
121   // _mm_loadu_si32 is banned.
122   int val1, val2;
123   memcpy(&val1, src1, sizeof(val1));
124   memcpy(&val2, src2, sizeof(val2));
125   return _mm_insert_epi32(_mm_cvtsi32_si128(val1), val2, 1);
126 }
127 
LoadLo8(const void * a)128 inline __m128i LoadLo8(const void* a) {
129   return _mm_loadl_epi64(static_cast<const __m128i*>(a));
130 }
131 
LoadHi8(const __m128i v,const void * a)132 inline __m128i LoadHi8(const __m128i v, const void* a) {
133   const __m128 x =
134       _mm_loadh_pi(_mm_castsi128_ps(v), static_cast<const __m64*>(a));
135   return _mm_castps_si128(x);
136 }
137 
LoadUnaligned16(const void * a)138 inline __m128i LoadUnaligned16(const void* a) {
139   return _mm_loadu_si128(static_cast<const __m128i*>(a));
140 }
141 
LoadAligned16(const void * a)142 inline __m128i LoadAligned16(const void* a) {
143   assert((reinterpret_cast<uintptr_t>(a) & 0xf) == 0);
144   return _mm_load_si128(static_cast<const __m128i*>(a));
145 }
146 
147 //------------------------------------------------------------------------------
148 // Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning.
149 
MaskOverreads(const __m128i source,const int over_read_in_bytes)150 inline __m128i MaskOverreads(const __m128i source,
151                              const int over_read_in_bytes) {
152   __m128i dst = source;
153 #if LIBGAV1_MSAN
154   if (over_read_in_bytes > 0) {
155     __m128i mask = _mm_set1_epi8(-1);
156     for (int i = 0; i < over_read_in_bytes; ++i) {
157       mask = _mm_srli_si128(mask, 1);
158     }
159     dst = _mm_and_si128(dst, mask);
160   }
161 #else
162   static_cast<void>(over_read_in_bytes);
163 #endif
164   return dst;
165 }
166 
LoadLo8Msan(const void * const source,const int over_read_in_bytes)167 inline __m128i LoadLo8Msan(const void* const source,
168                            const int over_read_in_bytes) {
169   return MaskOverreads(LoadLo8(source), over_read_in_bytes + 8);
170 }
171 
LoadAligned16Msan(const void * const source,const int over_read_in_bytes)172 inline __m128i LoadAligned16Msan(const void* const source,
173                                  const int over_read_in_bytes) {
174   return MaskOverreads(LoadAligned16(source), over_read_in_bytes);
175 }
176 
LoadUnaligned16Msan(const void * const source,const int over_read_in_bytes)177 inline __m128i LoadUnaligned16Msan(const void* const source,
178                                    const int over_read_in_bytes) {
179   return MaskOverreads(LoadUnaligned16(source), over_read_in_bytes);
180 }
181 
182 //------------------------------------------------------------------------------
183 // Store functions.
184 
Store2(void * dst,const __m128i x)185 inline void Store2(void* dst, const __m128i x) {
186   const int val = _mm_cvtsi128_si32(x);
187   memcpy(dst, &val, 2);
188 }
189 
Store4(void * dst,const __m128i x)190 inline void Store4(void* dst, const __m128i x) {
191   const int val = _mm_cvtsi128_si32(x);
192   memcpy(dst, &val, sizeof(val));
193 }
194 
StoreLo8(void * a,const __m128i v)195 inline void StoreLo8(void* a, const __m128i v) {
196   _mm_storel_epi64(static_cast<__m128i*>(a), v);
197 }
198 
StoreHi8(void * a,const __m128i v)199 inline void StoreHi8(void* a, const __m128i v) {
200   _mm_storeh_pi(static_cast<__m64*>(a), _mm_castsi128_ps(v));
201 }
202 
StoreAligned16(void * a,const __m128i v)203 inline void StoreAligned16(void* a, const __m128i v) {
204   _mm_store_si128(static_cast<__m128i*>(a), v);
205 }
206 
StoreUnaligned16(void * a,const __m128i v)207 inline void StoreUnaligned16(void* a, const __m128i v) {
208   _mm_storeu_si128(static_cast<__m128i*>(a), v);
209 }
210 
211 //------------------------------------------------------------------------------
212 // Arithmetic utilities.
213 
RightShiftWithRounding_U16(const __m128i v_val_d,int bits)214 inline __m128i RightShiftWithRounding_U16(const __m128i v_val_d, int bits) {
215   assert(bits <= 16);
216   const __m128i v_bias_d =
217       _mm_set1_epi16(static_cast<int16_t>((1 << bits) >> 1));
218   const __m128i v_tmp_d = _mm_add_epi16(v_val_d, v_bias_d);
219   return _mm_srli_epi16(v_tmp_d, bits);
220 }
221 
RightShiftWithRounding_S16(const __m128i v_val_d,int bits)222 inline __m128i RightShiftWithRounding_S16(const __m128i v_val_d, int bits) {
223   assert(bits <= 16);
224   const __m128i v_bias_d =
225       _mm_set1_epi16(static_cast<int16_t>((1 << bits) >> 1));
226   const __m128i v_tmp_d = _mm_add_epi16(v_val_d, v_bias_d);
227   return _mm_srai_epi16(v_tmp_d, bits);
228 }
229 
RightShiftWithRounding_U32(const __m128i v_val_d,int bits)230 inline __m128i RightShiftWithRounding_U32(const __m128i v_val_d, int bits) {
231   const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
232   const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
233   return _mm_srli_epi32(v_tmp_d, bits);
234 }
235 
RightShiftWithRounding_S32(const __m128i v_val_d,int bits)236 inline __m128i RightShiftWithRounding_S32(const __m128i v_val_d, int bits) {
237   const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
238   const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
239   return _mm_srai_epi32(v_tmp_d, bits);
240 }
241 
242 //------------------------------------------------------------------------------
243 // Masking utilities
MaskHighNBytes(int n)244 inline __m128i MaskHighNBytes(int n) {
245   static constexpr uint8_t kMask[32] = {
246       0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
247       0,   0,   0,   0,   0,   255, 255, 255, 255, 255, 255,
248       255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
249   };
250 
251   return LoadUnaligned16(kMask + n);
252 }
253 
254 }  // namespace dsp
255 }  // namespace libgav1
256 
257 #endif  // LIBGAV1_ENABLE_SSE4_1
258 #endif  // LIBGAV1_SRC_DSP_X86_COMMON_SSE4_H_
259