• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright 2021 The libgav1 Authors
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17//------------------------------------------------------------------------------
18// Load functions.
19
20inline __m128i Load2(const void* src) {
21  int16_t val;
22  memcpy(&val, src, sizeof(val));
23  return _mm_cvtsi32_si128(val);
24}
25
26inline __m128i Load2x2(const void* src1, const void* src2) {
27  uint16_t val1;
28  uint16_t val2;
29  memcpy(&val1, src1, sizeof(val1));
30  memcpy(&val2, src2, sizeof(val2));
31  return _mm_cvtsi32_si128(val1 | (val2 << 16));
32}
33
34// Load 2 uint8_t values into |lane| * 2 and |lane| * 2 + 1.
35template <int lane>
36inline __m128i Load2(const void* const buf, __m128i val) {
37  int16_t temp;
38  memcpy(&temp, buf, 2);
39  return _mm_insert_epi16(val, temp, lane);
40}
41
42inline __m128i Load4(const void* src) {
43  // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
44  // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
45  // movss instruction.
46  //
47  // Until compiler support of _mm_loadu_si32 is widespread, use of
48  // _mm_loadu_si32 is banned.
49  int val;
50  memcpy(&val, src, sizeof(val));
51  return _mm_cvtsi32_si128(val);
52}
53
54inline __m128i Load4x2(const void* src1, const void* src2) {
55  // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
56  // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
57  // movss instruction.
58  //
59  // Until compiler support of _mm_loadu_si32 is widespread, use of
60  // _mm_loadu_si32 is banned.
61  int val1, val2;
62  memcpy(&val1, src1, sizeof(val1));
63  memcpy(&val2, src2, sizeof(val2));
64  return _mm_insert_epi32(_mm_cvtsi32_si128(val1), val2, 1);
65}
66
67inline __m128i LoadLo8(const void* a) {
68  return _mm_loadl_epi64(static_cast<const __m128i*>(a));
69}
70
71inline __m128i LoadHi8(const __m128i v, const void* a) {
72  const __m128 x =
73      _mm_loadh_pi(_mm_castsi128_ps(v), static_cast<const __m64*>(a));
74  return _mm_castps_si128(x);
75}
76
77inline __m128i LoadUnaligned16(const void* a) {
78  return _mm_loadu_si128(static_cast<const __m128i*>(a));
79}
80
81inline __m128i LoadAligned16(const void* a) {
82  assert((reinterpret_cast<uintptr_t>(a) & 0xf) == 0);
83  return _mm_load_si128(static_cast<const __m128i*>(a));
84}
85
86//------------------------------------------------------------------------------
87// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning.
88
89inline __m128i MaskOverreads(const __m128i source,
90                             const ptrdiff_t over_read_in_bytes) {
91  __m128i dst = source;
92#if LIBGAV1_MSAN
93  if (over_read_in_bytes > 0) {
94    __m128i mask = _mm_set1_epi8(-1);
95    for (ptrdiff_t i = 0; i < over_read_in_bytes; ++i) {
96      mask = _mm_srli_si128(mask, 1);
97    }
98    dst = _mm_and_si128(dst, mask);
99  }
100#else
101  static_cast<void>(over_read_in_bytes);
102#endif
103  return dst;
104}
105
106inline __m128i LoadLo8Msan(const void* const source,
107                           const ptrdiff_t over_read_in_bytes) {
108  return MaskOverreads(LoadLo8(source), over_read_in_bytes + 8);
109}
110
111inline __m128i LoadHi8Msan(const __m128i v, const void* source,
112                           const ptrdiff_t over_read_in_bytes) {
113  return MaskOverreads(LoadHi8(v, source), over_read_in_bytes);
114}
115
116inline __m128i LoadAligned16Msan(const void* const source,
117                                 const ptrdiff_t over_read_in_bytes) {
118  return MaskOverreads(LoadAligned16(source), over_read_in_bytes);
119}
120
121inline __m128i LoadUnaligned16Msan(const void* const source,
122                                   const ptrdiff_t over_read_in_bytes) {
123  return MaskOverreads(LoadUnaligned16(source), over_read_in_bytes);
124}
125
126//------------------------------------------------------------------------------
127// Store functions.
128
129inline void Store2(void* dst, const __m128i x) {
130  const int val = _mm_cvtsi128_si32(x);
131  memcpy(dst, &val, 2);
132}
133
134inline void Store4(void* dst, const __m128i x) {
135  const int val = _mm_cvtsi128_si32(x);
136  memcpy(dst, &val, sizeof(val));
137}
138
139inline void StoreLo8(void* a, const __m128i v) {
140  _mm_storel_epi64(static_cast<__m128i*>(a), v);
141}
142
143inline void StoreHi8(void* a, const __m128i v) {
144  _mm_storeh_pi(static_cast<__m64*>(a), _mm_castsi128_ps(v));
145}
146
147inline void StoreAligned16(void* a, const __m128i v) {
148  assert((reinterpret_cast<uintptr_t>(a) & 0xf) == 0);
149  _mm_store_si128(static_cast<__m128i*>(a), v);
150}
151
152inline void StoreUnaligned16(void* a, const __m128i v) {
153  _mm_storeu_si128(static_cast<__m128i*>(a), v);
154}
155
156//------------------------------------------------------------------------------
157// Arithmetic utilities.
158
159inline __m128i RightShiftWithRounding_U16(const __m128i v_val_d, int bits) {
160  assert(bits <= 16);
161  // Shift out all but the last bit.
162  const __m128i v_tmp_d = _mm_srli_epi16(v_val_d, bits - 1);
163  // Avg with zero will shift by 1 and round.
164  return _mm_avg_epu16(v_tmp_d, _mm_setzero_si128());
165}
166
167inline __m128i RightShiftWithRounding_S16(const __m128i v_val_d, int bits) {
168  assert(bits < 16);
169  const __m128i v_bias_d =
170      _mm_set1_epi16(static_cast<int16_t>((1 << bits) >> 1));
171  const __m128i v_tmp_d = _mm_add_epi16(v_val_d, v_bias_d);
172  return _mm_srai_epi16(v_tmp_d, bits);
173}
174
175inline __m128i RightShiftWithRounding_U32(const __m128i v_val_d, int bits) {
176  const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
177  const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
178  return _mm_srli_epi32(v_tmp_d, bits);
179}
180
181inline __m128i RightShiftWithRounding_S32(const __m128i v_val_d, int bits) {
182  const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
183  const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
184  return _mm_srai_epi32(v_tmp_d, bits);
185}
186
187// Use this when |bits| is not an immediate value.
188inline __m128i VariableRightShiftWithRounding_S32(const __m128i v_val_d,
189                                                  int bits) {
190  const __m128i v_bias_d =
191      _mm_set1_epi32(static_cast<int32_t>((1 << bits) >> 1));
192  const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
193  return _mm_sra_epi32(v_tmp_d, _mm_cvtsi32_si128(bits));
194}
195
196//------------------------------------------------------------------------------
197// Masking utilities
198inline __m128i MaskHighNBytes(int n) {
199  static constexpr uint8_t kMask[32] = {
200      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
201      0,   0,   0,   0,   0,   255, 255, 255, 255, 255, 255,
202      255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
203  };
204
205  return LoadUnaligned16(kMask + n);
206}
207