1/* 2 * Copyright 2021 The libgav1 Authors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17//------------------------------------------------------------------------------ 18// Load functions. 19 20inline __m128i Load2(const void* src) { 21 int16_t val; 22 memcpy(&val, src, sizeof(val)); 23 return _mm_cvtsi32_si128(val); 24} 25 26inline __m128i Load2x2(const void* src1, const void* src2) { 27 uint16_t val1; 28 uint16_t val2; 29 memcpy(&val1, src1, sizeof(val1)); 30 memcpy(&val2, src2, sizeof(val2)); 31 return _mm_cvtsi32_si128(val1 | (val2 << 16)); 32} 33 34// Load 2 uint8_t values into |lane| * 2 and |lane| * 2 + 1. 35template <int lane> 36inline __m128i Load2(const void* const buf, __m128i val) { 37 int16_t temp; 38 memcpy(&temp, buf, 2); 39 return _mm_insert_epi16(val, temp, lane); 40} 41 42inline __m128i Load4(const void* src) { 43 // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32 44 // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a 45 // movss instruction. 46 // 47 // Until compiler support of _mm_loadu_si32 is widespread, use of 48 // _mm_loadu_si32 is banned. 49 int val; 50 memcpy(&val, src, sizeof(val)); 51 return _mm_cvtsi32_si128(val); 52} 53 54inline __m128i Load4x2(const void* src1, const void* src2) { 55 // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32 56 // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a 57 // movss instruction. 58 // 59 // Until compiler support of _mm_loadu_si32 is widespread, use of 60 // _mm_loadu_si32 is banned. 61 int val1, val2; 62 memcpy(&val1, src1, sizeof(val1)); 63 memcpy(&val2, src2, sizeof(val2)); 64 return _mm_insert_epi32(_mm_cvtsi32_si128(val1), val2, 1); 65} 66 67inline __m128i LoadLo8(const void* a) { 68 return _mm_loadl_epi64(static_cast<const __m128i*>(a)); 69} 70 71inline __m128i LoadHi8(const __m128i v, const void* a) { 72 const __m128 x = 73 _mm_loadh_pi(_mm_castsi128_ps(v), static_cast<const __m64*>(a)); 74 return _mm_castps_si128(x); 75} 76 77inline __m128i LoadUnaligned16(const void* a) { 78 return _mm_loadu_si128(static_cast<const __m128i*>(a)); 79} 80 81inline __m128i LoadAligned16(const void* a) { 82 assert((reinterpret_cast<uintptr_t>(a) & 0xf) == 0); 83 return _mm_load_si128(static_cast<const __m128i*>(a)); 84} 85 86//------------------------------------------------------------------------------ 87// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning. 88 89inline __m128i MaskOverreads(const __m128i source, 90 const ptrdiff_t over_read_in_bytes) { 91 __m128i dst = source; 92#if LIBGAV1_MSAN 93 if (over_read_in_bytes > 0) { 94 __m128i mask = _mm_set1_epi8(-1); 95 for (ptrdiff_t i = 0; i < over_read_in_bytes; ++i) { 96 mask = _mm_srli_si128(mask, 1); 97 } 98 dst = _mm_and_si128(dst, mask); 99 } 100#else 101 static_cast<void>(over_read_in_bytes); 102#endif 103 return dst; 104} 105 106inline __m128i LoadLo8Msan(const void* const source, 107 const ptrdiff_t over_read_in_bytes) { 108 return MaskOverreads(LoadLo8(source), over_read_in_bytes + 8); 109} 110 111inline __m128i LoadHi8Msan(const __m128i v, const void* source, 112 const ptrdiff_t over_read_in_bytes) { 113 return MaskOverreads(LoadHi8(v, source), over_read_in_bytes); 114} 115 116inline __m128i LoadAligned16Msan(const void* const source, 117 const ptrdiff_t over_read_in_bytes) { 118 return MaskOverreads(LoadAligned16(source), over_read_in_bytes); 119} 120 121inline __m128i LoadUnaligned16Msan(const void* const source, 122 const ptrdiff_t over_read_in_bytes) { 123 return MaskOverreads(LoadUnaligned16(source), over_read_in_bytes); 124} 125 126//------------------------------------------------------------------------------ 127// Store functions. 128 129inline void Store2(void* dst, const __m128i x) { 130 const int val = _mm_cvtsi128_si32(x); 131 memcpy(dst, &val, 2); 132} 133 134inline void Store4(void* dst, const __m128i x) { 135 const int val = _mm_cvtsi128_si32(x); 136 memcpy(dst, &val, sizeof(val)); 137} 138 139inline void StoreLo8(void* a, const __m128i v) { 140 _mm_storel_epi64(static_cast<__m128i*>(a), v); 141} 142 143inline void StoreHi8(void* a, const __m128i v) { 144 _mm_storeh_pi(static_cast<__m64*>(a), _mm_castsi128_ps(v)); 145} 146 147inline void StoreAligned16(void* a, const __m128i v) { 148 assert((reinterpret_cast<uintptr_t>(a) & 0xf) == 0); 149 _mm_store_si128(static_cast<__m128i*>(a), v); 150} 151 152inline void StoreUnaligned16(void* a, const __m128i v) { 153 _mm_storeu_si128(static_cast<__m128i*>(a), v); 154} 155 156//------------------------------------------------------------------------------ 157// Arithmetic utilities. 158 159inline __m128i RightShiftWithRounding_U16(const __m128i v_val_d, int bits) { 160 assert(bits <= 16); 161 // Shift out all but the last bit. 162 const __m128i v_tmp_d = _mm_srli_epi16(v_val_d, bits - 1); 163 // Avg with zero will shift by 1 and round. 164 return _mm_avg_epu16(v_tmp_d, _mm_setzero_si128()); 165} 166 167inline __m128i RightShiftWithRounding_S16(const __m128i v_val_d, int bits) { 168 assert(bits < 16); 169 const __m128i v_bias_d = 170 _mm_set1_epi16(static_cast<int16_t>((1 << bits) >> 1)); 171 const __m128i v_tmp_d = _mm_add_epi16(v_val_d, v_bias_d); 172 return _mm_srai_epi16(v_tmp_d, bits); 173} 174 175inline __m128i RightShiftWithRounding_U32(const __m128i v_val_d, int bits) { 176 const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1); 177 const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d); 178 return _mm_srli_epi32(v_tmp_d, bits); 179} 180 181inline __m128i RightShiftWithRounding_S32(const __m128i v_val_d, int bits) { 182 const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1); 183 const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d); 184 return _mm_srai_epi32(v_tmp_d, bits); 185} 186 187// Use this when |bits| is not an immediate value. 188inline __m128i VariableRightShiftWithRounding_S32(const __m128i v_val_d, 189 int bits) { 190 const __m128i v_bias_d = 191 _mm_set1_epi32(static_cast<int32_t>((1 << bits) >> 1)); 192 const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d); 193 return _mm_sra_epi32(v_tmp_d, _mm_cvtsi32_si128(bits)); 194} 195 196//------------------------------------------------------------------------------ 197// Masking utilities 198inline __m128i MaskHighNBytes(int n) { 199 static constexpr uint8_t kMask[32] = { 200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 201 0, 0, 0, 0, 0, 255, 255, 255, 255, 255, 255, 202 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 203 }; 204 205 return LoadUnaligned16(kMask + n); 206} 207