1/* 2 * Copyright 2021 The libgav1 Authors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17//------------------------------------------------------------------------------ 18// Compatibility functions. 19 20inline __m256i SetrM128i(const __m128i lo, const __m128i hi) { 21 // For compatibility with older gcc toolchains (< 8) use 22 // _mm256_inserti128_si256 over _mm256_setr_m128i. Newer gcc implementations 23 // are implemented similarly to the following, clang uses a different method 24 // but no differences in assembly have been observed. 25 return _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1); 26} 27 28//------------------------------------------------------------------------------ 29// Load functions. 30 31inline __m256i LoadAligned32(const void* a) { 32 assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0); 33 return _mm256_load_si256(static_cast<const __m256i*>(a)); 34} 35 36inline void LoadAligned64(const void* a, __m256i dst[2]) { 37 assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0); 38 dst[0] = _mm256_load_si256(static_cast<const __m256i*>(a) + 0); 39 dst[1] = _mm256_load_si256(static_cast<const __m256i*>(a) + 1); 40} 41 42inline __m256i LoadUnaligned32(const void* a) { 43 return _mm256_loadu_si256(static_cast<const __m256i*>(a)); 44} 45 46//------------------------------------------------------------------------------ 47// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning. 48 49inline __m256i MaskOverreads(const __m256i source, 50 const ptrdiff_t over_read_in_bytes) { 51 __m256i dst = source; 52#if LIBGAV1_MSAN 53 if (over_read_in_bytes >= 32) return _mm256_setzero_si256(); 54 if (over_read_in_bytes > 0) { 55 __m128i m = _mm_set1_epi8(-1); 56 for (ptrdiff_t i = 0; i < over_read_in_bytes % 16; ++i) { 57 m = _mm_srli_si128(m, 1); 58 } 59 const __m256i mask = (over_read_in_bytes < 16) 60 ? SetrM128i(_mm_set1_epi8(-1), m) 61 : SetrM128i(m, _mm_setzero_si128()); 62 dst = _mm256_and_si256(dst, mask); 63 } 64#else 65 static_cast<void>(over_read_in_bytes); 66#endif 67 return dst; 68} 69 70inline __m256i LoadAligned32Msan(const void* const source, 71 const ptrdiff_t over_read_in_bytes) { 72 return MaskOverreads(LoadAligned32(source), over_read_in_bytes); 73} 74 75inline void LoadAligned64Msan(const void* const source, 76 const ptrdiff_t over_read_in_bytes, 77 __m256i dst[2]) { 78 dst[0] = MaskOverreads(LoadAligned32(source), over_read_in_bytes); 79 dst[1] = MaskOverreads(LoadAligned32(static_cast<const __m256i*>(source) + 1), 80 over_read_in_bytes); 81} 82 83inline __m256i LoadUnaligned32Msan(const void* const source, 84 const ptrdiff_t over_read_in_bytes) { 85 return MaskOverreads(LoadUnaligned32(source), over_read_in_bytes); 86} 87 88//------------------------------------------------------------------------------ 89// Store functions. 90 91inline void StoreAligned32(void* a, const __m256i v) { 92 assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0); 93 _mm256_store_si256(static_cast<__m256i*>(a), v); 94} 95 96inline void StoreAligned64(void* a, const __m256i v[2]) { 97 assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0); 98 _mm256_store_si256(static_cast<__m256i*>(a) + 0, v[0]); 99 _mm256_store_si256(static_cast<__m256i*>(a) + 1, v[1]); 100} 101 102inline void StoreUnaligned32(void* a, const __m256i v) { 103 _mm256_storeu_si256(static_cast<__m256i*>(a), v); 104} 105 106//------------------------------------------------------------------------------ 107// Arithmetic utilities. 108 109inline __m256i RightShiftWithRounding_S16(const __m256i v_val_d, int bits) { 110 assert(bits <= 16); 111 const __m256i v_bias_d = 112 _mm256_set1_epi16(static_cast<int16_t>((1 << bits) >> 1)); 113 const __m256i v_tmp_d = _mm256_add_epi16(v_val_d, v_bias_d); 114 return _mm256_srai_epi16(v_tmp_d, bits); 115} 116 117inline __m256i RightShiftWithRounding_S32(const __m256i v_val_d, int bits) { 118 const __m256i v_bias_d = _mm256_set1_epi32((1 << bits) >> 1); 119 const __m256i v_tmp_d = _mm256_add_epi32(v_val_d, v_bias_d); 120 return _mm256_srai_epi32(v_tmp_d, bits); 121} 122