1/**************************************************************************** 2 * Copyright (C) 2017 Intel Corporation. All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 ****************************************************************************/ 23#if !defined(__SIMD_LIB_AVX2_HPP__) 24#error Do not include this file directly, use "simdlib.hpp" instead. 25#endif 26 27//============================================================================ 28// SIMD4 AVX (2) implementation 29// 30// Since this implementation inherits from the AVX (1) implementation, 31// the only operations below ones that replace AVX (1) operations. 32// Only 2 shifts and 2 gathers were introduced with AVX 2 33// Also, add native support for FMA operations 34//============================================================================ 35#define SIMD_WRAPPER_3(op) \ 36 static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) { return _mm_##op(a, b, c); } 37 38SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c 39SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c 40 41static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer vA, Integer vB) // return a << b (uint32) 42{ 43 return _mm_sllv_epi32(vA, vB); 44} 45 46static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vB) // return a >> b (uint32) 47{ 48 return _mm_srlv_epi32(vA, vB); 49} 50 51template <ScaleFactor ScaleT = ScaleFactor::SF_1> 52static SIMDINLINE Float SIMDCALL 53 i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) 54{ 55 return _mm_i32gather_ps(p, idx, static_cast<const int>(ScaleT)); 56} 57 58// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old 59template <ScaleFactor ScaleT = ScaleFactor::SF_1> 60static SIMDINLINE Float SIMDCALL 61 mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask) 62{ 63 return _mm_mask_i32gather_ps(old, p, idx, mask, static_cast<const int>(ScaleT)); 64} 65 66#undef SIMD_WRAPPER_3 67