1 /*
2  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #ifndef AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_
13 #define AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_
14 
15 #include <smmintrin.h>
16 
17 #ifdef __cplusplus
18 extern "C" {
19 #endif
20 
av1_round_shift_32_sse4_1(__m128i vec,int bit)21 static INLINE __m128i av1_round_shift_32_sse4_1(__m128i vec, int bit) {
22   __m128i tmp, round;
23   round = _mm_set1_epi32(1 << (bit - 1));
24   tmp = _mm_add_epi32(vec, round);
25   return _mm_srai_epi32(tmp, bit);
26 }
27 
av1_round_shift_array_32_sse4_1(__m128i * input,__m128i * output,const int size,const int bit)28 static INLINE void av1_round_shift_array_32_sse4_1(__m128i *input,
29                                                    __m128i *output,
30                                                    const int size,
31                                                    const int bit) {
32   if (bit > 0) {
33     int i;
34     for (i = 0; i < size; i++) {
35       output[i] = av1_round_shift_32_sse4_1(input[i], bit);
36     }
37   } else {
38     int i;
39     for (i = 0; i < size; i++) {
40       output[i] = _mm_slli_epi32(input[i], -bit);
41     }
42   }
43 }
44 
av1_round_shift_rect_array_32_sse4_1(__m128i * input,__m128i * output,const int size,const int bit,const int val)45 static INLINE void av1_round_shift_rect_array_32_sse4_1(__m128i *input,
46                                                         __m128i *output,
47                                                         const int size,
48                                                         const int bit,
49                                                         const int val) {
50   const __m128i sqrt2 = _mm_set1_epi32(val);
51   if (bit > 0) {
52     int i;
53     for (i = 0; i < size; i++) {
54       const __m128i r0 = av1_round_shift_32_sse4_1(input[i], bit);
55       const __m128i r1 = _mm_mullo_epi32(sqrt2, r0);
56       output[i] = av1_round_shift_32_sse4_1(r1, NewSqrt2Bits);
57     }
58   } else {
59     int i;
60     for (i = 0; i < size; i++) {
61       const __m128i r0 = _mm_slli_epi32(input[i], -bit);
62       const __m128i r1 = _mm_mullo_epi32(sqrt2, r0);
63       output[i] = av1_round_shift_32_sse4_1(r1, NewSqrt2Bits);
64     }
65   }
66 }
67 
68 #ifdef __cplusplus
69 }
70 #endif
71 
72 #endif  // AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_
73