1 /*
2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11 #ifndef AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_
12 #define AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_
13
14 #include <immintrin.h>
15
16 #include "config/aom_config.h"
17 #include "config/av1_rtcd.h"
18
19 #include "aom/aom_integer.h"
20 #include "aom_dsp/x86/transpose_sse2.h"
21 #include "aom_dsp/x86/txfm_common_sse2.h"
22
23 #ifdef __cplusplus
24 extern "C" {
25 #endif
26
27 void fdct8x32_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit);
28 void fdct8x64_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit);
29
fidentity4x4_new_sse2(const __m128i * const input,__m128i * const output,const int8_t cos_bit)30 static INLINE void fidentity4x4_new_sse2(const __m128i *const input,
31 __m128i *const output,
32 const int8_t cos_bit) {
33 (void)cos_bit;
34 const __m128i one = _mm_set1_epi16(1);
35
36 for (int i = 0; i < 4; ++i) {
37 const __m128i a = _mm_unpacklo_epi16(input[i], one);
38 const __m128i b = scale_round_sse2(a, NewSqrt2);
39 output[i] = _mm_packs_epi32(b, b);
40 }
41 }
42
fidentity8x4_new_sse2(const __m128i * const input,__m128i * const output,const int8_t cos_bit)43 static INLINE void fidentity8x4_new_sse2(const __m128i *const input,
44 __m128i *const output,
45 const int8_t cos_bit) {
46 (void)cos_bit;
47 const __m128i one = _mm_set1_epi16(1);
48
49 for (int i = 0; i < 4; ++i) {
50 const __m128i a_lo = _mm_unpacklo_epi16(input[i], one);
51 const __m128i a_hi = _mm_unpackhi_epi16(input[i], one);
52 const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2);
53 const __m128i b_hi = scale_round_sse2(a_hi, NewSqrt2);
54 output[i] = _mm_packs_epi32(b_lo, b_hi);
55 }
56 }
57
fidentity8x8_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)58 static INLINE void fidentity8x8_new_sse2(const __m128i *input, __m128i *output,
59 int8_t cos_bit) {
60 (void)cos_bit;
61
62 output[0] = _mm_adds_epi16(input[0], input[0]);
63 output[1] = _mm_adds_epi16(input[1], input[1]);
64 output[2] = _mm_adds_epi16(input[2], input[2]);
65 output[3] = _mm_adds_epi16(input[3], input[3]);
66 output[4] = _mm_adds_epi16(input[4], input[4]);
67 output[5] = _mm_adds_epi16(input[5], input[5]);
68 output[6] = _mm_adds_epi16(input[6], input[6]);
69 output[7] = _mm_adds_epi16(input[7], input[7]);
70 }
71
fidentity8x16_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)72 static INLINE void fidentity8x16_new_sse2(const __m128i *input, __m128i *output,
73 int8_t cos_bit) {
74 (void)cos_bit;
75 const __m128i one = _mm_set1_epi16(1);
76
77 for (int i = 0; i < 16; ++i) {
78 const __m128i a_lo = _mm_unpacklo_epi16(input[i], one);
79 const __m128i a_hi = _mm_unpackhi_epi16(input[i], one);
80 const __m128i b_lo = scale_round_sse2(a_lo, 2 * NewSqrt2);
81 const __m128i b_hi = scale_round_sse2(a_hi, 2 * NewSqrt2);
82 output[i] = _mm_packs_epi32(b_lo, b_hi);
83 }
84 }
85
fidentity8x32_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)86 static INLINE void fidentity8x32_new_sse2(const __m128i *input, __m128i *output,
87 int8_t cos_bit) {
88 (void)cos_bit;
89 for (int i = 0; i < 32; ++i) {
90 output[i] = _mm_slli_epi16(input[i], 2);
91 }
92 }
93
94 static const transform_1d_sse2 col_txfm8x32_arr[TX_TYPES] = {
95 fdct8x32_new_sse2, // DCT_DCT
96 NULL, // ADST_DCT
97 NULL, // DCT_ADST
98 NULL, // ADST_ADST
99 NULL, // FLIPADST_DCT
100 NULL, // DCT_FLIPADST
101 NULL, // FLIPADST_FLIPADST
102 NULL, // ADST_FLIPADST
103 NULL, // FLIPADST_ADST
104 fidentity8x32_new_sse2, // IDTX
105 fdct8x32_new_sse2, // V_DCT
106 fidentity8x32_new_sse2, // H_DCT
107 NULL, // V_ADST
108 NULL, // H_ADST
109 NULL, // V_FLIPADST
110 NULL // H_FLIPADST
111 };
112
113 #ifdef __cplusplus
114 }
115 #endif
116
117 #endif // AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_
118