1 /*
2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11 #ifndef AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
12 #define AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
13
14 #include <emmintrin.h> // SSE2
15 #include <tmmintrin.h> // SSSE3
16
17 #include "config/aom_config.h"
18 #include "config/av1_rtcd.h"
19
20 #include "aom/aom_integer.h"
21 #include "aom_dsp/x86/transpose_sse2.h"
22 #include "aom_dsp/x86/txfm_common_sse2.h"
23
24 #ifdef __cplusplus
25 extern "C" {
26 #endif
27
28 #define btf_16_ssse3(w0, w1, in, out0, out1) \
29 do { \
30 const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
31 const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
32 const __m128i _in = in; \
33 out0 = _mm_mulhrs_epi16(_in, _w0); \
34 out1 = _mm_mulhrs_epi16(_in, _w1); \
35 } while (0)
36
37 #define btf_16_adds_subs_sse2(in0, in1) \
38 do { \
39 const __m128i _in0 = in0; \
40 const __m128i _in1 = in1; \
41 in0 = _mm_adds_epi16(_in0, _in1); \
42 in1 = _mm_subs_epi16(_in0, _in1); \
43 } while (0)
44
45 #define btf_16_subs_adds_sse2(in0, in1) \
46 do { \
47 const __m128i _in0 = in0; \
48 const __m128i _in1 = in1; \
49 in1 = _mm_subs_epi16(_in0, _in1); \
50 in0 = _mm_adds_epi16(_in0, _in1); \
51 } while (0)
52
53 #define btf_16_adds_subs_out_sse2(out0, out1, in0, in1) \
54 do { \
55 const __m128i _in0 = in0; \
56 const __m128i _in1 = in1; \
57 out0 = _mm_adds_epi16(_in0, _in1); \
58 out1 = _mm_subs_epi16(_in0, _in1); \
59 } while (0)
60
round_shift_16bit_ssse3(__m128i * in,int size,int bit)61 static INLINE void round_shift_16bit_ssse3(__m128i *in, int size, int bit) {
62 if (bit < 0) {
63 const __m128i scale = _mm_set1_epi16(1 << (15 + bit));
64 for (int i = 0; i < size; ++i) {
65 in[i] = _mm_mulhrs_epi16(in[i], scale);
66 }
67 } else if (bit > 0) {
68 for (int i = 0; i < size; ++i) {
69 in[i] = _mm_slli_epi16(in[i], bit);
70 }
71 }
72 }
73
74 // 1D itx types
75 enum {
76 IDCT_1D,
77 IADST_1D,
78 IFLIPADST_1D = IADST_1D,
79 IIDENTITY_1D,
80 ITX_TYPES_1D,
81 } UENUM1BYTE(ITX_TYPE_1D);
82
83 static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = {
84 IDCT_1D, IADST_1D, IDCT_1D, IADST_1D,
85 IFLIPADST_1D, IDCT_1D, IFLIPADST_1D, IADST_1D,
86 IFLIPADST_1D, IIDENTITY_1D, IDCT_1D, IIDENTITY_1D,
87 IADST_1D, IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D,
88 };
89
90 static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = {
91 IDCT_1D, IDCT_1D, IADST_1D, IADST_1D,
92 IDCT_1D, IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D,
93 IADST_1D, IIDENTITY_1D, IIDENTITY_1D, IDCT_1D,
94 IIDENTITY_1D, IADST_1D, IIDENTITY_1D, IFLIPADST_1D,
95 };
96
97 DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = {
98 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707,
99 };
100
101 DECLARE_ALIGNED(16, static const int16_t,
102 av1_eob_to_eobxy_16x16_default[16]) = {
103 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
104 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
105 };
106
107 DECLARE_ALIGNED(16, static const int16_t,
108 av1_eob_to_eobxy_32x32_default[32]) = {
109 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
110 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
111 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
112 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
113 };
114
115 DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x16_default[16]) = {
116 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
117 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07,
118 };
119
120 DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x8_default[8]) = {
121 0x0707, 0x0707, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f,
122 };
123
124 DECLARE_ALIGNED(16, static const int16_t,
125 av1_eob_to_eobxy_16x32_default[32]) = {
126 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
127 0x0f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
128 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
129 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
130 };
131
132 DECLARE_ALIGNED(16, static const int16_t,
133 av1_eob_to_eobxy_32x16_default[16]) = {
134 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
135 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
136 };
137
138 DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x32_default[32]) = {
139 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
140 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x1f07, 0x1f07, 0x1f07,
141 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
142 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
143 };
144
145 DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x8_default[8]) = {
146 0x0707, 0x070f, 0x070f, 0x071f, 0x071f, 0x071f, 0x071f, 0x071f,
147 };
148
149 DECLARE_ALIGNED(16, static const int16_t *,
150 av1_eob_to_eobxy_default[TX_SIZES_ALL]) = {
151 NULL,
152 av1_eob_to_eobxy_8x8_default,
153 av1_eob_to_eobxy_16x16_default,
154 av1_eob_to_eobxy_32x32_default,
155 av1_eob_to_eobxy_32x32_default,
156 NULL,
157 NULL,
158 av1_eob_to_eobxy_8x16_default,
159 av1_eob_to_eobxy_16x8_default,
160 av1_eob_to_eobxy_16x32_default,
161 av1_eob_to_eobxy_32x16_default,
162 av1_eob_to_eobxy_32x32_default,
163 av1_eob_to_eobxy_32x32_default,
164 NULL,
165 NULL,
166 av1_eob_to_eobxy_8x32_default,
167 av1_eob_to_eobxy_32x8_default,
168 av1_eob_to_eobxy_16x32_default,
169 av1_eob_to_eobxy_32x16_default,
170 };
171
172 static const int lowbd_txfm_all_1d_zeros_idx[32] = {
173 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
174 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
175 };
176
177 // Transform block width in log2 for eob (size of 64 map to 32)
178 static const int tx_size_wide_log2_eob[TX_SIZES_ALL] = {
179 2, 3, 4, 5, 5, 2, 3, 3, 4, 4, 5, 5, 5, 2, 4, 3, 5, 4, 5,
180 };
181
get_eobx_eoby_scan_default(int * eobx,int * eoby,TX_SIZE tx_size,int eob)182 static INLINE void get_eobx_eoby_scan_default(int *eobx, int *eoby,
183 TX_SIZE tx_size, int eob) {
184 if (eob == 1) {
185 *eobx = 0;
186 *eoby = 0;
187 return;
188 }
189
190 const int tx_w_log2 = tx_size_wide_log2_eob[tx_size];
191 const int eob_row = (eob - 1) >> tx_w_log2;
192 const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row];
193 *eobx = eobxy & 0xFF;
194 *eoby = eobxy >> 8;
195 }
196
197 static int eob_fill[32] = {
198 0, 7, 7, 7, 7, 7, 7, 7, 15, 15, 15, 15, 15, 15, 15, 15,
199 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
200 };
201
get_eobx_eoby_scan_h_identity(int * eobx,int * eoby,TX_SIZE tx_size,int eob)202 static INLINE void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby,
203 TX_SIZE tx_size, int eob) {
204 eob -= 1;
205 const int txfm_size_col = tx_size_wide[tx_size];
206 const int eobx_max = AOMMIN(32, txfm_size_col) - 1;
207 *eobx = (eob >= eobx_max) ? eobx_max : eob_fill[eob];
208 const int temp_eoby = eob / (eobx_max + 1);
209 assert(temp_eoby < 32);
210 *eoby = eob_fill[temp_eoby];
211 }
212
get_eobx_eoby_scan_v_identity(int * eobx,int * eoby,TX_SIZE tx_size,int eob)213 static INLINE void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby,
214 TX_SIZE tx_size, int eob) {
215 eob -= 1;
216 const int txfm_size_row = tx_size_high[tx_size];
217 const int eoby_max = AOMMIN(32, txfm_size_row) - 1;
218 *eobx = eob / (eoby_max + 1);
219 *eoby = (eob >= eoby_max) ? eoby_max : eob_fill[eob];
220 }
221
222 typedef void (*transform_1d_ssse3)(const __m128i *input, __m128i *output,
223 int8_t cos_bit);
224
225 void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output,
226 int stride, TX_TYPE tx_type,
227 TX_SIZE tx_size, int eob);
228 #ifdef __cplusplus
229 } // extern "C"
230 #endif
231
232 #endif // AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
233