1 /*
2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11 #ifndef AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_
12 #define AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_
13
14 #include <emmintrin.h> // SSE2
15
16 #include "config/aom_config.h"
17 #include "config/av1_rtcd.h"
18
19 #include "aom/aom_integer.h"
20 #include "aom_dsp/x86/transpose_sse2.h"
21 #include "aom_dsp/x86/txfm_common_sse2.h"
22 #include "av1/common/av1_txfm.h"
23
24 #ifdef __cplusplus
25 extern "C" {
26 #endif
27
btf_16_w4_sse2(const __m128i * const w0,const __m128i * const w1,const __m128i __rounding,const int8_t cos_bit,const __m128i * const in0,const __m128i * const in1,__m128i * const out0,__m128i * const out1)28 static INLINE void btf_16_w4_sse2(
29 const __m128i *const w0, const __m128i *const w1, const __m128i __rounding,
30 const int8_t cos_bit, const __m128i *const in0, const __m128i *const in1,
31 __m128i *const out0, __m128i *const out1) {
32 const __m128i t0 = _mm_unpacklo_epi16(*in0, *in1);
33 const __m128i u0 = _mm_madd_epi16(t0, *w0);
34 const __m128i v0 = _mm_madd_epi16(t0, *w1);
35 const __m128i a0 = _mm_add_epi32(u0, __rounding);
36 const __m128i b0 = _mm_add_epi32(v0, __rounding);
37 const __m128i c0 = _mm_srai_epi32(a0, cos_bit);
38 const __m128i d0 = _mm_srai_epi32(b0, cos_bit);
39
40 *out0 = _mm_packs_epi32(c0, c0);
41 *out1 = _mm_packs_epi32(d0, c0);
42 }
43
44 #define btf_16_4p_sse2(w0, w1, in0, in1, out0, out1) \
45 { \
46 __m128i t0 = _mm_unpacklo_epi16(in0, in1); \
47 __m128i u0 = _mm_madd_epi16(t0, w0); \
48 __m128i v0 = _mm_madd_epi16(t0, w1); \
49 \
50 __m128i a0 = _mm_add_epi32(u0, __rounding); \
51 __m128i b0 = _mm_add_epi32(v0, __rounding); \
52 \
53 __m128i c0 = _mm_srai_epi32(a0, cos_bit); \
54 __m128i d0 = _mm_srai_epi32(b0, cos_bit); \
55 \
56 out0 = _mm_packs_epi32(c0, c0); \
57 out1 = _mm_packs_epi32(d0, d0); \
58 }
59
60 #define btf_16_sse2(w0, w1, in0, in1, out0, out1) \
61 { \
62 __m128i t0 = _mm_unpacklo_epi16(in0, in1); \
63 __m128i t1 = _mm_unpackhi_epi16(in0, in1); \
64 __m128i u0 = _mm_madd_epi16(t0, w0); \
65 __m128i u1 = _mm_madd_epi16(t1, w0); \
66 __m128i v0 = _mm_madd_epi16(t0, w1); \
67 __m128i v1 = _mm_madd_epi16(t1, w1); \
68 \
69 __m128i a0 = _mm_add_epi32(u0, __rounding); \
70 __m128i a1 = _mm_add_epi32(u1, __rounding); \
71 __m128i b0 = _mm_add_epi32(v0, __rounding); \
72 __m128i b1 = _mm_add_epi32(v1, __rounding); \
73 \
74 __m128i c0 = _mm_srai_epi32(a0, cos_bit); \
75 __m128i c1 = _mm_srai_epi32(a1, cos_bit); \
76 __m128i d0 = _mm_srai_epi32(b0, cos_bit); \
77 __m128i d1 = _mm_srai_epi32(b1, cos_bit); \
78 \
79 out0 = _mm_packs_epi32(c0, c1); \
80 out1 = _mm_packs_epi32(d0, d1); \
81 }
82
load_16bit_to_16bit(const int16_t * a)83 static INLINE __m128i load_16bit_to_16bit(const int16_t *a) {
84 return _mm_load_si128((const __m128i *)a);
85 }
86
load_32bit_to_16bit(const int32_t * a)87 static INLINE __m128i load_32bit_to_16bit(const int32_t *a) {
88 const __m128i a_low = _mm_load_si128((const __m128i *)a);
89 return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4));
90 }
91
load_32bit_to_16bit_w4(const int32_t * a)92 static INLINE __m128i load_32bit_to_16bit_w4(const int32_t *a) {
93 const __m128i a_low = _mm_load_si128((const __m128i *)a);
94 return _mm_packs_epi32(a_low, a_low);
95 }
96
97 // Store 4 16 bit values. Sign extend the values.
store_16bit_to_32bit_w4(const __m128i a,int32_t * const b)98 static INLINE void store_16bit_to_32bit_w4(const __m128i a, int32_t *const b) {
99 const __m128i a_lo = _mm_unpacklo_epi16(a, a);
100 const __m128i a_1 = _mm_srai_epi32(a_lo, 16);
101 _mm_store_si128((__m128i *)b, a_1);
102 }
103
104 // Store 8 16 bit values. Sign extend the values.
store_16bit_to_32bit(__m128i a,int32_t * b)105 static INLINE void store_16bit_to_32bit(__m128i a, int32_t *b) {
106 const __m128i a_lo = _mm_unpacklo_epi16(a, a);
107 const __m128i a_hi = _mm_unpackhi_epi16(a, a);
108 const __m128i a_1 = _mm_srai_epi32(a_lo, 16);
109 const __m128i a_2 = _mm_srai_epi32(a_hi, 16);
110 _mm_store_si128((__m128i *)b, a_1);
111 _mm_store_si128((__m128i *)(b + 4), a_2);
112 }
113
scale_round_sse2(const __m128i a,const int scale)114 static INLINE __m128i scale_round_sse2(const __m128i a, const int scale) {
115 const __m128i scale_rounding = pair_set_epi16(scale, 1 << (NewSqrt2Bits - 1));
116 const __m128i b = _mm_madd_epi16(a, scale_rounding);
117 return _mm_srai_epi32(b, NewSqrt2Bits);
118 }
119
store_rect_16bit_to_32bit_w4(const __m128i a,int32_t * const b)120 static INLINE void store_rect_16bit_to_32bit_w4(const __m128i a,
121 int32_t *const b) {
122 const __m128i one = _mm_set1_epi16(1);
123 const __m128i a_lo = _mm_unpacklo_epi16(a, one);
124 const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2);
125 _mm_store_si128((__m128i *)b, b_lo);
126 }
127
store_rect_16bit_to_32bit(const __m128i a,int32_t * const b)128 static INLINE void store_rect_16bit_to_32bit(const __m128i a,
129 int32_t *const b) {
130 const __m128i one = _mm_set1_epi16(1);
131 const __m128i a_lo = _mm_unpacklo_epi16(a, one);
132 const __m128i a_hi = _mm_unpackhi_epi16(a, one);
133 const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2);
134 const __m128i b_hi = scale_round_sse2(a_hi, NewSqrt2);
135 _mm_store_si128((__m128i *)b, b_lo);
136 _mm_store_si128((__m128i *)(b + 4), b_hi);
137 }
138
load_buffer_16bit_to_16bit_w4(const int16_t * const in,const int stride,__m128i * const out,const int out_size)139 static INLINE void load_buffer_16bit_to_16bit_w4(const int16_t *const in,
140 const int stride,
141 __m128i *const out,
142 const int out_size) {
143 for (int i = 0; i < out_size; ++i) {
144 out[i] = _mm_loadl_epi64((const __m128i *)(in + i * stride));
145 }
146 }
147
load_buffer_16bit_to_16bit_w4_flip(const int16_t * const in,const int stride,__m128i * const out,const int out_size)148 static INLINE void load_buffer_16bit_to_16bit_w4_flip(const int16_t *const in,
149 const int stride,
150 __m128i *const out,
151 const int out_size) {
152 for (int i = 0; i < out_size; ++i) {
153 out[out_size - i - 1] = _mm_loadl_epi64((const __m128i *)(in + i * stride));
154 }
155 }
156
load_buffer_16bit_to_16bit(const int16_t * in,int stride,__m128i * out,int out_size)157 static INLINE void load_buffer_16bit_to_16bit(const int16_t *in, int stride,
158 __m128i *out, int out_size) {
159 for (int i = 0; i < out_size; ++i) {
160 out[i] = load_16bit_to_16bit(in + i * stride);
161 }
162 }
163
load_buffer_16bit_to_16bit_flip(const int16_t * in,int stride,__m128i * out,int out_size)164 static INLINE void load_buffer_16bit_to_16bit_flip(const int16_t *in,
165 int stride, __m128i *out,
166 int out_size) {
167 for (int i = 0; i < out_size; ++i) {
168 out[out_size - i - 1] = load_16bit_to_16bit(in + i * stride);
169 }
170 }
171
load_buffer_32bit_to_16bit(const int32_t * in,int stride,__m128i * out,int out_size)172 static INLINE void load_buffer_32bit_to_16bit(const int32_t *in, int stride,
173 __m128i *out, int out_size) {
174 for (int i = 0; i < out_size; ++i) {
175 out[i] = load_32bit_to_16bit(in + i * stride);
176 }
177 }
178
load_buffer_32bit_to_16bit_w4(const int32_t * in,int stride,__m128i * out,int out_size)179 static INLINE void load_buffer_32bit_to_16bit_w4(const int32_t *in, int stride,
180 __m128i *out, int out_size) {
181 for (int i = 0; i < out_size; ++i) {
182 out[i] = load_32bit_to_16bit_w4(in + i * stride);
183 }
184 }
185
load_buffer_32bit_to_16bit_flip(const int32_t * in,int stride,__m128i * out,int out_size)186 static INLINE void load_buffer_32bit_to_16bit_flip(const int32_t *in,
187 int stride, __m128i *out,
188 int out_size) {
189 for (int i = 0; i < out_size; ++i) {
190 out[out_size - i - 1] = load_32bit_to_16bit(in + i * stride);
191 }
192 }
193
store_buffer_16bit_to_32bit_w4(const __m128i * const in,int32_t * const out,const int stride,const int out_size)194 static INLINE void store_buffer_16bit_to_32bit_w4(const __m128i *const in,
195 int32_t *const out,
196 const int stride,
197 const int out_size) {
198 for (int i = 0; i < out_size; ++i) {
199 store_16bit_to_32bit_w4(in[i], out + i * stride);
200 }
201 }
202
store_buffer_16bit_to_32bit_w8(const __m128i * const in,int32_t * const out,const int stride,const int out_size)203 static INLINE void store_buffer_16bit_to_32bit_w8(const __m128i *const in,
204 int32_t *const out,
205 const int stride,
206 const int out_size) {
207 for (int i = 0; i < out_size; ++i) {
208 store_16bit_to_32bit(in[i], out + i * stride);
209 }
210 }
211
store_rect_buffer_16bit_to_32bit_w4(const __m128i * const in,int32_t * const out,const int stride,const int out_size)212 static INLINE void store_rect_buffer_16bit_to_32bit_w4(const __m128i *const in,
213 int32_t *const out,
214 const int stride,
215 const int out_size) {
216 for (int i = 0; i < out_size; ++i) {
217 store_rect_16bit_to_32bit_w4(in[i], out + i * stride);
218 }
219 }
220
store_rect_buffer_16bit_to_32bit_w8(const __m128i * const in,int32_t * const out,const int stride,const int out_size)221 static INLINE void store_rect_buffer_16bit_to_32bit_w8(const __m128i *const in,
222 int32_t *const out,
223 const int stride,
224 const int out_size) {
225 for (int i = 0; i < out_size; ++i) {
226 store_rect_16bit_to_32bit(in[i], out + i * stride);
227 }
228 }
229
store_buffer_16bit_to_16bit_8x8(const __m128i * in,uint16_t * out,const int stride)230 static INLINE void store_buffer_16bit_to_16bit_8x8(const __m128i *in,
231 uint16_t *out,
232 const int stride) {
233 for (int i = 0; i < 8; ++i) {
234 _mm_store_si128((__m128i *)(out + i * stride), in[i]);
235 }
236 }
237
round_shift_16bit(__m128i * in,int size,int bit)238 static INLINE void round_shift_16bit(__m128i *in, int size, int bit) {
239 if (bit < 0) {
240 bit = -bit;
241 __m128i rounding = _mm_set1_epi16(1 << (bit - 1));
242 for (int i = 0; i < size; ++i) {
243 in[i] = _mm_adds_epi16(in[i], rounding);
244 in[i] = _mm_srai_epi16(in[i], bit);
245 }
246 } else if (bit > 0) {
247 for (int i = 0; i < size; ++i) {
248 in[i] = _mm_slli_epi16(in[i], bit);
249 }
250 }
251 }
252
flip_buf_sse2(__m128i * in,__m128i * out,int size)253 static INLINE void flip_buf_sse2(__m128i *in, __m128i *out, int size) {
254 for (int i = 0; i < size; ++i) {
255 out[size - i - 1] = in[i];
256 }
257 }
258
259 void av1_lowbd_fwd_txfm2d_4x4_sse2(const int16_t *input, int32_t *output,
260 int stride, TX_TYPE tx_type, int bd);
261
262 void av1_lowbd_fwd_txfm2d_4x8_sse2(const int16_t *input, int32_t *output,
263 int stride, TX_TYPE tx_type, int bd);
264
265 void av1_lowbd_fwd_txfm2d_4x16_sse2(const int16_t *input, int32_t *output,
266 int stride, TX_TYPE tx_type, int bd);
267
268 void av1_lowbd_fwd_txfm2d_8x4_sse2(const int16_t *input, int32_t *output,
269 int stride, TX_TYPE tx_type, int bd);
270
271 void av1_lowbd_fwd_txfm2d_8x8_sse2(const int16_t *input, int32_t *output,
272 int stride, TX_TYPE tx_type, int bd);
273
274 void av1_lowbd_fwd_txfm2d_8x16_sse2(const int16_t *input, int32_t *output,
275 int stride, TX_TYPE tx_type, int bd);
276
277 void av1_lowbd_fwd_txfm2d_8x32_sse2(const int16_t *input, int32_t *output,
278 int stride, TX_TYPE tx_type, int bd);
279
280 void av1_lowbd_fwd_txfm2d_16x4_sse2(const int16_t *input, int32_t *output,
281 int stride, TX_TYPE tx_type, int bd);
282
283 void av1_lowbd_fwd_txfm2d_16x8_sse2(const int16_t *input, int32_t *output,
284 int stride, TX_TYPE tx_type, int bd);
285
286 void av1_lowbd_fwd_txfm2d_16x16_sse2(const int16_t *input, int32_t *output,
287 int stride, TX_TYPE tx_type, int bd);
288
289 void av1_lowbd_fwd_txfm2d_16x32_sse2(const int16_t *input, int32_t *output,
290 int stride, TX_TYPE tx_type, int bd);
291
292 void av1_lowbd_fwd_txfm2d_32x8_sse2(const int16_t *input, int32_t *output,
293 int stride, TX_TYPE tx_type, int bd);
294
295 void av1_lowbd_fwd_txfm2d_32x16_sse2(const int16_t *input, int32_t *output,
296 int stride, TX_TYPE tx_type, int bd);
297
298 void av1_lowbd_fwd_txfm2d_32x32_sse2(const int16_t *input, int32_t *output,
299 int stride, TX_TYPE tx_type, int bd);
300
301 void av1_lowbd_fwd_txfm2d_16x64_sse2(const int16_t *input, int32_t *output,
302 int stride, TX_TYPE tx_type, int bd);
303
304 void av1_lowbd_fwd_txfm2d_64x16_sse2(const int16_t *input, int32_t *output,
305 int stride, TX_TYPE tx_type, int bd);
306
307 typedef void (*transform_1d_sse2)(const __m128i *input, __m128i *output,
308 int8_t cos_bit);
309
310 typedef struct {
311 transform_1d_sse2 col, row; // vertical and horizontal
312 } transform_2d_sse2;
313
314 #ifdef __cplusplus
315 }
316 #endif // __cplusplus
317 #endif // AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_
318