1 /*
2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #ifndef AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_
13 #define AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_
14
15 #include <smmintrin.h>
16 #include "av1/common/av1_txfm.h"
17 #include "av1/common/x86/av1_txfm_sse4.h"
18
19 #ifdef __cplusplus
20 extern "C" {
21 #endif
22
23 void av1_fdct4_new_sse4_1(const __m128i *input, __m128i *output,
24 const int8_t cos_bit, const int8_t *stage_range);
25 void av1_fdct8_new_sse4_1(const __m128i *input, __m128i *output,
26 const int8_t cos_bit, const int8_t *stage_range);
27 void av1_fdct16_new_sse4_1(const __m128i *input, __m128i *output,
28 const int8_t cos_bit, const int8_t *stage_range);
29 void av1_fdct32_new_sse4_1(__m128i *input, __m128i *output, int cos_bit,
30 const int stride);
31 void av1_fdct64_new_sse4_1(__m128i *input, __m128i *output, int8_t cos_bit,
32 const int instride, const int outstride);
33 void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
34 const int8_t cos_bit, const int8_t *stage_range);
35 void av1_fadst8_new_sse4_1(const __m128i *input, __m128i *output,
36 const int8_t cos_bit, const int8_t *stage_range);
37 void av1_fadst16_new_sse4_1(const __m128i *input, __m128i *output,
38 const int8_t cos_bit, const int8_t *stage_range);
39
40 void av1_idct4_new_sse4_1(const __m128i *input, __m128i *output,
41 const int8_t cos_bit, const int8_t *stage_range);
42 void av1_idct8_new_sse4_1(const __m128i *input, __m128i *output,
43 const int8_t cos_bit, const int8_t *stage_range);
44 void av1_idct16_new_sse4_1(const __m128i *input, __m128i *output,
45 const int8_t cos_bit, const int8_t *stage_range);
46 void av1_idct32_new_sse4_1(const __m128i *input, __m128i *output,
47 const int8_t cos_bit, const int8_t *stage_range);
48 void av1_idct64_new_sse4_1(const __m128i *input, __m128i *output,
49 const int8_t cos_bit, const int8_t *stage_range);
50
51 void av1_iadst4_new_sse4_1(const __m128i *input, __m128i *output,
52 const int8_t cos_bit, const int8_t *stage_range);
53 void av1_iadst8_new_sse4_1(const __m128i *input, __m128i *output,
54 const int8_t cos_bit, const int8_t *stage_range);
55 void av1_iadst16_new_sse4_1(const __m128i *input, __m128i *output,
56 const int8_t cos_bit, const int8_t *stage_range);
57
58 void av1_idtx32_new_sse4_1(__m128i *input, __m128i *output, int cos_bit,
59 const int col_num);
60
transpose_32_4x4(int stride,const __m128i * input,__m128i * output)61 static INLINE void transpose_32_4x4(int stride, const __m128i *input,
62 __m128i *output) {
63 __m128i temp0 = _mm_unpacklo_epi32(input[0 * stride], input[2 * stride]);
64 __m128i temp1 = _mm_unpackhi_epi32(input[0 * stride], input[2 * stride]);
65 __m128i temp2 = _mm_unpacklo_epi32(input[1 * stride], input[3 * stride]);
66 __m128i temp3 = _mm_unpackhi_epi32(input[1 * stride], input[3 * stride]);
67
68 output[0 * stride] = _mm_unpacklo_epi32(temp0, temp2);
69 output[1 * stride] = _mm_unpackhi_epi32(temp0, temp2);
70 output[2 * stride] = _mm_unpacklo_epi32(temp1, temp3);
71 output[3 * stride] = _mm_unpackhi_epi32(temp1, temp3);
72 }
73
74 // the entire input block can be represent by a grid of 4x4 blocks
75 // each 4x4 blocks can be represent by 4 vertical __m128i
76 // we first transpose each 4x4 block internally
77 // then transpose the grid
transpose_32(int txfm_size,const __m128i * input,__m128i * output)78 static INLINE void transpose_32(int txfm_size, const __m128i *input,
79 __m128i *output) {
80 const int num_per_128 = 4;
81 const int row_size = txfm_size;
82 const int col_size = txfm_size / num_per_128;
83 int r, c;
84
85 // transpose each 4x4 block internally
86 for (r = 0; r < row_size; r += 4) {
87 for (c = 0; c < col_size; c++) {
88 transpose_32_4x4(col_size, &input[r * col_size + c],
89 &output[c * 4 * col_size + r / 4]);
90 }
91 }
92 }
93
94 // out0 = in0*w0 + in1*w1
95 // out1 = -in1*w0 + in0*w1
96 #define btf_32_sse4_1_type0(w0, w1, in0, in1, out0, out1, bit) \
97 do { \
98 const __m128i ww0 = _mm_set1_epi32(w0); \
99 const __m128i ww1 = _mm_set1_epi32(w1); \
100 const __m128i in0_w0 = _mm_mullo_epi32(in0, ww0); \
101 const __m128i in1_w1 = _mm_mullo_epi32(in1, ww1); \
102 out0 = _mm_add_epi32(in0_w0, in1_w1); \
103 out0 = av1_round_shift_32_sse4_1(out0, bit); \
104 const __m128i in0_w1 = _mm_mullo_epi32(in0, ww1); \
105 const __m128i in1_w0 = _mm_mullo_epi32(in1, ww0); \
106 out1 = _mm_sub_epi32(in0_w1, in1_w0); \
107 out1 = av1_round_shift_32_sse4_1(out1, bit); \
108 } while (0)
109
110 // out0 = in0*w0 + in1*w1
111 // out1 = in1*w0 - in0*w1
112 #define btf_32_sse4_1_type1(w0, w1, in0, in1, out0, out1, bit) \
113 do { \
114 btf_32_sse4_1_type0(w1, w0, in1, in0, out0, out1, bit); \
115 } while (0)
116
117 // out0 = in0*w0 + in1*w1
118 // out1 = -in1*w0 + in0*w1
119 #define btf_32_type0_sse4_1_new(ww0, ww1, in0, in1, out0, out1, r, bit) \
120 do { \
121 const __m128i in0_w0 = _mm_mullo_epi32(in0, ww0); \
122 const __m128i in1_w1 = _mm_mullo_epi32(in1, ww1); \
123 out0 = _mm_add_epi32(in0_w0, in1_w1); \
124 out0 = _mm_add_epi32(out0, r); \
125 out0 = _mm_srai_epi32(out0, bit); \
126 const __m128i in0_w1 = _mm_mullo_epi32(in0, ww1); \
127 const __m128i in1_w0 = _mm_mullo_epi32(in1, ww0); \
128 out1 = _mm_sub_epi32(in0_w1, in1_w0); \
129 out1 = _mm_add_epi32(out1, r); \
130 out1 = _mm_srai_epi32(out1, bit); \
131 } while (0)
132
133 // out0 = in0*w0 + in1*w1
134 // out1 = in1*w0 - in0*w1
135 #define btf_32_type1_sse4_1_new(ww0, ww1, in0, in1, out0, out1, r, bit) \
136 do { \
137 btf_32_type0_sse4_1_new(ww1, ww0, in1, in0, out0, out1, r, bit); \
138 } while (0)
139
140 #ifdef __cplusplus
141 }
142 #endif
143
144 #endif // AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_
145