• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #ifndef AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_
13 #define AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_
14 
15 #include <smmintrin.h>
16 #include "av1/common/av1_txfm.h"
17 #include "av1/common/x86/av1_txfm_sse4.h"
18 
19 #ifdef __cplusplus
20 extern "C" {
21 #endif
22 
23 void av1_fdct4_new_sse4_1(const __m128i *input, __m128i *output,
24                           const int8_t cos_bit, const int8_t *stage_range);
25 void av1_fdct8_new_sse4_1(const __m128i *input, __m128i *output,
26                           const int8_t cos_bit, const int8_t *stage_range);
27 void av1_fdct16_new_sse4_1(const __m128i *input, __m128i *output,
28                            const int8_t cos_bit, const int8_t *stage_range);
29 void av1_fdct32_new_sse4_1(__m128i *input, __m128i *output, int cos_bit,
30                            const int stride);
31 void av1_fdct64_new_sse4_1(__m128i *input, __m128i *output, int8_t cos_bit,
32                            const int instride, const int outstride);
33 void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
34                            const int8_t cos_bit, const int8_t *stage_range);
35 void av1_fadst8_new_sse4_1(const __m128i *input, __m128i *output,
36                            const int8_t cos_bit, const int8_t *stage_range);
37 void av1_fadst16_new_sse4_1(const __m128i *input, __m128i *output,
38                             const int8_t cos_bit, const int8_t *stage_range);
39 
40 void av1_idct4_new_sse4_1(const __m128i *input, __m128i *output,
41                           const int8_t cos_bit, const int8_t *stage_range);
42 void av1_idct8_new_sse4_1(const __m128i *input, __m128i *output,
43                           const int8_t cos_bit, const int8_t *stage_range);
44 void av1_idct16_new_sse4_1(const __m128i *input, __m128i *output,
45                            const int8_t cos_bit, const int8_t *stage_range);
46 void av1_idct32_new_sse4_1(const __m128i *input, __m128i *output,
47                            const int8_t cos_bit, const int8_t *stage_range);
48 void av1_idct64_new_sse4_1(const __m128i *input, __m128i *output,
49                            const int8_t cos_bit, const int8_t *stage_range);
50 
51 void av1_iadst4_new_sse4_1(const __m128i *input, __m128i *output,
52                            const int8_t cos_bit, const int8_t *stage_range);
53 void av1_iadst8_new_sse4_1(const __m128i *input, __m128i *output,
54                            const int8_t cos_bit, const int8_t *stage_range);
55 void av1_iadst16_new_sse4_1(const __m128i *input, __m128i *output,
56                             const int8_t cos_bit, const int8_t *stage_range);
57 
58 void av1_idtx32_new_sse4_1(__m128i *input, __m128i *output, int cos_bit,
59                            const int col_num);
60 
transpose_32_4x4(int stride,const __m128i * input,__m128i * output)61 static INLINE void transpose_32_4x4(int stride, const __m128i *input,
62                                     __m128i *output) {
63   __m128i temp0 = _mm_unpacklo_epi32(input[0 * stride], input[2 * stride]);
64   __m128i temp1 = _mm_unpackhi_epi32(input[0 * stride], input[2 * stride]);
65   __m128i temp2 = _mm_unpacklo_epi32(input[1 * stride], input[3 * stride]);
66   __m128i temp3 = _mm_unpackhi_epi32(input[1 * stride], input[3 * stride]);
67 
68   output[0 * stride] = _mm_unpacklo_epi32(temp0, temp2);
69   output[1 * stride] = _mm_unpackhi_epi32(temp0, temp2);
70   output[2 * stride] = _mm_unpacklo_epi32(temp1, temp3);
71   output[3 * stride] = _mm_unpackhi_epi32(temp1, temp3);
72 }
73 
74 // the entire input block can be represent by a grid of 4x4 blocks
75 // each 4x4 blocks can be represent by 4 vertical __m128i
76 // we first transpose each 4x4 block internally
77 // then transpose the grid
transpose_32(int txfm_size,const __m128i * input,__m128i * output)78 static INLINE void transpose_32(int txfm_size, const __m128i *input,
79                                 __m128i *output) {
80   const int num_per_128 = 4;
81   const int row_size = txfm_size;
82   const int col_size = txfm_size / num_per_128;
83   int r, c;
84 
85   // transpose each 4x4 block internally
86   for (r = 0; r < row_size; r += 4) {
87     for (c = 0; c < col_size; c++) {
88       transpose_32_4x4(col_size, &input[r * col_size + c],
89                        &output[c * 4 * col_size + r / 4]);
90     }
91   }
92 }
93 
94 // out0 = in0*w0 + in1*w1
95 // out1 = -in1*w0 + in0*w1
96 #define btf_32_sse4_1_type0(w0, w1, in0, in1, out0, out1, bit) \
97   do {                                                         \
98     const __m128i ww0 = _mm_set1_epi32(w0);                    \
99     const __m128i ww1 = _mm_set1_epi32(w1);                    \
100     const __m128i in0_w0 = _mm_mullo_epi32(in0, ww0);          \
101     const __m128i in1_w1 = _mm_mullo_epi32(in1, ww1);          \
102     out0 = _mm_add_epi32(in0_w0, in1_w1);                      \
103     out0 = av1_round_shift_32_sse4_1(out0, bit);               \
104     const __m128i in0_w1 = _mm_mullo_epi32(in0, ww1);          \
105     const __m128i in1_w0 = _mm_mullo_epi32(in1, ww0);          \
106     out1 = _mm_sub_epi32(in0_w1, in1_w0);                      \
107     out1 = av1_round_shift_32_sse4_1(out1, bit);               \
108   } while (0)
109 
110 // out0 = in0*w0 + in1*w1
111 // out1 = in1*w0 - in0*w1
112 #define btf_32_sse4_1_type1(w0, w1, in0, in1, out0, out1, bit) \
113   do {                                                         \
114     btf_32_sse4_1_type0(w1, w0, in1, in0, out0, out1, bit);    \
115   } while (0)
116 
117 // out0 = in0*w0 + in1*w1
118 // out1 = -in1*w0 + in0*w1
119 #define btf_32_type0_sse4_1_new(ww0, ww1, in0, in1, out0, out1, r, bit) \
120   do {                                                                  \
121     const __m128i in0_w0 = _mm_mullo_epi32(in0, ww0);                   \
122     const __m128i in1_w1 = _mm_mullo_epi32(in1, ww1);                   \
123     out0 = _mm_add_epi32(in0_w0, in1_w1);                               \
124     out0 = _mm_add_epi32(out0, r);                                      \
125     out0 = _mm_srai_epi32(out0, bit);                                   \
126     const __m128i in0_w1 = _mm_mullo_epi32(in0, ww1);                   \
127     const __m128i in1_w0 = _mm_mullo_epi32(in1, ww0);                   \
128     out1 = _mm_sub_epi32(in0_w1, in1_w0);                               \
129     out1 = _mm_add_epi32(out1, r);                                      \
130     out1 = _mm_srai_epi32(out1, bit);                                   \
131   } while (0)
132 
133 // out0 = in0*w0 + in1*w1
134 // out1 = in1*w0 - in0*w1
135 #define btf_32_type1_sse4_1_new(ww0, ww1, in0, in1, out0, out1, r, bit) \
136   do {                                                                  \
137     btf_32_type0_sse4_1_new(ww1, ww0, in1, in0, out0, out1, r, bit);    \
138   } while (0)
139 
140 #ifdef __cplusplus
141 }
142 #endif
143 
144 #endif  // AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_
145