• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #ifndef AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_
13 #define AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_
14 
15 #include <smmintrin.h>
16 #include "av1/common/av1_txfm.h"
17 #include "av1/common/x86/av1_txfm_sse4.h"
18 
19 #ifdef __cplusplus
20 extern "C" {
21 #endif
22 
23 void av1_fdct32_sse4_1(__m128i *input, __m128i *output, int cos_bit,
24                        const int stride);
25 void av1_fdct64_sse4_1(__m128i *input, __m128i *output, int8_t cos_bit,
26                        const int instride, const int outstride);
27 
28 void av1_idtx32_sse4_1(__m128i *input, __m128i *output, int cos_bit,
29                        const int col_num);
30 
transpose_32_4x4(int stride,const __m128i * input,__m128i * output)31 static inline void transpose_32_4x4(int stride, const __m128i *input,
32                                     __m128i *output) {
33   __m128i temp0 = _mm_unpacklo_epi32(input[0 * stride], input[2 * stride]);
34   __m128i temp1 = _mm_unpackhi_epi32(input[0 * stride], input[2 * stride]);
35   __m128i temp2 = _mm_unpacklo_epi32(input[1 * stride], input[3 * stride]);
36   __m128i temp3 = _mm_unpackhi_epi32(input[1 * stride], input[3 * stride]);
37 
38   output[0 * stride] = _mm_unpacklo_epi32(temp0, temp2);
39   output[1 * stride] = _mm_unpackhi_epi32(temp0, temp2);
40   output[2 * stride] = _mm_unpacklo_epi32(temp1, temp3);
41   output[3 * stride] = _mm_unpackhi_epi32(temp1, temp3);
42 }
43 
44 // the entire input block can be represent by a grid of 4x4 blocks
45 // each 4x4 blocks can be represent by 4 vertical __m128i
46 // we first transpose each 4x4 block internally
47 // then transpose the grid
transpose_32(int txfm_size,const __m128i * input,__m128i * output)48 static inline void transpose_32(int txfm_size, const __m128i *input,
49                                 __m128i *output) {
50   const int num_per_128 = 4;
51   const int row_size = txfm_size;
52   const int col_size = txfm_size / num_per_128;
53   int r, c;
54 
55   // transpose each 4x4 block internally
56   for (r = 0; r < row_size; r += 4) {
57     for (c = 0; c < col_size; c++) {
58       transpose_32_4x4(col_size, &input[r * col_size + c],
59                        &output[c * 4 * col_size + r / 4]);
60     }
61   }
62 }
63 
64 // out0 = in0*w0 + in1*w1
65 // out1 = -in1*w0 + in0*w1
66 #define btf_32_sse4_1_type0(w0, w1, in0, in1, out0, out1, bit) \
67   do {                                                         \
68     const __m128i ww0 = _mm_set1_epi32(w0);                    \
69     const __m128i ww1 = _mm_set1_epi32(w1);                    \
70     const __m128i in0_w0 = _mm_mullo_epi32(in0, ww0);          \
71     const __m128i in1_w1 = _mm_mullo_epi32(in1, ww1);          \
72     out0 = _mm_add_epi32(in0_w0, in1_w1);                      \
73     out0 = av1_round_shift_32_sse4_1(out0, bit);               \
74     const __m128i in0_w1 = _mm_mullo_epi32(in0, ww1);          \
75     const __m128i in1_w0 = _mm_mullo_epi32(in1, ww0);          \
76     out1 = _mm_sub_epi32(in0_w1, in1_w0);                      \
77     out1 = av1_round_shift_32_sse4_1(out1, bit);               \
78   } while (0)
79 
80 // out0 = in0*w0 + in1*w1
81 // out1 = in1*w0 - in0*w1
82 #define btf_32_sse4_1_type1(w0, w1, in0, in1, out0, out1, bit) \
83   do {                                                         \
84     btf_32_sse4_1_type0(w1, w0, in1, in0, out0, out1, bit);    \
85   } while (0)
86 
87 // out0 = in0*w0 + in1*w1
88 // out1 = -in1*w0 + in0*w1
89 #define btf_32_type0_sse4_1_new(ww0, ww1, in0, in1, out0, out1, r, bit) \
90   do {                                                                  \
91     const __m128i in0_w0 = _mm_mullo_epi32(in0, ww0);                   \
92     const __m128i in1_w1 = _mm_mullo_epi32(in1, ww1);                   \
93     out0 = _mm_add_epi32(in0_w0, in1_w1);                               \
94     out0 = _mm_add_epi32(out0, r);                                      \
95     out0 = _mm_srai_epi32(out0, bit);                                   \
96     const __m128i in0_w1 = _mm_mullo_epi32(in0, ww1);                   \
97     const __m128i in1_w0 = _mm_mullo_epi32(in1, ww0);                   \
98     out1 = _mm_sub_epi32(in0_w1, in1_w0);                               \
99     out1 = _mm_add_epi32(out1, r);                                      \
100     out1 = _mm_srai_epi32(out1, bit);                                   \
101   } while (0)
102 
103 // out0 = in0*w0 + in1*w1
104 // out1 = in1*w0 - in0*w1
105 #define btf_32_type1_sse4_1_new(ww0, ww1, in0, in1, out0, out1, r, bit) \
106   do {                                                                  \
107     btf_32_type0_sse4_1_new(ww1, ww0, in1, in0, out0, out1, r, bit);    \
108   } while (0)
109 
110 #ifdef __cplusplus
111 }
112 #endif
113 
114 #endif  // AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_
115