1 /*
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./vp9_rtcd.h"
12 #include "vpx_dsp/x86/inv_txfm_sse2.h"
13
vp9_iht4x4_16_add_sse2(const tran_low_t * input,uint8_t * dest,int stride,int tx_type)14 void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
15 int tx_type) {
16 __m128i in[2];
17 const __m128i eight = _mm_set1_epi16(8);
18
19 in[0] = load_input_data8(input);
20 in[1] = load_input_data8(input + 8);
21
22 switch (tx_type) {
23 case DCT_DCT:
24 idct4_sse2(in);
25 idct4_sse2(in);
26 break;
27 case ADST_DCT:
28 idct4_sse2(in);
29 iadst4_sse2(in);
30 break;
31 case DCT_ADST:
32 iadst4_sse2(in);
33 idct4_sse2(in);
34 break;
35 default:
36 assert(tx_type == ADST_ADST);
37 iadst4_sse2(in);
38 iadst4_sse2(in);
39 break;
40 }
41
42 // Final round and shift
43 in[0] = _mm_add_epi16(in[0], eight);
44 in[1] = _mm_add_epi16(in[1], eight);
45
46 in[0] = _mm_srai_epi16(in[0], 4);
47 in[1] = _mm_srai_epi16(in[1], 4);
48
49 recon_and_store4x4_sse2(in, dest, stride);
50 }
51
vp9_iht8x8_64_add_sse2(const tran_low_t * input,uint8_t * dest,int stride,int tx_type)52 void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
53 int tx_type) {
54 __m128i in[8];
55 const __m128i final_rounding = _mm_set1_epi16(1 << 4);
56
57 // load input data
58 in[0] = load_input_data8(input);
59 in[1] = load_input_data8(input + 8 * 1);
60 in[2] = load_input_data8(input + 8 * 2);
61 in[3] = load_input_data8(input + 8 * 3);
62 in[4] = load_input_data8(input + 8 * 4);
63 in[5] = load_input_data8(input + 8 * 5);
64 in[6] = load_input_data8(input + 8 * 6);
65 in[7] = load_input_data8(input + 8 * 7);
66
67 switch (tx_type) {
68 case DCT_DCT:
69 vpx_idct8_sse2(in);
70 vpx_idct8_sse2(in);
71 break;
72 case ADST_DCT:
73 vpx_idct8_sse2(in);
74 iadst8_sse2(in);
75 break;
76 case DCT_ADST:
77 iadst8_sse2(in);
78 vpx_idct8_sse2(in);
79 break;
80 default:
81 assert(tx_type == ADST_ADST);
82 iadst8_sse2(in);
83 iadst8_sse2(in);
84 break;
85 }
86
87 // Final rounding and shift
88 in[0] = _mm_adds_epi16(in[0], final_rounding);
89 in[1] = _mm_adds_epi16(in[1], final_rounding);
90 in[2] = _mm_adds_epi16(in[2], final_rounding);
91 in[3] = _mm_adds_epi16(in[3], final_rounding);
92 in[4] = _mm_adds_epi16(in[4], final_rounding);
93 in[5] = _mm_adds_epi16(in[5], final_rounding);
94 in[6] = _mm_adds_epi16(in[6], final_rounding);
95 in[7] = _mm_adds_epi16(in[7], final_rounding);
96
97 in[0] = _mm_srai_epi16(in[0], 5);
98 in[1] = _mm_srai_epi16(in[1], 5);
99 in[2] = _mm_srai_epi16(in[2], 5);
100 in[3] = _mm_srai_epi16(in[3], 5);
101 in[4] = _mm_srai_epi16(in[4], 5);
102 in[5] = _mm_srai_epi16(in[5], 5);
103 in[6] = _mm_srai_epi16(in[6], 5);
104 in[7] = _mm_srai_epi16(in[7], 5);
105
106 recon_and_store(dest + 0 * stride, in[0]);
107 recon_and_store(dest + 1 * stride, in[1]);
108 recon_and_store(dest + 2 * stride, in[2]);
109 recon_and_store(dest + 3 * stride, in[3]);
110 recon_and_store(dest + 4 * stride, in[4]);
111 recon_and_store(dest + 5 * stride, in[5]);
112 recon_and_store(dest + 6 * stride, in[6]);
113 recon_and_store(dest + 7 * stride, in[7]);
114 }
115
load_buffer_8x16(const tran_low_t * const input,__m128i * const in)116 static INLINE void load_buffer_8x16(const tran_low_t *const input,
117 __m128i *const in) {
118 in[0] = load_input_data8(input + 0 * 16);
119 in[1] = load_input_data8(input + 1 * 16);
120 in[2] = load_input_data8(input + 2 * 16);
121 in[3] = load_input_data8(input + 3 * 16);
122 in[4] = load_input_data8(input + 4 * 16);
123 in[5] = load_input_data8(input + 5 * 16);
124 in[6] = load_input_data8(input + 6 * 16);
125 in[7] = load_input_data8(input + 7 * 16);
126
127 in[8] = load_input_data8(input + 8 * 16);
128 in[9] = load_input_data8(input + 9 * 16);
129 in[10] = load_input_data8(input + 10 * 16);
130 in[11] = load_input_data8(input + 11 * 16);
131 in[12] = load_input_data8(input + 12 * 16);
132 in[13] = load_input_data8(input + 13 * 16);
133 in[14] = load_input_data8(input + 14 * 16);
134 in[15] = load_input_data8(input + 15 * 16);
135 }
136
write_buffer_8x16(uint8_t * const dest,__m128i * const in,const int stride)137 static INLINE void write_buffer_8x16(uint8_t *const dest, __m128i *const in,
138 const int stride) {
139 const __m128i final_rounding = _mm_set1_epi16(1 << 5);
140 // Final rounding and shift
141 in[0] = _mm_adds_epi16(in[0], final_rounding);
142 in[1] = _mm_adds_epi16(in[1], final_rounding);
143 in[2] = _mm_adds_epi16(in[2], final_rounding);
144 in[3] = _mm_adds_epi16(in[3], final_rounding);
145 in[4] = _mm_adds_epi16(in[4], final_rounding);
146 in[5] = _mm_adds_epi16(in[5], final_rounding);
147 in[6] = _mm_adds_epi16(in[6], final_rounding);
148 in[7] = _mm_adds_epi16(in[7], final_rounding);
149 in[8] = _mm_adds_epi16(in[8], final_rounding);
150 in[9] = _mm_adds_epi16(in[9], final_rounding);
151 in[10] = _mm_adds_epi16(in[10], final_rounding);
152 in[11] = _mm_adds_epi16(in[11], final_rounding);
153 in[12] = _mm_adds_epi16(in[12], final_rounding);
154 in[13] = _mm_adds_epi16(in[13], final_rounding);
155 in[14] = _mm_adds_epi16(in[14], final_rounding);
156 in[15] = _mm_adds_epi16(in[15], final_rounding);
157
158 in[0] = _mm_srai_epi16(in[0], 6);
159 in[1] = _mm_srai_epi16(in[1], 6);
160 in[2] = _mm_srai_epi16(in[2], 6);
161 in[3] = _mm_srai_epi16(in[3], 6);
162 in[4] = _mm_srai_epi16(in[4], 6);
163 in[5] = _mm_srai_epi16(in[5], 6);
164 in[6] = _mm_srai_epi16(in[6], 6);
165 in[7] = _mm_srai_epi16(in[7], 6);
166 in[8] = _mm_srai_epi16(in[8], 6);
167 in[9] = _mm_srai_epi16(in[9], 6);
168 in[10] = _mm_srai_epi16(in[10], 6);
169 in[11] = _mm_srai_epi16(in[11], 6);
170 in[12] = _mm_srai_epi16(in[12], 6);
171 in[13] = _mm_srai_epi16(in[13], 6);
172 in[14] = _mm_srai_epi16(in[14], 6);
173 in[15] = _mm_srai_epi16(in[15], 6);
174
175 recon_and_store(dest + 0 * stride, in[0]);
176 recon_and_store(dest + 1 * stride, in[1]);
177 recon_and_store(dest + 2 * stride, in[2]);
178 recon_and_store(dest + 3 * stride, in[3]);
179 recon_and_store(dest + 4 * stride, in[4]);
180 recon_and_store(dest + 5 * stride, in[5]);
181 recon_and_store(dest + 6 * stride, in[6]);
182 recon_and_store(dest + 7 * stride, in[7]);
183 recon_and_store(dest + 8 * stride, in[8]);
184 recon_and_store(dest + 9 * stride, in[9]);
185 recon_and_store(dest + 10 * stride, in[10]);
186 recon_and_store(dest + 11 * stride, in[11]);
187 recon_and_store(dest + 12 * stride, in[12]);
188 recon_and_store(dest + 13 * stride, in[13]);
189 recon_and_store(dest + 14 * stride, in[14]);
190 recon_and_store(dest + 15 * stride, in[15]);
191 }
192
vp9_iht16x16_256_add_sse2(const tran_low_t * input,uint8_t * dest,int stride,int tx_type)193 void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
194 int stride, int tx_type) {
195 __m128i in0[16], in1[16];
196
197 load_buffer_8x16(input, in0);
198 input += 8;
199 load_buffer_8x16(input, in1);
200
201 switch (tx_type) {
202 case DCT_DCT:
203 idct16_sse2(in0, in1);
204 idct16_sse2(in0, in1);
205 break;
206 case ADST_DCT:
207 idct16_sse2(in0, in1);
208 iadst16_sse2(in0, in1);
209 break;
210 case DCT_ADST:
211 iadst16_sse2(in0, in1);
212 idct16_sse2(in0, in1);
213 break;
214 default:
215 assert(tx_type == ADST_ADST);
216 iadst16_sse2(in0, in1);
217 iadst16_sse2(in0, in1);
218 break;
219 }
220
221 write_buffer_8x16(dest, in0, stride);
222 dest += 8;
223 write_buffer_8x16(dest, in1, stride);
224 }
225