1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11 #include <assert.h>
12 #include <smmintrin.h> /* SSE4.1 */
13
14 #include "config/aom_config.h"
15 #include "config/av1_rtcd.h"
16
17 #include "av1/common/av1_inv_txfm1d_cfg.h"
18 #include "av1/common/idct.h"
19 #include "av1/common/x86/av1_inv_txfm_ssse3.h"
20 #include "av1/common/x86/av1_txfm_sse2.h"
21 #include "av1/common/x86/av1_txfm_sse4.h"
22 #include "av1/common/x86/highbd_txfm_utility_sse4.h"
23
highbd_clamp_epi16(__m128i u,int bd)24 static INLINE __m128i highbd_clamp_epi16(__m128i u, int bd) {
25 const __m128i zero = _mm_setzero_si128();
26 const __m128i one = _mm_set1_epi16(1);
27 const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
28 __m128i clamped, mask;
29
30 mask = _mm_cmpgt_epi16(u, max);
31 clamped = _mm_andnot_si128(mask, u);
32 mask = _mm_and_si128(mask, max);
33 clamped = _mm_or_si128(mask, clamped);
34 mask = _mm_cmpgt_epi16(clamped, zero);
35 clamped = _mm_and_si128(clamped, mask);
36
37 return clamped;
38 }
39
highbd_get_recon_8x8_sse4_1(const __m128i pred,__m128i res0,__m128i res1,const int bd)40 static INLINE __m128i highbd_get_recon_8x8_sse4_1(const __m128i pred,
41 __m128i res0, __m128i res1,
42 const int bd) {
43 __m128i x0 = _mm_cvtepi16_epi32(pred);
44 __m128i x1 = _mm_cvtepi16_epi32(_mm_srli_si128(pred, 8));
45
46 x0 = _mm_add_epi32(res0, x0);
47 x1 = _mm_add_epi32(res1, x1);
48 x0 = _mm_packus_epi32(x0, x1);
49 x0 = highbd_clamp_epi16(x0, bd);
50 return x0;
51 }
52
highbd_get_recon_4xn_sse4_1(const __m128i pred,__m128i res0,const int bd)53 static INLINE __m128i highbd_get_recon_4xn_sse4_1(const __m128i pred,
54 __m128i res0, const int bd) {
55 __m128i x0 = _mm_cvtepi16_epi32(pred);
56
57 x0 = _mm_add_epi32(res0, x0);
58 x0 = _mm_packus_epi32(x0, x0);
59 x0 = highbd_clamp_epi16(x0, bd);
60 return x0;
61 }
62
highbd_write_buffer_4xn_sse4_1(__m128i * in,uint16_t * output,int stride,int flipud,int height,const int bd)63 static INLINE void highbd_write_buffer_4xn_sse4_1(__m128i *in, uint16_t *output,
64 int stride, int flipud,
65 int height, const int bd) {
66 int j = flipud ? (height - 1) : 0;
67 const int step = flipud ? -1 : 1;
68 for (int i = 0; i < height; ++i, j += step) {
69 __m128i v = _mm_loadl_epi64((__m128i const *)(output + i * stride));
70 __m128i u = highbd_get_recon_4xn_sse4_1(v, in[j], bd);
71
72 _mm_storel_epi64((__m128i *)(output + i * stride), u);
73 }
74 }
75
highbd_write_buffer_8xn_sse4_1(__m128i * in,uint16_t * output,int stride,int flipud,int height,const int bd)76 static INLINE void highbd_write_buffer_8xn_sse4_1(__m128i *in, uint16_t *output,
77 int stride, int flipud,
78 int height, const int bd) {
79 int j = flipud ? (height - 1) : 0;
80 const int step = flipud ? -1 : 1;
81 for (int i = 0; i < height; ++i, j += step) {
82 __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride));
83 __m128i u = highbd_get_recon_8x8_sse4_1(v, in[j], in[j + height], bd);
84
85 _mm_storeu_si128((__m128i *)(output + i * stride), u);
86 }
87 }
88
load_buffer_32bit_input(const int32_t * in,int stride,__m128i * out,int out_size)89 static INLINE void load_buffer_32bit_input(const int32_t *in, int stride,
90 __m128i *out, int out_size) {
91 for (int i = 0; i < out_size; ++i) {
92 out[i] = _mm_loadu_si128((const __m128i *)(in + i * stride));
93 }
94 }
95
load_buffer_4x4(const int32_t * coeff,__m128i * in)96 static INLINE void load_buffer_4x4(const int32_t *coeff, __m128i *in) {
97 in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
98 in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
99 in[2] = _mm_load_si128((const __m128i *)(coeff + 8));
100 in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
101 }
102
addsub_sse4_1(const __m128i in0,const __m128i in1,__m128i * out0,__m128i * out1,const __m128i * clamp_lo,const __m128i * clamp_hi)103 static void addsub_sse4_1(const __m128i in0, const __m128i in1, __m128i *out0,
104 __m128i *out1, const __m128i *clamp_lo,
105 const __m128i *clamp_hi) {
106 __m128i a0 = _mm_add_epi32(in0, in1);
107 __m128i a1 = _mm_sub_epi32(in0, in1);
108
109 a0 = _mm_max_epi32(a0, *clamp_lo);
110 a0 = _mm_min_epi32(a0, *clamp_hi);
111 a1 = _mm_max_epi32(a1, *clamp_lo);
112 a1 = _mm_min_epi32(a1, *clamp_hi);
113
114 *out0 = a0;
115 *out1 = a1;
116 }
117
addsub_no_clamp_sse4_1(const __m128i in0,const __m128i in1,__m128i * out0,__m128i * out1)118 static void addsub_no_clamp_sse4_1(const __m128i in0, const __m128i in1,
119 __m128i *out0, __m128i *out1) {
120 __m128i a0 = _mm_add_epi32(in0, in1);
121 __m128i a1 = _mm_sub_epi32(in0, in1);
122
123 *out0 = a0;
124 *out1 = a1;
125 }
126
addsub_shift_sse4_1(const __m128i in0,const __m128i in1,__m128i * out0,__m128i * out1,const __m128i * clamp_lo,const __m128i * clamp_hi,int shift)127 static void addsub_shift_sse4_1(const __m128i in0, const __m128i in1,
128 __m128i *out0, __m128i *out1,
129 const __m128i *clamp_lo,
130 const __m128i *clamp_hi, int shift) {
131 __m128i offset = _mm_set1_epi32((1 << shift) >> 1);
132 __m128i in0_w_offset = _mm_add_epi32(in0, offset);
133 __m128i a0 = _mm_add_epi32(in0_w_offset, in1);
134 __m128i a1 = _mm_sub_epi32(in0_w_offset, in1);
135
136 a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
137 a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
138
139 a0 = _mm_max_epi32(a0, *clamp_lo);
140 a0 = _mm_min_epi32(a0, *clamp_hi);
141 a1 = _mm_max_epi32(a1, *clamp_lo);
142 a1 = _mm_min_epi32(a1, *clamp_hi);
143
144 *out0 = a0;
145 *out1 = a1;
146 }
147
idct32_stage4_sse4_1(__m128i * bf1,const __m128i * cospim8,const __m128i * cospi56,const __m128i * cospi8,const __m128i * cospim56,const __m128i * cospim40,const __m128i * cospi24,const __m128i * cospi40,const __m128i * cospim24,const __m128i * rounding,int bit)148 static INLINE void idct32_stage4_sse4_1(
149 __m128i *bf1, const __m128i *cospim8, const __m128i *cospi56,
150 const __m128i *cospi8, const __m128i *cospim56, const __m128i *cospim40,
151 const __m128i *cospi24, const __m128i *cospi40, const __m128i *cospim24,
152 const __m128i *rounding, int bit) {
153 __m128i temp1, temp2;
154 temp1 = half_btf_sse4_1(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit);
155 bf1[30] = half_btf_sse4_1(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit);
156 bf1[17] = temp1;
157
158 temp2 = half_btf_sse4_1(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit);
159 bf1[29] =
160 half_btf_sse4_1(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit);
161 bf1[18] = temp2;
162
163 temp1 = half_btf_sse4_1(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit);
164 bf1[26] =
165 half_btf_sse4_1(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit);
166 bf1[21] = temp1;
167
168 temp2 =
169 half_btf_sse4_1(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit);
170 bf1[25] =
171 half_btf_sse4_1(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit);
172 bf1[22] = temp2;
173 }
174
idct32_stage5_sse4_1(__m128i * bf1,const __m128i * cospim16,const __m128i * cospi48,const __m128i * cospi16,const __m128i * cospim48,const __m128i * clamp_lo,const __m128i * clamp_hi,const __m128i * rounding,int bit)175 static INLINE void idct32_stage5_sse4_1(
176 __m128i *bf1, const __m128i *cospim16, const __m128i *cospi48,
177 const __m128i *cospi16, const __m128i *cospim48, const __m128i *clamp_lo,
178 const __m128i *clamp_hi, const __m128i *rounding, int bit) {
179 __m128i temp1, temp2;
180 temp1 = half_btf_sse4_1(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit);
181 bf1[14] = half_btf_sse4_1(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit);
182 bf1[9] = temp1;
183
184 temp2 =
185 half_btf_sse4_1(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit);
186 bf1[13] =
187 half_btf_sse4_1(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit);
188 bf1[10] = temp2;
189
190 addsub_sse4_1(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi);
191 addsub_sse4_1(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi);
192 addsub_sse4_1(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi);
193 addsub_sse4_1(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi);
194 addsub_sse4_1(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi);
195 addsub_sse4_1(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi);
196 addsub_sse4_1(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi);
197 addsub_sse4_1(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi);
198 }
199
idct32_stage6_sse4_1(__m128i * bf1,const __m128i * cospim32,const __m128i * cospi32,const __m128i * cospim16,const __m128i * cospi48,const __m128i * cospi16,const __m128i * cospim48,const __m128i * clamp_lo,const __m128i * clamp_hi,const __m128i * rounding,int bit)200 static INLINE void idct32_stage6_sse4_1(
201 __m128i *bf1, const __m128i *cospim32, const __m128i *cospi32,
202 const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16,
203 const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi,
204 const __m128i *rounding, int bit) {
205 __m128i temp1, temp2;
206 temp1 = half_btf_sse4_1(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit);
207 bf1[6] = half_btf_sse4_1(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit);
208 bf1[5] = temp1;
209
210 addsub_sse4_1(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi);
211 addsub_sse4_1(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi);
212 addsub_sse4_1(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi);
213 addsub_sse4_1(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi);
214
215 temp1 = half_btf_sse4_1(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit);
216 bf1[29] =
217 half_btf_sse4_1(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit);
218 bf1[18] = temp1;
219 temp2 = half_btf_sse4_1(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit);
220 bf1[28] =
221 half_btf_sse4_1(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit);
222 bf1[19] = temp2;
223 temp1 =
224 half_btf_sse4_1(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit);
225 bf1[27] =
226 half_btf_sse4_1(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit);
227 bf1[20] = temp1;
228 temp2 =
229 half_btf_sse4_1(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit);
230 bf1[26] =
231 half_btf_sse4_1(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit);
232 bf1[21] = temp2;
233 }
234
idct32_stage7_sse4_1(__m128i * bf1,const __m128i * cospim32,const __m128i * cospi32,const __m128i * clamp_lo,const __m128i * clamp_hi,const __m128i * rounding,int bit)235 static INLINE void idct32_stage7_sse4_1(__m128i *bf1, const __m128i *cospim32,
236 const __m128i *cospi32,
237 const __m128i *clamp_lo,
238 const __m128i *clamp_hi,
239 const __m128i *rounding, int bit) {
240 __m128i temp1, temp2;
241 addsub_sse4_1(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi);
242 addsub_sse4_1(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi);
243 addsub_sse4_1(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi);
244 addsub_sse4_1(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi);
245
246 temp1 = half_btf_sse4_1(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit);
247 bf1[13] =
248 half_btf_sse4_1(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit);
249 bf1[10] = temp1;
250 temp2 = half_btf_sse4_1(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit);
251 bf1[12] =
252 half_btf_sse4_1(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit);
253 bf1[11] = temp2;
254
255 addsub_sse4_1(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi);
256 addsub_sse4_1(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi);
257 addsub_sse4_1(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi);
258 addsub_sse4_1(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi);
259 addsub_sse4_1(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi);
260 addsub_sse4_1(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi);
261 addsub_sse4_1(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi);
262 addsub_sse4_1(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi);
263 }
264
idct32_stage8_sse4_1(__m128i * bf1,const __m128i * cospim32,const __m128i * cospi32,const __m128i * clamp_lo,const __m128i * clamp_hi,const __m128i * rounding,int bit)265 static INLINE void idct32_stage8_sse4_1(__m128i *bf1, const __m128i *cospim32,
266 const __m128i *cospi32,
267 const __m128i *clamp_lo,
268 const __m128i *clamp_hi,
269 const __m128i *rounding, int bit) {
270 __m128i temp1, temp2;
271 addsub_sse4_1(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi);
272 addsub_sse4_1(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi);
273 addsub_sse4_1(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi);
274 addsub_sse4_1(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi);
275 addsub_sse4_1(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi);
276 addsub_sse4_1(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi);
277 addsub_sse4_1(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi);
278 addsub_sse4_1(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi);
279
280 temp1 = half_btf_sse4_1(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit);
281 bf1[27] =
282 half_btf_sse4_1(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit);
283 bf1[20] = temp1;
284 temp2 = half_btf_sse4_1(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit);
285 bf1[26] =
286 half_btf_sse4_1(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit);
287 bf1[21] = temp2;
288 temp1 = half_btf_sse4_1(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit);
289 bf1[25] =
290 half_btf_sse4_1(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit);
291 bf1[22] = temp1;
292 temp2 = half_btf_sse4_1(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit);
293 bf1[24] =
294 half_btf_sse4_1(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit);
295 bf1[23] = temp2;
296 }
297
idct32_stage9_sse4_1(__m128i * bf1,__m128i * out,const int do_cols,const int bd,const int out_shift,const int log_range)298 static INLINE void idct32_stage9_sse4_1(__m128i *bf1, __m128i *out,
299 const int do_cols, const int bd,
300 const int out_shift,
301 const int log_range) {
302 if (do_cols) {
303 addsub_no_clamp_sse4_1(bf1[0], bf1[31], out + 0, out + 31);
304 addsub_no_clamp_sse4_1(bf1[1], bf1[30], out + 1, out + 30);
305 addsub_no_clamp_sse4_1(bf1[2], bf1[29], out + 2, out + 29);
306 addsub_no_clamp_sse4_1(bf1[3], bf1[28], out + 3, out + 28);
307 addsub_no_clamp_sse4_1(bf1[4], bf1[27], out + 4, out + 27);
308 addsub_no_clamp_sse4_1(bf1[5], bf1[26], out + 5, out + 26);
309 addsub_no_clamp_sse4_1(bf1[6], bf1[25], out + 6, out + 25);
310 addsub_no_clamp_sse4_1(bf1[7], bf1[24], out + 7, out + 24);
311 addsub_no_clamp_sse4_1(bf1[8], bf1[23], out + 8, out + 23);
312 addsub_no_clamp_sse4_1(bf1[9], bf1[22], out + 9, out + 22);
313 addsub_no_clamp_sse4_1(bf1[10], bf1[21], out + 10, out + 21);
314 addsub_no_clamp_sse4_1(bf1[11], bf1[20], out + 11, out + 20);
315 addsub_no_clamp_sse4_1(bf1[12], bf1[19], out + 12, out + 19);
316 addsub_no_clamp_sse4_1(bf1[13], bf1[18], out + 13, out + 18);
317 addsub_no_clamp_sse4_1(bf1[14], bf1[17], out + 14, out + 17);
318 addsub_no_clamp_sse4_1(bf1[15], bf1[16], out + 15, out + 16);
319 } else {
320 const int log_range_out = AOMMAX(16, bd + 6);
321 const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
322 -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
323 const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
324 (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
325
326 addsub_shift_sse4_1(bf1[0], bf1[31], out + 0, out + 31, &clamp_lo_out,
327 &clamp_hi_out, out_shift);
328 addsub_shift_sse4_1(bf1[1], bf1[30], out + 1, out + 30, &clamp_lo_out,
329 &clamp_hi_out, out_shift);
330 addsub_shift_sse4_1(bf1[2], bf1[29], out + 2, out + 29, &clamp_lo_out,
331 &clamp_hi_out, out_shift);
332 addsub_shift_sse4_1(bf1[3], bf1[28], out + 3, out + 28, &clamp_lo_out,
333 &clamp_hi_out, out_shift);
334 addsub_shift_sse4_1(bf1[4], bf1[27], out + 4, out + 27, &clamp_lo_out,
335 &clamp_hi_out, out_shift);
336 addsub_shift_sse4_1(bf1[5], bf1[26], out + 5, out + 26, &clamp_lo_out,
337 &clamp_hi_out, out_shift);
338 addsub_shift_sse4_1(bf1[6], bf1[25], out + 6, out + 25, &clamp_lo_out,
339 &clamp_hi_out, out_shift);
340 addsub_shift_sse4_1(bf1[7], bf1[24], out + 7, out + 24, &clamp_lo_out,
341 &clamp_hi_out, out_shift);
342 addsub_shift_sse4_1(bf1[8], bf1[23], out + 8, out + 23, &clamp_lo_out,
343 &clamp_hi_out, out_shift);
344 addsub_shift_sse4_1(bf1[9], bf1[22], out + 9, out + 22, &clamp_lo_out,
345 &clamp_hi_out, out_shift);
346 addsub_shift_sse4_1(bf1[10], bf1[21], out + 10, out + 21, &clamp_lo_out,
347 &clamp_hi_out, out_shift);
348 addsub_shift_sse4_1(bf1[11], bf1[20], out + 11, out + 20, &clamp_lo_out,
349 &clamp_hi_out, out_shift);
350 addsub_shift_sse4_1(bf1[12], bf1[19], out + 12, out + 19, &clamp_lo_out,
351 &clamp_hi_out, out_shift);
352 addsub_shift_sse4_1(bf1[13], bf1[18], out + 13, out + 18, &clamp_lo_out,
353 &clamp_hi_out, out_shift);
354 addsub_shift_sse4_1(bf1[14], bf1[17], out + 14, out + 17, &clamp_lo_out,
355 &clamp_hi_out, out_shift);
356 addsub_shift_sse4_1(bf1[15], bf1[16], out + 15, out + 16, &clamp_lo_out,
357 &clamp_hi_out, out_shift);
358 }
359 }
360
neg_shift_sse4_1(const __m128i in0,const __m128i in1,__m128i * out0,__m128i * out1,const __m128i * clamp_lo,const __m128i * clamp_hi,int shift)361 static void neg_shift_sse4_1(const __m128i in0, const __m128i in1,
362 __m128i *out0, __m128i *out1,
363 const __m128i *clamp_lo, const __m128i *clamp_hi,
364 int shift) {
365 __m128i offset = _mm_set1_epi32((1 << shift) >> 1);
366 __m128i a0 = _mm_add_epi32(offset, in0);
367 __m128i a1 = _mm_sub_epi32(offset, in1);
368
369 a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
370 a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
371
372 a0 = _mm_max_epi32(a0, *clamp_lo);
373 a0 = _mm_min_epi32(a0, *clamp_hi);
374 a1 = _mm_max_epi32(a1, *clamp_lo);
375 a1 = _mm_min_epi32(a1, *clamp_hi);
376
377 *out0 = a0;
378 *out1 = a1;
379 }
380
idct4x4_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)381 static void idct4x4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
382 int bd, int out_shift) {
383 (void)out_shift;
384 const int32_t *cospi = cospi_arr(bit);
385 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
386 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
387 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
388 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
389 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
390
391 __m128i u0, u1, u2, u3;
392 __m128i v0, v1, v2, v3, x, y;
393
394 v0 = _mm_unpacklo_epi32(in[0], in[1]);
395 v1 = _mm_unpackhi_epi32(in[0], in[1]);
396 v2 = _mm_unpacklo_epi32(in[2], in[3]);
397 v3 = _mm_unpackhi_epi32(in[2], in[3]);
398
399 u0 = _mm_unpacklo_epi64(v0, v2);
400 u1 = _mm_unpackhi_epi64(v0, v2);
401 u2 = _mm_unpacklo_epi64(v1, v3);
402 u3 = _mm_unpackhi_epi64(v1, v3);
403
404 x = _mm_mullo_epi32(u0, cospi32);
405 y = _mm_mullo_epi32(u2, cospi32);
406 v0 = _mm_add_epi32(x, y);
407 v0 = _mm_add_epi32(v0, rnding);
408 v0 = _mm_srai_epi32(v0, bit);
409
410 v1 = _mm_sub_epi32(x, y);
411 v1 = _mm_add_epi32(v1, rnding);
412 v1 = _mm_srai_epi32(v1, bit);
413
414 x = _mm_mullo_epi32(u1, cospi48);
415 y = _mm_mullo_epi32(u3, cospim16);
416 v2 = _mm_add_epi32(x, y);
417 v2 = _mm_add_epi32(v2, rnding);
418 v2 = _mm_srai_epi32(v2, bit);
419
420 x = _mm_mullo_epi32(u1, cospi16);
421 y = _mm_mullo_epi32(u3, cospi48);
422 v3 = _mm_add_epi32(x, y);
423 v3 = _mm_add_epi32(v3, rnding);
424 v3 = _mm_srai_epi32(v3, bit);
425
426 if (do_cols) {
427 addsub_no_clamp_sse4_1(v0, v3, out + 0, out + 3);
428 addsub_no_clamp_sse4_1(v1, v2, out + 1, out + 2);
429 } else {
430 const int log_range = AOMMAX(16, bd + 6);
431 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
432 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
433 addsub_sse4_1(v0, v3, out + 0, out + 3, &clamp_lo, &clamp_hi);
434 addsub_sse4_1(v1, v2, out + 1, out + 2, &clamp_lo, &clamp_hi);
435 }
436 }
437
iadst4x4_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)438 static void iadst4x4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
439 int bd, int out_shift) {
440 (void)out_shift;
441 const int32_t *sinpi = sinpi_arr(bit);
442 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
443 const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]);
444 const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]);
445 const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]);
446 const __m128i sinpi4 = _mm_set1_epi32((int)sinpi[4]);
447 __m128i t;
448 __m128i s0, s1, s2, s3, s4, s5, s6, s7;
449 __m128i x0, x1, x2, x3;
450 __m128i u0, u1, u2, u3;
451 __m128i v0, v1, v2, v3;
452
453 v0 = _mm_unpacklo_epi32(in[0], in[1]);
454 v1 = _mm_unpackhi_epi32(in[0], in[1]);
455 v2 = _mm_unpacklo_epi32(in[2], in[3]);
456 v3 = _mm_unpackhi_epi32(in[2], in[3]);
457
458 x0 = _mm_unpacklo_epi64(v0, v2);
459 x1 = _mm_unpackhi_epi64(v0, v2);
460 x2 = _mm_unpacklo_epi64(v1, v3);
461 x3 = _mm_unpackhi_epi64(v1, v3);
462
463 s0 = _mm_mullo_epi32(x0, sinpi1);
464 s1 = _mm_mullo_epi32(x0, sinpi2);
465 s2 = _mm_mullo_epi32(x1, sinpi3);
466 s3 = _mm_mullo_epi32(x2, sinpi4);
467 s4 = _mm_mullo_epi32(x2, sinpi1);
468 s5 = _mm_mullo_epi32(x3, sinpi2);
469 s6 = _mm_mullo_epi32(x3, sinpi4);
470 t = _mm_sub_epi32(x0, x2);
471 s7 = _mm_add_epi32(t, x3);
472
473 t = _mm_add_epi32(s0, s3);
474 s0 = _mm_add_epi32(t, s5);
475 t = _mm_sub_epi32(s1, s4);
476 s1 = _mm_sub_epi32(t, s6);
477 s3 = s2;
478 s2 = _mm_mullo_epi32(s7, sinpi3);
479
480 u0 = _mm_add_epi32(s0, s3);
481 u1 = _mm_add_epi32(s1, s3);
482 u2 = s2;
483 t = _mm_add_epi32(s0, s1);
484 u3 = _mm_sub_epi32(t, s3);
485
486 u0 = _mm_add_epi32(u0, rnding);
487 u0 = _mm_srai_epi32(u0, bit);
488
489 u1 = _mm_add_epi32(u1, rnding);
490 u1 = _mm_srai_epi32(u1, bit);
491
492 u2 = _mm_add_epi32(u2, rnding);
493 u2 = _mm_srai_epi32(u2, bit);
494
495 u3 = _mm_add_epi32(u3, rnding);
496 u3 = _mm_srai_epi32(u3, bit);
497
498 if (!do_cols) {
499 const int log_range = AOMMAX(16, bd + 6);
500 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
501 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
502
503 u0 = _mm_max_epi32(u0, clamp_lo);
504 u0 = _mm_min_epi32(u0, clamp_hi);
505 u1 = _mm_max_epi32(u1, clamp_lo);
506 u1 = _mm_min_epi32(u1, clamp_hi);
507 u2 = _mm_max_epi32(u2, clamp_lo);
508 u2 = _mm_min_epi32(u2, clamp_hi);
509 u3 = _mm_max_epi32(u3, clamp_lo);
510 u3 = _mm_min_epi32(u3, clamp_hi);
511 }
512
513 out[0] = u0;
514 out[1] = u1;
515 out[2] = u2;
516 out[3] = u3;
517 }
518
round_shift_4x4(__m128i * in,int shift)519 static INLINE void round_shift_4x4(__m128i *in, int shift) {
520 __m128i rnding = _mm_set1_epi32(1 << (shift - 1));
521
522 in[0] = _mm_add_epi32(in[0], rnding);
523 in[1] = _mm_add_epi32(in[1], rnding);
524 in[2] = _mm_add_epi32(in[2], rnding);
525 in[3] = _mm_add_epi32(in[3], rnding);
526
527 in[0] = _mm_srai_epi32(in[0], shift);
528 in[1] = _mm_srai_epi32(in[1], shift);
529 in[2] = _mm_srai_epi32(in[2], shift);
530 in[3] = _mm_srai_epi32(in[3], shift);
531 }
532
write_buffer_4x4(__m128i * in,uint16_t * output,int stride,int fliplr,int flipud,int shift,int bd)533 static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride,
534 int fliplr, int flipud, int shift, int bd) {
535 const __m128i zero = _mm_setzero_si128();
536 __m128i u0, u1, u2, u3;
537 __m128i v0, v1, v2, v3;
538
539 round_shift_4x4(in, shift);
540
541 v0 = _mm_loadl_epi64((__m128i const *)(output + 0 * stride));
542 v1 = _mm_loadl_epi64((__m128i const *)(output + 1 * stride));
543 v2 = _mm_loadl_epi64((__m128i const *)(output + 2 * stride));
544 v3 = _mm_loadl_epi64((__m128i const *)(output + 3 * stride));
545
546 v0 = _mm_unpacklo_epi16(v0, zero);
547 v1 = _mm_unpacklo_epi16(v1, zero);
548 v2 = _mm_unpacklo_epi16(v2, zero);
549 v3 = _mm_unpacklo_epi16(v3, zero);
550
551 if (fliplr) {
552 in[0] = _mm_shuffle_epi32(in[0], 0x1B);
553 in[1] = _mm_shuffle_epi32(in[1], 0x1B);
554 in[2] = _mm_shuffle_epi32(in[2], 0x1B);
555 in[3] = _mm_shuffle_epi32(in[3], 0x1B);
556 }
557
558 if (flipud) {
559 u0 = _mm_add_epi32(in[3], v0);
560 u1 = _mm_add_epi32(in[2], v1);
561 u2 = _mm_add_epi32(in[1], v2);
562 u3 = _mm_add_epi32(in[0], v3);
563 } else {
564 u0 = _mm_add_epi32(in[0], v0);
565 u1 = _mm_add_epi32(in[1], v1);
566 u2 = _mm_add_epi32(in[2], v2);
567 u3 = _mm_add_epi32(in[3], v3);
568 }
569
570 v0 = _mm_packus_epi32(u0, u1);
571 v2 = _mm_packus_epi32(u2, u3);
572
573 u0 = highbd_clamp_epi16(v0, bd);
574 u2 = highbd_clamp_epi16(v2, bd);
575
576 v0 = _mm_unpacklo_epi64(u0, u0);
577 v1 = _mm_unpackhi_epi64(u0, u0);
578 v2 = _mm_unpacklo_epi64(u2, u2);
579 v3 = _mm_unpackhi_epi64(u2, u2);
580
581 _mm_storel_epi64((__m128i *)(output + 0 * stride), v0);
582 _mm_storel_epi64((__m128i *)(output + 1 * stride), v1);
583 _mm_storel_epi64((__m128i *)(output + 2 * stride), v2);
584 _mm_storel_epi64((__m128i *)(output + 3 * stride), v3);
585 }
highbd_clamp_epi32_sse4_1(const __m128i * in,__m128i * out,const __m128i * clamp_lo,const __m128i * clamp_hi,int size)586 static void highbd_clamp_epi32_sse4_1(const __m128i *in, __m128i *out,
587 const __m128i *clamp_lo,
588 const __m128i *clamp_hi, int size) {
589 __m128i a0, a1;
590 for (int i = 0; i < size; i += 4) {
591 a0 = _mm_max_epi32(in[i], *clamp_lo);
592 out[i] = _mm_min_epi32(a0, *clamp_hi);
593
594 a1 = _mm_max_epi32(in[i + 1], *clamp_lo);
595 out[i + 1] = _mm_min_epi32(a1, *clamp_hi);
596
597 a0 = _mm_max_epi32(in[i + 2], *clamp_lo);
598 out[i + 2] = _mm_min_epi32(a0, *clamp_hi);
599
600 a1 = _mm_max_epi32(in[i + 3], *clamp_lo);
601 out[i + 3] = _mm_min_epi32(a1, *clamp_hi);
602 }
603 }
iidentity4_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)604 static void iidentity4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
605 int bd, int out_shift) {
606 (void)bit;
607 (void)out_shift;
608 __m128i v[4];
609 __m128i fact = _mm_set1_epi32(NewSqrt2);
610 __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
611 __m128i a0, a1;
612
613 a0 = _mm_mullo_epi32(in[0], fact);
614 a1 = _mm_mullo_epi32(in[1], fact);
615 a0 = _mm_add_epi32(a0, offset);
616 a1 = _mm_add_epi32(a1, offset);
617 out[0] = _mm_srai_epi32(a0, NewSqrt2Bits);
618 out[1] = _mm_srai_epi32(a1, NewSqrt2Bits);
619
620 a0 = _mm_mullo_epi32(in[2], fact);
621 a1 = _mm_mullo_epi32(in[3], fact);
622 a0 = _mm_add_epi32(a0, offset);
623 a1 = _mm_add_epi32(a1, offset);
624 out[2] = _mm_srai_epi32(a0, NewSqrt2Bits);
625 out[3] = _mm_srai_epi32(a1, NewSqrt2Bits);
626
627 if (!do_cols) {
628 const int log_range = AOMMAX(16, bd + 6);
629 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
630 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
631
632 highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 4);
633 }
634
635 // Transpose for 4x4
636 v[0] = _mm_unpacklo_epi32(out[0], out[1]);
637 v[1] = _mm_unpackhi_epi32(out[0], out[1]);
638 v[2] = _mm_unpacklo_epi32(out[2], out[3]);
639 v[3] = _mm_unpackhi_epi32(out[2], out[3]);
640
641 out[0] = _mm_unpacklo_epi64(v[0], v[2]);
642 out[1] = _mm_unpackhi_epi64(v[0], v[2]);
643 out[2] = _mm_unpacklo_epi64(v[1], v[3]);
644 out[3] = _mm_unpackhi_epi64(v[1], v[3]);
645 }
av1_inv_txfm2d_add_4x4_sse4_1(const int32_t * coeff,uint16_t * output,int stride,TX_TYPE tx_type,int bd)646 void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
647 int stride, TX_TYPE tx_type, int bd) {
648 __m128i in[4];
649 const int8_t *shift = inv_txfm_shift_ls[TX_4X4];
650 const int txw_idx = get_txw_idx(TX_4X4);
651 const int txh_idx = get_txh_idx(TX_4X4);
652
653 switch (tx_type) {
654 case DCT_DCT:
655 load_buffer_4x4(coeff, in);
656 idct4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
657 idct4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
658 write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
659 break;
660 case ADST_DCT:
661 load_buffer_4x4(coeff, in);
662 idct4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
663 iadst4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
664 write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
665 break;
666 case DCT_ADST:
667 load_buffer_4x4(coeff, in);
668 iadst4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
669 idct4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
670 write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
671 break;
672 case ADST_ADST:
673 load_buffer_4x4(coeff, in);
674 iadst4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
675 iadst4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
676 write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
677 break;
678 case FLIPADST_DCT:
679 load_buffer_4x4(coeff, in);
680 idct4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
681 iadst4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
682 write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
683 break;
684 case DCT_FLIPADST:
685 load_buffer_4x4(coeff, in);
686 iadst4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
687 idct4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
688 write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
689 break;
690 case FLIPADST_FLIPADST:
691 load_buffer_4x4(coeff, in);
692 iadst4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
693 iadst4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
694 write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd);
695 break;
696 case ADST_FLIPADST:
697 load_buffer_4x4(coeff, in);
698 iadst4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
699 iadst4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
700 write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
701 break;
702 case FLIPADST_ADST:
703 load_buffer_4x4(coeff, in);
704 iadst4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
705 iadst4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
706 write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
707 break;
708 case IDTX:
709 load_buffer_4x4(coeff, in);
710 iidentity4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
711 iidentity4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
712 write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
713 break;
714 case V_DCT:
715 load_buffer_4x4(coeff, in);
716 iidentity4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
717 idct4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
718 write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
719 break;
720 case H_DCT:
721 load_buffer_4x4(coeff, in);
722 idct4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
723 iidentity4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
724 write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
725 break;
726 case V_ADST:
727 load_buffer_4x4(coeff, in);
728 iidentity4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
729 iadst4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
730 write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
731 break;
732 case H_ADST:
733 load_buffer_4x4(coeff, in);
734 iadst4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
735 iidentity4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
736 write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
737 break;
738 case V_FLIPADST:
739 load_buffer_4x4(coeff, in);
740 iidentity4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
741 iadst4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
742 write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
743 break;
744 case H_FLIPADST:
745 load_buffer_4x4(coeff, in);
746 iadst4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
747 iidentity4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
748 write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
749 break;
750 default: assert(0);
751 }
752 }
753
754 // 8x8
load_buffer_8x8(const int32_t * coeff,__m128i * in)755 static void load_buffer_8x8(const int32_t *coeff, __m128i *in) {
756 in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
757 in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
758 in[2] = _mm_load_si128((const __m128i *)(coeff + 8));
759 in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
760 in[4] = _mm_load_si128((const __m128i *)(coeff + 16));
761 in[5] = _mm_load_si128((const __m128i *)(coeff + 20));
762 in[6] = _mm_load_si128((const __m128i *)(coeff + 24));
763 in[7] = _mm_load_si128((const __m128i *)(coeff + 28));
764 in[8] = _mm_load_si128((const __m128i *)(coeff + 32));
765 in[9] = _mm_load_si128((const __m128i *)(coeff + 36));
766 in[10] = _mm_load_si128((const __m128i *)(coeff + 40));
767 in[11] = _mm_load_si128((const __m128i *)(coeff + 44));
768 in[12] = _mm_load_si128((const __m128i *)(coeff + 48));
769 in[13] = _mm_load_si128((const __m128i *)(coeff + 52));
770 in[14] = _mm_load_si128((const __m128i *)(coeff + 56));
771 in[15] = _mm_load_si128((const __m128i *)(coeff + 60));
772 }
773
idct8x8_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)774 static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
775 int bd, int out_shift) {
776 const int32_t *cospi = cospi_arr(bit);
777 const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
778 const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
779 const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
780 const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
781 const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
782 const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
783 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
784 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
785 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
786 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
787 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
788 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
789 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
790 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
791 __m128i u0, u1, u2, u3, u4, u5, u6, u7;
792 __m128i v0, v1, v2, v3, v4, v5, v6, v7;
793 __m128i x, y;
794 int col;
795
796 // Note:
797 // Even column: 0, 2, ..., 14
798 // Odd column: 1, 3, ..., 15
799 // one even column plus one odd column constructs one row (8 coeffs)
800 // total we have 8 rows (8x8).
801 for (col = 0; col < 2; ++col) {
802 // stage 0
803 // stage 1
804 // stage 2
805 u0 = in[0 * 2 + col];
806 u1 = in[4 * 2 + col];
807 u2 = in[2 * 2 + col];
808 u3 = in[6 * 2 + col];
809
810 x = _mm_mullo_epi32(in[1 * 2 + col], cospi56);
811 y = _mm_mullo_epi32(in[7 * 2 + col], cospim8);
812 u4 = _mm_add_epi32(x, y);
813 u4 = _mm_add_epi32(u4, rnding);
814 u4 = _mm_srai_epi32(u4, bit);
815
816 x = _mm_mullo_epi32(in[1 * 2 + col], cospi8);
817 y = _mm_mullo_epi32(in[7 * 2 + col], cospi56);
818 u7 = _mm_add_epi32(x, y);
819 u7 = _mm_add_epi32(u7, rnding);
820 u7 = _mm_srai_epi32(u7, bit);
821
822 x = _mm_mullo_epi32(in[5 * 2 + col], cospi24);
823 y = _mm_mullo_epi32(in[3 * 2 + col], cospim40);
824 u5 = _mm_add_epi32(x, y);
825 u5 = _mm_add_epi32(u5, rnding);
826 u5 = _mm_srai_epi32(u5, bit);
827
828 x = _mm_mullo_epi32(in[5 * 2 + col], cospi40);
829 y = _mm_mullo_epi32(in[3 * 2 + col], cospi24);
830 u6 = _mm_add_epi32(x, y);
831 u6 = _mm_add_epi32(u6, rnding);
832 u6 = _mm_srai_epi32(u6, bit);
833
834 // stage 3
835 x = _mm_mullo_epi32(u0, cospi32);
836 y = _mm_mullo_epi32(u1, cospi32);
837 v0 = _mm_add_epi32(x, y);
838 v0 = _mm_add_epi32(v0, rnding);
839 v0 = _mm_srai_epi32(v0, bit);
840
841 v1 = _mm_sub_epi32(x, y);
842 v1 = _mm_add_epi32(v1, rnding);
843 v1 = _mm_srai_epi32(v1, bit);
844
845 x = _mm_mullo_epi32(u2, cospi48);
846 y = _mm_mullo_epi32(u3, cospim16);
847 v2 = _mm_add_epi32(x, y);
848 v2 = _mm_add_epi32(v2, rnding);
849 v2 = _mm_srai_epi32(v2, bit);
850
851 x = _mm_mullo_epi32(u2, cospi16);
852 y = _mm_mullo_epi32(u3, cospi48);
853 v3 = _mm_add_epi32(x, y);
854 v3 = _mm_add_epi32(v3, rnding);
855 v3 = _mm_srai_epi32(v3, bit);
856
857 addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
858 addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
859
860 // stage 4
861 addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
862 addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
863 u4 = v4;
864 u7 = v7;
865
866 x = _mm_mullo_epi32(v5, cospi32);
867 y = _mm_mullo_epi32(v6, cospi32);
868 u6 = _mm_add_epi32(y, x);
869 u6 = _mm_add_epi32(u6, rnding);
870 u6 = _mm_srai_epi32(u6, bit);
871
872 u5 = _mm_sub_epi32(y, x);
873 u5 = _mm_add_epi32(u5, rnding);
874 u5 = _mm_srai_epi32(u5, bit);
875
876 // stage 5
877 if (do_cols) {
878 addsub_no_clamp_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col);
879 addsub_no_clamp_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col);
880 addsub_no_clamp_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col);
881 addsub_no_clamp_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col);
882 } else {
883 const int log_range_out = AOMMAX(16, bd + 6);
884 const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
885 -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
886 const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
887 (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
888 addsub_shift_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col,
889 &clamp_lo_out, &clamp_hi_out, out_shift);
890 addsub_shift_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col,
891 &clamp_lo_out, &clamp_hi_out, out_shift);
892 addsub_shift_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col,
893 &clamp_lo_out, &clamp_hi_out, out_shift);
894 addsub_shift_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col,
895 &clamp_lo_out, &clamp_hi_out, out_shift);
896 }
897 }
898 }
899
iadst8x8_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)900 static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
901 int bd, int out_shift) {
902 const int32_t *cospi = cospi_arr(bit);
903 const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
904 const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
905 const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
906 const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
907 const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
908 const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
909 const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
910 const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
911 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
912 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
913 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
914 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
915 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
916 const __m128i kZero = _mm_setzero_si128();
917 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
918 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
919 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
920 __m128i u[8], v[8], x;
921
922 // Even 8 points: 0, 2, ..., 14
923 // stage 0
924 // stage 1
925 // stage 2
926 // (1)
927 u[0] = _mm_mullo_epi32(in[14], cospi4);
928 x = _mm_mullo_epi32(in[0], cospi60);
929 u[0] = _mm_add_epi32(u[0], x);
930 u[0] = _mm_add_epi32(u[0], rnding);
931 u[0] = _mm_srai_epi32(u[0], bit);
932
933 u[1] = _mm_mullo_epi32(in[14], cospi60);
934 x = _mm_mullo_epi32(in[0], cospi4);
935 u[1] = _mm_sub_epi32(u[1], x);
936 u[1] = _mm_add_epi32(u[1], rnding);
937 u[1] = _mm_srai_epi32(u[1], bit);
938
939 // (2)
940 u[2] = _mm_mullo_epi32(in[10], cospi20);
941 x = _mm_mullo_epi32(in[4], cospi44);
942 u[2] = _mm_add_epi32(u[2], x);
943 u[2] = _mm_add_epi32(u[2], rnding);
944 u[2] = _mm_srai_epi32(u[2], bit);
945
946 u[3] = _mm_mullo_epi32(in[10], cospi44);
947 x = _mm_mullo_epi32(in[4], cospi20);
948 u[3] = _mm_sub_epi32(u[3], x);
949 u[3] = _mm_add_epi32(u[3], rnding);
950 u[3] = _mm_srai_epi32(u[3], bit);
951
952 // (3)
953 u[4] = _mm_mullo_epi32(in[6], cospi36);
954 x = _mm_mullo_epi32(in[8], cospi28);
955 u[4] = _mm_add_epi32(u[4], x);
956 u[4] = _mm_add_epi32(u[4], rnding);
957 u[4] = _mm_srai_epi32(u[4], bit);
958
959 u[5] = _mm_mullo_epi32(in[6], cospi28);
960 x = _mm_mullo_epi32(in[8], cospi36);
961 u[5] = _mm_sub_epi32(u[5], x);
962 u[5] = _mm_add_epi32(u[5], rnding);
963 u[5] = _mm_srai_epi32(u[5], bit);
964
965 // (4)
966 u[6] = _mm_mullo_epi32(in[2], cospi52);
967 x = _mm_mullo_epi32(in[12], cospi12);
968 u[6] = _mm_add_epi32(u[6], x);
969 u[6] = _mm_add_epi32(u[6], rnding);
970 u[6] = _mm_srai_epi32(u[6], bit);
971
972 u[7] = _mm_mullo_epi32(in[2], cospi12);
973 x = _mm_mullo_epi32(in[12], cospi52);
974 u[7] = _mm_sub_epi32(u[7], x);
975 u[7] = _mm_add_epi32(u[7], rnding);
976 u[7] = _mm_srai_epi32(u[7], bit);
977
978 // stage 3
979 addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
980 addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
981 addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
982 addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
983
984 // stage 4
985 u[0] = v[0];
986 u[1] = v[1];
987 u[2] = v[2];
988 u[3] = v[3];
989
990 u[4] = _mm_mullo_epi32(v[4], cospi16);
991 x = _mm_mullo_epi32(v[5], cospi48);
992 u[4] = _mm_add_epi32(u[4], x);
993 u[4] = _mm_add_epi32(u[4], rnding);
994 u[4] = _mm_srai_epi32(u[4], bit);
995
996 u[5] = _mm_mullo_epi32(v[4], cospi48);
997 x = _mm_mullo_epi32(v[5], cospi16);
998 u[5] = _mm_sub_epi32(u[5], x);
999 u[5] = _mm_add_epi32(u[5], rnding);
1000 u[5] = _mm_srai_epi32(u[5], bit);
1001
1002 u[6] = _mm_mullo_epi32(v[6], cospim48);
1003 x = _mm_mullo_epi32(v[7], cospi16);
1004 u[6] = _mm_add_epi32(u[6], x);
1005 u[6] = _mm_add_epi32(u[6], rnding);
1006 u[6] = _mm_srai_epi32(u[6], bit);
1007
1008 u[7] = _mm_mullo_epi32(v[6], cospi16);
1009 x = _mm_mullo_epi32(v[7], cospim48);
1010 u[7] = _mm_sub_epi32(u[7], x);
1011 u[7] = _mm_add_epi32(u[7], rnding);
1012 u[7] = _mm_srai_epi32(u[7], bit);
1013
1014 // stage 5
1015 addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
1016 addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
1017 addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
1018 addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
1019
1020 // stage 6
1021 u[0] = v[0];
1022 u[1] = v[1];
1023 u[4] = v[4];
1024 u[5] = v[5];
1025
1026 v[0] = _mm_mullo_epi32(v[2], cospi32);
1027 x = _mm_mullo_epi32(v[3], cospi32);
1028 u[2] = _mm_add_epi32(v[0], x);
1029 u[2] = _mm_add_epi32(u[2], rnding);
1030 u[2] = _mm_srai_epi32(u[2], bit);
1031
1032 u[3] = _mm_sub_epi32(v[0], x);
1033 u[3] = _mm_add_epi32(u[3], rnding);
1034 u[3] = _mm_srai_epi32(u[3], bit);
1035
1036 v[0] = _mm_mullo_epi32(v[6], cospi32);
1037 x = _mm_mullo_epi32(v[7], cospi32);
1038 u[6] = _mm_add_epi32(v[0], x);
1039 u[6] = _mm_add_epi32(u[6], rnding);
1040 u[6] = _mm_srai_epi32(u[6], bit);
1041
1042 u[7] = _mm_sub_epi32(v[0], x);
1043 u[7] = _mm_add_epi32(u[7], rnding);
1044 u[7] = _mm_srai_epi32(u[7], bit);
1045
1046 // stage 7
1047 if (do_cols) {
1048 out[0] = u[0];
1049 out[2] = _mm_sub_epi32(kZero, u[4]);
1050 out[4] = u[6];
1051 out[6] = _mm_sub_epi32(kZero, u[2]);
1052 out[8] = u[3];
1053 out[10] = _mm_sub_epi32(kZero, u[7]);
1054 out[12] = u[5];
1055 out[14] = _mm_sub_epi32(kZero, u[1]);
1056 } else {
1057 const int log_range_out = AOMMAX(16, bd + 6);
1058 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
1059 const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
1060
1061 neg_shift_sse4_1(u[0], u[4], out + 0, out + 2, &clamp_lo_out, &clamp_hi_out,
1062 out_shift);
1063 neg_shift_sse4_1(u[6], u[2], out + 4, out + 6, &clamp_lo_out, &clamp_hi_out,
1064 out_shift);
1065 neg_shift_sse4_1(u[3], u[7], out + 8, out + 10, &clamp_lo_out,
1066 &clamp_hi_out, out_shift);
1067 neg_shift_sse4_1(u[5], u[1], out + 12, out + 14, &clamp_lo_out,
1068 &clamp_hi_out, out_shift);
1069 }
1070
1071 // Odd 8 points: 1, 3, ..., 15
1072 // stage 0
1073 // stage 1
1074 // stage 2
1075 // (1)
1076 u[0] = _mm_mullo_epi32(in[15], cospi4);
1077 x = _mm_mullo_epi32(in[1], cospi60);
1078 u[0] = _mm_add_epi32(u[0], x);
1079 u[0] = _mm_add_epi32(u[0], rnding);
1080 u[0] = _mm_srai_epi32(u[0], bit);
1081
1082 u[1] = _mm_mullo_epi32(in[15], cospi60);
1083 x = _mm_mullo_epi32(in[1], cospi4);
1084 u[1] = _mm_sub_epi32(u[1], x);
1085 u[1] = _mm_add_epi32(u[1], rnding);
1086 u[1] = _mm_srai_epi32(u[1], bit);
1087
1088 // (2)
1089 u[2] = _mm_mullo_epi32(in[11], cospi20);
1090 x = _mm_mullo_epi32(in[5], cospi44);
1091 u[2] = _mm_add_epi32(u[2], x);
1092 u[2] = _mm_add_epi32(u[2], rnding);
1093 u[2] = _mm_srai_epi32(u[2], bit);
1094
1095 u[3] = _mm_mullo_epi32(in[11], cospi44);
1096 x = _mm_mullo_epi32(in[5], cospi20);
1097 u[3] = _mm_sub_epi32(u[3], x);
1098 u[3] = _mm_add_epi32(u[3], rnding);
1099 u[3] = _mm_srai_epi32(u[3], bit);
1100
1101 // (3)
1102 u[4] = _mm_mullo_epi32(in[7], cospi36);
1103 x = _mm_mullo_epi32(in[9], cospi28);
1104 u[4] = _mm_add_epi32(u[4], x);
1105 u[4] = _mm_add_epi32(u[4], rnding);
1106 u[4] = _mm_srai_epi32(u[4], bit);
1107
1108 u[5] = _mm_mullo_epi32(in[7], cospi28);
1109 x = _mm_mullo_epi32(in[9], cospi36);
1110 u[5] = _mm_sub_epi32(u[5], x);
1111 u[5] = _mm_add_epi32(u[5], rnding);
1112 u[5] = _mm_srai_epi32(u[5], bit);
1113
1114 // (4)
1115 u[6] = _mm_mullo_epi32(in[3], cospi52);
1116 x = _mm_mullo_epi32(in[13], cospi12);
1117 u[6] = _mm_add_epi32(u[6], x);
1118 u[6] = _mm_add_epi32(u[6], rnding);
1119 u[6] = _mm_srai_epi32(u[6], bit);
1120
1121 u[7] = _mm_mullo_epi32(in[3], cospi12);
1122 x = _mm_mullo_epi32(in[13], cospi52);
1123 u[7] = _mm_sub_epi32(u[7], x);
1124 u[7] = _mm_add_epi32(u[7], rnding);
1125 u[7] = _mm_srai_epi32(u[7], bit);
1126
1127 // stage 3
1128 addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
1129 addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
1130 addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
1131 addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
1132
1133 // stage 4
1134 u[0] = v[0];
1135 u[1] = v[1];
1136 u[2] = v[2];
1137 u[3] = v[3];
1138
1139 u[4] = _mm_mullo_epi32(v[4], cospi16);
1140 x = _mm_mullo_epi32(v[5], cospi48);
1141 u[4] = _mm_add_epi32(u[4], x);
1142 u[4] = _mm_add_epi32(u[4], rnding);
1143 u[4] = _mm_srai_epi32(u[4], bit);
1144
1145 u[5] = _mm_mullo_epi32(v[4], cospi48);
1146 x = _mm_mullo_epi32(v[5], cospi16);
1147 u[5] = _mm_sub_epi32(u[5], x);
1148 u[5] = _mm_add_epi32(u[5], rnding);
1149 u[5] = _mm_srai_epi32(u[5], bit);
1150
1151 u[6] = _mm_mullo_epi32(v[6], cospim48);
1152 x = _mm_mullo_epi32(v[7], cospi16);
1153 u[6] = _mm_add_epi32(u[6], x);
1154 u[6] = _mm_add_epi32(u[6], rnding);
1155 u[6] = _mm_srai_epi32(u[6], bit);
1156
1157 u[7] = _mm_mullo_epi32(v[6], cospi16);
1158 x = _mm_mullo_epi32(v[7], cospim48);
1159 u[7] = _mm_sub_epi32(u[7], x);
1160 u[7] = _mm_add_epi32(u[7], rnding);
1161 u[7] = _mm_srai_epi32(u[7], bit);
1162
1163 // stage 5
1164 addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
1165 addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
1166 addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
1167 addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
1168
1169 // stage 6
1170 u[0] = v[0];
1171 u[1] = v[1];
1172 u[4] = v[4];
1173 u[5] = v[5];
1174
1175 v[0] = _mm_mullo_epi32(v[2], cospi32);
1176 x = _mm_mullo_epi32(v[3], cospi32);
1177 u[2] = _mm_add_epi32(v[0], x);
1178 u[2] = _mm_add_epi32(u[2], rnding);
1179 u[2] = _mm_srai_epi32(u[2], bit);
1180
1181 u[3] = _mm_sub_epi32(v[0], x);
1182 u[3] = _mm_add_epi32(u[3], rnding);
1183 u[3] = _mm_srai_epi32(u[3], bit);
1184
1185 v[0] = _mm_mullo_epi32(v[6], cospi32);
1186 x = _mm_mullo_epi32(v[7], cospi32);
1187 u[6] = _mm_add_epi32(v[0], x);
1188 u[6] = _mm_add_epi32(u[6], rnding);
1189 u[6] = _mm_srai_epi32(u[6], bit);
1190
1191 u[7] = _mm_sub_epi32(v[0], x);
1192 u[7] = _mm_add_epi32(u[7], rnding);
1193 u[7] = _mm_srai_epi32(u[7], bit);
1194
1195 // stage 7
1196 if (do_cols) {
1197 out[1] = u[0];
1198 out[3] = _mm_sub_epi32(kZero, u[4]);
1199 out[5] = u[6];
1200 out[7] = _mm_sub_epi32(kZero, u[2]);
1201 out[9] = u[3];
1202 out[11] = _mm_sub_epi32(kZero, u[7]);
1203 out[13] = u[5];
1204 out[15] = _mm_sub_epi32(kZero, u[1]);
1205 } else {
1206 const int log_range_out = AOMMAX(16, bd + 6);
1207 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
1208 const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
1209
1210 neg_shift_sse4_1(u[0], u[4], out + 1, out + 3, &clamp_lo_out, &clamp_hi_out,
1211 out_shift);
1212 neg_shift_sse4_1(u[6], u[2], out + 5, out + 7, &clamp_lo_out, &clamp_hi_out,
1213 out_shift);
1214 neg_shift_sse4_1(u[3], u[7], out + 9, out + 11, &clamp_lo_out,
1215 &clamp_hi_out, out_shift);
1216 neg_shift_sse4_1(u[5], u[1], out + 13, out + 15, &clamp_lo_out,
1217 &clamp_hi_out, out_shift);
1218 }
1219 }
shift_sse4_1(const __m128i * in,__m128i * out,const __m128i * clamp_lo,const __m128i * clamp_hi,int shift,int size)1220 static void shift_sse4_1(const __m128i *in, __m128i *out,
1221 const __m128i *clamp_lo, const __m128i *clamp_hi,
1222 int shift, int size) {
1223 __m128i offset = _mm_set1_epi32((1 << shift) >> 1);
1224 __m128i shift_vec = _mm_cvtsi32_si128(shift);
1225 __m128i a0, a1;
1226 for (int i = 0; i < size; i += 4) {
1227 a0 = _mm_add_epi32(in[i], offset);
1228 a1 = _mm_add_epi32(in[i + 1], offset);
1229 a0 = _mm_sra_epi32(a0, shift_vec);
1230 a1 = _mm_sra_epi32(a1, shift_vec);
1231 a0 = _mm_max_epi32(a0, *clamp_lo);
1232 a1 = _mm_max_epi32(a1, *clamp_lo);
1233 out[i] = _mm_min_epi32(a0, *clamp_hi);
1234 out[i + 1] = _mm_min_epi32(a1, *clamp_hi);
1235
1236 a0 = _mm_add_epi32(in[i + 2], offset);
1237 a1 = _mm_add_epi32(in[i + 3], offset);
1238 a0 = _mm_sra_epi32(a0, shift_vec);
1239 a1 = _mm_sra_epi32(a1, shift_vec);
1240 a0 = _mm_max_epi32(a0, *clamp_lo);
1241 a1 = _mm_max_epi32(a1, *clamp_lo);
1242 out[i + 2] = _mm_min_epi32(a0, *clamp_hi);
1243 out[i + 3] = _mm_min_epi32(a1, *clamp_hi);
1244 }
1245 }
1246
iidentity8_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)1247 static void iidentity8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
1248 int bd, int out_shift) {
1249 (void)bit;
1250 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1251 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
1252 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
1253 __m128i v[8];
1254 v[0] = _mm_add_epi32(in[0], in[0]);
1255 v[1] = _mm_add_epi32(in[1], in[1]);
1256 v[2] = _mm_add_epi32(in[2], in[2]);
1257 v[3] = _mm_add_epi32(in[3], in[3]);
1258 v[4] = _mm_add_epi32(in[4], in[4]);
1259 v[5] = _mm_add_epi32(in[5], in[5]);
1260 v[6] = _mm_add_epi32(in[6], in[6]);
1261 v[7] = _mm_add_epi32(in[7], in[7]);
1262
1263 if (!do_cols) {
1264 const int log_range_out = AOMMAX(16, bd + 6);
1265 const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
1266 -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
1267 const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
1268 (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
1269
1270 shift_sse4_1(v, out, &clamp_lo_out, &clamp_hi_out, out_shift, 8);
1271 } else {
1272 highbd_clamp_epi32_sse4_1(v, out, &clamp_lo, &clamp_hi, 8);
1273 }
1274 }
1275
round_shift_8x8(__m128i * in,int shift)1276 static void round_shift_8x8(__m128i *in, int shift) {
1277 round_shift_4x4(&in[0], shift);
1278 round_shift_4x4(&in[4], shift);
1279 round_shift_4x4(&in[8], shift);
1280 round_shift_4x4(&in[12], shift);
1281 }
1282
get_recon_8x8(const __m128i pred,__m128i res_lo,__m128i res_hi,int fliplr,int bd)1283 static __m128i get_recon_8x8(const __m128i pred, __m128i res_lo, __m128i res_hi,
1284 int fliplr, int bd) {
1285 __m128i x0, x1;
1286 const __m128i zero = _mm_setzero_si128();
1287
1288 x0 = _mm_unpacklo_epi16(pred, zero);
1289 x1 = _mm_unpackhi_epi16(pred, zero);
1290
1291 if (fliplr) {
1292 res_lo = _mm_shuffle_epi32(res_lo, 0x1B);
1293 res_hi = _mm_shuffle_epi32(res_hi, 0x1B);
1294 x0 = _mm_add_epi32(res_hi, x0);
1295 x1 = _mm_add_epi32(res_lo, x1);
1296
1297 } else {
1298 x0 = _mm_add_epi32(res_lo, x0);
1299 x1 = _mm_add_epi32(res_hi, x1);
1300 }
1301
1302 x0 = _mm_packus_epi32(x0, x1);
1303 return highbd_clamp_epi16(x0, bd);
1304 }
1305
write_buffer_8x8(__m128i * in,uint16_t * output,int stride,int fliplr,int flipud,int shift,int bd)1306 static void write_buffer_8x8(__m128i *in, uint16_t *output, int stride,
1307 int fliplr, int flipud, int shift, int bd) {
1308 __m128i u0, u1, u2, u3, u4, u5, u6, u7;
1309 __m128i v0, v1, v2, v3, v4, v5, v6, v7;
1310
1311 round_shift_8x8(in, shift);
1312
1313 v0 = _mm_load_si128((__m128i const *)(output + 0 * stride));
1314 v1 = _mm_load_si128((__m128i const *)(output + 1 * stride));
1315 v2 = _mm_load_si128((__m128i const *)(output + 2 * stride));
1316 v3 = _mm_load_si128((__m128i const *)(output + 3 * stride));
1317 v4 = _mm_load_si128((__m128i const *)(output + 4 * stride));
1318 v5 = _mm_load_si128((__m128i const *)(output + 5 * stride));
1319 v6 = _mm_load_si128((__m128i const *)(output + 6 * stride));
1320 v7 = _mm_load_si128((__m128i const *)(output + 7 * stride));
1321
1322 if (flipud) {
1323 u0 = get_recon_8x8(v0, in[14], in[15], fliplr, bd);
1324 u1 = get_recon_8x8(v1, in[12], in[13], fliplr, bd);
1325 u2 = get_recon_8x8(v2, in[10], in[11], fliplr, bd);
1326 u3 = get_recon_8x8(v3, in[8], in[9], fliplr, bd);
1327 u4 = get_recon_8x8(v4, in[6], in[7], fliplr, bd);
1328 u5 = get_recon_8x8(v5, in[4], in[5], fliplr, bd);
1329 u6 = get_recon_8x8(v6, in[2], in[3], fliplr, bd);
1330 u7 = get_recon_8x8(v7, in[0], in[1], fliplr, bd);
1331 } else {
1332 u0 = get_recon_8x8(v0, in[0], in[1], fliplr, bd);
1333 u1 = get_recon_8x8(v1, in[2], in[3], fliplr, bd);
1334 u2 = get_recon_8x8(v2, in[4], in[5], fliplr, bd);
1335 u3 = get_recon_8x8(v3, in[6], in[7], fliplr, bd);
1336 u4 = get_recon_8x8(v4, in[8], in[9], fliplr, bd);
1337 u5 = get_recon_8x8(v5, in[10], in[11], fliplr, bd);
1338 u6 = get_recon_8x8(v6, in[12], in[13], fliplr, bd);
1339 u7 = get_recon_8x8(v7, in[14], in[15], fliplr, bd);
1340 }
1341
1342 _mm_store_si128((__m128i *)(output + 0 * stride), u0);
1343 _mm_store_si128((__m128i *)(output + 1 * stride), u1);
1344 _mm_store_si128((__m128i *)(output + 2 * stride), u2);
1345 _mm_store_si128((__m128i *)(output + 3 * stride), u3);
1346 _mm_store_si128((__m128i *)(output + 4 * stride), u4);
1347 _mm_store_si128((__m128i *)(output + 5 * stride), u5);
1348 _mm_store_si128((__m128i *)(output + 6 * stride), u6);
1349 _mm_store_si128((__m128i *)(output + 7 * stride), u7);
1350 }
1351
av1_inv_txfm2d_add_8x8_sse4_1(const int32_t * coeff,uint16_t * output,int stride,TX_TYPE tx_type,int bd)1352 void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output,
1353 int stride, TX_TYPE tx_type, int bd) {
1354 __m128i in[16], out[16];
1355 const int8_t *shift = inv_txfm_shift_ls[TX_8X8];
1356 const int txw_idx = get_txw_idx(TX_8X8);
1357 const int txh_idx = get_txh_idx(TX_8X8);
1358
1359 switch (tx_type) {
1360 case DCT_DCT:
1361 load_buffer_8x8(coeff, in);
1362 transpose_8x8(in, out);
1363 idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
1364 -shift[0]);
1365 transpose_8x8(in, out);
1366 idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
1367 write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
1368 break;
1369 case DCT_ADST:
1370 load_buffer_8x8(coeff, in);
1371 transpose_8x8(in, out);
1372 iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
1373 -shift[0]);
1374 transpose_8x8(in, out);
1375 idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
1376 write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
1377 break;
1378 case ADST_DCT:
1379 load_buffer_8x8(coeff, in);
1380 transpose_8x8(in, out);
1381 idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
1382 -shift[0]);
1383 transpose_8x8(in, out);
1384 iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
1385 write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
1386 break;
1387 case ADST_ADST:
1388 load_buffer_8x8(coeff, in);
1389 transpose_8x8(in, out);
1390 iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
1391 -shift[0]);
1392 transpose_8x8(in, out);
1393 iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
1394 write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
1395 break;
1396 case FLIPADST_DCT:
1397 load_buffer_8x8(coeff, in);
1398 transpose_8x8(in, out);
1399 idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
1400 -shift[0]);
1401 transpose_8x8(in, out);
1402 iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
1403 write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd);
1404 break;
1405 case DCT_FLIPADST:
1406 load_buffer_8x8(coeff, in);
1407 transpose_8x8(in, out);
1408 iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
1409 -shift[0]);
1410 transpose_8x8(in, out);
1411 idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
1412 write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd);
1413 break;
1414 case ADST_FLIPADST:
1415 load_buffer_8x8(coeff, in);
1416 transpose_8x8(in, out);
1417 iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
1418 -shift[0]);
1419 transpose_8x8(in, out);
1420 iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
1421 write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd);
1422 break;
1423 case FLIPADST_FLIPADST:
1424 load_buffer_8x8(coeff, in);
1425 transpose_8x8(in, out);
1426 iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
1427 -shift[0]);
1428 transpose_8x8(in, out);
1429 iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
1430 write_buffer_8x8(in, output, stride, 1, 1, -shift[1], bd);
1431 break;
1432 case FLIPADST_ADST:
1433 load_buffer_8x8(coeff, in);
1434 transpose_8x8(in, out);
1435 iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
1436 -shift[0]);
1437 transpose_8x8(in, out);
1438 iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
1439 write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd);
1440 break;
1441 default: assert(0);
1442 }
1443 }
1444
idct8x8_low1_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)1445 static void idct8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
1446 int bd, int out_shift) {
1447 const int32_t *cospi = cospi_arr(bit);
1448 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1449 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1450 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1451 __m128i x;
1452
1453 // stage 0
1454 // stage 1
1455 // stage 2
1456 // stage 3
1457 x = _mm_mullo_epi32(in[0], cospi32);
1458 x = _mm_add_epi32(x, rnding);
1459 x = _mm_srai_epi32(x, bit);
1460
1461 // stage 4
1462 // stage 5
1463 if (!do_cols) {
1464 const int log_range_out = AOMMAX(16, bd + 6);
1465 const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
1466 -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
1467 const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
1468 (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
1469
1470 __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
1471 x = _mm_add_epi32(x, offset);
1472 x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
1473 x = _mm_max_epi32(x, clamp_lo_out);
1474 x = _mm_min_epi32(x, clamp_hi_out);
1475 }
1476
1477 out[0] = x;
1478 out[1] = x;
1479 out[2] = x;
1480 out[3] = x;
1481 out[4] = x;
1482 out[5] = x;
1483 out[6] = x;
1484 out[7] = x;
1485 }
1486
idct8x8_new_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)1487 static void idct8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
1488 int bd, int out_shift) {
1489 const int32_t *cospi = cospi_arr(bit);
1490 const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
1491 const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
1492 const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
1493 const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
1494 const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
1495 const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
1496 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1497 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
1498 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
1499 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
1500 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1501 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1502 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
1503 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
1504 __m128i u0, u1, u2, u3, u4, u5, u6, u7;
1505 __m128i v0, v1, v2, v3, v4, v5, v6, v7;
1506 __m128i x, y;
1507
1508 // stage 0
1509 // stage 1
1510 // stage 2
1511 u0 = in[0];
1512 u1 = in[4];
1513 u2 = in[2];
1514 u3 = in[6];
1515
1516 x = _mm_mullo_epi32(in[1], cospi56);
1517 y = _mm_mullo_epi32(in[7], cospim8);
1518 u4 = _mm_add_epi32(x, y);
1519 u4 = _mm_add_epi32(u4, rnding);
1520 u4 = _mm_srai_epi32(u4, bit);
1521
1522 x = _mm_mullo_epi32(in[1], cospi8);
1523 y = _mm_mullo_epi32(in[7], cospi56);
1524 u7 = _mm_add_epi32(x, y);
1525 u7 = _mm_add_epi32(u7, rnding);
1526 u7 = _mm_srai_epi32(u7, bit);
1527
1528 x = _mm_mullo_epi32(in[5], cospi24);
1529 y = _mm_mullo_epi32(in[3], cospim40);
1530 u5 = _mm_add_epi32(x, y);
1531 u5 = _mm_add_epi32(u5, rnding);
1532 u5 = _mm_srai_epi32(u5, bit);
1533
1534 x = _mm_mullo_epi32(in[5], cospi40);
1535 y = _mm_mullo_epi32(in[3], cospi24);
1536 u6 = _mm_add_epi32(x, y);
1537 u6 = _mm_add_epi32(u6, rnding);
1538 u6 = _mm_srai_epi32(u6, bit);
1539
1540 // stage 3
1541 x = _mm_mullo_epi32(u0, cospi32);
1542 y = _mm_mullo_epi32(u1, cospi32);
1543 v0 = _mm_add_epi32(x, y);
1544 v0 = _mm_add_epi32(v0, rnding);
1545 v0 = _mm_srai_epi32(v0, bit);
1546
1547 v1 = _mm_sub_epi32(x, y);
1548 v1 = _mm_add_epi32(v1, rnding);
1549 v1 = _mm_srai_epi32(v1, bit);
1550
1551 x = _mm_mullo_epi32(u2, cospi48);
1552 y = _mm_mullo_epi32(u3, cospim16);
1553 v2 = _mm_add_epi32(x, y);
1554 v2 = _mm_add_epi32(v2, rnding);
1555 v2 = _mm_srai_epi32(v2, bit);
1556
1557 x = _mm_mullo_epi32(u2, cospi16);
1558 y = _mm_mullo_epi32(u3, cospi48);
1559 v3 = _mm_add_epi32(x, y);
1560 v3 = _mm_add_epi32(v3, rnding);
1561 v3 = _mm_srai_epi32(v3, bit);
1562
1563 addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
1564 addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
1565
1566 // stage 4
1567 addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
1568 addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
1569 u4 = v4;
1570 u7 = v7;
1571
1572 x = _mm_mullo_epi32(v5, cospi32);
1573 y = _mm_mullo_epi32(v6, cospi32);
1574 u6 = _mm_add_epi32(y, x);
1575 u6 = _mm_add_epi32(u6, rnding);
1576 u6 = _mm_srai_epi32(u6, bit);
1577
1578 u5 = _mm_sub_epi32(y, x);
1579 u5 = _mm_add_epi32(u5, rnding);
1580 u5 = _mm_srai_epi32(u5, bit);
1581
1582 // stage 5
1583 if (do_cols) {
1584 addsub_no_clamp_sse4_1(u0, u7, out + 0, out + 7);
1585 addsub_no_clamp_sse4_1(u1, u6, out + 1, out + 6);
1586 addsub_no_clamp_sse4_1(u2, u5, out + 2, out + 5);
1587 addsub_no_clamp_sse4_1(u3, u4, out + 3, out + 4);
1588 } else {
1589 const int log_range_out = AOMMAX(16, bd + 6);
1590 const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
1591 -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
1592 const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
1593 (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
1594 addsub_shift_sse4_1(u0, u7, out + 0, out + 7, &clamp_lo_out, &clamp_hi_out,
1595 out_shift);
1596 addsub_shift_sse4_1(u1, u6, out + 1, out + 6, &clamp_lo_out, &clamp_hi_out,
1597 out_shift);
1598 addsub_shift_sse4_1(u2, u5, out + 2, out + 5, &clamp_lo_out, &clamp_hi_out,
1599 out_shift);
1600 addsub_shift_sse4_1(u3, u4, out + 3, out + 4, &clamp_lo_out, &clamp_hi_out,
1601 out_shift);
1602 }
1603 }
1604
iadst8x8_low1_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)1605 static void iadst8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit,
1606 int do_cols, int bd, int out_shift) {
1607 const int32_t *cospi = cospi_arr(bit);
1608 const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
1609 const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
1610 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
1611 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
1612 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1613 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1614 const __m128i kZero = _mm_setzero_si128();
1615 __m128i u[8], x;
1616
1617 // stage 0
1618 // stage 1
1619 // stage 2
1620
1621 x = _mm_mullo_epi32(in[0], cospi60);
1622 u[0] = _mm_add_epi32(x, rnding);
1623 u[0] = _mm_srai_epi32(u[0], bit);
1624
1625 x = _mm_mullo_epi32(in[0], cospi4);
1626 u[1] = _mm_sub_epi32(kZero, x);
1627 u[1] = _mm_add_epi32(u[1], rnding);
1628 u[1] = _mm_srai_epi32(u[1], bit);
1629
1630 // stage 3
1631 // stage 4
1632 __m128i temp1, temp2;
1633 temp1 = _mm_mullo_epi32(u[0], cospi16);
1634 x = _mm_mullo_epi32(u[1], cospi48);
1635 temp1 = _mm_add_epi32(temp1, x);
1636 temp1 = _mm_add_epi32(temp1, rnding);
1637 temp1 = _mm_srai_epi32(temp1, bit);
1638 u[4] = temp1;
1639
1640 temp2 = _mm_mullo_epi32(u[0], cospi48);
1641 x = _mm_mullo_epi32(u[1], cospi16);
1642 u[5] = _mm_sub_epi32(temp2, x);
1643 u[5] = _mm_add_epi32(u[5], rnding);
1644 u[5] = _mm_srai_epi32(u[5], bit);
1645
1646 // stage 5
1647 // stage 6
1648 temp1 = _mm_mullo_epi32(u[0], cospi32);
1649 x = _mm_mullo_epi32(u[1], cospi32);
1650 u[2] = _mm_add_epi32(temp1, x);
1651 u[2] = _mm_add_epi32(u[2], rnding);
1652 u[2] = _mm_srai_epi32(u[2], bit);
1653
1654 u[3] = _mm_sub_epi32(temp1, x);
1655 u[3] = _mm_add_epi32(u[3], rnding);
1656 u[3] = _mm_srai_epi32(u[3], bit);
1657
1658 temp1 = _mm_mullo_epi32(u[4], cospi32);
1659 x = _mm_mullo_epi32(u[5], cospi32);
1660 u[6] = _mm_add_epi32(temp1, x);
1661 u[6] = _mm_add_epi32(u[6], rnding);
1662 u[6] = _mm_srai_epi32(u[6], bit);
1663
1664 u[7] = _mm_sub_epi32(temp1, x);
1665 u[7] = _mm_add_epi32(u[7], rnding);
1666 u[7] = _mm_srai_epi32(u[7], bit);
1667
1668 // stage 7
1669 if (do_cols) {
1670 out[0] = u[0];
1671 out[1] = _mm_sub_epi32(kZero, u[4]);
1672 out[2] = u[6];
1673 out[3] = _mm_sub_epi32(kZero, u[2]);
1674 out[4] = u[3];
1675 out[5] = _mm_sub_epi32(kZero, u[7]);
1676 out[6] = u[5];
1677 out[7] = _mm_sub_epi32(kZero, u[1]);
1678 } else {
1679 const int log_range_out = AOMMAX(16, bd + 6);
1680 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
1681 const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
1682
1683 neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
1684 out_shift);
1685 neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
1686 out_shift);
1687 neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
1688 out_shift);
1689 neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
1690 out_shift);
1691 }
1692 }
1693
iadst8x8_new_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)1694 static void iadst8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
1695 int bd, int out_shift) {
1696 const int32_t *cospi = cospi_arr(bit);
1697 const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
1698 const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
1699 const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
1700 const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
1701 const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
1702 const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
1703 const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
1704 const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
1705 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
1706 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
1707 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
1708 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1709 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1710 const __m128i kZero = _mm_setzero_si128();
1711 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1712 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
1713 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
1714 __m128i u[8], v[8], x;
1715
1716 // stage 0
1717 // stage 1
1718 // stage 2
1719
1720 u[0] = _mm_mullo_epi32(in[7], cospi4);
1721 x = _mm_mullo_epi32(in[0], cospi60);
1722 u[0] = _mm_add_epi32(u[0], x);
1723 u[0] = _mm_add_epi32(u[0], rnding);
1724 u[0] = _mm_srai_epi32(u[0], bit);
1725
1726 u[1] = _mm_mullo_epi32(in[7], cospi60);
1727 x = _mm_mullo_epi32(in[0], cospi4);
1728 u[1] = _mm_sub_epi32(u[1], x);
1729 u[1] = _mm_add_epi32(u[1], rnding);
1730 u[1] = _mm_srai_epi32(u[1], bit);
1731
1732 // (2)
1733 u[2] = _mm_mullo_epi32(in[5], cospi20);
1734 x = _mm_mullo_epi32(in[2], cospi44);
1735 u[2] = _mm_add_epi32(u[2], x);
1736 u[2] = _mm_add_epi32(u[2], rnding);
1737 u[2] = _mm_srai_epi32(u[2], bit);
1738
1739 u[3] = _mm_mullo_epi32(in[5], cospi44);
1740 x = _mm_mullo_epi32(in[2], cospi20);
1741 u[3] = _mm_sub_epi32(u[3], x);
1742 u[3] = _mm_add_epi32(u[3], rnding);
1743 u[3] = _mm_srai_epi32(u[3], bit);
1744
1745 // (3)
1746 u[4] = _mm_mullo_epi32(in[3], cospi36);
1747 x = _mm_mullo_epi32(in[4], cospi28);
1748 u[4] = _mm_add_epi32(u[4], x);
1749 u[4] = _mm_add_epi32(u[4], rnding);
1750 u[4] = _mm_srai_epi32(u[4], bit);
1751
1752 u[5] = _mm_mullo_epi32(in[3], cospi28);
1753 x = _mm_mullo_epi32(in[4], cospi36);
1754 u[5] = _mm_sub_epi32(u[5], x);
1755 u[5] = _mm_add_epi32(u[5], rnding);
1756 u[5] = _mm_srai_epi32(u[5], bit);
1757
1758 // (4)
1759 u[6] = _mm_mullo_epi32(in[1], cospi52);
1760 x = _mm_mullo_epi32(in[6], cospi12);
1761 u[6] = _mm_add_epi32(u[6], x);
1762 u[6] = _mm_add_epi32(u[6], rnding);
1763 u[6] = _mm_srai_epi32(u[6], bit);
1764
1765 u[7] = _mm_mullo_epi32(in[1], cospi12);
1766 x = _mm_mullo_epi32(in[6], cospi52);
1767 u[7] = _mm_sub_epi32(u[7], x);
1768 u[7] = _mm_add_epi32(u[7], rnding);
1769 u[7] = _mm_srai_epi32(u[7], bit);
1770
1771 // stage 3
1772 addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
1773 addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
1774 addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
1775 addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
1776
1777 // stage 4
1778 u[0] = v[0];
1779 u[1] = v[1];
1780 u[2] = v[2];
1781 u[3] = v[3];
1782
1783 u[4] = _mm_mullo_epi32(v[4], cospi16);
1784 x = _mm_mullo_epi32(v[5], cospi48);
1785 u[4] = _mm_add_epi32(u[4], x);
1786 u[4] = _mm_add_epi32(u[4], rnding);
1787 u[4] = _mm_srai_epi32(u[4], bit);
1788
1789 u[5] = _mm_mullo_epi32(v[4], cospi48);
1790 x = _mm_mullo_epi32(v[5], cospi16);
1791 u[5] = _mm_sub_epi32(u[5], x);
1792 u[5] = _mm_add_epi32(u[5], rnding);
1793 u[5] = _mm_srai_epi32(u[5], bit);
1794
1795 u[6] = _mm_mullo_epi32(v[6], cospim48);
1796 x = _mm_mullo_epi32(v[7], cospi16);
1797 u[6] = _mm_add_epi32(u[6], x);
1798 u[6] = _mm_add_epi32(u[6], rnding);
1799 u[6] = _mm_srai_epi32(u[6], bit);
1800
1801 u[7] = _mm_mullo_epi32(v[6], cospi16);
1802 x = _mm_mullo_epi32(v[7], cospim48);
1803 u[7] = _mm_sub_epi32(u[7], x);
1804 u[7] = _mm_add_epi32(u[7], rnding);
1805 u[7] = _mm_srai_epi32(u[7], bit);
1806
1807 // stage 5
1808 addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
1809 addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
1810 addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
1811 addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
1812
1813 // stage 6
1814 u[0] = v[0];
1815 u[1] = v[1];
1816 u[4] = v[4];
1817 u[5] = v[5];
1818
1819 v[0] = _mm_mullo_epi32(v[2], cospi32);
1820 x = _mm_mullo_epi32(v[3], cospi32);
1821 u[2] = _mm_add_epi32(v[0], x);
1822 u[2] = _mm_add_epi32(u[2], rnding);
1823 u[2] = _mm_srai_epi32(u[2], bit);
1824
1825 u[3] = _mm_sub_epi32(v[0], x);
1826 u[3] = _mm_add_epi32(u[3], rnding);
1827 u[3] = _mm_srai_epi32(u[3], bit);
1828
1829 v[0] = _mm_mullo_epi32(v[6], cospi32);
1830 x = _mm_mullo_epi32(v[7], cospi32);
1831 u[6] = _mm_add_epi32(v[0], x);
1832 u[6] = _mm_add_epi32(u[6], rnding);
1833 u[6] = _mm_srai_epi32(u[6], bit);
1834
1835 u[7] = _mm_sub_epi32(v[0], x);
1836 u[7] = _mm_add_epi32(u[7], rnding);
1837 u[7] = _mm_srai_epi32(u[7], bit);
1838
1839 // stage 7
1840 if (do_cols) {
1841 out[0] = u[0];
1842 out[1] = _mm_sub_epi32(kZero, u[4]);
1843 out[2] = u[6];
1844 out[3] = _mm_sub_epi32(kZero, u[2]);
1845 out[4] = u[3];
1846 out[5] = _mm_sub_epi32(kZero, u[7]);
1847 out[6] = u[5];
1848 out[7] = _mm_sub_epi32(kZero, u[1]);
1849 } else {
1850 const int log_range_out = AOMMAX(16, bd + 6);
1851 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
1852 const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
1853
1854 neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
1855 out_shift);
1856 neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
1857 out_shift);
1858 neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
1859 out_shift);
1860 neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
1861 out_shift);
1862 }
1863 }
1864
idct16x16_low1_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)1865 static void idct16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit,
1866 int do_cols, int bd, int out_shift) {
1867 const int32_t *cospi = cospi_arr(bit);
1868 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1869 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1870 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1871 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
1872 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
1873
1874 {
1875 // stage 0
1876 // stage 1
1877 // stage 2
1878 // stage 3
1879 // stage 4
1880 in[0] = _mm_mullo_epi32(in[0], cospi32);
1881 in[0] = _mm_add_epi32(in[0], rnding);
1882 in[0] = _mm_srai_epi32(in[0], bit);
1883
1884 // stage 5
1885 // stage 6
1886 // stage 7
1887 if (do_cols) {
1888 in[0] = _mm_max_epi32(in[0], clamp_lo);
1889 in[0] = _mm_min_epi32(in[0], clamp_hi);
1890 } else {
1891 const int log_range_out = AOMMAX(16, bd + 6);
1892 const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
1893 -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
1894 const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
1895 (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
1896 __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
1897 in[0] = _mm_add_epi32(in[0], offset);
1898 in[0] = _mm_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift));
1899 in[0] = _mm_max_epi32(in[0], clamp_lo_out);
1900 in[0] = _mm_min_epi32(in[0], clamp_hi_out);
1901 }
1902
1903 out[0] = in[0];
1904 out[1] = in[0];
1905 out[2] = in[0];
1906 out[3] = in[0];
1907 out[4] = in[0];
1908 out[5] = in[0];
1909 out[6] = in[0];
1910 out[7] = in[0];
1911 out[8] = in[0];
1912 out[9] = in[0];
1913 out[10] = in[0];
1914 out[11] = in[0];
1915 out[12] = in[0];
1916 out[13] = in[0];
1917 out[14] = in[0];
1918 out[15] = in[0];
1919 }
1920 }
1921
idct16x16_low8_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)1922 static void idct16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit,
1923 int do_cols, int bd, int out_shift) {
1924 const int32_t *cospi = cospi_arr(bit);
1925 const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
1926 const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
1927 const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
1928 const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
1929 const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
1930 const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
1931 const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
1932 const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
1933 const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
1934 const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
1935 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1936 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
1937 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
1938 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
1939 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
1940 const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
1941 const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
1942 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1943 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1944 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
1945 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
1946 __m128i u[16], x, y;
1947
1948 {
1949 // stage 0
1950 // stage 1
1951 u[0] = in[0];
1952 u[2] = in[4];
1953 u[4] = in[2];
1954 u[6] = in[6];
1955 u[8] = in[1];
1956 u[10] = in[5];
1957 u[12] = in[3];
1958 u[14] = in[7];
1959
1960 // stage 2
1961 u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
1962 u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
1963
1964 u[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit);
1965 u[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit);
1966
1967 u[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit);
1968 u[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit);
1969
1970 u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
1971 u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
1972
1973 // stage 3
1974 u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit);
1975 u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit);
1976 u[5] = half_btf_0_sse4_1(&cospim40, &u[6], &rnding, bit);
1977 u[6] = half_btf_0_sse4_1(&cospi24, &u[6], &rnding, bit);
1978
1979 addsub_sse4_1(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
1980 addsub_sse4_1(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
1981 addsub_sse4_1(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
1982 addsub_sse4_1(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
1983
1984 // stage 4
1985 x = _mm_mullo_epi32(u[0], cospi32);
1986 u[0] = _mm_add_epi32(x, rnding);
1987 u[0] = _mm_srai_epi32(u[0], bit);
1988 u[1] = u[0];
1989
1990 u[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit);
1991 u[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit);
1992
1993 addsub_sse4_1(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi);
1994 addsub_sse4_1(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi);
1995
1996 x = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
1997 u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
1998 u[9] = x;
1999 y = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
2000 u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
2001 u[10] = y;
2002
2003 // stage 5
2004 addsub_sse4_1(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
2005 addsub_sse4_1(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
2006
2007 x = _mm_mullo_epi32(u[5], cospi32);
2008 y = _mm_mullo_epi32(u[6], cospi32);
2009 u[5] = _mm_sub_epi32(y, x);
2010 u[5] = _mm_add_epi32(u[5], rnding);
2011 u[5] = _mm_srai_epi32(u[5], bit);
2012
2013 u[6] = _mm_add_epi32(y, x);
2014 u[6] = _mm_add_epi32(u[6], rnding);
2015 u[6] = _mm_srai_epi32(u[6], bit);
2016
2017 addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
2018 addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
2019 addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
2020 addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
2021
2022 // stage 6
2023 addsub_sse4_1(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi);
2024 addsub_sse4_1(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi);
2025 addsub_sse4_1(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi);
2026 addsub_sse4_1(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi);
2027
2028 x = _mm_mullo_epi32(u[10], cospi32);
2029 y = _mm_mullo_epi32(u[13], cospi32);
2030 u[10] = _mm_sub_epi32(y, x);
2031 u[10] = _mm_add_epi32(u[10], rnding);
2032 u[10] = _mm_srai_epi32(u[10], bit);
2033
2034 u[13] = _mm_add_epi32(x, y);
2035 u[13] = _mm_add_epi32(u[13], rnding);
2036 u[13] = _mm_srai_epi32(u[13], bit);
2037
2038 x = _mm_mullo_epi32(u[11], cospi32);
2039 y = _mm_mullo_epi32(u[12], cospi32);
2040 u[11] = _mm_sub_epi32(y, x);
2041 u[11] = _mm_add_epi32(u[11], rnding);
2042 u[11] = _mm_srai_epi32(u[11], bit);
2043
2044 u[12] = _mm_add_epi32(x, y);
2045 u[12] = _mm_add_epi32(u[12], rnding);
2046 u[12] = _mm_srai_epi32(u[12], bit);
2047 // stage 7
2048 if (do_cols) {
2049 addsub_no_clamp_sse4_1(u[0], u[15], out + 0, out + 15);
2050 addsub_no_clamp_sse4_1(u[1], u[14], out + 1, out + 14);
2051 addsub_no_clamp_sse4_1(u[2], u[13], out + 2, out + 13);
2052 addsub_no_clamp_sse4_1(u[3], u[12], out + 3, out + 12);
2053 addsub_no_clamp_sse4_1(u[4], u[11], out + 4, out + 11);
2054 addsub_no_clamp_sse4_1(u[5], u[10], out + 5, out + 10);
2055 addsub_no_clamp_sse4_1(u[6], u[9], out + 6, out + 9);
2056 addsub_no_clamp_sse4_1(u[7], u[8], out + 7, out + 8);
2057 } else {
2058 const int log_range_out = AOMMAX(16, bd + 6);
2059 const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
2060 -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
2061 const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
2062 (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
2063
2064 addsub_shift_sse4_1(u[0], u[15], out + 0, out + 15, &clamp_lo_out,
2065 &clamp_hi_out, out_shift);
2066 addsub_shift_sse4_1(u[1], u[14], out + 1, out + 14, &clamp_lo_out,
2067 &clamp_hi_out, out_shift);
2068 addsub_shift_sse4_1(u[2], u[13], out + 2, out + 13, &clamp_lo_out,
2069 &clamp_hi_out, out_shift);
2070 addsub_shift_sse4_1(u[3], u[12], out + 3, out + 12, &clamp_lo_out,
2071 &clamp_hi_out, out_shift);
2072 addsub_shift_sse4_1(u[4], u[11], out + 4, out + 11, &clamp_lo_out,
2073 &clamp_hi_out, out_shift);
2074 addsub_shift_sse4_1(u[5], u[10], out + 5, out + 10, &clamp_lo_out,
2075 &clamp_hi_out, out_shift);
2076 addsub_shift_sse4_1(u[6], u[9], out + 6, out + 9, &clamp_lo_out,
2077 &clamp_hi_out, out_shift);
2078 addsub_shift_sse4_1(u[7], u[8], out + 7, out + 8, &clamp_lo_out,
2079 &clamp_hi_out, out_shift);
2080 }
2081 }
2082 }
2083
iadst16x16_low1_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)2084 static void iadst16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit,
2085 int do_cols, int bd, int out_shift) {
2086 const int32_t *cospi = cospi_arr(bit);
2087 const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
2088 const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
2089 const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
2090 const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
2091 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
2092 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
2093 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
2094 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
2095 const __m128i zero = _mm_setzero_si128();
2096 __m128i v[16], x, y, temp1, temp2;
2097
2098 // Calculate the column 0, 1, 2, 3
2099 {
2100 // stage 0
2101 // stage 1
2102 // stage 2
2103 x = _mm_mullo_epi32(in[0], cospi62);
2104 v[0] = _mm_add_epi32(x, rnding);
2105 v[0] = _mm_srai_epi32(v[0], bit);
2106
2107 x = _mm_mullo_epi32(in[0], cospi2);
2108 v[1] = _mm_sub_epi32(zero, x);
2109 v[1] = _mm_add_epi32(v[1], rnding);
2110 v[1] = _mm_srai_epi32(v[1], bit);
2111
2112 // stage 3
2113 v[8] = v[0];
2114 v[9] = v[1];
2115
2116 // stage 4
2117 temp1 = _mm_mullo_epi32(v[8], cospi8);
2118 x = _mm_mullo_epi32(v[9], cospi56);
2119 temp1 = _mm_add_epi32(temp1, x);
2120 temp1 = _mm_add_epi32(temp1, rnding);
2121 temp1 = _mm_srai_epi32(temp1, bit);
2122
2123 temp2 = _mm_mullo_epi32(v[8], cospi56);
2124 x = _mm_mullo_epi32(v[9], cospi8);
2125 temp2 = _mm_sub_epi32(temp2, x);
2126 temp2 = _mm_add_epi32(temp2, rnding);
2127 temp2 = _mm_srai_epi32(temp2, bit);
2128 v[8] = temp1;
2129 v[9] = temp2;
2130
2131 // stage 5
2132 v[4] = v[0];
2133 v[5] = v[1];
2134 v[12] = v[8];
2135 v[13] = v[9];
2136
2137 // stage 6
2138 temp1 = _mm_mullo_epi32(v[4], cospi16);
2139 x = _mm_mullo_epi32(v[5], cospi48);
2140 temp1 = _mm_add_epi32(temp1, x);
2141 temp1 = _mm_add_epi32(temp1, rnding);
2142 temp1 = _mm_srai_epi32(temp1, bit);
2143
2144 temp2 = _mm_mullo_epi32(v[4], cospi48);
2145 x = _mm_mullo_epi32(v[5], cospi16);
2146 temp2 = _mm_sub_epi32(temp2, x);
2147 temp2 = _mm_add_epi32(temp2, rnding);
2148 temp2 = _mm_srai_epi32(temp2, bit);
2149 v[4] = temp1;
2150 v[5] = temp2;
2151
2152 temp1 = _mm_mullo_epi32(v[12], cospi16);
2153 x = _mm_mullo_epi32(v[13], cospi48);
2154 temp1 = _mm_add_epi32(temp1, x);
2155 temp1 = _mm_add_epi32(temp1, rnding);
2156 temp1 = _mm_srai_epi32(temp1, bit);
2157
2158 temp2 = _mm_mullo_epi32(v[12], cospi48);
2159 x = _mm_mullo_epi32(v[13], cospi16);
2160 temp2 = _mm_sub_epi32(temp2, x);
2161 temp2 = _mm_add_epi32(temp2, rnding);
2162 temp2 = _mm_srai_epi32(temp2, bit);
2163 v[12] = temp1;
2164 v[13] = temp2;
2165
2166 // stage 7
2167 v[2] = v[0];
2168 v[3] = v[1];
2169 v[6] = v[4];
2170 v[7] = v[5];
2171 v[10] = v[8];
2172 v[11] = v[9];
2173 v[14] = v[12];
2174 v[15] = v[13];
2175
2176 // stage 8
2177 y = _mm_mullo_epi32(v[2], cospi32);
2178 x = _mm_mullo_epi32(v[3], cospi32);
2179 v[2] = _mm_add_epi32(y, x);
2180 v[2] = _mm_add_epi32(v[2], rnding);
2181 v[2] = _mm_srai_epi32(v[2], bit);
2182
2183 v[3] = _mm_sub_epi32(y, x);
2184 v[3] = _mm_add_epi32(v[3], rnding);
2185 v[3] = _mm_srai_epi32(v[3], bit);
2186
2187 y = _mm_mullo_epi32(v[6], cospi32);
2188 x = _mm_mullo_epi32(v[7], cospi32);
2189 v[6] = _mm_add_epi32(y, x);
2190 v[6] = _mm_add_epi32(v[6], rnding);
2191 v[6] = _mm_srai_epi32(v[6], bit);
2192
2193 v[7] = _mm_sub_epi32(y, x);
2194 v[7] = _mm_add_epi32(v[7], rnding);
2195 v[7] = _mm_srai_epi32(v[7], bit);
2196
2197 y = _mm_mullo_epi32(v[10], cospi32);
2198 x = _mm_mullo_epi32(v[11], cospi32);
2199 v[10] = _mm_add_epi32(y, x);
2200 v[10] = _mm_add_epi32(v[10], rnding);
2201 v[10] = _mm_srai_epi32(v[10], bit);
2202
2203 v[11] = _mm_sub_epi32(y, x);
2204 v[11] = _mm_add_epi32(v[11], rnding);
2205 v[11] = _mm_srai_epi32(v[11], bit);
2206
2207 y = _mm_mullo_epi32(v[14], cospi32);
2208 x = _mm_mullo_epi32(v[15], cospi32);
2209 v[14] = _mm_add_epi32(y, x);
2210 v[14] = _mm_add_epi32(v[14], rnding);
2211 v[14] = _mm_srai_epi32(v[14], bit);
2212
2213 v[15] = _mm_sub_epi32(y, x);
2214 v[15] = _mm_add_epi32(v[15], rnding);
2215 v[15] = _mm_srai_epi32(v[15], bit);
2216
2217 // stage 9
2218 if (do_cols) {
2219 out[0] = v[0];
2220 out[1] = _mm_sub_epi32(_mm_setzero_si128(), v[8]);
2221 out[2] = v[12];
2222 out[3] = _mm_sub_epi32(_mm_setzero_si128(), v[4]);
2223 out[4] = v[6];
2224 out[5] = _mm_sub_epi32(_mm_setzero_si128(), v[14]);
2225 out[6] = v[10];
2226 out[7] = _mm_sub_epi32(_mm_setzero_si128(), v[2]);
2227 out[8] = v[3];
2228 out[9] = _mm_sub_epi32(_mm_setzero_si128(), v[11]);
2229 out[10] = v[15];
2230 out[11] = _mm_sub_epi32(_mm_setzero_si128(), v[7]);
2231 out[12] = v[5];
2232 out[13] = _mm_sub_epi32(_mm_setzero_si128(), v[13]);
2233 out[14] = v[9];
2234 out[15] = _mm_sub_epi32(_mm_setzero_si128(), v[1]);
2235 } else {
2236 const int log_range_out = AOMMAX(16, bd + 6);
2237 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
2238 const __m128i clamp_hi_out =
2239 _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
2240
2241 neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out,
2242 &clamp_hi_out, out_shift);
2243 neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
2244 &clamp_hi_out, out_shift);
2245 neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
2246 &clamp_hi_out, out_shift);
2247 neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
2248 &clamp_hi_out, out_shift);
2249 neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
2250 &clamp_hi_out, out_shift);
2251 neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
2252 &clamp_hi_out, out_shift);
2253 neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
2254 &clamp_hi_out, out_shift);
2255 neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
2256 &clamp_hi_out, out_shift);
2257 }
2258 }
2259 }
2260
iadst16x16_low8_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)2261 static void iadst16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit,
2262 int do_cols, int bd, int out_shift) {
2263 const int32_t *cospi = cospi_arr(bit);
2264 const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
2265 const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
2266 const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
2267 const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
2268 const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
2269 const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
2270 const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
2271 const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
2272 const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
2273 const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
2274 const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
2275 const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
2276 const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
2277 const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
2278 const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
2279 const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
2280 const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
2281 const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
2282 const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
2283 const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
2284 const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
2285 const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
2286 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
2287 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
2288 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
2289 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
2290 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
2291 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2292 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
2293 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
2294 __m128i u[16], x, y;
2295
2296 // Calculate the column 0, 1, 2, 3
2297 {
2298 // stage 0
2299 // stage 1
2300 // stage 2
2301 __m128i zero = _mm_setzero_si128();
2302 x = _mm_mullo_epi32(in[0], cospi62);
2303 u[0] = _mm_add_epi32(x, rnding);
2304 u[0] = _mm_srai_epi32(u[0], bit);
2305
2306 x = _mm_mullo_epi32(in[0], cospi2);
2307 u[1] = _mm_sub_epi32(zero, x);
2308 u[1] = _mm_add_epi32(u[1], rnding);
2309 u[1] = _mm_srai_epi32(u[1], bit);
2310
2311 x = _mm_mullo_epi32(in[2], cospi54);
2312 u[2] = _mm_add_epi32(x, rnding);
2313 u[2] = _mm_srai_epi32(u[2], bit);
2314
2315 x = _mm_mullo_epi32(in[2], cospi10);
2316 u[3] = _mm_sub_epi32(zero, x);
2317 u[3] = _mm_add_epi32(u[3], rnding);
2318 u[3] = _mm_srai_epi32(u[3], bit);
2319
2320 x = _mm_mullo_epi32(in[4], cospi46);
2321 u[4] = _mm_add_epi32(x, rnding);
2322 u[4] = _mm_srai_epi32(u[4], bit);
2323
2324 x = _mm_mullo_epi32(in[4], cospi18);
2325 u[5] = _mm_sub_epi32(zero, x);
2326 u[5] = _mm_add_epi32(u[5], rnding);
2327 u[5] = _mm_srai_epi32(u[5], bit);
2328
2329 x = _mm_mullo_epi32(in[6], cospi38);
2330 u[6] = _mm_add_epi32(x, rnding);
2331 u[6] = _mm_srai_epi32(u[6], bit);
2332
2333 x = _mm_mullo_epi32(in[6], cospi26);
2334 u[7] = _mm_sub_epi32(zero, x);
2335 u[7] = _mm_add_epi32(u[7], rnding);
2336 u[7] = _mm_srai_epi32(u[7], bit);
2337
2338 u[8] = _mm_mullo_epi32(in[7], cospi34);
2339 u[8] = _mm_add_epi32(u[8], rnding);
2340 u[8] = _mm_srai_epi32(u[8], bit);
2341
2342 u[9] = _mm_mullo_epi32(in[7], cospi30);
2343 u[9] = _mm_add_epi32(u[9], rnding);
2344 u[9] = _mm_srai_epi32(u[9], bit);
2345
2346 u[10] = _mm_mullo_epi32(in[5], cospi42);
2347 u[10] = _mm_add_epi32(u[10], rnding);
2348 u[10] = _mm_srai_epi32(u[10], bit);
2349
2350 u[11] = _mm_mullo_epi32(in[5], cospi22);
2351 u[11] = _mm_add_epi32(u[11], rnding);
2352 u[11] = _mm_srai_epi32(u[11], bit);
2353
2354 u[12] = _mm_mullo_epi32(in[3], cospi50);
2355 u[12] = _mm_add_epi32(u[12], rnding);
2356 u[12] = _mm_srai_epi32(u[12], bit);
2357
2358 u[13] = _mm_mullo_epi32(in[3], cospi14);
2359 u[13] = _mm_add_epi32(u[13], rnding);
2360 u[13] = _mm_srai_epi32(u[13], bit);
2361
2362 u[14] = _mm_mullo_epi32(in[1], cospi58);
2363 u[14] = _mm_add_epi32(u[14], rnding);
2364 u[14] = _mm_srai_epi32(u[14], bit);
2365
2366 u[15] = _mm_mullo_epi32(in[1], cospi6);
2367 u[15] = _mm_add_epi32(u[15], rnding);
2368 u[15] = _mm_srai_epi32(u[15], bit);
2369
2370 // stage 3
2371 addsub_sse4_1(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
2372 addsub_sse4_1(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
2373 addsub_sse4_1(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
2374 addsub_sse4_1(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
2375 addsub_sse4_1(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
2376 addsub_sse4_1(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
2377 addsub_sse4_1(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
2378 addsub_sse4_1(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
2379
2380 // stage 4
2381 y = _mm_mullo_epi32(u[8], cospi56);
2382 x = _mm_mullo_epi32(u[9], cospi56);
2383 u[8] = _mm_mullo_epi32(u[8], cospi8);
2384 u[8] = _mm_add_epi32(u[8], x);
2385 u[8] = _mm_add_epi32(u[8], rnding);
2386 u[8] = _mm_srai_epi32(u[8], bit);
2387
2388 x = _mm_mullo_epi32(u[9], cospi8);
2389 u[9] = _mm_sub_epi32(y, x);
2390 u[9] = _mm_add_epi32(u[9], rnding);
2391 u[9] = _mm_srai_epi32(u[9], bit);
2392
2393 x = _mm_mullo_epi32(u[11], cospi24);
2394 y = _mm_mullo_epi32(u[10], cospi24);
2395 u[10] = _mm_mullo_epi32(u[10], cospi40);
2396 u[10] = _mm_add_epi32(u[10], x);
2397 u[10] = _mm_add_epi32(u[10], rnding);
2398 u[10] = _mm_srai_epi32(u[10], bit);
2399
2400 x = _mm_mullo_epi32(u[11], cospi40);
2401 u[11] = _mm_sub_epi32(y, x);
2402 u[11] = _mm_add_epi32(u[11], rnding);
2403 u[11] = _mm_srai_epi32(u[11], bit);
2404
2405 x = _mm_mullo_epi32(u[13], cospi8);
2406 y = _mm_mullo_epi32(u[12], cospi8);
2407 u[12] = _mm_mullo_epi32(u[12], cospim56);
2408 u[12] = _mm_add_epi32(u[12], x);
2409 u[12] = _mm_add_epi32(u[12], rnding);
2410 u[12] = _mm_srai_epi32(u[12], bit);
2411
2412 x = _mm_mullo_epi32(u[13], cospim56);
2413 u[13] = _mm_sub_epi32(y, x);
2414 u[13] = _mm_add_epi32(u[13], rnding);
2415 u[13] = _mm_srai_epi32(u[13], bit);
2416
2417 x = _mm_mullo_epi32(u[15], cospi40);
2418 y = _mm_mullo_epi32(u[14], cospi40);
2419 u[14] = _mm_mullo_epi32(u[14], cospim24);
2420 u[14] = _mm_add_epi32(u[14], x);
2421 u[14] = _mm_add_epi32(u[14], rnding);
2422 u[14] = _mm_srai_epi32(u[14], bit);
2423
2424 x = _mm_mullo_epi32(u[15], cospim24);
2425 u[15] = _mm_sub_epi32(y, x);
2426 u[15] = _mm_add_epi32(u[15], rnding);
2427 u[15] = _mm_srai_epi32(u[15], bit);
2428
2429 // stage 5
2430 addsub_sse4_1(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
2431 addsub_sse4_1(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
2432 addsub_sse4_1(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
2433 addsub_sse4_1(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
2434 addsub_sse4_1(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
2435 addsub_sse4_1(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
2436 addsub_sse4_1(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
2437 addsub_sse4_1(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
2438
2439 // stage 6
2440 x = _mm_mullo_epi32(u[5], cospi48);
2441 y = _mm_mullo_epi32(u[4], cospi48);
2442 u[4] = _mm_mullo_epi32(u[4], cospi16);
2443 u[4] = _mm_add_epi32(u[4], x);
2444 u[4] = _mm_add_epi32(u[4], rnding);
2445 u[4] = _mm_srai_epi32(u[4], bit);
2446
2447 x = _mm_mullo_epi32(u[5], cospi16);
2448 u[5] = _mm_sub_epi32(y, x);
2449 u[5] = _mm_add_epi32(u[5], rnding);
2450 u[5] = _mm_srai_epi32(u[5], bit);
2451
2452 x = _mm_mullo_epi32(u[7], cospi16);
2453 y = _mm_mullo_epi32(u[6], cospi16);
2454 u[6] = _mm_mullo_epi32(u[6], cospim48);
2455 u[6] = _mm_add_epi32(u[6], x);
2456 u[6] = _mm_add_epi32(u[6], rnding);
2457 u[6] = _mm_srai_epi32(u[6], bit);
2458
2459 x = _mm_mullo_epi32(u[7], cospim48);
2460 u[7] = _mm_sub_epi32(y, x);
2461 u[7] = _mm_add_epi32(u[7], rnding);
2462 u[7] = _mm_srai_epi32(u[7], bit);
2463
2464 x = _mm_mullo_epi32(u[13], cospi48);
2465 y = _mm_mullo_epi32(u[12], cospi48);
2466 u[12] = _mm_mullo_epi32(u[12], cospi16);
2467 u[12] = _mm_add_epi32(u[12], x);
2468 u[12] = _mm_add_epi32(u[12], rnding);
2469 u[12] = _mm_srai_epi32(u[12], bit);
2470
2471 x = _mm_mullo_epi32(u[13], cospi16);
2472 u[13] = _mm_sub_epi32(y, x);
2473 u[13] = _mm_add_epi32(u[13], rnding);
2474 u[13] = _mm_srai_epi32(u[13], bit);
2475
2476 x = _mm_mullo_epi32(u[15], cospi16);
2477 y = _mm_mullo_epi32(u[14], cospi16);
2478 u[14] = _mm_mullo_epi32(u[14], cospim48);
2479 u[14] = _mm_add_epi32(u[14], x);
2480 u[14] = _mm_add_epi32(u[14], rnding);
2481 u[14] = _mm_srai_epi32(u[14], bit);
2482
2483 x = _mm_mullo_epi32(u[15], cospim48);
2484 u[15] = _mm_sub_epi32(y, x);
2485 u[15] = _mm_add_epi32(u[15], rnding);
2486 u[15] = _mm_srai_epi32(u[15], bit);
2487
2488 // stage 7
2489 addsub_sse4_1(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
2490 addsub_sse4_1(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
2491 addsub_sse4_1(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
2492 addsub_sse4_1(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
2493 addsub_sse4_1(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
2494 addsub_sse4_1(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
2495 addsub_sse4_1(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
2496 addsub_sse4_1(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
2497
2498 // stage 8
2499 y = _mm_mullo_epi32(u[2], cospi32);
2500 x = _mm_mullo_epi32(u[3], cospi32);
2501 u[2] = _mm_add_epi32(y, x);
2502 u[2] = _mm_add_epi32(u[2], rnding);
2503 u[2] = _mm_srai_epi32(u[2], bit);
2504
2505 u[3] = _mm_sub_epi32(y, x);
2506 u[3] = _mm_add_epi32(u[3], rnding);
2507 u[3] = _mm_srai_epi32(u[3], bit);
2508 y = _mm_mullo_epi32(u[6], cospi32);
2509 x = _mm_mullo_epi32(u[7], cospi32);
2510 u[6] = _mm_add_epi32(y, x);
2511 u[6] = _mm_add_epi32(u[6], rnding);
2512 u[6] = _mm_srai_epi32(u[6], bit);
2513
2514 u[7] = _mm_sub_epi32(y, x);
2515 u[7] = _mm_add_epi32(u[7], rnding);
2516 u[7] = _mm_srai_epi32(u[7], bit);
2517
2518 y = _mm_mullo_epi32(u[10], cospi32);
2519 x = _mm_mullo_epi32(u[11], cospi32);
2520 u[10] = _mm_add_epi32(y, x);
2521 u[10] = _mm_add_epi32(u[10], rnding);
2522 u[10] = _mm_srai_epi32(u[10], bit);
2523
2524 u[11] = _mm_sub_epi32(y, x);
2525 u[11] = _mm_add_epi32(u[11], rnding);
2526 u[11] = _mm_srai_epi32(u[11], bit);
2527
2528 y = _mm_mullo_epi32(u[14], cospi32);
2529 x = _mm_mullo_epi32(u[15], cospi32);
2530 u[14] = _mm_add_epi32(y, x);
2531 u[14] = _mm_add_epi32(u[14], rnding);
2532 u[14] = _mm_srai_epi32(u[14], bit);
2533
2534 u[15] = _mm_sub_epi32(y, x);
2535 u[15] = _mm_add_epi32(u[15], rnding);
2536 u[15] = _mm_srai_epi32(u[15], bit);
2537
2538 // stage 9
2539 if (do_cols) {
2540 out[0] = u[0];
2541 out[1] = _mm_sub_epi32(_mm_setzero_si128(), u[8]);
2542 out[2] = u[12];
2543 out[3] = _mm_sub_epi32(_mm_setzero_si128(), u[4]);
2544 out[4] = u[6];
2545 out[5] = _mm_sub_epi32(_mm_setzero_si128(), u[14]);
2546 out[6] = u[10];
2547 out[7] = _mm_sub_epi32(_mm_setzero_si128(), u[2]);
2548 out[8] = u[3];
2549 out[9] = _mm_sub_epi32(_mm_setzero_si128(), u[11]);
2550 out[10] = u[15];
2551 out[11] = _mm_sub_epi32(_mm_setzero_si128(), u[7]);
2552 out[12] = u[5];
2553 out[13] = _mm_sub_epi32(_mm_setzero_si128(), u[13]);
2554 out[14] = u[9];
2555 out[15] = _mm_sub_epi32(_mm_setzero_si128(), u[1]);
2556 } else {
2557 const int log_range_out = AOMMAX(16, bd + 6);
2558 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
2559 const __m128i clamp_hi_out =
2560 _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
2561
2562 neg_shift_sse4_1(u[0], u[8], out + 0, out + 1, &clamp_lo_out,
2563 &clamp_hi_out, out_shift);
2564 neg_shift_sse4_1(u[12], u[4], out + 2, out + 3, &clamp_lo_out,
2565 &clamp_hi_out, out_shift);
2566 neg_shift_sse4_1(u[6], u[14], out + 4, out + 5, &clamp_lo_out,
2567 &clamp_hi_out, out_shift);
2568 neg_shift_sse4_1(u[10], u[2], out + 6, out + 7, &clamp_lo_out,
2569 &clamp_hi_out, out_shift);
2570 neg_shift_sse4_1(u[3], u[11], out + 8, out + 9, &clamp_lo_out,
2571 &clamp_hi_out, out_shift);
2572 neg_shift_sse4_1(u[15], u[7], out + 10, out + 11, &clamp_lo_out,
2573 &clamp_hi_out, out_shift);
2574 neg_shift_sse4_1(u[5], u[13], out + 12, out + 13, &clamp_lo_out,
2575 &clamp_hi_out, out_shift);
2576 neg_shift_sse4_1(u[9], u[1], out + 14, out + 15, &clamp_lo_out,
2577 &clamp_hi_out, out_shift);
2578 }
2579 }
2580 }
2581
idct16x16_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)2582 static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
2583 int bd, int out_shift) {
2584 const int32_t *cospi = cospi_arr(bit);
2585 const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
2586 const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
2587 const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
2588 const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
2589 const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
2590 const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
2591 const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
2592 const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
2593 const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
2594 const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
2595 const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
2596 const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
2597 const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
2598 const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
2599 const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
2600 const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
2601 const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
2602 const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
2603 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
2604 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
2605 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
2606 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
2607 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
2608 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
2609 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2610 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
2611 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
2612 __m128i u[16], v[16], x, y;
2613
2614 {
2615 // stage 0
2616 // stage 1
2617 u[0] = in[0];
2618 u[1] = in[8];
2619 u[2] = in[4];
2620 u[3] = in[12];
2621 u[4] = in[2];
2622 u[5] = in[10];
2623 u[6] = in[6];
2624 u[7] = in[14];
2625 u[8] = in[1];
2626 u[9] = in[9];
2627 u[10] = in[5];
2628 u[11] = in[13];
2629 u[12] = in[3];
2630 u[13] = in[11];
2631 u[14] = in[7];
2632 u[15] = in[15];
2633
2634 // stage 2
2635 v[0] = u[0];
2636 v[1] = u[1];
2637 v[2] = u[2];
2638 v[3] = u[3];
2639 v[4] = u[4];
2640 v[5] = u[5];
2641 v[6] = u[6];
2642 v[7] = u[7];
2643
2644 v[8] = half_btf_sse4_1(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit);
2645 v[9] = half_btf_sse4_1(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit);
2646 v[10] = half_btf_sse4_1(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit);
2647 v[11] = half_btf_sse4_1(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit);
2648 v[12] = half_btf_sse4_1(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit);
2649 v[13] = half_btf_sse4_1(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit);
2650 v[14] = half_btf_sse4_1(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit);
2651 v[15] = half_btf_sse4_1(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit);
2652
2653 // stage 3
2654 u[0] = v[0];
2655 u[1] = v[1];
2656 u[2] = v[2];
2657 u[3] = v[3];
2658 u[4] = half_btf_sse4_1(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit);
2659 u[5] = half_btf_sse4_1(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit);
2660 u[6] = half_btf_sse4_1(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit);
2661 u[7] = half_btf_sse4_1(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit);
2662 addsub_sse4_1(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
2663 addsub_sse4_1(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
2664 addsub_sse4_1(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
2665 addsub_sse4_1(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
2666
2667 // stage 4
2668 x = _mm_mullo_epi32(u[0], cospi32);
2669 y = _mm_mullo_epi32(u[1], cospi32);
2670 v[0] = _mm_add_epi32(x, y);
2671 v[0] = _mm_add_epi32(v[0], rnding);
2672 v[0] = _mm_srai_epi32(v[0], bit);
2673
2674 v[1] = _mm_sub_epi32(x, y);
2675 v[1] = _mm_add_epi32(v[1], rnding);
2676 v[1] = _mm_srai_epi32(v[1], bit);
2677
2678 v[2] = half_btf_sse4_1(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit);
2679 v[3] = half_btf_sse4_1(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit);
2680 addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
2681 addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
2682 v[8] = u[8];
2683 v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
2684 v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
2685 v[11] = u[11];
2686 v[12] = u[12];
2687 v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
2688 v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
2689 v[15] = u[15];
2690
2691 // stage 5
2692 addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
2693 addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
2694 u[4] = v[4];
2695
2696 x = _mm_mullo_epi32(v[5], cospi32);
2697 y = _mm_mullo_epi32(v[6], cospi32);
2698 u[5] = _mm_sub_epi32(y, x);
2699 u[5] = _mm_add_epi32(u[5], rnding);
2700 u[5] = _mm_srai_epi32(u[5], bit);
2701
2702 u[6] = _mm_add_epi32(y, x);
2703 u[6] = _mm_add_epi32(u[6], rnding);
2704 u[6] = _mm_srai_epi32(u[6], bit);
2705
2706 u[7] = v[7];
2707 addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
2708 addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
2709 addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
2710 addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
2711
2712 // stage 6
2713 addsub_sse4_1(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi);
2714 addsub_sse4_1(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi);
2715 addsub_sse4_1(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi);
2716 addsub_sse4_1(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi);
2717 v[8] = u[8];
2718 v[9] = u[9];
2719
2720 x = _mm_mullo_epi32(u[10], cospi32);
2721 y = _mm_mullo_epi32(u[13], cospi32);
2722 v[10] = _mm_sub_epi32(y, x);
2723 v[10] = _mm_add_epi32(v[10], rnding);
2724 v[10] = _mm_srai_epi32(v[10], bit);
2725
2726 v[13] = _mm_add_epi32(x, y);
2727 v[13] = _mm_add_epi32(v[13], rnding);
2728 v[13] = _mm_srai_epi32(v[13], bit);
2729
2730 x = _mm_mullo_epi32(u[11], cospi32);
2731 y = _mm_mullo_epi32(u[12], cospi32);
2732 v[11] = _mm_sub_epi32(y, x);
2733 v[11] = _mm_add_epi32(v[11], rnding);
2734 v[11] = _mm_srai_epi32(v[11], bit);
2735
2736 v[12] = _mm_add_epi32(x, y);
2737 v[12] = _mm_add_epi32(v[12], rnding);
2738 v[12] = _mm_srai_epi32(v[12], bit);
2739
2740 v[14] = u[14];
2741 v[15] = u[15];
2742
2743 // stage 7
2744 if (do_cols) {
2745 addsub_no_clamp_sse4_1(v[0], v[15], out + 0, out + 15);
2746 addsub_no_clamp_sse4_1(v[1], v[14], out + 1, out + 14);
2747 addsub_no_clamp_sse4_1(v[2], v[13], out + 2, out + 13);
2748 addsub_no_clamp_sse4_1(v[3], v[12], out + 3, out + 12);
2749 addsub_no_clamp_sse4_1(v[4], v[11], out + 4, out + 11);
2750 addsub_no_clamp_sse4_1(v[5], v[10], out + 5, out + 10);
2751 addsub_no_clamp_sse4_1(v[6], v[9], out + 6, out + 9);
2752 addsub_no_clamp_sse4_1(v[7], v[8], out + 7, out + 8);
2753 } else {
2754 const int log_range_out = AOMMAX(16, bd + 6);
2755 const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
2756 -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
2757 const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
2758 (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
2759
2760 addsub_shift_sse4_1(v[0], v[15], out + 0, out + 15, &clamp_lo_out,
2761 &clamp_hi_out, out_shift);
2762 addsub_shift_sse4_1(v[1], v[14], out + 1, out + 14, &clamp_lo_out,
2763 &clamp_hi_out, out_shift);
2764 addsub_shift_sse4_1(v[2], v[13], out + 2, out + 13, &clamp_lo_out,
2765 &clamp_hi_out, out_shift);
2766 addsub_shift_sse4_1(v[3], v[12], out + 3, out + 12, &clamp_lo_out,
2767 &clamp_hi_out, out_shift);
2768 addsub_shift_sse4_1(v[4], v[11], out + 4, out + 11, &clamp_lo_out,
2769 &clamp_hi_out, out_shift);
2770 addsub_shift_sse4_1(v[5], v[10], out + 5, out + 10, &clamp_lo_out,
2771 &clamp_hi_out, out_shift);
2772 addsub_shift_sse4_1(v[6], v[9], out + 6, out + 9, &clamp_lo_out,
2773 &clamp_hi_out, out_shift);
2774 addsub_shift_sse4_1(v[7], v[8], out + 7, out + 8, &clamp_lo_out,
2775 &clamp_hi_out, out_shift);
2776 }
2777 }
2778 }
2779
iadst16x16_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)2780 static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
2781 int bd, int out_shift) {
2782 const int32_t *cospi = cospi_arr(bit);
2783 const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
2784 const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
2785 const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
2786 const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
2787 const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
2788 const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
2789 const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
2790 const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
2791 const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
2792 const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
2793 const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
2794 const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
2795 const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
2796 const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
2797 const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
2798 const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
2799 const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
2800 const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
2801 const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
2802 const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
2803 const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
2804 const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
2805 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
2806 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
2807 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
2808 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
2809 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
2810 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2811 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
2812 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
2813 __m128i u[16], v[16], x, y;
2814
2815 // Calculate the column 0, 1, 2, 3
2816 {
2817 // stage 0
2818 // stage 1
2819 // stage 2
2820 v[0] = _mm_mullo_epi32(in[15], cospi2);
2821 x = _mm_mullo_epi32(in[0], cospi62);
2822 v[0] = _mm_add_epi32(v[0], x);
2823 v[0] = _mm_add_epi32(v[0], rnding);
2824 v[0] = _mm_srai_epi32(v[0], bit);
2825
2826 v[1] = _mm_mullo_epi32(in[15], cospi62);
2827 x = _mm_mullo_epi32(in[0], cospi2);
2828 v[1] = _mm_sub_epi32(v[1], x);
2829 v[1] = _mm_add_epi32(v[1], rnding);
2830 v[1] = _mm_srai_epi32(v[1], bit);
2831
2832 v[2] = _mm_mullo_epi32(in[13], cospi10);
2833 x = _mm_mullo_epi32(in[2], cospi54);
2834 v[2] = _mm_add_epi32(v[2], x);
2835 v[2] = _mm_add_epi32(v[2], rnding);
2836 v[2] = _mm_srai_epi32(v[2], bit);
2837
2838 v[3] = _mm_mullo_epi32(in[13], cospi54);
2839 x = _mm_mullo_epi32(in[2], cospi10);
2840 v[3] = _mm_sub_epi32(v[3], x);
2841 v[3] = _mm_add_epi32(v[3], rnding);
2842 v[3] = _mm_srai_epi32(v[3], bit);
2843
2844 v[4] = _mm_mullo_epi32(in[11], cospi18);
2845 x = _mm_mullo_epi32(in[4], cospi46);
2846 v[4] = _mm_add_epi32(v[4], x);
2847 v[4] = _mm_add_epi32(v[4], rnding);
2848 v[4] = _mm_srai_epi32(v[4], bit);
2849
2850 v[5] = _mm_mullo_epi32(in[11], cospi46);
2851 x = _mm_mullo_epi32(in[4], cospi18);
2852 v[5] = _mm_sub_epi32(v[5], x);
2853 v[5] = _mm_add_epi32(v[5], rnding);
2854 v[5] = _mm_srai_epi32(v[5], bit);
2855
2856 v[6] = _mm_mullo_epi32(in[9], cospi26);
2857 x = _mm_mullo_epi32(in[6], cospi38);
2858 v[6] = _mm_add_epi32(v[6], x);
2859 v[6] = _mm_add_epi32(v[6], rnding);
2860 v[6] = _mm_srai_epi32(v[6], bit);
2861
2862 v[7] = _mm_mullo_epi32(in[9], cospi38);
2863 x = _mm_mullo_epi32(in[6], cospi26);
2864 v[7] = _mm_sub_epi32(v[7], x);
2865 v[7] = _mm_add_epi32(v[7], rnding);
2866 v[7] = _mm_srai_epi32(v[7], bit);
2867
2868 v[8] = _mm_mullo_epi32(in[7], cospi34);
2869 x = _mm_mullo_epi32(in[8], cospi30);
2870 v[8] = _mm_add_epi32(v[8], x);
2871 v[8] = _mm_add_epi32(v[8], rnding);
2872 v[8] = _mm_srai_epi32(v[8], bit);
2873
2874 v[9] = _mm_mullo_epi32(in[7], cospi30);
2875 x = _mm_mullo_epi32(in[8], cospi34);
2876 v[9] = _mm_sub_epi32(v[9], x);
2877 v[9] = _mm_add_epi32(v[9], rnding);
2878 v[9] = _mm_srai_epi32(v[9], bit);
2879
2880 v[10] = _mm_mullo_epi32(in[5], cospi42);
2881 x = _mm_mullo_epi32(in[10], cospi22);
2882 v[10] = _mm_add_epi32(v[10], x);
2883 v[10] = _mm_add_epi32(v[10], rnding);
2884 v[10] = _mm_srai_epi32(v[10], bit);
2885
2886 v[11] = _mm_mullo_epi32(in[5], cospi22);
2887 x = _mm_mullo_epi32(in[10], cospi42);
2888 v[11] = _mm_sub_epi32(v[11], x);
2889 v[11] = _mm_add_epi32(v[11], rnding);
2890 v[11] = _mm_srai_epi32(v[11], bit);
2891
2892 v[12] = _mm_mullo_epi32(in[3], cospi50);
2893 x = _mm_mullo_epi32(in[12], cospi14);
2894 v[12] = _mm_add_epi32(v[12], x);
2895 v[12] = _mm_add_epi32(v[12], rnding);
2896 v[12] = _mm_srai_epi32(v[12], bit);
2897
2898 v[13] = _mm_mullo_epi32(in[3], cospi14);
2899 x = _mm_mullo_epi32(in[12], cospi50);
2900 v[13] = _mm_sub_epi32(v[13], x);
2901 v[13] = _mm_add_epi32(v[13], rnding);
2902 v[13] = _mm_srai_epi32(v[13], bit);
2903
2904 v[14] = _mm_mullo_epi32(in[1], cospi58);
2905 x = _mm_mullo_epi32(in[14], cospi6);
2906 v[14] = _mm_add_epi32(v[14], x);
2907 v[14] = _mm_add_epi32(v[14], rnding);
2908 v[14] = _mm_srai_epi32(v[14], bit);
2909
2910 v[15] = _mm_mullo_epi32(in[1], cospi6);
2911 x = _mm_mullo_epi32(in[14], cospi58);
2912 v[15] = _mm_sub_epi32(v[15], x);
2913 v[15] = _mm_add_epi32(v[15], rnding);
2914 v[15] = _mm_srai_epi32(v[15], bit);
2915
2916 // stage 3
2917 addsub_sse4_1(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
2918 addsub_sse4_1(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
2919 addsub_sse4_1(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
2920 addsub_sse4_1(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
2921 addsub_sse4_1(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
2922 addsub_sse4_1(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
2923 addsub_sse4_1(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
2924 addsub_sse4_1(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
2925
2926 // stage 4
2927 v[0] = u[0];
2928 v[1] = u[1];
2929 v[2] = u[2];
2930 v[3] = u[3];
2931 v[4] = u[4];
2932 v[5] = u[5];
2933 v[6] = u[6];
2934 v[7] = u[7];
2935
2936 v[8] = _mm_mullo_epi32(u[8], cospi8);
2937 x = _mm_mullo_epi32(u[9], cospi56);
2938 v[8] = _mm_add_epi32(v[8], x);
2939 v[8] = _mm_add_epi32(v[8], rnding);
2940 v[8] = _mm_srai_epi32(v[8], bit);
2941
2942 v[9] = _mm_mullo_epi32(u[8], cospi56);
2943 x = _mm_mullo_epi32(u[9], cospi8);
2944 v[9] = _mm_sub_epi32(v[9], x);
2945 v[9] = _mm_add_epi32(v[9], rnding);
2946 v[9] = _mm_srai_epi32(v[9], bit);
2947
2948 v[10] = _mm_mullo_epi32(u[10], cospi40);
2949 x = _mm_mullo_epi32(u[11], cospi24);
2950 v[10] = _mm_add_epi32(v[10], x);
2951 v[10] = _mm_add_epi32(v[10], rnding);
2952 v[10] = _mm_srai_epi32(v[10], bit);
2953
2954 v[11] = _mm_mullo_epi32(u[10], cospi24);
2955 x = _mm_mullo_epi32(u[11], cospi40);
2956 v[11] = _mm_sub_epi32(v[11], x);
2957 v[11] = _mm_add_epi32(v[11], rnding);
2958 v[11] = _mm_srai_epi32(v[11], bit);
2959
2960 v[12] = _mm_mullo_epi32(u[12], cospim56);
2961 x = _mm_mullo_epi32(u[13], cospi8);
2962 v[12] = _mm_add_epi32(v[12], x);
2963 v[12] = _mm_add_epi32(v[12], rnding);
2964 v[12] = _mm_srai_epi32(v[12], bit);
2965
2966 v[13] = _mm_mullo_epi32(u[12], cospi8);
2967 x = _mm_mullo_epi32(u[13], cospim56);
2968 v[13] = _mm_sub_epi32(v[13], x);
2969 v[13] = _mm_add_epi32(v[13], rnding);
2970 v[13] = _mm_srai_epi32(v[13], bit);
2971
2972 v[14] = _mm_mullo_epi32(u[14], cospim24);
2973 x = _mm_mullo_epi32(u[15], cospi40);
2974 v[14] = _mm_add_epi32(v[14], x);
2975 v[14] = _mm_add_epi32(v[14], rnding);
2976 v[14] = _mm_srai_epi32(v[14], bit);
2977
2978 v[15] = _mm_mullo_epi32(u[14], cospi40);
2979 x = _mm_mullo_epi32(u[15], cospim24);
2980 v[15] = _mm_sub_epi32(v[15], x);
2981 v[15] = _mm_add_epi32(v[15], rnding);
2982 v[15] = _mm_srai_epi32(v[15], bit);
2983
2984 // stage 5
2985 addsub_sse4_1(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
2986 addsub_sse4_1(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
2987 addsub_sse4_1(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
2988 addsub_sse4_1(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
2989 addsub_sse4_1(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
2990 addsub_sse4_1(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
2991 addsub_sse4_1(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
2992 addsub_sse4_1(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
2993
2994 // stage 6
2995 v[0] = u[0];
2996 v[1] = u[1];
2997 v[2] = u[2];
2998 v[3] = u[3];
2999
3000 v[4] = _mm_mullo_epi32(u[4], cospi16);
3001 x = _mm_mullo_epi32(u[5], cospi48);
3002 v[4] = _mm_add_epi32(v[4], x);
3003 v[4] = _mm_add_epi32(v[4], rnding);
3004 v[4] = _mm_srai_epi32(v[4], bit);
3005
3006 v[5] = _mm_mullo_epi32(u[4], cospi48);
3007 x = _mm_mullo_epi32(u[5], cospi16);
3008 v[5] = _mm_sub_epi32(v[5], x);
3009 v[5] = _mm_add_epi32(v[5], rnding);
3010 v[5] = _mm_srai_epi32(v[5], bit);
3011
3012 v[6] = _mm_mullo_epi32(u[6], cospim48);
3013 x = _mm_mullo_epi32(u[7], cospi16);
3014 v[6] = _mm_add_epi32(v[6], x);
3015 v[6] = _mm_add_epi32(v[6], rnding);
3016 v[6] = _mm_srai_epi32(v[6], bit);
3017
3018 v[7] = _mm_mullo_epi32(u[6], cospi16);
3019 x = _mm_mullo_epi32(u[7], cospim48);
3020 v[7] = _mm_sub_epi32(v[7], x);
3021 v[7] = _mm_add_epi32(v[7], rnding);
3022 v[7] = _mm_srai_epi32(v[7], bit);
3023
3024 v[8] = u[8];
3025 v[9] = u[9];
3026 v[10] = u[10];
3027 v[11] = u[11];
3028
3029 v[12] = _mm_mullo_epi32(u[12], cospi16);
3030 x = _mm_mullo_epi32(u[13], cospi48);
3031 v[12] = _mm_add_epi32(v[12], x);
3032 v[12] = _mm_add_epi32(v[12], rnding);
3033 v[12] = _mm_srai_epi32(v[12], bit);
3034
3035 v[13] = _mm_mullo_epi32(u[12], cospi48);
3036 x = _mm_mullo_epi32(u[13], cospi16);
3037 v[13] = _mm_sub_epi32(v[13], x);
3038 v[13] = _mm_add_epi32(v[13], rnding);
3039 v[13] = _mm_srai_epi32(v[13], bit);
3040
3041 v[14] = _mm_mullo_epi32(u[14], cospim48);
3042 x = _mm_mullo_epi32(u[15], cospi16);
3043 v[14] = _mm_add_epi32(v[14], x);
3044 v[14] = _mm_add_epi32(v[14], rnding);
3045 v[14] = _mm_srai_epi32(v[14], bit);
3046
3047 v[15] = _mm_mullo_epi32(u[14], cospi16);
3048 x = _mm_mullo_epi32(u[15], cospim48);
3049 v[15] = _mm_sub_epi32(v[15], x);
3050 v[15] = _mm_add_epi32(v[15], rnding);
3051 v[15] = _mm_srai_epi32(v[15], bit);
3052
3053 // stage 7
3054 addsub_sse4_1(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
3055 addsub_sse4_1(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
3056 addsub_sse4_1(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
3057 addsub_sse4_1(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
3058 addsub_sse4_1(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
3059 addsub_sse4_1(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
3060 addsub_sse4_1(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
3061 addsub_sse4_1(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
3062
3063 // stage 8
3064 v[0] = u[0];
3065 v[1] = u[1];
3066
3067 y = _mm_mullo_epi32(u[2], cospi32);
3068 x = _mm_mullo_epi32(u[3], cospi32);
3069 v[2] = _mm_add_epi32(y, x);
3070 v[2] = _mm_add_epi32(v[2], rnding);
3071 v[2] = _mm_srai_epi32(v[2], bit);
3072
3073 v[3] = _mm_sub_epi32(y, x);
3074 v[3] = _mm_add_epi32(v[3], rnding);
3075 v[3] = _mm_srai_epi32(v[3], bit);
3076
3077 v[4] = u[4];
3078 v[5] = u[5];
3079
3080 y = _mm_mullo_epi32(u[6], cospi32);
3081 x = _mm_mullo_epi32(u[7], cospi32);
3082 v[6] = _mm_add_epi32(y, x);
3083 v[6] = _mm_add_epi32(v[6], rnding);
3084 v[6] = _mm_srai_epi32(v[6], bit);
3085
3086 v[7] = _mm_sub_epi32(y, x);
3087 v[7] = _mm_add_epi32(v[7], rnding);
3088 v[7] = _mm_srai_epi32(v[7], bit);
3089
3090 v[8] = u[8];
3091 v[9] = u[9];
3092
3093 y = _mm_mullo_epi32(u[10], cospi32);
3094 x = _mm_mullo_epi32(u[11], cospi32);
3095 v[10] = _mm_add_epi32(y, x);
3096 v[10] = _mm_add_epi32(v[10], rnding);
3097 v[10] = _mm_srai_epi32(v[10], bit);
3098
3099 v[11] = _mm_sub_epi32(y, x);
3100 v[11] = _mm_add_epi32(v[11], rnding);
3101 v[11] = _mm_srai_epi32(v[11], bit);
3102
3103 v[12] = u[12];
3104 v[13] = u[13];
3105
3106 y = _mm_mullo_epi32(u[14], cospi32);
3107 x = _mm_mullo_epi32(u[15], cospi32);
3108 v[14] = _mm_add_epi32(y, x);
3109 v[14] = _mm_add_epi32(v[14], rnding);
3110 v[14] = _mm_srai_epi32(v[14], bit);
3111
3112 v[15] = _mm_sub_epi32(y, x);
3113 v[15] = _mm_add_epi32(v[15], rnding);
3114 v[15] = _mm_srai_epi32(v[15], bit);
3115
3116 // stage 9
3117 if (do_cols) {
3118 out[0] = v[0];
3119 out[1] = _mm_sub_epi32(_mm_setzero_si128(), v[8]);
3120 out[2] = v[12];
3121 out[3] = _mm_sub_epi32(_mm_setzero_si128(), v[4]);
3122 out[4] = v[6];
3123 out[5] = _mm_sub_epi32(_mm_setzero_si128(), v[14]);
3124 out[6] = v[10];
3125 out[7] = _mm_sub_epi32(_mm_setzero_si128(), v[2]);
3126 out[8] = v[3];
3127 out[9] = _mm_sub_epi32(_mm_setzero_si128(), v[11]);
3128 out[10] = v[15];
3129 out[11] = _mm_sub_epi32(_mm_setzero_si128(), v[7]);
3130 out[12] = v[5];
3131 out[13] = _mm_sub_epi32(_mm_setzero_si128(), v[13]);
3132 out[14] = v[9];
3133 out[15] = _mm_sub_epi32(_mm_setzero_si128(), v[1]);
3134 } else {
3135 const int log_range_out = AOMMAX(16, bd + 6);
3136 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
3137 const __m128i clamp_hi_out =
3138 _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
3139
3140 neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out,
3141 &clamp_hi_out, out_shift);
3142 neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
3143 &clamp_hi_out, out_shift);
3144 neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
3145 &clamp_hi_out, out_shift);
3146 neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
3147 &clamp_hi_out, out_shift);
3148 neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
3149 &clamp_hi_out, out_shift);
3150 neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
3151 &clamp_hi_out, out_shift);
3152 neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
3153 &clamp_hi_out, out_shift);
3154 neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
3155 &clamp_hi_out, out_shift);
3156 }
3157 }
3158 }
iidentity16_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)3159 static void iidentity16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
3160 int bd, int out_shift) {
3161 (void)bit;
3162 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3163 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
3164 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
3165 __m128i v[16];
3166 __m128i fact = _mm_set1_epi32(2 * NewSqrt2);
3167 __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
3168 __m128i a0, a1, a2, a3;
3169
3170 for (int i = 0; i < 16; i += 8) {
3171 a0 = _mm_mullo_epi32(in[i], fact);
3172 a1 = _mm_mullo_epi32(in[i + 1], fact);
3173 a0 = _mm_add_epi32(a0, offset);
3174 a1 = _mm_add_epi32(a1, offset);
3175 v[i] = _mm_srai_epi32(a0, NewSqrt2Bits);
3176 v[i + 1] = _mm_srai_epi32(a1, NewSqrt2Bits);
3177
3178 a2 = _mm_mullo_epi32(in[i + 2], fact);
3179 a3 = _mm_mullo_epi32(in[i + 3], fact);
3180 a2 = _mm_add_epi32(a2, offset);
3181 a3 = _mm_add_epi32(a3, offset);
3182 v[i + 2] = _mm_srai_epi32(a2, NewSqrt2Bits);
3183 v[i + 3] = _mm_srai_epi32(a3, NewSqrt2Bits);
3184
3185 a0 = _mm_mullo_epi32(in[i + 4], fact);
3186 a1 = _mm_mullo_epi32(in[i + 5], fact);
3187 a0 = _mm_add_epi32(a0, offset);
3188 a1 = _mm_add_epi32(a1, offset);
3189 v[i + 4] = _mm_srai_epi32(a0, NewSqrt2Bits);
3190 v[i + 5] = _mm_srai_epi32(a1, NewSqrt2Bits);
3191
3192 a2 = _mm_mullo_epi32(in[i + 6], fact);
3193 a3 = _mm_mullo_epi32(in[i + 7], fact);
3194 a2 = _mm_add_epi32(a2, offset);
3195 a3 = _mm_add_epi32(a3, offset);
3196 v[i + 6] = _mm_srai_epi32(a2, NewSqrt2Bits);
3197 v[i + 7] = _mm_srai_epi32(a3, NewSqrt2Bits);
3198 }
3199
3200 if (!do_cols) {
3201 const int log_range_out = AOMMAX(16, bd + 6);
3202 const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
3203 -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
3204 const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
3205 (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
3206
3207 shift_sse4_1(v, out, &clamp_lo_out, &clamp_hi_out, out_shift, 16);
3208 } else {
3209 highbd_clamp_epi32_sse4_1(v, out, &clamp_lo, &clamp_hi, 16);
3210 }
3211 }
idct64_stage8_sse4_1(__m128i * u,const __m128i * cospim32,const __m128i * cospi32,const __m128i * cospim16,const __m128i * cospi48,const __m128i * cospi16,const __m128i * cospim48,const __m128i * clamp_lo,const __m128i * clamp_hi,const __m128i * rnding,int bit)3212 static INLINE void idct64_stage8_sse4_1(
3213 __m128i *u, const __m128i *cospim32, const __m128i *cospi32,
3214 const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16,
3215 const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi,
3216 const __m128i *rnding, int bit) {
3217 int i;
3218 __m128i temp1, temp2, temp3, temp4;
3219 temp1 = half_btf_sse4_1(cospim32, &u[10], cospi32, &u[13], rnding, bit);
3220 u[13] = half_btf_sse4_1(cospi32, &u[10], cospi32, &u[13], rnding, bit);
3221 u[10] = temp1;
3222 temp2 = half_btf_sse4_1(cospim32, &u[11], cospi32, &u[12], rnding, bit);
3223 u[12] = half_btf_sse4_1(cospi32, &u[11], cospi32, &u[12], rnding, bit);
3224 u[11] = temp2;
3225
3226 for (i = 16; i < 20; ++i) {
3227 addsub_sse4_1(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi);
3228 addsub_sse4_1(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo,
3229 clamp_hi);
3230 }
3231
3232 temp1 = half_btf_sse4_1(cospim16, &u[36], cospi48, &u[59], rnding, bit);
3233 temp2 = half_btf_sse4_1(cospim16, &u[37], cospi48, &u[58], rnding, bit);
3234 temp3 = half_btf_sse4_1(cospim16, &u[38], cospi48, &u[57], rnding, bit);
3235 temp4 = half_btf_sse4_1(cospim16, &u[39], cospi48, &u[56], rnding, bit);
3236 u[56] = half_btf_sse4_1(cospi48, &u[39], cospi16, &u[56], rnding, bit);
3237 u[57] = half_btf_sse4_1(cospi48, &u[38], cospi16, &u[57], rnding, bit);
3238 u[58] = half_btf_sse4_1(cospi48, &u[37], cospi16, &u[58], rnding, bit);
3239 u[59] = half_btf_sse4_1(cospi48, &u[36], cospi16, &u[59], rnding, bit);
3240 u[36] = temp1;
3241 u[37] = temp2;
3242 u[38] = temp3;
3243 u[39] = temp4;
3244
3245 temp1 = half_btf_sse4_1(cospim48, &u[40], cospim16, &u[55], rnding, bit);
3246 temp2 = half_btf_sse4_1(cospim48, &u[41], cospim16, &u[54], rnding, bit);
3247 temp3 = half_btf_sse4_1(cospim48, &u[42], cospim16, &u[53], rnding, bit);
3248 temp4 = half_btf_sse4_1(cospim48, &u[43], cospim16, &u[52], rnding, bit);
3249 u[52] = half_btf_sse4_1(cospim16, &u[43], cospi48, &u[52], rnding, bit);
3250 u[53] = half_btf_sse4_1(cospim16, &u[42], cospi48, &u[53], rnding, bit);
3251 u[54] = half_btf_sse4_1(cospim16, &u[41], cospi48, &u[54], rnding, bit);
3252 u[55] = half_btf_sse4_1(cospim16, &u[40], cospi48, &u[55], rnding, bit);
3253 u[40] = temp1;
3254 u[41] = temp2;
3255 u[42] = temp3;
3256 u[43] = temp4;
3257 }
3258
idct64_stage9_sse4_1(__m128i * u,const __m128i * cospim32,const __m128i * cospi32,const __m128i * clamp_lo,const __m128i * clamp_hi,const __m128i * rnding,int bit)3259 static INLINE void idct64_stage9_sse4_1(__m128i *u, const __m128i *cospim32,
3260 const __m128i *cospi32,
3261 const __m128i *clamp_lo,
3262 const __m128i *clamp_hi,
3263 const __m128i *rnding, int bit) {
3264 int i;
3265 __m128i temp1, temp2, temp3, temp4;
3266 for (i = 0; i < 8; ++i) {
3267 addsub_sse4_1(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi);
3268 }
3269
3270 temp1 = half_btf_sse4_1(cospim32, &u[20], cospi32, &u[27], rnding, bit);
3271 temp2 = half_btf_sse4_1(cospim32, &u[21], cospi32, &u[26], rnding, bit);
3272 temp3 = half_btf_sse4_1(cospim32, &u[22], cospi32, &u[25], rnding, bit);
3273 temp4 = half_btf_sse4_1(cospim32, &u[23], cospi32, &u[24], rnding, bit);
3274 u[24] = half_btf_sse4_1(cospi32, &u[23], cospi32, &u[24], rnding, bit);
3275 u[25] = half_btf_sse4_1(cospi32, &u[22], cospi32, &u[25], rnding, bit);
3276 u[26] = half_btf_sse4_1(cospi32, &u[21], cospi32, &u[26], rnding, bit);
3277 u[27] = half_btf_sse4_1(cospi32, &u[20], cospi32, &u[27], rnding, bit);
3278 u[20] = temp1;
3279 u[21] = temp2;
3280 u[22] = temp3;
3281 u[23] = temp4;
3282 for (i = 32; i < 40; i++) {
3283 addsub_sse4_1(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi);
3284 }
3285
3286 for (i = 48; i < 56; i++) {
3287 addsub_sse4_1(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi);
3288 }
3289 }
3290
idct64_stage10_sse4_1(__m128i * u,const __m128i * cospim32,const __m128i * cospi32,const __m128i * clamp_lo,const __m128i * clamp_hi,const __m128i * rnding,int bit)3291 static INLINE void idct64_stage10_sse4_1(__m128i *u, const __m128i *cospim32,
3292 const __m128i *cospi32,
3293 const __m128i *clamp_lo,
3294 const __m128i *clamp_hi,
3295 const __m128i *rnding, int bit) {
3296 __m128i temp1, temp2, temp3, temp4;
3297 for (int i = 0; i < 16; i++) {
3298 addsub_sse4_1(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi);
3299 }
3300
3301 temp1 = half_btf_sse4_1(cospim32, &u[40], cospi32, &u[55], rnding, bit);
3302 temp2 = half_btf_sse4_1(cospim32, &u[41], cospi32, &u[54], rnding, bit);
3303 temp3 = half_btf_sse4_1(cospim32, &u[42], cospi32, &u[53], rnding, bit);
3304 temp4 = half_btf_sse4_1(cospim32, &u[43], cospi32, &u[52], rnding, bit);
3305 u[52] = half_btf_sse4_1(cospi32, &u[43], cospi32, &u[52], rnding, bit);
3306 u[53] = half_btf_sse4_1(cospi32, &u[42], cospi32, &u[53], rnding, bit);
3307 u[54] = half_btf_sse4_1(cospi32, &u[41], cospi32, &u[54], rnding, bit);
3308 u[55] = half_btf_sse4_1(cospi32, &u[40], cospi32, &u[55], rnding, bit);
3309 u[40] = temp1;
3310 u[41] = temp2;
3311 u[42] = temp3;
3312 u[43] = temp4;
3313
3314 temp1 = half_btf_sse4_1(cospim32, &u[44], cospi32, &u[51], rnding, bit);
3315 temp2 = half_btf_sse4_1(cospim32, &u[45], cospi32, &u[50], rnding, bit);
3316 temp3 = half_btf_sse4_1(cospim32, &u[46], cospi32, &u[49], rnding, bit);
3317 temp4 = half_btf_sse4_1(cospim32, &u[47], cospi32, &u[48], rnding, bit);
3318 u[48] = half_btf_sse4_1(cospi32, &u[47], cospi32, &u[48], rnding, bit);
3319 u[49] = half_btf_sse4_1(cospi32, &u[46], cospi32, &u[49], rnding, bit);
3320 u[50] = half_btf_sse4_1(cospi32, &u[45], cospi32, &u[50], rnding, bit);
3321 u[51] = half_btf_sse4_1(cospi32, &u[44], cospi32, &u[51], rnding, bit);
3322 u[44] = temp1;
3323 u[45] = temp2;
3324 u[46] = temp3;
3325 u[47] = temp4;
3326 }
3327
idct64_stage11_sse4_1(__m128i * u,__m128i * out,int do_cols,int bd,int out_shift,const int log_range)3328 static INLINE void idct64_stage11_sse4_1(__m128i *u, __m128i *out, int do_cols,
3329 int bd, int out_shift,
3330 const int log_range) {
3331 if (do_cols) {
3332 for (int i = 0; i < 32; i++) {
3333 addsub_no_clamp_sse4_1(u[i], u[63 - i], &out[(i)], &out[(63 - i)]);
3334 }
3335 } else {
3336 const int log_range_out = AOMMAX(16, bd + 6);
3337 const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
3338 -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
3339 const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
3340 (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
3341
3342 for (int i = 0; i < 32; i++) {
3343 addsub_shift_sse4_1(u[i], u[63 - i], &out[(i)], &out[(63 - i)],
3344 &clamp_lo_out, &clamp_hi_out, out_shift);
3345 }
3346 }
3347 }
3348
idct64x64_low1_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)3349 static void idct64x64_low1_sse4_1(__m128i *in, __m128i *out, int bit,
3350 int do_cols, int bd, int out_shift) {
3351 const int32_t *cospi = cospi_arr(bit);
3352 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
3353 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3354 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
3355 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
3356
3357 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
3358
3359 {
3360 __m128i x;
3361
3362 // stage 1
3363 // stage 2
3364 // stage 3
3365 // stage 4
3366 // stage 5
3367 // stage 6
3368 x = half_btf_0_sse4_1(&cospi32, &in[0], &rnding, bit);
3369
3370 // stage 8
3371 // stage 9
3372 // stage 10
3373 // stage 11
3374 if (do_cols) {
3375 x = _mm_max_epi32(x, clamp_lo);
3376 x = _mm_min_epi32(x, clamp_hi);
3377 } else {
3378 const int log_range_out = AOMMAX(16, bd + 6);
3379 const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
3380 -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
3381 const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
3382 (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
3383
3384 __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
3385 x = _mm_add_epi32(x, offset);
3386 x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
3387
3388 x = _mm_max_epi32(x, clamp_lo_out);
3389 x = _mm_min_epi32(x, clamp_hi_out);
3390 }
3391
3392 out[0] = x;
3393 out[1] = x;
3394 out[2] = x;
3395 out[3] = x;
3396 out[4] = x;
3397 out[5] = x;
3398 out[6] = x;
3399 out[7] = x;
3400 out[8] = x;
3401 out[9] = x;
3402 out[10] = x;
3403 out[11] = x;
3404 out[12] = x;
3405 out[13] = x;
3406 out[14] = x;
3407 out[15] = x;
3408 out[16] = x;
3409 out[17] = x;
3410 out[18] = x;
3411 out[19] = x;
3412 out[20] = x;
3413 out[21] = x;
3414 out[22] = x;
3415 out[23] = x;
3416 out[24] = x;
3417 out[25] = x;
3418 out[26] = x;
3419 out[27] = x;
3420 out[28] = x;
3421 out[29] = x;
3422 out[30] = x;
3423 out[31] = x;
3424 out[32] = x;
3425 out[33] = x;
3426 out[34] = x;
3427 out[35] = x;
3428 out[36] = x;
3429 out[37] = x;
3430 out[38] = x;
3431 out[39] = x;
3432 out[40] = x;
3433 out[41] = x;
3434 out[42] = x;
3435 out[43] = x;
3436 out[44] = x;
3437 out[45] = x;
3438 out[46] = x;
3439 out[47] = x;
3440 out[48] = x;
3441 out[49] = x;
3442 out[50] = x;
3443 out[51] = x;
3444 out[52] = x;
3445 out[53] = x;
3446 out[54] = x;
3447 out[55] = x;
3448 out[56] = x;
3449 out[57] = x;
3450 out[58] = x;
3451 out[59] = x;
3452 out[60] = x;
3453 out[61] = x;
3454 out[62] = x;
3455 out[63] = x;
3456 }
3457 }
3458
idct64x64_low8_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)3459 static void idct64x64_low8_sse4_1(__m128i *in, __m128i *out, int bit,
3460 int do_cols, int bd, int out_shift) {
3461 int i, j;
3462 const int32_t *cospi = cospi_arr(bit);
3463 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
3464 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3465 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
3466 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
3467
3468 const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
3469 const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
3470 const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
3471 const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
3472 const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
3473 const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
3474 const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
3475 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
3476 const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
3477 const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
3478 const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
3479 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
3480 const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
3481 const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
3482 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
3483 const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
3484 const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
3485 const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
3486 const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
3487 const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
3488 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
3489 const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
3490 const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
3491 const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
3492 const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
3493 const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
3494 const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
3495 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
3496 const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
3497 const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
3498 const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
3499 const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
3500 const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
3501 const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
3502 const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
3503 const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
3504 const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
3505 const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
3506
3507 {
3508 __m128i u[64];
3509
3510 // stage 1
3511 u[0] = in[0];
3512 u[8] = in[4];
3513 u[16] = in[2];
3514 u[24] = in[6];
3515 u[32] = in[1];
3516 u[40] = in[5];
3517 u[48] = in[3];
3518 u[56] = in[7];
3519
3520 // stage 2
3521 u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
3522 u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
3523 u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
3524 u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
3525 u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
3526 u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
3527 u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
3528 u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
3529
3530 // stage 3
3531 u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit);
3532 u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit);
3533 u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit);
3534 u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit);
3535 u[33] = u[32];
3536 u[38] = u[39];
3537 u[41] = u[40];
3538 u[46] = u[47];
3539 u[49] = u[48];
3540 u[54] = u[55];
3541 u[57] = u[56];
3542 u[62] = u[63];
3543
3544 // stage 4
3545 __m128i temp1, temp2;
3546 u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
3547 u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
3548 u[17] = u[16];
3549 u[22] = u[23];
3550 u[25] = u[24];
3551 u[30] = u[31];
3552
3553 temp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
3554 u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
3555 u[33] = temp1;
3556
3557 temp2 = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
3558 u[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
3559 u[57] = temp2;
3560
3561 temp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
3562 u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
3563 u[41] = temp1;
3564
3565 temp2 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
3566 u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
3567 u[46] = temp2;
3568
3569 // stage 5
3570 u[9] = u[8];
3571 u[14] = u[15];
3572
3573 temp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
3574 u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
3575 u[17] = temp1;
3576
3577 temp2 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
3578 u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
3579 u[22] = temp2;
3580
3581 u[35] = u[32];
3582 u[34] = u[33];
3583 u[36] = u[39];
3584 u[37] = u[38];
3585 u[43] = u[40];
3586 u[42] = u[41];
3587 u[44] = u[47];
3588 u[45] = u[46];
3589 u[51] = u[48];
3590 u[50] = u[49];
3591 u[52] = u[55];
3592 u[53] = u[54];
3593 u[59] = u[56];
3594 u[58] = u[57];
3595 u[60] = u[63];
3596 u[61] = u[62];
3597
3598 // stage 6
3599 temp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
3600 u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
3601 u[0] = temp1;
3602
3603 temp2 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
3604 u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
3605 u[9] = temp2;
3606 u[19] = u[16];
3607 u[18] = u[17];
3608 u[20] = u[23];
3609 u[21] = u[22];
3610 u[27] = u[24];
3611 u[26] = u[25];
3612 u[28] = u[31];
3613 u[29] = u[30];
3614
3615 temp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
3616 u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
3617 u[34] = temp1;
3618 temp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
3619 u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
3620 u[35] = temp2;
3621 temp1 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
3622 u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
3623 u[36] = temp1;
3624 temp2 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
3625 u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
3626 u[37] = temp2;
3627 temp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
3628 u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
3629 u[42] = temp1;
3630 temp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
3631 u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
3632 u[43] = temp2;
3633 temp1 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
3634 u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
3635 u[44] = temp1;
3636 temp2 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
3637 u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
3638 u[45] = temp2;
3639
3640 // stage 7
3641 u[3] = u[0];
3642 u[2] = u[1];
3643 u[11] = u[8];
3644 u[10] = u[9];
3645 u[12] = u[15];
3646 u[13] = u[14];
3647
3648 temp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
3649 u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
3650 u[18] = temp1;
3651 temp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
3652 u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
3653 u[19] = temp2;
3654 temp1 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
3655 u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
3656 u[20] = temp1;
3657 temp2 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
3658 u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
3659 u[21] = temp2;
3660 for (i = 32; i < 64; i += 16) {
3661 for (j = i; j < i + 4; j++) {
3662 addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
3663 addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
3664 &clamp_hi);
3665 }
3666 }
3667
3668 // stage 8
3669 u[7] = u[0];
3670 u[6] = u[1];
3671 u[5] = u[2];
3672 u[4] = u[3];
3673 u[9] = u[9];
3674
3675 idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
3676 &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
3677
3678 // stage 9
3679 idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3680 bit);
3681
3682 // stage 10
3683 idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3684 bit);
3685
3686 // stage 11
3687 idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, log_range);
3688 }
3689 }
3690
idct64x64_low16_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)3691 static void idct64x64_low16_sse4_1(__m128i *in, __m128i *out, int bit,
3692 int do_cols, int bd, int out_shift) {
3693 int i, j;
3694 const int32_t *cospi = cospi_arr(bit);
3695 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
3696 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3697 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
3698 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
3699
3700 const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
3701 const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
3702 const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
3703 const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
3704 const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
3705 const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
3706 const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
3707 const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
3708 const __m128i cospi9 = _mm_set1_epi32(cospi[9]);
3709 const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
3710 const __m128i cospi11 = _mm_set1_epi32(cospi[11]);
3711 const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
3712 const __m128i cospi13 = _mm_set1_epi32(cospi[13]);
3713 const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
3714 const __m128i cospi15 = _mm_set1_epi32(cospi[15]);
3715 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
3716 const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
3717 const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
3718 const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
3719 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
3720 const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
3721 const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
3722 const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
3723 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
3724 const __m128i cospi51 = _mm_set1_epi32(cospi[51]);
3725 const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
3726 const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
3727 const __m128i cospi55 = _mm_set1_epi32(cospi[55]);
3728 const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
3729 const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
3730 const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
3731 const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
3732 const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
3733
3734 const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
3735 const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
3736 const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
3737 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
3738 const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
3739 const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
3740 const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
3741 const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
3742 const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
3743 const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
3744 const __m128i cospim44 = _mm_set1_epi32(-cospi[44]);
3745 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
3746 const __m128i cospim49 = _mm_set1_epi32(-cospi[49]);
3747 const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
3748 const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
3749 const __m128i cospim53 = _mm_set1_epi32(-cospi[53]);
3750 const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
3751 const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
3752 const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
3753 const __m128i cospim60 = _mm_set1_epi32(-cospi[60]);
3754 const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
3755
3756 {
3757 __m128i u[64];
3758 __m128i tmp1, tmp2, tmp3, tmp4;
3759 // stage 1
3760 u[0] = in[0];
3761 u[32] = in[1];
3762 u[36] = in[9];
3763 u[40] = in[5];
3764 u[44] = in[13];
3765 u[48] = in[3];
3766 u[52] = in[11];
3767 u[56] = in[7];
3768 u[60] = in[15];
3769 u[16] = in[2];
3770 u[20] = in[10];
3771 u[24] = in[6];
3772 u[28] = in[14];
3773 u[4] = in[8];
3774 u[8] = in[4];
3775 u[12] = in[12];
3776
3777 // stage 2
3778 u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
3779 u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
3780 u[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit);
3781 u[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit);
3782 u[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit);
3783 u[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit);
3784 u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
3785 u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
3786 u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
3787 u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
3788 u[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit);
3789 u[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit);
3790 u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
3791 u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
3792 u[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit);
3793 u[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit);
3794
3795 // stage 3
3796 u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit);
3797 u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit);
3798 u[19] = half_btf_0_sse4_1(&cospim50, &u[28], &rnding, bit);
3799 u[28] = half_btf_0_sse4_1(&cospi14, &u[28], &rnding, bit);
3800 u[27] = half_btf_0_sse4_1(&cospi10, &u[20], &rnding, bit);
3801 u[20] = half_btf_0_sse4_1(&cospi54, &u[20], &rnding, bit);
3802 u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit);
3803 u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit);
3804 u[33] = u[32];
3805 u[34] = u[35];
3806 u[37] = u[36];
3807 u[38] = u[39];
3808 u[41] = u[40];
3809 u[42] = u[43];
3810 u[45] = u[44];
3811 u[46] = u[47];
3812 u[49] = u[48];
3813 u[50] = u[51];
3814 u[53] = u[52];
3815 u[54] = u[55];
3816 u[57] = u[56];
3817 u[58] = u[59];
3818 u[61] = u[60];
3819 u[62] = u[63];
3820
3821 // stage 4
3822 u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
3823 u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
3824 u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
3825 u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
3826
3827 u[17] = u[16];
3828 u[18] = u[19];
3829 u[21] = u[20];
3830 u[22] = u[23];
3831 u[25] = u[24];
3832 u[26] = u[27];
3833 u[29] = u[28];
3834 u[30] = u[31];
3835
3836 tmp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
3837 tmp2 = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
3838 tmp3 = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
3839 tmp4 = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
3840 u[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
3841 u[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
3842 u[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
3843 u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
3844 u[33] = tmp1;
3845 u[34] = tmp2;
3846 u[37] = tmp3;
3847 u[38] = tmp4;
3848
3849 tmp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
3850 tmp2 = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
3851 tmp3 = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
3852 tmp4 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
3853 u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
3854 u[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
3855 u[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
3856 u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
3857 u[41] = tmp1;
3858 u[42] = tmp2;
3859 u[45] = tmp3;
3860 u[46] = tmp4;
3861
3862 // stage 5
3863 u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit);
3864 u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit);
3865
3866 u[9] = u[8];
3867 u[10] = u[11];
3868 u[13] = u[12];
3869 u[14] = u[15];
3870
3871 tmp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
3872 tmp2 = half_btf_sse4_1(&cospim56, &u[18], &cospim8, &u[29], &rnding, bit);
3873 tmp3 = half_btf_sse4_1(&cospim40, &u[21], &cospi24, &u[26], &rnding, bit);
3874 tmp4 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
3875 u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
3876 u[26] = half_btf_sse4_1(&cospi24, &u[21], &cospi40, &u[26], &rnding, bit);
3877 u[29] = half_btf_sse4_1(&cospim8, &u[18], &cospi56, &u[29], &rnding, bit);
3878 u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
3879 u[17] = tmp1;
3880 u[18] = tmp2;
3881 u[21] = tmp3;
3882 u[22] = tmp4;
3883
3884 for (i = 32; i < 64; i += 8) {
3885 addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
3886 &clamp_hi);
3887 addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
3888 &clamp_hi);
3889
3890 addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
3891 &clamp_hi);
3892 addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
3893 &clamp_hi);
3894 }
3895
3896 // stage 6
3897 tmp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
3898 u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
3899 u[0] = tmp1;
3900 u[5] = u[4];
3901 u[6] = u[7];
3902
3903 tmp1 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
3904 u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
3905 u[9] = tmp1;
3906 tmp2 = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
3907 u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
3908 u[10] = tmp2;
3909
3910 for (i = 16; i < 32; i += 8) {
3911 addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
3912 &clamp_hi);
3913 addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
3914 &clamp_hi);
3915
3916 addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
3917 &clamp_hi);
3918 addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
3919 &clamp_hi);
3920 }
3921
3922 tmp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
3923 tmp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
3924 tmp3 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
3925 tmp4 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
3926 u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
3927 u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
3928 u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
3929 u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
3930 u[34] = tmp1;
3931 u[35] = tmp2;
3932 u[36] = tmp3;
3933 u[37] = tmp4;
3934
3935 tmp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
3936 tmp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
3937 tmp3 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
3938 tmp4 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
3939 u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
3940 u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
3941 u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
3942 u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
3943 u[42] = tmp1;
3944 u[43] = tmp2;
3945 u[44] = tmp3;
3946 u[45] = tmp4;
3947
3948 // stage 7
3949 u[3] = u[0];
3950 u[2] = u[1];
3951 tmp1 = half_btf_sse4_1(&cospim32, &u[5], &cospi32, &u[6], &rnding, bit);
3952 u[6] = half_btf_sse4_1(&cospi32, &u[5], &cospi32, &u[6], &rnding, bit);
3953 u[5] = tmp1;
3954 addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
3955 addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
3956 addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
3957 addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
3958
3959 tmp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
3960 tmp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
3961 tmp3 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
3962 tmp4 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
3963 u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
3964 u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
3965 u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
3966 u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
3967 u[18] = tmp1;
3968 u[19] = tmp2;
3969 u[20] = tmp3;
3970 u[21] = tmp4;
3971
3972 for (i = 32; i < 64; i += 16) {
3973 for (j = i; j < i + 4; j++) {
3974 addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
3975 addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
3976 &clamp_hi);
3977 }
3978 }
3979
3980 // stage 8
3981 for (i = 0; i < 4; ++i) {
3982 addsub_sse4_1(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi);
3983 }
3984
3985 idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
3986 &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
3987
3988 // stage 9
3989 idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3990 bit);
3991
3992 // stage 10
3993 idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3994 bit);
3995
3996 // stage 11
3997 idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, log_range);
3998 }
3999 }
4000
idct64x64_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)4001 static void idct64x64_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
4002 int bd, int out_shift) {
4003 int i, j;
4004 const int32_t *cospi = cospi_arr(bit);
4005 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
4006 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
4007 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
4008 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
4009
4010 const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
4011 const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
4012 const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
4013 const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
4014 const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
4015 const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
4016 const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
4017 const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
4018 const __m128i cospi9 = _mm_set1_epi32(cospi[9]);
4019 const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
4020 const __m128i cospi11 = _mm_set1_epi32(cospi[11]);
4021 const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
4022 const __m128i cospi13 = _mm_set1_epi32(cospi[13]);
4023 const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
4024 const __m128i cospi15 = _mm_set1_epi32(cospi[15]);
4025 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
4026 const __m128i cospi17 = _mm_set1_epi32(cospi[17]);
4027 const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
4028 const __m128i cospi19 = _mm_set1_epi32(cospi[19]);
4029 const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
4030 const __m128i cospi21 = _mm_set1_epi32(cospi[21]);
4031 const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
4032 const __m128i cospi23 = _mm_set1_epi32(cospi[23]);
4033 const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
4034 const __m128i cospi25 = _mm_set1_epi32(cospi[25]);
4035 const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
4036 const __m128i cospi27 = _mm_set1_epi32(cospi[27]);
4037 const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
4038 const __m128i cospi29 = _mm_set1_epi32(cospi[29]);
4039 const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
4040 const __m128i cospi31 = _mm_set1_epi32(cospi[31]);
4041 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
4042 const __m128i cospi35 = _mm_set1_epi32(cospi[35]);
4043 const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
4044 const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
4045 const __m128i cospi39 = _mm_set1_epi32(cospi[39]);
4046 const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
4047 const __m128i cospi43 = _mm_set1_epi32(cospi[43]);
4048 const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
4049 const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
4050 const __m128i cospi47 = _mm_set1_epi32(cospi[47]);
4051 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
4052 const __m128i cospi51 = _mm_set1_epi32(cospi[51]);
4053 const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
4054 const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
4055 const __m128i cospi55 = _mm_set1_epi32(cospi[55]);
4056 const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
4057 const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
4058 const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
4059 const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
4060 const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
4061
4062 const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
4063 const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
4064 const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
4065 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
4066 const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
4067 const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
4068 const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
4069 const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
4070 const __m128i cospim33 = _mm_set1_epi32(-cospi[33]);
4071 const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
4072 const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
4073 const __m128i cospim37 = _mm_set1_epi32(-cospi[37]);
4074 const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
4075 const __m128i cospim41 = _mm_set1_epi32(-cospi[41]);
4076 const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
4077 const __m128i cospim44 = _mm_set1_epi32(-cospi[44]);
4078 const __m128i cospim45 = _mm_set1_epi32(-cospi[45]);
4079 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
4080 const __m128i cospim49 = _mm_set1_epi32(-cospi[49]);
4081 const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
4082 const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
4083 const __m128i cospim53 = _mm_set1_epi32(-cospi[53]);
4084 const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
4085 const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
4086 const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
4087 const __m128i cospim60 = _mm_set1_epi32(-cospi[60]);
4088 const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
4089
4090 {
4091 __m128i u[64], v[64];
4092
4093 // stage 1
4094 u[32] = in[1];
4095 u[34] = in[17];
4096 u[36] = in[9];
4097 u[38] = in[25];
4098 u[40] = in[5];
4099 u[42] = in[21];
4100 u[44] = in[13];
4101 u[46] = in[29];
4102 u[48] = in[3];
4103 u[50] = in[19];
4104 u[52] = in[11];
4105 u[54] = in[27];
4106 u[56] = in[7];
4107 u[58] = in[23];
4108 u[60] = in[15];
4109 u[62] = in[31];
4110
4111 v[16] = in[2];
4112 v[18] = in[18];
4113 v[20] = in[10];
4114 v[22] = in[26];
4115 v[24] = in[6];
4116 v[26] = in[22];
4117 v[28] = in[14];
4118 v[30] = in[30];
4119
4120 u[8] = in[4];
4121 u[10] = in[20];
4122 u[12] = in[12];
4123 u[14] = in[28];
4124
4125 v[4] = in[8];
4126 v[6] = in[24];
4127
4128 u[0] = in[0];
4129 u[2] = in[16];
4130
4131 // stage 2
4132 v[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
4133 v[33] = half_btf_0_sse4_1(&cospim33, &u[62], &rnding, bit);
4134 v[34] = half_btf_0_sse4_1(&cospi47, &u[34], &rnding, bit);
4135 v[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit);
4136 v[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit);
4137 v[37] = half_btf_0_sse4_1(&cospim41, &u[58], &rnding, bit);
4138 v[38] = half_btf_0_sse4_1(&cospi39, &u[38], &rnding, bit);
4139 v[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
4140 v[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
4141 v[41] = half_btf_0_sse4_1(&cospim37, &u[54], &rnding, bit);
4142 v[42] = half_btf_0_sse4_1(&cospi43, &u[42], &rnding, bit);
4143 v[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit);
4144 v[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit);
4145 v[45] = half_btf_0_sse4_1(&cospim45, &u[50], &rnding, bit);
4146 v[46] = half_btf_0_sse4_1(&cospi35, &u[46], &rnding, bit);
4147 v[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
4148 v[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
4149 v[49] = half_btf_0_sse4_1(&cospi29, &u[46], &rnding, bit);
4150 v[50] = half_btf_0_sse4_1(&cospi19, &u[50], &rnding, bit);
4151 v[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit);
4152 v[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit);
4153 v[53] = half_btf_0_sse4_1(&cospi21, &u[42], &rnding, bit);
4154 v[54] = half_btf_0_sse4_1(&cospi27, &u[54], &rnding, bit);
4155 v[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
4156 v[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
4157 v[57] = half_btf_0_sse4_1(&cospi25, &u[38], &rnding, bit);
4158 v[58] = half_btf_0_sse4_1(&cospi23, &u[58], &rnding, bit);
4159 v[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit);
4160 v[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit);
4161 v[61] = half_btf_0_sse4_1(&cospi17, &u[34], &rnding, bit);
4162 v[62] = half_btf_0_sse4_1(&cospi31, &u[62], &rnding, bit);
4163 v[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
4164
4165 // stage 3
4166 u[16] = half_btf_0_sse4_1(&cospi62, &v[16], &rnding, bit);
4167 u[17] = half_btf_0_sse4_1(&cospim34, &v[30], &rnding, bit);
4168 u[18] = half_btf_0_sse4_1(&cospi46, &v[18], &rnding, bit);
4169 u[19] = half_btf_0_sse4_1(&cospim50, &v[28], &rnding, bit);
4170 u[20] = half_btf_0_sse4_1(&cospi54, &v[20], &rnding, bit);
4171 u[21] = half_btf_0_sse4_1(&cospim42, &v[26], &rnding, bit);
4172 u[22] = half_btf_0_sse4_1(&cospi38, &v[22], &rnding, bit);
4173 u[23] = half_btf_0_sse4_1(&cospim58, &v[24], &rnding, bit);
4174 u[24] = half_btf_0_sse4_1(&cospi6, &v[24], &rnding, bit);
4175 u[25] = half_btf_0_sse4_1(&cospi26, &v[22], &rnding, bit);
4176 u[26] = half_btf_0_sse4_1(&cospi22, &v[26], &rnding, bit);
4177 u[27] = half_btf_0_sse4_1(&cospi10, &v[20], &rnding, bit);
4178 u[28] = half_btf_0_sse4_1(&cospi14, &v[28], &rnding, bit);
4179 u[29] = half_btf_0_sse4_1(&cospi18, &v[18], &rnding, bit);
4180 u[30] = half_btf_0_sse4_1(&cospi30, &v[30], &rnding, bit);
4181 u[31] = half_btf_0_sse4_1(&cospi2, &v[16], &rnding, bit);
4182
4183 for (i = 32; i < 64; i += 4) {
4184 addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
4185 &clamp_hi);
4186 addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
4187 &clamp_hi);
4188 }
4189
4190 // stage 4
4191 v[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
4192 v[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit);
4193 v[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit);
4194 v[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
4195 v[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
4196 v[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit);
4197 v[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit);
4198 v[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
4199
4200 for (i = 16; i < 32; i += 4) {
4201 addsub_sse4_1(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo,
4202 &clamp_hi);
4203 addsub_sse4_1(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo,
4204 &clamp_hi);
4205 }
4206
4207 for (i = 32; i < 64; i += 4) {
4208 v[i + 0] = u[i + 0];
4209 v[i + 3] = u[i + 3];
4210 }
4211
4212 v[33] = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
4213 v[34] = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
4214 v[37] = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
4215 v[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
4216 v[41] = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
4217 v[42] = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
4218 v[45] = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
4219 v[46] = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
4220 v[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
4221 v[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
4222 v[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
4223 v[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
4224 v[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
4225 v[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
4226 v[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
4227 v[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
4228
4229 // stage 5
4230 u[4] = half_btf_0_sse4_1(&cospi56, &v[4], &rnding, bit);
4231 u[5] = half_btf_0_sse4_1(&cospim40, &v[6], &rnding, bit);
4232 u[6] = half_btf_0_sse4_1(&cospi24, &v[6], &rnding, bit);
4233 u[7] = half_btf_0_sse4_1(&cospi8, &v[4], &rnding, bit);
4234
4235 for (i = 8; i < 16; i += 4) {
4236 addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
4237 &clamp_hi);
4238 addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
4239 &clamp_hi);
4240 }
4241
4242 for (i = 16; i < 32; i += 4) {
4243 u[i + 0] = v[i + 0];
4244 u[i + 3] = v[i + 3];
4245 }
4246
4247 u[17] = half_btf_sse4_1(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit);
4248 u[18] = half_btf_sse4_1(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit);
4249 u[21] = half_btf_sse4_1(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit);
4250 u[22] = half_btf_sse4_1(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit);
4251 u[25] = half_btf_sse4_1(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit);
4252 u[26] = half_btf_sse4_1(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit);
4253 u[29] = half_btf_sse4_1(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit);
4254 u[30] = half_btf_sse4_1(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit);
4255
4256 for (i = 32; i < 64; i += 8) {
4257 addsub_sse4_1(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
4258 &clamp_hi);
4259 addsub_sse4_1(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
4260 &clamp_hi);
4261
4262 addsub_sse4_1(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
4263 &clamp_hi);
4264 addsub_sse4_1(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
4265 &clamp_hi);
4266 }
4267
4268 // stage 6
4269 v[0] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
4270 v[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
4271 v[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit);
4272 v[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit);
4273
4274 addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
4275 addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
4276
4277 for (i = 8; i < 16; i += 4) {
4278 v[i + 0] = u[i + 0];
4279 v[i + 3] = u[i + 3];
4280 }
4281
4282 v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
4283 v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
4284 v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
4285 v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
4286
4287 for (i = 16; i < 32; i += 8) {
4288 addsub_sse4_1(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo,
4289 &clamp_hi);
4290 addsub_sse4_1(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo,
4291 &clamp_hi);
4292
4293 addsub_sse4_1(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo,
4294 &clamp_hi);
4295 addsub_sse4_1(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo,
4296 &clamp_hi);
4297 }
4298
4299 for (i = 32; i < 64; i += 8) {
4300 v[i + 0] = u[i + 0];
4301 v[i + 1] = u[i + 1];
4302 v[i + 6] = u[i + 6];
4303 v[i + 7] = u[i + 7];
4304 }
4305
4306 v[34] = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
4307 v[35] = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
4308 v[36] = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
4309 v[37] = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
4310 v[42] = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
4311 v[43] = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
4312 v[44] = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
4313 v[45] = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
4314 v[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
4315 v[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
4316 v[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
4317 v[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
4318 v[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
4319 v[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
4320 v[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
4321 v[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
4322
4323 // stage 7
4324 addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
4325 addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
4326
4327 u[4] = v[4];
4328 u[7] = v[7];
4329 u[5] = half_btf_sse4_1(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit);
4330 u[6] = half_btf_sse4_1(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit);
4331
4332 addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
4333 addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
4334 addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
4335 addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
4336
4337 for (i = 16; i < 32; i += 8) {
4338 u[i + 0] = v[i + 0];
4339 u[i + 1] = v[i + 1];
4340 u[i + 6] = v[i + 6];
4341 u[i + 7] = v[i + 7];
4342 }
4343
4344 u[18] = half_btf_sse4_1(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit);
4345 u[19] = half_btf_sse4_1(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit);
4346 u[20] = half_btf_sse4_1(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit);
4347 u[21] = half_btf_sse4_1(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit);
4348 u[26] = half_btf_sse4_1(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit);
4349 u[27] = half_btf_sse4_1(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit);
4350 u[28] = half_btf_sse4_1(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit);
4351 u[29] = half_btf_sse4_1(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit);
4352
4353 for (i = 32; i < 64; i += 16) {
4354 for (j = i; j < i + 4; j++) {
4355 addsub_sse4_1(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
4356 addsub_sse4_1(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
4357 &clamp_hi);
4358 }
4359 }
4360
4361 // stage 8
4362 for (i = 0; i < 4; ++i) {
4363 addsub_sse4_1(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi);
4364 }
4365
4366 v[8] = u[8];
4367 v[9] = u[9];
4368 v[14] = u[14];
4369 v[15] = u[15];
4370
4371 v[10] = half_btf_sse4_1(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit);
4372 v[11] = half_btf_sse4_1(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit);
4373 v[12] = half_btf_sse4_1(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit);
4374 v[13] = half_btf_sse4_1(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit);
4375
4376 for (i = 16; i < 20; ++i) {
4377 addsub_sse4_1(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi);
4378 addsub_sse4_1(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo,
4379 &clamp_hi);
4380 }
4381
4382 for (i = 32; i < 36; ++i) {
4383 v[i] = u[i];
4384 v[i + 12] = u[i + 12];
4385 v[i + 16] = u[i + 16];
4386 v[i + 28] = u[i + 28];
4387 }
4388
4389 v[36] = half_btf_sse4_1(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit);
4390 v[37] = half_btf_sse4_1(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit);
4391 v[38] = half_btf_sse4_1(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit);
4392 v[39] = half_btf_sse4_1(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit);
4393 v[40] = half_btf_sse4_1(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit);
4394 v[41] = half_btf_sse4_1(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit);
4395 v[42] = half_btf_sse4_1(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit);
4396 v[43] = half_btf_sse4_1(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit);
4397 v[52] = half_btf_sse4_1(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit);
4398 v[53] = half_btf_sse4_1(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit);
4399 v[54] = half_btf_sse4_1(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit);
4400 v[55] = half_btf_sse4_1(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit);
4401 v[56] = half_btf_sse4_1(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit);
4402 v[57] = half_btf_sse4_1(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit);
4403 v[58] = half_btf_sse4_1(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit);
4404 v[59] = half_btf_sse4_1(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit);
4405
4406 // stage 9
4407 for (i = 0; i < 8; ++i) {
4408 addsub_sse4_1(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi);
4409 }
4410
4411 for (i = 16; i < 20; ++i) {
4412 u[i] = v[i];
4413 u[i + 12] = v[i + 12];
4414 }
4415
4416 u[20] = half_btf_sse4_1(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit);
4417 u[21] = half_btf_sse4_1(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit);
4418 u[22] = half_btf_sse4_1(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit);
4419 u[23] = half_btf_sse4_1(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit);
4420 u[24] = half_btf_sse4_1(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit);
4421 u[25] = half_btf_sse4_1(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit);
4422 u[26] = half_btf_sse4_1(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit);
4423 u[27] = half_btf_sse4_1(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit);
4424
4425 for (i = 32; i < 40; i++) {
4426 addsub_sse4_1(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi);
4427 }
4428
4429 for (i = 48; i < 56; i++) {
4430 addsub_sse4_1(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi);
4431 }
4432
4433 // stage 10
4434 for (i = 0; i < 16; i++) {
4435 addsub_sse4_1(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi);
4436 }
4437
4438 for (i = 32; i < 40; i++) v[i] = u[i];
4439
4440 v[40] = half_btf_sse4_1(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit);
4441 v[41] = half_btf_sse4_1(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit);
4442 v[42] = half_btf_sse4_1(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit);
4443 v[43] = half_btf_sse4_1(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit);
4444 v[44] = half_btf_sse4_1(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit);
4445 v[45] = half_btf_sse4_1(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit);
4446 v[46] = half_btf_sse4_1(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit);
4447 v[47] = half_btf_sse4_1(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit);
4448 v[48] = half_btf_sse4_1(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit);
4449 v[49] = half_btf_sse4_1(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit);
4450 v[50] = half_btf_sse4_1(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit);
4451 v[51] = half_btf_sse4_1(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit);
4452 v[52] = half_btf_sse4_1(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit);
4453 v[53] = half_btf_sse4_1(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit);
4454 v[54] = half_btf_sse4_1(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit);
4455 v[55] = half_btf_sse4_1(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit);
4456
4457 for (i = 56; i < 64; i++) v[i] = u[i];
4458
4459 // stage 11
4460 if (do_cols) {
4461 for (i = 0; i < 32; i++) {
4462 addsub_no_clamp_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)]);
4463 }
4464 } else {
4465 const int log_range_out = AOMMAX(16, bd + 6);
4466 const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
4467 -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
4468 const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
4469 (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
4470
4471 for (i = 0; i < 32; i++) {
4472 addsub_shift_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)],
4473 &clamp_lo_out, &clamp_hi_out, out_shift);
4474 }
4475 }
4476 }
4477 }
4478
idct32x32_low1_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)4479 static void idct32x32_low1_sse4_1(__m128i *in, __m128i *out, int bit,
4480 int do_cols, int bd, int out_shift) {
4481 const int32_t *cospi = cospi_arr(bit);
4482 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
4483 const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
4484 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
4485 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
4486 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
4487 __m128i bf1;
4488
4489 // stage 0
4490 // stage 1
4491 bf1 = in[0];
4492
4493 // stage 2
4494 // stage 3
4495 // stage 4
4496 // stage 5
4497 bf1 = half_btf_0_sse4_1(&cospi32, &bf1, &rounding, bit);
4498
4499 // stage 6
4500 // stage 7
4501 // stage 8
4502 // stage 9
4503 if (do_cols) {
4504 bf1 = _mm_max_epi32(bf1, clamp_lo);
4505 bf1 = _mm_min_epi32(bf1, clamp_hi);
4506 } else {
4507 const int log_range_out = AOMMAX(16, bd + 6);
4508 const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
4509 -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
4510 const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
4511 (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
4512
4513 __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
4514 bf1 = _mm_add_epi32(bf1, offset);
4515 bf1 = _mm_sra_epi32(bf1, _mm_cvtsi32_si128(out_shift));
4516 bf1 = _mm_max_epi32(bf1, clamp_lo_out);
4517 bf1 = _mm_min_epi32(bf1, clamp_hi_out);
4518 }
4519 out[0] = bf1;
4520 out[1] = bf1;
4521 out[2] = bf1;
4522 out[3] = bf1;
4523 out[4] = bf1;
4524 out[5] = bf1;
4525 out[6] = bf1;
4526 out[7] = bf1;
4527 out[8] = bf1;
4528 out[9] = bf1;
4529 out[10] = bf1;
4530 out[11] = bf1;
4531 out[12] = bf1;
4532 out[13] = bf1;
4533 out[14] = bf1;
4534 out[15] = bf1;
4535 out[16] = bf1;
4536 out[17] = bf1;
4537 out[18] = bf1;
4538 out[19] = bf1;
4539 out[20] = bf1;
4540 out[21] = bf1;
4541 out[22] = bf1;
4542 out[23] = bf1;
4543 out[24] = bf1;
4544 out[25] = bf1;
4545 out[26] = bf1;
4546 out[27] = bf1;
4547 out[28] = bf1;
4548 out[29] = bf1;
4549 out[30] = bf1;
4550 out[31] = bf1;
4551 }
4552
idct32x32_low8_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)4553 static void idct32x32_low8_sse4_1(__m128i *in, __m128i *out, int bit,
4554 int do_cols, int bd, int out_shift) {
4555 const int32_t *cospi = cospi_arr(bit);
4556 const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
4557 const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
4558 const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
4559 const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
4560 const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
4561 const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
4562 const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
4563 const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
4564 const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
4565 const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
4566 const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
4567 const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
4568 const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
4569 const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
4570 const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
4571 const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
4572 const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
4573 const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
4574 const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
4575 const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
4576 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
4577 const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
4578 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
4579 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
4580 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
4581 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
4582 const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
4583 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
4584 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
4585 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
4586 __m128i bf1[32];
4587
4588 // stage 0
4589 // stage 1
4590 bf1[0] = in[0];
4591 bf1[4] = in[4];
4592 bf1[8] = in[2];
4593 bf1[12] = in[6];
4594 bf1[16] = in[1];
4595 bf1[20] = in[5];
4596 bf1[24] = in[3];
4597 bf1[28] = in[7];
4598
4599 // stage 2
4600 bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit);
4601 bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit);
4602 bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit);
4603 bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit);
4604 bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit);
4605 bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit);
4606 bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit);
4607 bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit);
4608
4609 // stage 3
4610 bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit);
4611 bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit);
4612
4613 bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit);
4614 bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit);
4615 bf1[17] = bf1[16];
4616 bf1[18] = bf1[19];
4617 bf1[21] = bf1[20];
4618 bf1[22] = bf1[23];
4619 bf1[25] = bf1[24];
4620 bf1[26] = bf1[27];
4621 bf1[29] = bf1[28];
4622 bf1[30] = bf1[31];
4623
4624 // stage 4 :
4625 bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit);
4626 bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit);
4627
4628 bf1[9] = bf1[8];
4629 bf1[10] = bf1[11];
4630 bf1[13] = bf1[12];
4631 bf1[14] = bf1[15];
4632
4633 idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
4634 &cospi24, &cospi40, &cospim24, &rounding, bit);
4635
4636 // stage 5
4637 bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit);
4638 bf1[1] = bf1[0];
4639 bf1[5] = bf1[4];
4640 bf1[6] = bf1[7];
4641
4642 idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
4643 &clamp_hi, &rounding, bit);
4644
4645 // stage 6
4646 bf1[3] = bf1[0];
4647 bf1[2] = bf1[1];
4648
4649 idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
4650 &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
4651
4652 // stage 7
4653 idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
4654 &rounding, bit);
4655
4656 // stage 8
4657 idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
4658 &rounding, bit);
4659
4660 // stage 9
4661 idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, log_range);
4662 }
4663
idct32x32_low16_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)4664 static void idct32x32_low16_sse4_1(__m128i *in, __m128i *out, int bit,
4665 int do_cols, int bd, int out_shift) {
4666 const int32_t *cospi = cospi_arr(bit);
4667 const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
4668 const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
4669 const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
4670 const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
4671 const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
4672 const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
4673 const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
4674 const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
4675 const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
4676 const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
4677 const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
4678 const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
4679 const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
4680 const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
4681 const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
4682 const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
4683 const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
4684 const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
4685 const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
4686 const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
4687 const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
4688 const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
4689 const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
4690 const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
4691 const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
4692 const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
4693 const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
4694 const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
4695 const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
4696 const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
4697 const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
4698 const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
4699 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
4700 const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
4701 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
4702 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
4703 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
4704 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
4705 const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
4706 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
4707 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
4708 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
4709 __m128i bf1[32];
4710
4711 // stage 0
4712 // stage 1
4713
4714 bf1[0] = in[0];
4715 bf1[2] = in[8];
4716 bf1[4] = in[4];
4717 bf1[6] = in[12];
4718 bf1[8] = in[2];
4719 bf1[10] = in[10];
4720 bf1[12] = in[6];
4721 bf1[14] = in[14];
4722 bf1[16] = in[1];
4723 bf1[18] = in[9];
4724 bf1[20] = in[5];
4725 bf1[22] = in[13];
4726 bf1[24] = in[3];
4727 bf1[26] = in[11];
4728 bf1[28] = in[7];
4729 bf1[30] = in[15];
4730
4731 // stage 2
4732 bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit);
4733 bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit);
4734 bf1[17] = half_btf_0_sse4_1(&cospim34, &bf1[30], &rounding, bit);
4735 bf1[30] = half_btf_0_sse4_1(&cospi30, &bf1[30], &rounding, bit);
4736 bf1[29] = half_btf_0_sse4_1(&cospi18, &bf1[18], &rounding, bit);
4737 bf1[18] = half_btf_0_sse4_1(&cospi46, &bf1[18], &rounding, bit);
4738 bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit);
4739 bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit);
4740 bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit);
4741 bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit);
4742 bf1[21] = half_btf_0_sse4_1(&cospim42, &bf1[26], &rounding, bit);
4743 bf1[26] = half_btf_0_sse4_1(&cospi22, &bf1[26], &rounding, bit);
4744 bf1[25] = half_btf_0_sse4_1(&cospi26, &bf1[22], &rounding, bit);
4745 bf1[22] = half_btf_0_sse4_1(&cospi38, &bf1[22], &rounding, bit);
4746 bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit);
4747 bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit);
4748
4749 // stage 3
4750 bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit);
4751 bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit);
4752 bf1[9] = half_btf_0_sse4_1(&cospim36, &bf1[14], &rounding, bit);
4753 bf1[14] = half_btf_0_sse4_1(&cospi28, &bf1[14], &rounding, bit);
4754 bf1[13] = half_btf_0_sse4_1(&cospi20, &bf1[10], &rounding, bit);
4755 bf1[10] = half_btf_0_sse4_1(&cospi44, &bf1[10], &rounding, bit);
4756 bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit);
4757 bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit);
4758
4759 addsub_sse4_1(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
4760 addsub_sse4_1(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
4761 addsub_sse4_1(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
4762 addsub_sse4_1(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
4763 addsub_sse4_1(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
4764 addsub_sse4_1(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
4765 addsub_sse4_1(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
4766 addsub_sse4_1(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
4767 // stage 4
4768 bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit);
4769 bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit);
4770 bf1[5] = half_btf_0_sse4_1(&cospim40, &bf1[6], &rounding, bit);
4771 bf1[6] = half_btf_0_sse4_1(&cospi24, &bf1[6], &rounding, bit);
4772
4773 addsub_sse4_1(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi);
4774 addsub_sse4_1(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi);
4775 addsub_sse4_1(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi);
4776 addsub_sse4_1(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi);
4777
4778 idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
4779 &cospi24, &cospi40, &cospim24, &rounding, bit);
4780
4781 // stage 5
4782 bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit);
4783 bf1[1] = bf1[0];
4784 bf1[3] = half_btf_0_sse4_1(&cospi16, &bf1[2], &rounding, bit);
4785 bf1[2] = half_btf_0_sse4_1(&cospi48, &bf1[2], &rounding, bit);
4786
4787 addsub_sse4_1(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
4788 addsub_sse4_1(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
4789
4790 idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
4791 &clamp_hi, &rounding, bit);
4792
4793 // stage 6
4794 addsub_sse4_1(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi);
4795 addsub_sse4_1(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi);
4796
4797 idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
4798 &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
4799
4800 // stage 7
4801 idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
4802 &rounding, bit);
4803
4804 // stage 8
4805 idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
4806 &rounding, bit);
4807
4808 // stage 9
4809 idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, log_range);
4810 }
4811
idct32x32_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)4812 static void idct32x32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
4813 int bd, int out_shift) {
4814 const int32_t *cospi = cospi_arr(bit);
4815 const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
4816 const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
4817 const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
4818 const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
4819 const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
4820 const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
4821 const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
4822 const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
4823 const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
4824 const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
4825 const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
4826 const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
4827 const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
4828 const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
4829 const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
4830 const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
4831 const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
4832 const __m128i cospim26 = _mm_set1_epi32(-cospi[26]);
4833 const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
4834 const __m128i cospim10 = _mm_set1_epi32(-cospi[10]);
4835 const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
4836 const __m128i cospim18 = _mm_set1_epi32(-cospi[18]);
4837 const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
4838 const __m128i cospim2 = _mm_set1_epi32(-cospi[2]);
4839 const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
4840 const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
4841 const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
4842 const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
4843 const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
4844 const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
4845 const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
4846 const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
4847 const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
4848 const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
4849 const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
4850 const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
4851 const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
4852 const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
4853 const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
4854 const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
4855 const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
4856 const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
4857 const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
4858 const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
4859 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
4860 const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
4861 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
4862 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
4863 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
4864 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
4865 const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
4866 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
4867 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
4868 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
4869 __m128i bf1[32], bf0[32];
4870
4871 // stage 0
4872 // stage 1
4873 bf1[0] = in[0];
4874 bf1[1] = in[16];
4875 bf1[2] = in[8];
4876 bf1[3] = in[24];
4877 bf1[4] = in[4];
4878 bf1[5] = in[20];
4879 bf1[6] = in[12];
4880 bf1[7] = in[28];
4881 bf1[8] = in[2];
4882 bf1[9] = in[18];
4883 bf1[10] = in[10];
4884 bf1[11] = in[26];
4885 bf1[12] = in[6];
4886 bf1[13] = in[22];
4887 bf1[14] = in[14];
4888 bf1[15] = in[30];
4889 bf1[16] = in[1];
4890 bf1[17] = in[17];
4891 bf1[18] = in[9];
4892 bf1[19] = in[25];
4893 bf1[20] = in[5];
4894 bf1[21] = in[21];
4895 bf1[22] = in[13];
4896 bf1[23] = in[29];
4897 bf1[24] = in[3];
4898 bf1[25] = in[19];
4899 bf1[26] = in[11];
4900 bf1[27] = in[27];
4901 bf1[28] = in[7];
4902 bf1[29] = in[23];
4903 bf1[30] = in[15];
4904 bf1[31] = in[31];
4905
4906 // stage 2
4907 bf0[0] = bf1[0];
4908 bf0[1] = bf1[1];
4909 bf0[2] = bf1[2];
4910 bf0[3] = bf1[3];
4911 bf0[4] = bf1[4];
4912 bf0[5] = bf1[5];
4913 bf0[6] = bf1[6];
4914 bf0[7] = bf1[7];
4915 bf0[8] = bf1[8];
4916 bf0[9] = bf1[9];
4917 bf0[10] = bf1[10];
4918 bf0[11] = bf1[11];
4919 bf0[12] = bf1[12];
4920 bf0[13] = bf1[13];
4921 bf0[14] = bf1[14];
4922 bf0[15] = bf1[15];
4923 bf0[16] =
4924 half_btf_sse4_1(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit);
4925 bf0[17] =
4926 half_btf_sse4_1(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit);
4927 bf0[18] =
4928 half_btf_sse4_1(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit);
4929 bf0[19] =
4930 half_btf_sse4_1(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit);
4931 bf0[20] =
4932 half_btf_sse4_1(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit);
4933 bf0[21] =
4934 half_btf_sse4_1(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit);
4935 bf0[22] =
4936 half_btf_sse4_1(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit);
4937 bf0[23] =
4938 half_btf_sse4_1(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit);
4939 bf0[24] =
4940 half_btf_sse4_1(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit);
4941 bf0[25] =
4942 half_btf_sse4_1(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit);
4943 bf0[26] =
4944 half_btf_sse4_1(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit);
4945 bf0[27] =
4946 half_btf_sse4_1(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit);
4947 bf0[28] =
4948 half_btf_sse4_1(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit);
4949 bf0[29] =
4950 half_btf_sse4_1(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit);
4951 bf0[30] =
4952 half_btf_sse4_1(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit);
4953 bf0[31] =
4954 half_btf_sse4_1(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit);
4955
4956 // stage 3
4957 bf1[0] = bf0[0];
4958 bf1[1] = bf0[1];
4959 bf1[2] = bf0[2];
4960 bf1[3] = bf0[3];
4961 bf1[4] = bf0[4];
4962 bf1[5] = bf0[5];
4963 bf1[6] = bf0[6];
4964 bf1[7] = bf0[7];
4965 bf1[8] =
4966 half_btf_sse4_1(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit);
4967 bf1[9] =
4968 half_btf_sse4_1(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit);
4969 bf1[10] =
4970 half_btf_sse4_1(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit);
4971 bf1[11] =
4972 half_btf_sse4_1(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit);
4973 bf1[12] =
4974 half_btf_sse4_1(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit);
4975 bf1[13] =
4976 half_btf_sse4_1(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit);
4977 bf1[14] =
4978 half_btf_sse4_1(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit);
4979 bf1[15] =
4980 half_btf_sse4_1(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit);
4981
4982 addsub_sse4_1(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
4983 addsub_sse4_1(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
4984 addsub_sse4_1(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
4985 addsub_sse4_1(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
4986 addsub_sse4_1(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
4987 addsub_sse4_1(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
4988 addsub_sse4_1(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
4989 addsub_sse4_1(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
4990
4991 // stage 4
4992 bf0[0] = bf1[0];
4993 bf0[1] = bf1[1];
4994 bf0[2] = bf1[2];
4995 bf0[3] = bf1[3];
4996 bf0[4] =
4997 half_btf_sse4_1(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit);
4998 bf0[5] =
4999 half_btf_sse4_1(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit);
5000 bf0[6] =
5001 half_btf_sse4_1(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit);
5002 bf0[7] = half_btf_sse4_1(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit);
5003
5004 addsub_sse4_1(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi);
5005 addsub_sse4_1(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi);
5006 addsub_sse4_1(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi);
5007 addsub_sse4_1(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi);
5008
5009 bf0[16] = bf1[16];
5010 bf0[17] =
5011 half_btf_sse4_1(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit);
5012 bf0[18] =
5013 half_btf_sse4_1(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit);
5014 bf0[19] = bf1[19];
5015 bf0[20] = bf1[20];
5016 bf0[21] =
5017 half_btf_sse4_1(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit);
5018 bf0[22] =
5019 half_btf_sse4_1(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit);
5020 bf0[23] = bf1[23];
5021 bf0[24] = bf1[24];
5022 bf0[25] =
5023 half_btf_sse4_1(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit);
5024 bf0[26] =
5025 half_btf_sse4_1(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit);
5026 bf0[27] = bf1[27];
5027 bf0[28] = bf1[28];
5028 bf0[29] =
5029 half_btf_sse4_1(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit);
5030 bf0[30] =
5031 half_btf_sse4_1(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit);
5032 bf0[31] = bf1[31];
5033
5034 // stage 5
5035 bf1[0] =
5036 half_btf_sse4_1(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit);
5037 bf1[1] =
5038 half_btf_sse4_1(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit);
5039 bf1[2] =
5040 half_btf_sse4_1(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit);
5041 bf1[3] =
5042 half_btf_sse4_1(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit);
5043 addsub_sse4_1(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
5044 addsub_sse4_1(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
5045 bf1[8] = bf0[8];
5046 bf1[9] =
5047 half_btf_sse4_1(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit);
5048 bf1[10] =
5049 half_btf_sse4_1(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit);
5050 bf1[11] = bf0[11];
5051 bf1[12] = bf0[12];
5052 bf1[13] =
5053 half_btf_sse4_1(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit);
5054 bf1[14] =
5055 half_btf_sse4_1(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit);
5056 bf1[15] = bf0[15];
5057 addsub_sse4_1(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi);
5058 addsub_sse4_1(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi);
5059 addsub_sse4_1(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi);
5060 addsub_sse4_1(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi);
5061 addsub_sse4_1(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi);
5062 addsub_sse4_1(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi);
5063 addsub_sse4_1(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi);
5064 addsub_sse4_1(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi);
5065
5066 // stage 6
5067 addsub_sse4_1(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi);
5068 addsub_sse4_1(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi);
5069 bf0[4] = bf1[4];
5070 bf0[5] =
5071 half_btf_sse4_1(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
5072 bf0[6] =
5073 half_btf_sse4_1(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
5074 bf0[7] = bf1[7];
5075 addsub_sse4_1(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi);
5076 addsub_sse4_1(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi);
5077 addsub_sse4_1(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi);
5078 addsub_sse4_1(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi);
5079 bf0[16] = bf1[16];
5080 bf0[17] = bf1[17];
5081 bf0[18] =
5082 half_btf_sse4_1(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit);
5083 bf0[19] =
5084 half_btf_sse4_1(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit);
5085 bf0[20] =
5086 half_btf_sse4_1(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit);
5087 bf0[21] =
5088 half_btf_sse4_1(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit);
5089 bf0[22] = bf1[22];
5090 bf0[23] = bf1[23];
5091 bf0[24] = bf1[24];
5092 bf0[25] = bf1[25];
5093 bf0[26] =
5094 half_btf_sse4_1(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit);
5095 bf0[27] =
5096 half_btf_sse4_1(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit);
5097 bf0[28] =
5098 half_btf_sse4_1(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit);
5099 bf0[29] =
5100 half_btf_sse4_1(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit);
5101 bf0[30] = bf1[30];
5102 bf0[31] = bf1[31];
5103
5104 // stage 7
5105 addsub_sse4_1(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi);
5106 addsub_sse4_1(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi);
5107 addsub_sse4_1(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi);
5108 addsub_sse4_1(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi);
5109 bf1[8] = bf0[8];
5110 bf1[9] = bf0[9];
5111 bf1[10] =
5112 half_btf_sse4_1(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
5113 bf1[11] =
5114 half_btf_sse4_1(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
5115 bf1[12] =
5116 half_btf_sse4_1(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
5117 bf1[13] =
5118 half_btf_sse4_1(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
5119 bf1[14] = bf0[14];
5120 bf1[15] = bf0[15];
5121 addsub_sse4_1(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi);
5122 addsub_sse4_1(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi);
5123 addsub_sse4_1(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi);
5124 addsub_sse4_1(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi);
5125 addsub_sse4_1(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi);
5126 addsub_sse4_1(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi);
5127 addsub_sse4_1(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi);
5128 addsub_sse4_1(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi);
5129
5130 // stage 8
5131 addsub_sse4_1(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi);
5132 addsub_sse4_1(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi);
5133 addsub_sse4_1(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi);
5134 addsub_sse4_1(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi);
5135 addsub_sse4_1(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi);
5136 addsub_sse4_1(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi);
5137 addsub_sse4_1(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi);
5138 addsub_sse4_1(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi);
5139 bf0[16] = bf1[16];
5140 bf0[17] = bf1[17];
5141 bf0[18] = bf1[18];
5142 bf0[19] = bf1[19];
5143 bf0[20] =
5144 half_btf_sse4_1(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
5145 bf0[21] =
5146 half_btf_sse4_1(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
5147 bf0[22] =
5148 half_btf_sse4_1(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
5149 bf0[23] =
5150 half_btf_sse4_1(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
5151 bf0[24] =
5152 half_btf_sse4_1(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
5153 bf0[25] =
5154 half_btf_sse4_1(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
5155 bf0[26] =
5156 half_btf_sse4_1(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
5157 bf0[27] =
5158 half_btf_sse4_1(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
5159 bf0[28] = bf1[28];
5160 bf0[29] = bf1[29];
5161 bf0[30] = bf1[30];
5162 bf0[31] = bf1[31];
5163
5164 // stage 9
5165 if (do_cols) {
5166 addsub_no_clamp_sse4_1(bf0[0], bf0[31], out + 0, out + 31);
5167 addsub_no_clamp_sse4_1(bf0[1], bf0[30], out + 1, out + 30);
5168 addsub_no_clamp_sse4_1(bf0[2], bf0[29], out + 2, out + 29);
5169 addsub_no_clamp_sse4_1(bf0[3], bf0[28], out + 3, out + 28);
5170 addsub_no_clamp_sse4_1(bf0[4], bf0[27], out + 4, out + 27);
5171 addsub_no_clamp_sse4_1(bf0[5], bf0[26], out + 5, out + 26);
5172 addsub_no_clamp_sse4_1(bf0[6], bf0[25], out + 6, out + 25);
5173 addsub_no_clamp_sse4_1(bf0[7], bf0[24], out + 7, out + 24);
5174 addsub_no_clamp_sse4_1(bf0[8], bf0[23], out + 8, out + 23);
5175 addsub_no_clamp_sse4_1(bf0[9], bf0[22], out + 9, out + 22);
5176 addsub_no_clamp_sse4_1(bf0[10], bf0[21], out + 10, out + 21);
5177 addsub_no_clamp_sse4_1(bf0[11], bf0[20], out + 11, out + 20);
5178 addsub_no_clamp_sse4_1(bf0[12], bf0[19], out + 12, out + 19);
5179 addsub_no_clamp_sse4_1(bf0[13], bf0[18], out + 13, out + 18);
5180 addsub_no_clamp_sse4_1(bf0[14], bf0[17], out + 14, out + 17);
5181 addsub_no_clamp_sse4_1(bf0[15], bf0[16], out + 15, out + 16);
5182 } else {
5183 const int log_range_out = AOMMAX(16, bd + 6);
5184 const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
5185 -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
5186 const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
5187 (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
5188
5189 addsub_shift_sse4_1(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo_out,
5190 &clamp_hi_out, out_shift);
5191 addsub_shift_sse4_1(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo_out,
5192 &clamp_hi_out, out_shift);
5193 addsub_shift_sse4_1(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo_out,
5194 &clamp_hi_out, out_shift);
5195 addsub_shift_sse4_1(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo_out,
5196 &clamp_hi_out, out_shift);
5197 addsub_shift_sse4_1(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo_out,
5198 &clamp_hi_out, out_shift);
5199 addsub_shift_sse4_1(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo_out,
5200 &clamp_hi_out, out_shift);
5201 addsub_shift_sse4_1(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo_out,
5202 &clamp_hi_out, out_shift);
5203 addsub_shift_sse4_1(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo_out,
5204 &clamp_hi_out, out_shift);
5205 addsub_shift_sse4_1(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo_out,
5206 &clamp_hi_out, out_shift);
5207 addsub_shift_sse4_1(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo_out,
5208 &clamp_hi_out, out_shift);
5209 addsub_shift_sse4_1(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo_out,
5210 &clamp_hi_out, out_shift);
5211 addsub_shift_sse4_1(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo_out,
5212 &clamp_hi_out, out_shift);
5213 addsub_shift_sse4_1(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo_out,
5214 &clamp_hi_out, out_shift);
5215 addsub_shift_sse4_1(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo_out,
5216 &clamp_hi_out, out_shift);
5217 addsub_shift_sse4_1(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo_out,
5218 &clamp_hi_out, out_shift);
5219 addsub_shift_sse4_1(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo_out,
5220 &clamp_hi_out, out_shift);
5221 }
5222 }
5223
av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5224 void av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t *input, uint8_t *dest,
5225 int stride,
5226 const TxfmParam *txfm_param) {
5227 int bd = txfm_param->bd;
5228 const TX_TYPE tx_type = txfm_param->tx_type;
5229 const int32_t *src = cast_to_int32(input);
5230 switch (tx_type) {
5231 case IDTX:
5232 case H_DCT:
5233 case H_ADST:
5234 case H_FLIPADST:
5235 case V_DCT:
5236 case V_ADST:
5237 case V_FLIPADST:
5238 av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
5239 txfm_param->tx_size,
5240 txfm_param->eob, bd);
5241 break;
5242 default:
5243 av1_inv_txfm2d_add_8x8_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride,
5244 tx_type, bd);
5245 break;
5246 }
5247 }
av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5248 void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *input, uint8_t *dest,
5249 int stride,
5250 const TxfmParam *txfm_param) {
5251 assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
5252 int eob = txfm_param->eob;
5253 int bd = txfm_param->bd;
5254 int lossless = txfm_param->lossless;
5255 const int32_t *src = cast_to_int32(input);
5256 const TX_TYPE tx_type = txfm_param->tx_type;
5257 if (lossless) {
5258 assert(tx_type == DCT_DCT);
5259 av1_highbd_iwht4x4_add(input, dest, stride, eob, bd);
5260 return;
5261 }
5262 av1_inv_txfm2d_add_4x4_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
5263 bd);
5264 }
iidentity32_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)5265 static void iidentity32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
5266 int bd, int out_shift) {
5267 (void)bit;
5268 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
5269 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
5270 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
5271 __m128i v[32];
5272 for (int i = 0; i < 32; i += 16) {
5273 v[i] = _mm_slli_epi32(in[i], 2);
5274 v[i + 1] = _mm_slli_epi32(in[i + 1], 2);
5275 v[i + 2] = _mm_slli_epi32(in[i + 2], 2);
5276 v[i + 3] = _mm_slli_epi32(in[i + 3], 2);
5277 v[i + 4] = _mm_slli_epi32(in[i + 4], 2);
5278 v[i + 5] = _mm_slli_epi32(in[i + 5], 2);
5279 v[i + 6] = _mm_slli_epi32(in[i + 6], 2);
5280 v[i + 7] = _mm_slli_epi32(in[i + 7], 2);
5281 v[i + 8] = _mm_slli_epi32(in[i + 8], 2);
5282 v[i + 9] = _mm_slli_epi32(in[i + 9], 2);
5283 v[i + 10] = _mm_slli_epi32(in[i + 10], 2);
5284 v[i + 11] = _mm_slli_epi32(in[i + 11], 2);
5285 v[i + 12] = _mm_slli_epi32(in[i + 12], 2);
5286 v[i + 13] = _mm_slli_epi32(in[i + 13], 2);
5287 v[i + 14] = _mm_slli_epi32(in[i + 14], 2);
5288 v[i + 15] = _mm_slli_epi32(in[i + 15], 2);
5289 }
5290
5291 if (!do_cols) {
5292 const int log_range_out = AOMMAX(16, bd + 6);
5293 const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
5294 -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
5295 const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
5296 (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
5297 shift_sse4_1(v, out, &clamp_lo_out, &clamp_hi_out, out_shift, 32);
5298 } else {
5299 highbd_clamp_epi32_sse4_1(v, out, &clamp_lo, &clamp_hi, 32);
5300 }
5301 }
5302 static const transform_1d_sse4_1
5303 highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
5304 {
5305 { idct4x4_sse4_1, NULL, NULL, NULL },
5306 { iadst4x4_sse4_1, NULL, NULL, NULL },
5307 { iidentity4_sse4_1, iidentity4_sse4_1, iidentity4_sse4_1, NULL },
5308 },
5309 { { idct8x8_low1_sse4_1, idct8x8_new_sse4_1, NULL, NULL },
5310 { iadst8x8_low1_sse4_1, iadst8x8_new_sse4_1, NULL, NULL },
5311 { iidentity8_sse4_1, iidentity8_sse4_1, NULL, NULL } },
5312 {
5313 { idct16x16_low1_sse4_1, idct16x16_low8_sse4_1, idct16x16_sse4_1,
5314 NULL },
5315 { iadst16x16_low1_sse4_1, iadst16x16_low8_sse4_1, iadst16x16_sse4_1,
5316 NULL },
5317 { iidentity16_sse4_1, NULL, iidentity16_sse4_1, NULL },
5318 },
5319 { { idct32x32_low1_sse4_1, idct32x32_low8_sse4_1, idct32x32_low16_sse4_1,
5320 idct32x32_sse4_1 },
5321 { NULL, NULL, NULL, NULL },
5322 { iidentity32_sse4_1, NULL, NULL, NULL } },
5323 { { idct64x64_low1_sse4_1, idct64x64_low8_sse4_1, idct64x64_low16_sse4_1,
5324 idct64x64_sse4_1 },
5325 { NULL, NULL, NULL, NULL },
5326 { NULL, NULL, NULL, NULL } }
5327 };
highbd_inv_txfm2d_add_h_identity_ssse41(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)5328 static void highbd_inv_txfm2d_add_h_identity_ssse41(const int32_t *input,
5329 uint16_t *output,
5330 int stride, TX_TYPE tx_type,
5331 TX_SIZE tx_size, int eob,
5332 const int bd) {
5333 __m128i buf1[64];
5334 int eobx, eoby;
5335 get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
5336 const int8_t *shift = inv_txfm_shift_ls[tx_size];
5337 const int txw_idx = get_txw_idx(tx_size);
5338 const int txh_idx = get_txh_idx(tx_size);
5339 const int txfm_size_col = tx_size_wide[tx_size];
5340 const int txfm_size_row = tx_size_high[tx_size];
5341 const int input_stride = AOMMIN(32, txfm_size_col);
5342 const int buf_size_w_div4 = input_stride >> 2;
5343 const int buf_size_h_div8 = (eoby + 8) >> 3;
5344 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
5345 const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby];
5346 const transform_1d_sse4_1 row_txfm =
5347 highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
5348 const transform_1d_sse4_1 col_txfm =
5349 highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx];
5350 int ud_flip, lr_flip;
5351 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
5352
5353 for (int i = 0; i < (buf_size_h_div8 << 1); ++i) {
5354 __m128i buf0[16];
5355 const int32_t *input_row = input + i * input_stride * 4;
5356 for (int j = 0; j < buf_size_w_div4; ++j) {
5357 __m128i *buf0_cur = buf0 + j * 4;
5358 load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
5359 }
5360 if (rect_type == 1 || rect_type == -1) {
5361 av1_round_shift_rect_array_32_sse4_1(buf0, buf0, input_stride, 0,
5362 NewInvSqrt2);
5363 }
5364 row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
5365
5366 __m128i *_buf1 = buf1 + i * 4;
5367
5368 for (int j = 0; j < buf_size_w_div4; ++j) {
5369 _buf1[j * txfm_size_row + 0] = buf0[j * 4 + 0];
5370 _buf1[j * txfm_size_row + 1] = buf0[j * 4 + 1];
5371 _buf1[j * txfm_size_row + 2] = buf0[j * 4 + 2];
5372 _buf1[j * txfm_size_row + 3] = buf0[j * 4 + 3];
5373 }
5374 }
5375 for (int i = 0; i < buf_size_w_div4; i++) {
5376 col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
5377 inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
5378
5379 av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
5380 buf1 + i * txfm_size_row, txfm_size_row,
5381 -shift[1]);
5382 }
5383
5384 // write to buffer
5385 for (int i = 0; i < (txfm_size_col >> 3); i++) {
5386 highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2, output + 8 * i,
5387 stride, ud_flip, txfm_size_row, bd);
5388 }
5389 }
highbd_inv_txfm2d_add_v_identity_ssse41(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)5390 static void highbd_inv_txfm2d_add_v_identity_ssse41(const int32_t *input,
5391 uint16_t *output,
5392 int stride, TX_TYPE tx_type,
5393 TX_SIZE tx_size, int eob,
5394 const int bd) {
5395 __m128i buf1[64];
5396 int eobx, eoby;
5397 get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
5398 const int8_t *shift = inv_txfm_shift_ls[tx_size];
5399 const int txw_idx = get_txw_idx(tx_size);
5400 const int txh_idx = get_txh_idx(tx_size);
5401 const int txfm_size_col = tx_size_wide[tx_size];
5402 const int txfm_size_row = tx_size_high[tx_size];
5403 const int input_stride = AOMMIN(32, txfm_size_col);
5404 const int buf_size_w_div8 = input_stride >> 2;
5405 const int row_max = AOMMIN(32, txfm_size_row);
5406 const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
5407 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
5408 const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
5409 const transform_1d_sse4_1 row_txfm =
5410 highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
5411 const transform_1d_sse4_1 col_txfm =
5412 highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
5413 int ud_flip, lr_flip;
5414 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
5415
5416 for (int i = 0; i < (row_max >> 2); ++i) {
5417 __m128i buf0[16];
5418 const int32_t *input_row = input + i * input_stride * 4;
5419 for (int j = 0; j < (buf_size_nonzero_w_div8 << 1); ++j) {
5420 __m128i *buf0_cur = buf0 + j * 4;
5421 load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
5422
5423 TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
5424 buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
5425 }
5426 if (rect_type == 1 || rect_type == -1) {
5427 av1_round_shift_rect_array_32_sse4_1(
5428 buf0, buf0, (buf_size_nonzero_w_div8 << 3), 0, NewInvSqrt2);
5429 }
5430 row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
5431
5432 __m128i *_buf1 = buf1 + i * 4;
5433 if (lr_flip) {
5434 for (int j = 0; j < buf_size_w_div8; ++j) {
5435 TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
5436 buf0[4 * j],
5437 _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 0],
5438 _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 1],
5439 _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 2],
5440 _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 3]);
5441 }
5442 } else {
5443 for (int j = 0; j < buf_size_w_div8; ++j) {
5444 TRANSPOSE_4X4(
5445 buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
5446 _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
5447 _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
5448 }
5449 }
5450 }
5451 for (int i = 0; i < buf_size_w_div8; i++) {
5452 col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
5453 inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
5454
5455 av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
5456 buf1 + i * txfm_size_row, txfm_size_row,
5457 -shift[1]);
5458 }
5459
5460 // write to buffer
5461 {
5462 for (int i = 0; i < (txfm_size_col >> 3); i++) {
5463 highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
5464 output + 8 * i, stride, ud_flip,
5465 txfm_size_row, bd);
5466 }
5467 }
5468 }
highbd_inv_txfm2d_add_idtx_ssse41(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)5469 static void highbd_inv_txfm2d_add_idtx_ssse41(const int32_t *input,
5470 uint16_t *output, int stride,
5471 TX_TYPE tx_type, TX_SIZE tx_size,
5472 int eob, const int bd) {
5473 (void)eob;
5474 __m128i buf1[64 * 4];
5475 const int8_t *shift = inv_txfm_shift_ls[tx_size];
5476 const int txw_idx = get_txw_idx(tx_size);
5477 const int txh_idx = get_txh_idx(tx_size);
5478 const int txfm_size_col = tx_size_wide[tx_size];
5479 const int txfm_size_row = tx_size_high[tx_size];
5480 const int input_stride = AOMMIN(32, txfm_size_col);
5481 const int row_max = AOMMIN(32, txfm_size_row);
5482 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
5483 const transform_1d_sse4_1 row_txfm =
5484 highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
5485 const transform_1d_sse4_1 col_txfm =
5486 highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
5487
5488 for (int i = 0; i < (row_max >> 2); ++i) {
5489 __m128i buf0[32];
5490 const int32_t *input_row = input + i * input_stride * 4;
5491 for (int j = 0; j < (input_stride >> 2); ++j) {
5492 __m128i *buf0_cur = buf0 + j * 4;
5493 load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
5494 }
5495 if (rect_type == 1 || rect_type == -1) {
5496 av1_round_shift_rect_array_32_sse4_1(buf0, buf0, input_stride, 0,
5497 NewInvSqrt2);
5498 }
5499 row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
5500
5501 __m128i *_buf1 = buf1 + i * 4;
5502 for (int j = 0; j < (input_stride >> 2); ++j) {
5503 _buf1[j * txfm_size_row + 0] = buf0[j * 4 + 0];
5504 _buf1[j * txfm_size_row + 1] = buf0[j * 4 + 1];
5505 _buf1[j * txfm_size_row + 2] = buf0[j * 4 + 2];
5506 _buf1[j * txfm_size_row + 3] = buf0[j * 4 + 3];
5507 }
5508 }
5509 for (int i = 0; i < (input_stride >> 2); i++) {
5510 col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
5511 inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
5512
5513 av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
5514 buf1 + i * txfm_size_row, txfm_size_row,
5515 -shift[1]);
5516 }
5517
5518 // write to buffer
5519 {
5520 for (int i = 0; i < (txfm_size_col >> 3); i++) {
5521 highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
5522 output + 8 * i, stride, 0, txfm_size_row,
5523 bd);
5524 }
5525 }
5526 }
highbd_inv_txfm2d_add_no_identity_sse41(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)5527 static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input,
5528 uint16_t *output,
5529 int stride, TX_TYPE tx_type,
5530 TX_SIZE tx_size, int eob,
5531 const int bd) {
5532 __m128i buf1[64 * 16];
5533 int eobx, eoby;
5534 get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
5535 const int8_t *shift = inv_txfm_shift_ls[tx_size];
5536 const int txw_idx = get_txw_idx(tx_size);
5537 const int txh_idx = get_txh_idx(tx_size);
5538 const int txfm_size_col = tx_size_wide[tx_size];
5539 const int txfm_size_row = tx_size_high[tx_size];
5540 const int buf_size_w_div8 = txfm_size_col >> 2;
5541 const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
5542 const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
5543 const int input_stride = AOMMIN(32, txfm_size_col);
5544 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
5545
5546 const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
5547 const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
5548 const transform_1d_sse4_1 row_txfm =
5549 highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
5550 const transform_1d_sse4_1 col_txfm =
5551 highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
5552
5553 assert(col_txfm != NULL);
5554 assert(row_txfm != NULL);
5555 int ud_flip, lr_flip;
5556 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
5557
5558 // 1st stage: column transform
5559 for (int i = 0; i < buf_size_nonzero_h_div8 << 1; i++) {
5560 __m128i buf0[64];
5561 const int32_t *input_row = input + i * input_stride * 4;
5562 for (int j = 0; j < buf_size_nonzero_w_div8 << 1; ++j) {
5563 __m128i *buf0_cur = buf0 + j * 4;
5564 load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
5565
5566 TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
5567 buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
5568 }
5569 if (rect_type == 1 || rect_type == -1) {
5570 av1_round_shift_rect_array_32_sse4_1(
5571 buf0, buf0, buf_size_nonzero_w_div8 << 3, 0, NewInvSqrt2);
5572 }
5573 row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
5574
5575 __m128i *_buf1 = buf1 + i * 4;
5576 if (lr_flip) {
5577 for (int j = 0; j < buf_size_w_div8; ++j) {
5578 TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
5579 buf0[4 * j],
5580 _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 0],
5581 _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 1],
5582 _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 2],
5583 _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 3]);
5584 }
5585 } else {
5586 for (int j = 0; j < buf_size_w_div8; ++j) {
5587 TRANSPOSE_4X4(
5588 buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
5589 _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
5590 _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
5591 }
5592 }
5593 }
5594 // 2nd stage: column transform
5595 for (int i = 0; i < buf_size_w_div8; i++) {
5596 col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
5597 inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
5598
5599 av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
5600 buf1 + i * txfm_size_row, txfm_size_row,
5601 -shift[1]);
5602 }
5603
5604 // write to buffer
5605 {
5606 for (int i = 0; i < (txfm_size_col >> 3); i++) {
5607 highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
5608 output + 8 * i, stride, ud_flip,
5609 txfm_size_row, bd);
5610 }
5611 }
5612 }
5613
highbd_inv_txfm2d_add_4x8_sse41(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)5614 static void highbd_inv_txfm2d_add_4x8_sse41(const int32_t *input,
5615 uint16_t *output, int stride,
5616 TX_TYPE tx_type, TX_SIZE tx_size,
5617 int eob, const int bd) {
5618 (void)eob;
5619 __m128i buf1[8];
5620 const int8_t *shift = inv_txfm_shift_ls[tx_size];
5621 const int txw_idx = get_txw_idx(tx_size);
5622 const int txh_idx = get_txh_idx(tx_size);
5623 const int txfm_size_col = tx_size_wide[tx_size];
5624 const int txfm_size_row = tx_size_high[tx_size];
5625 const transform_1d_sse4_1 row_txfm =
5626 highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
5627 const transform_1d_sse4_1 col_txfm =
5628 highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][1];
5629 const int input_stride = AOMMIN(32, txfm_size_col);
5630
5631 assert(col_txfm != NULL);
5632 assert(row_txfm != NULL);
5633 int ud_flip, lr_flip;
5634 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
5635
5636 // 1st stage: column transform
5637 __m128i buf0[8];
5638 const int32_t *input_row = input;
5639 __m128i *buf0_cur = buf0;
5640 load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_row);
5641 av1_round_shift_rect_array_32_sse4_1(buf0, buf0, txfm_size_row, 0,
5642 NewInvSqrt2);
5643 row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
5644 row_txfm(buf0 + 4, buf0 + 4, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
5645 -shift[0]);
5646
5647 if (lr_flip) {
5648 TRANSPOSE_4X4(buf0[3], buf0[2], buf0[1], buf0[0], buf1[0], buf1[1], buf1[2],
5649 buf1[3]);
5650
5651 TRANSPOSE_4X4(buf0[7], buf0[6], buf0[5], buf0[4], buf1[4], buf1[5], buf1[6],
5652 buf1[7]);
5653 } else {
5654 TRANSPOSE_4X4(buf0[0], buf0[1], buf0[2], buf0[3], buf1[0], buf1[1], buf1[2],
5655 buf1[3]);
5656
5657 TRANSPOSE_4X4(buf0[4], buf0[5], buf0[6], buf0[7], buf1[4], buf1[5], buf1[6],
5658 buf1[7]);
5659 }
5660
5661 // 2nd stage: column transform
5662 col_txfm(buf1, buf1, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
5663
5664 av1_round_shift_array_32_sse4_1(buf1, buf1, txfm_size_row, -shift[1]);
5665
5666 // write to buffer
5667 highbd_write_buffer_4xn_sse4_1(buf1, output, stride, ud_flip, txfm_size_row,
5668 bd);
5669 }
5670
highbd_inv_txfm2d_add_8x4_sse41(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)5671 static void highbd_inv_txfm2d_add_8x4_sse41(const int32_t *input,
5672 uint16_t *output, int stride,
5673 TX_TYPE tx_type, TX_SIZE tx_size,
5674 int eob, const int bd) {
5675 (void)eob;
5676 __m128i buf1[8];
5677 const int8_t *shift = inv_txfm_shift_ls[tx_size];
5678 const int txw_idx = get_txw_idx(tx_size);
5679 const int txh_idx = get_txh_idx(tx_size);
5680 const int txfm_size_col = tx_size_wide[tx_size];
5681 const int txfm_size_row = tx_size_high[tx_size];
5682 const transform_1d_sse4_1 row_txfm =
5683 highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][1];
5684 const transform_1d_sse4_1 col_txfm =
5685 highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
5686
5687 assert(col_txfm != NULL);
5688 assert(row_txfm != NULL);
5689 int ud_flip, lr_flip;
5690 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
5691
5692 // 1st stage: column transform
5693 __m128i buf0[8];
5694 const int32_t *input_row = input;
5695 load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col);
5696
5697 TRANSPOSE_4X4(buf0[0], buf0[2], buf0[4], buf0[6], buf1[0], buf1[1], buf1[2],
5698 buf1[3]);
5699 TRANSPOSE_4X4(buf0[1], buf0[3], buf0[5], buf0[7], buf1[4], buf1[5], buf1[6],
5700 buf1[7]);
5701
5702 av1_round_shift_rect_array_32_sse4_1(buf1, buf0, txfm_size_col, 0,
5703 NewInvSqrt2);
5704 row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
5705
5706 __m128i *buf1_ptr;
5707 if (lr_flip) {
5708 flip_buf_sse2(buf0, buf1, txfm_size_col);
5709 buf1_ptr = buf1;
5710 } else {
5711 buf1_ptr = buf0;
5712 }
5713
5714 // 2nd stage: column transform
5715 for (int i = 0; i < 2; i++) {
5716 col_txfm(buf1_ptr + i * txfm_size_row, buf1_ptr + i * txfm_size_row,
5717 inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
5718 }
5719 av1_round_shift_array_32_sse4_1(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
5720 // write to buffer
5721 highbd_write_buffer_8xn_sse4_1(buf1_ptr, output, stride, ud_flip,
5722 txfm_size_row, bd);
5723 }
5724
highbd_inv_txfm2d_add_4x16_sse4_1(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)5725 static void highbd_inv_txfm2d_add_4x16_sse4_1(const int32_t *input,
5726 uint16_t *output, int stride,
5727 TX_TYPE tx_type, TX_SIZE tx_size,
5728 int eob, const int bd) {
5729 (void)eob;
5730 __m128i buf1[16];
5731 const int8_t *shift = inv_txfm_shift_ls[tx_size];
5732 const int txw_idx = get_txw_idx(tx_size);
5733 const int txh_idx = get_txh_idx(tx_size);
5734 const int txfm_size_col = tx_size_wide[tx_size];
5735 const int txfm_size_row = tx_size_high[tx_size];
5736 const int buf_size_h_div8 = txfm_size_row >> 2;
5737 const transform_1d_sse4_1 row_txfm =
5738 highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
5739 const transform_1d_sse4_1 col_txfm =
5740 highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][2];
5741 const int input_stride = AOMMIN(32, txfm_size_col);
5742
5743 assert(col_txfm != NULL);
5744 assert(row_txfm != NULL);
5745 int ud_flip, lr_flip;
5746 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
5747
5748 // 1st stage: column transform
5749 __m128i buf0[16];
5750 const int32_t *input_row = input;
5751 __m128i *buf0_cur = buf0;
5752 load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_row);
5753 for (int i = 0; i < (txfm_size_row >> 2); i++) {
5754 row_txfm(buf0 + (i << 2), buf0 + (i << 2),
5755 inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
5756 }
5757
5758 av1_round_shift_array_32_sse4_1(buf0, buf0, txfm_size_row, -shift[0]);
5759
5760 if (lr_flip) {
5761 for (int j = 0; j < buf_size_h_div8; ++j) {
5762 TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
5763 buf0[4 * j], buf1[4 * j], buf1[4 * j + 1], buf1[4 * j + 2],
5764 buf1[4 * j + 3]);
5765 }
5766 } else {
5767 for (int j = 0; j < buf_size_h_div8; ++j) {
5768 TRANSPOSE_4X4(buf0[4 * j], buf0[4 * j + 1], buf0[4 * j + 2],
5769 buf0[4 * j + 3], buf1[4 * j], buf1[4 * j + 1],
5770 buf1[4 * j + 2], buf1[4 * j + 3]);
5771 }
5772 }
5773
5774 // 2nd stage: column transform
5775 col_txfm(buf1, buf1, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
5776
5777 av1_round_shift_array_32_sse4_1(buf1, buf1, txfm_size_row, -shift[1]);
5778
5779 // write to buffer
5780 highbd_write_buffer_4xn_sse4_1(buf1, output, stride, ud_flip, txfm_size_row,
5781 bd);
5782 }
5783
highbd_inv_txfm2d_add_16x4_sse4_1(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)5784 static void highbd_inv_txfm2d_add_16x4_sse4_1(const int32_t *input,
5785 uint16_t *output, int stride,
5786 TX_TYPE tx_type, TX_SIZE tx_size,
5787 int eob, const int bd) {
5788 (void)eob;
5789 __m128i buf1[16];
5790 const int8_t *shift = inv_txfm_shift_ls[tx_size];
5791 const int txw_idx = get_txw_idx(tx_size);
5792 const int txh_idx = get_txh_idx(tx_size);
5793 const int txfm_size_col = tx_size_wide[tx_size];
5794 const int txfm_size_row = tx_size_high[tx_size];
5795 const int buf_size_w_div8 = txfm_size_col >> 2;
5796 const transform_1d_sse4_1 row_txfm =
5797 highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][2];
5798 const transform_1d_sse4_1 col_txfm =
5799 highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
5800
5801 assert(col_txfm != NULL);
5802 assert(row_txfm != NULL);
5803 int ud_flip, lr_flip;
5804 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
5805
5806 // 1st stage: column transform
5807 __m128i buf0[16];
5808 const int32_t *input_row = input;
5809 load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col);
5810
5811 for (int j = 0; j < buf_size_w_div8; j++) {
5812 TRANSPOSE_4X4(buf0[j], buf0[j + 4], buf0[j + 8], buf0[j + 12], buf1[4 * j],
5813 buf1[4 * j + 1], buf1[4 * j + 2], buf1[4 * j + 3]);
5814 }
5815 row_txfm(buf1, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
5816
5817 __m128i *buf1_ptr;
5818 if (lr_flip) {
5819 flip_buf_sse2(buf0, buf1, txfm_size_col);
5820 buf1_ptr = buf1;
5821 } else {
5822 buf1_ptr = buf0;
5823 }
5824
5825 // 2nd stage: column transform
5826 for (int i = 0; i < buf_size_w_div8; i++) {
5827 col_txfm(buf1_ptr + i * txfm_size_row, buf1_ptr + i * txfm_size_row,
5828 inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
5829 }
5830 av1_round_shift_array_32_sse4_1(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
5831
5832 // write to buffer
5833 for (int i = 0; i < (txfm_size_col >> 3); i++) {
5834 highbd_write_buffer_8xn_sse4_1(buf1_ptr + i * txfm_size_row * 2,
5835 output + 8 * i, stride, ud_flip,
5836 txfm_size_row, bd);
5837 }
5838 }
5839
av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)5840 void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input,
5841 uint8_t *output, int stride,
5842 TX_TYPE tx_type, TX_SIZE tx_size,
5843 int eob, const int bd) {
5844 switch (tx_type) {
5845 case DCT_DCT:
5846 case ADST_DCT:
5847 case DCT_ADST:
5848 case ADST_ADST:
5849 case FLIPADST_DCT:
5850 case DCT_FLIPADST:
5851 case FLIPADST_FLIPADST:
5852 case ADST_FLIPADST:
5853 case FLIPADST_ADST:
5854 highbd_inv_txfm2d_add_no_identity_sse41(
5855 input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
5856 bd);
5857 break;
5858 case V_DCT:
5859 case V_ADST:
5860 case V_FLIPADST:
5861 highbd_inv_txfm2d_add_h_identity_ssse41(
5862 input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
5863 bd);
5864 break;
5865 case H_DCT:
5866 case H_ADST:
5867 case H_FLIPADST:
5868 highbd_inv_txfm2d_add_v_identity_ssse41(
5869 input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
5870 bd);
5871 break;
5872 case IDTX:
5873 highbd_inv_txfm2d_add_idtx_ssse41(input, CONVERT_TO_SHORTPTR(output),
5874 stride, tx_type, tx_size, eob, bd);
5875 break;
5876 default: assert(0); break;
5877 }
5878 }
5879
av1_highbd_inv_txfm_add_4x8_sse4_1(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5880 void av1_highbd_inv_txfm_add_4x8_sse4_1(const tran_low_t *input, uint8_t *dest,
5881 int stride,
5882 const TxfmParam *txfm_param) {
5883 int bd = txfm_param->bd;
5884 const TX_TYPE tx_type = txfm_param->tx_type;
5885 const TX_SIZE tx_size = txfm_param->tx_size;
5886 int eob = txfm_param->eob;
5887 highbd_inv_txfm2d_add_4x8_sse41(input, CONVERT_TO_SHORTPTR(dest), stride,
5888 tx_type, tx_size, eob, bd);
5889 }
5890
av1_highbd_inv_txfm_add_8x4_sse4_1(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5891 void av1_highbd_inv_txfm_add_8x4_sse4_1(const tran_low_t *input, uint8_t *dest,
5892 int stride,
5893 const TxfmParam *txfm_param) {
5894 int bd = txfm_param->bd;
5895 const TX_TYPE tx_type = txfm_param->tx_type;
5896 const TX_SIZE tx_size = txfm_param->tx_size;
5897 int eob = txfm_param->eob;
5898 highbd_inv_txfm2d_add_8x4_sse41(input, CONVERT_TO_SHORTPTR(dest), stride,
5899 tx_type, tx_size, eob, bd);
5900 }
5901
av1_highbd_inv_txfm_add_4x16_sse4_1(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5902 void av1_highbd_inv_txfm_add_4x16_sse4_1(const tran_low_t *input, uint8_t *dest,
5903 int stride,
5904 const TxfmParam *txfm_param) {
5905 int bd = txfm_param->bd;
5906 const TX_TYPE tx_type = txfm_param->tx_type;
5907 const TX_SIZE tx_size = txfm_param->tx_size;
5908 int eob = txfm_param->eob;
5909 highbd_inv_txfm2d_add_4x16_sse4_1(input, CONVERT_TO_SHORTPTR(dest), stride,
5910 tx_type, tx_size, eob, bd);
5911 }
5912
av1_highbd_inv_txfm_add_16x4_sse4_1(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5913 void av1_highbd_inv_txfm_add_16x4_sse4_1(const tran_low_t *input, uint8_t *dest,
5914 int stride,
5915 const TxfmParam *txfm_param) {
5916 int bd = txfm_param->bd;
5917 const TX_TYPE tx_type = txfm_param->tx_type;
5918 const TX_SIZE tx_size = txfm_param->tx_size;
5919 int eob = txfm_param->eob;
5920 highbd_inv_txfm2d_add_16x4_sse4_1(input, CONVERT_TO_SHORTPTR(dest), stride,
5921 tx_type, tx_size, eob, bd);
5922 }
5923
av1_highbd_inv_txfm_add_sse4_1(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5924 void av1_highbd_inv_txfm_add_sse4_1(const tran_low_t *input, uint8_t *dest,
5925 int stride, const TxfmParam *txfm_param) {
5926 assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
5927 const TX_SIZE tx_size = txfm_param->tx_size;
5928 switch (tx_size) {
5929 case TX_4X8:
5930 av1_highbd_inv_txfm_add_4x8_sse4_1(input, dest, stride, txfm_param);
5931 break;
5932 case TX_8X4:
5933 av1_highbd_inv_txfm_add_8x4_sse4_1(input, dest, stride, txfm_param);
5934 break;
5935 default:
5936 // TODO(http://crbug.com/aomedia/2350): the remaining sse4_1 versions
5937 // cause test vector mismatches.
5938 av1_highbd_inv_txfm_add_c(input, dest, stride, txfm_param);
5939 break;
5940 }
5941 }
5942