1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <emmintrin.h> // SSE2
12
13 #include "./vpx_dsp_rtcd.h"
14 #include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
15 #include "vpx_dsp/x86/inv_txfm_sse2.h"
16 #include "vpx_dsp/x86/transpose_sse2.h"
17
highbd_idct8x8_half1d(__m128i * const io)18 static void highbd_idct8x8_half1d(__m128i *const io) {
19 __m128i step1[8], step2[8];
20
21 transpose_32bit_4x4x2(io, io);
22
23 // stage 1
24 step1[0] = io[0];
25 step1[2] = io[4];
26 step1[1] = io[2];
27 step1[3] = io[6];
28 highbd_butterfly_sse2(io[1], io[7], cospi_28_64, cospi_4_64, &step1[4],
29 &step1[7]);
30 highbd_butterfly_sse2(io[5], io[3], cospi_12_64, cospi_20_64, &step1[5],
31 &step1[6]);
32
33 // stage 2
34 highbd_butterfly_cospi16_sse2(step1[0], step1[2], &step2[0], &step2[1]);
35 highbd_butterfly_sse2(step1[1], step1[3], cospi_24_64, cospi_8_64, &step2[2],
36 &step2[3]);
37 step2[4] = _mm_add_epi32(step1[4], step1[5]);
38 step2[5] = _mm_sub_epi32(step1[4], step1[5]);
39 step2[6] = _mm_sub_epi32(step1[7], step1[6]);
40 step2[7] = _mm_add_epi32(step1[7], step1[6]);
41
42 // stage 3
43 step1[0] = _mm_add_epi32(step2[0], step2[3]);
44 step1[1] = _mm_add_epi32(step2[1], step2[2]);
45 step1[2] = _mm_sub_epi32(step2[1], step2[2]);
46 step1[3] = _mm_sub_epi32(step2[0], step2[3]);
47 step1[4] = step2[4];
48 highbd_butterfly_cospi16_sse2(step2[6], step2[5], &step1[6], &step1[5]);
49 step1[7] = step2[7];
50
51 // stage 4
52 highbd_idct8_stage4(step1, io);
53 }
54
highbd_idct8x8_12_half1d(__m128i * const io)55 static void highbd_idct8x8_12_half1d(__m128i *const io) {
56 __m128i temp1[4], sign[2], step1[8], step2[8];
57
58 transpose_32bit_4x4(io, io);
59
60 // stage 1
61 step1[0] = io[0];
62 step1[1] = io[2];
63 abs_extend_64bit_sse2(io[1], temp1, sign);
64 step1[4] = multiplication_round_shift_sse2(temp1, sign, cospi_28_64);
65 step1[7] = multiplication_round_shift_sse2(temp1, sign, cospi_4_64);
66 abs_extend_64bit_sse2(io[3], temp1, sign);
67 step1[5] = multiplication_neg_round_shift_sse2(temp1, sign, cospi_20_64);
68 step1[6] = multiplication_round_shift_sse2(temp1, sign, cospi_12_64);
69
70 // stage 2
71 abs_extend_64bit_sse2(step1[0], temp1, sign);
72 step2[0] = multiplication_round_shift_sse2(temp1, sign, cospi_16_64);
73 abs_extend_64bit_sse2(step1[1], temp1, sign);
74 step2[2] = multiplication_round_shift_sse2(temp1, sign, cospi_24_64);
75 step2[3] = multiplication_round_shift_sse2(temp1, sign, cospi_8_64);
76 step2[4] = _mm_add_epi32(step1[4], step1[5]);
77 step2[5] = _mm_sub_epi32(step1[4], step1[5]);
78 step2[6] = _mm_sub_epi32(step1[7], step1[6]);
79 step2[7] = _mm_add_epi32(step1[7], step1[6]);
80
81 // stage 3
82 step1[0] = _mm_add_epi32(step2[0], step2[3]);
83 step1[1] = _mm_add_epi32(step2[0], step2[2]);
84 step1[2] = _mm_sub_epi32(step2[0], step2[2]);
85 step1[3] = _mm_sub_epi32(step2[0], step2[3]);
86 step1[4] = step2[4];
87 highbd_butterfly_cospi16_sse2(step2[6], step2[5], &step1[6], &step1[5]);
88 step1[7] = step2[7];
89
90 // stage 4
91 highbd_idct8_stage4(step1, io);
92 }
93
vpx_highbd_idct8x8_64_add_sse2(const tran_low_t * input,uint16_t * dest,int stride,int bd)94 void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest,
95 int stride, int bd) {
96 __m128i io[16];
97
98 io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0));
99 io[4] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 4));
100 io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0));
101 io[5] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 4));
102 io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0));
103 io[6] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 4));
104 io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0));
105 io[7] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 4));
106
107 if (bd == 8) {
108 __m128i io_short[8];
109
110 io_short[0] = _mm_packs_epi32(io[0], io[4]);
111 io_short[1] = _mm_packs_epi32(io[1], io[5]);
112 io_short[2] = _mm_packs_epi32(io[2], io[6]);
113 io_short[3] = _mm_packs_epi32(io[3], io[7]);
114 io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
115 io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
116 io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0));
117 io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4));
118 io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0));
119 io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
120 io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
121 io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
122 io_short[4] = _mm_packs_epi32(io[8], io[12]);
123 io_short[5] = _mm_packs_epi32(io[9], io[13]);
124 io_short[6] = _mm_packs_epi32(io[10], io[14]);
125 io_short[7] = _mm_packs_epi32(io[11], io[15]);
126
127 vpx_idct8_sse2(io_short);
128 vpx_idct8_sse2(io_short);
129 round_shift_8x8(io_short, io);
130 } else {
131 __m128i temp[4];
132
133 highbd_idct8x8_half1d(io);
134
135 io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
136 io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
137 io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0));
138 io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4));
139 io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0));
140 io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
141 io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
142 io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
143 highbd_idct8x8_half1d(&io[8]);
144
145 temp[0] = io[4];
146 temp[1] = io[5];
147 temp[2] = io[6];
148 temp[3] = io[7];
149 io[4] = io[8];
150 io[5] = io[9];
151 io[6] = io[10];
152 io[7] = io[11];
153 highbd_idct8x8_half1d(io);
154
155 io[8] = temp[0];
156 io[9] = temp[1];
157 io[10] = temp[2];
158 io[11] = temp[3];
159 highbd_idct8x8_half1d(&io[8]);
160
161 highbd_idct8x8_final_round(io);
162 }
163
164 recon_and_store_8x8(io, dest, stride, bd);
165 }
166
vpx_highbd_idct8x8_12_add_sse2(const tran_low_t * input,uint16_t * dest,int stride,int bd)167 void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest,
168 int stride, int bd) {
169 const __m128i zero = _mm_setzero_si128();
170 __m128i io[16];
171
172 io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0));
173 io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0));
174 io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0));
175 io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0));
176
177 if (bd == 8) {
178 __m128i io_short[8];
179
180 io_short[0] = _mm_packs_epi32(io[0], zero);
181 io_short[1] = _mm_packs_epi32(io[1], zero);
182 io_short[2] = _mm_packs_epi32(io[2], zero);
183 io_short[3] = _mm_packs_epi32(io[3], zero);
184
185 idct8x8_12_add_kernel_sse2(io_short);
186 round_shift_8x8(io_short, io);
187 } else {
188 __m128i temp[4];
189
190 highbd_idct8x8_12_half1d(io);
191
192 temp[0] = io[4];
193 temp[1] = io[5];
194 temp[2] = io[6];
195 temp[3] = io[7];
196 highbd_idct8x8_12_half1d(io);
197
198 io[8] = temp[0];
199 io[9] = temp[1];
200 io[10] = temp[2];
201 io[11] = temp[3];
202 highbd_idct8x8_12_half1d(&io[8]);
203
204 highbd_idct8x8_final_round(io);
205 }
206
207 recon_and_store_8x8(io, dest, stride, bd);
208 }
209
vpx_highbd_idct8x8_1_add_sse2(const tran_low_t * input,uint16_t * dest,int stride,int bd)210 void vpx_highbd_idct8x8_1_add_sse2(const tran_low_t *input, uint16_t *dest,
211 int stride, int bd) {
212 highbd_idct_1_add_kernel(input, dest, stride, bd, 8);
213 }
214