1 /*
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "vpx_dsp/x86/inv_txfm_sse2.h"
12 #include "vpx_dsp/x86/txfm_common_sse2.h"
13 #include "vpx_ports/mem.h"
14
vp9_iht4x4_16_add_sse2(const int16_t * input,uint8_t * dest,int stride,int tx_type)15 void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
16 int tx_type) {
17 __m128i in[2];
18 const __m128i zero = _mm_setzero_si128();
19 const __m128i eight = _mm_set1_epi16(8);
20
21 in[0] = _mm_loadu_si128((const __m128i *)(input));
22 in[1] = _mm_loadu_si128((const __m128i *)(input + 8));
23
24 switch (tx_type) {
25 case 0: // DCT_DCT
26 idct4_sse2(in);
27 idct4_sse2(in);
28 break;
29 case 1: // ADST_DCT
30 idct4_sse2(in);
31 iadst4_sse2(in);
32 break;
33 case 2: // DCT_ADST
34 iadst4_sse2(in);
35 idct4_sse2(in);
36 break;
37 case 3: // ADST_ADST
38 iadst4_sse2(in);
39 iadst4_sse2(in);
40 break;
41 default:
42 assert(0);
43 break;
44 }
45
46 // Final round and shift
47 in[0] = _mm_add_epi16(in[0], eight);
48 in[1] = _mm_add_epi16(in[1], eight);
49
50 in[0] = _mm_srai_epi16(in[0], 4);
51 in[1] = _mm_srai_epi16(in[1], 4);
52
53 // Reconstruction and Store
54 {
55 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
56 __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
57 d0 = _mm_unpacklo_epi32(d0,
58 _mm_cvtsi32_si128(*(const int *)(dest + stride)));
59 d2 = _mm_unpacklo_epi32(
60 d2, _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)));
61 d0 = _mm_unpacklo_epi8(d0, zero);
62 d2 = _mm_unpacklo_epi8(d2, zero);
63 d0 = _mm_add_epi16(d0, in[0]);
64 d2 = _mm_add_epi16(d2, in[1]);
65 d0 = _mm_packus_epi16(d0, d2);
66 // store result[0]
67 *(int *)dest = _mm_cvtsi128_si32(d0);
68 // store result[1]
69 d0 = _mm_srli_si128(d0, 4);
70 *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
71 // store result[2]
72 d0 = _mm_srli_si128(d0, 4);
73 *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
74 // store result[3]
75 d0 = _mm_srli_si128(d0, 4);
76 *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
77 }
78 }
79
vp9_iht8x8_64_add_sse2(const int16_t * input,uint8_t * dest,int stride,int tx_type)80 void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
81 int tx_type) {
82 __m128i in[8];
83 const __m128i zero = _mm_setzero_si128();
84 const __m128i final_rounding = _mm_set1_epi16(1 << 4);
85
86 // load input data
87 in[0] = _mm_load_si128((const __m128i *)input);
88 in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1));
89 in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2));
90 in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3));
91 in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4));
92 in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5));
93 in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6));
94 in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7));
95
96 switch (tx_type) {
97 case 0: // DCT_DCT
98 idct8_sse2(in);
99 idct8_sse2(in);
100 break;
101 case 1: // ADST_DCT
102 idct8_sse2(in);
103 iadst8_sse2(in);
104 break;
105 case 2: // DCT_ADST
106 iadst8_sse2(in);
107 idct8_sse2(in);
108 break;
109 case 3: // ADST_ADST
110 iadst8_sse2(in);
111 iadst8_sse2(in);
112 break;
113 default:
114 assert(0);
115 break;
116 }
117
118 // Final rounding and shift
119 in[0] = _mm_adds_epi16(in[0], final_rounding);
120 in[1] = _mm_adds_epi16(in[1], final_rounding);
121 in[2] = _mm_adds_epi16(in[2], final_rounding);
122 in[3] = _mm_adds_epi16(in[3], final_rounding);
123 in[4] = _mm_adds_epi16(in[4], final_rounding);
124 in[5] = _mm_adds_epi16(in[5], final_rounding);
125 in[6] = _mm_adds_epi16(in[6], final_rounding);
126 in[7] = _mm_adds_epi16(in[7], final_rounding);
127
128 in[0] = _mm_srai_epi16(in[0], 5);
129 in[1] = _mm_srai_epi16(in[1], 5);
130 in[2] = _mm_srai_epi16(in[2], 5);
131 in[3] = _mm_srai_epi16(in[3], 5);
132 in[4] = _mm_srai_epi16(in[4], 5);
133 in[5] = _mm_srai_epi16(in[5], 5);
134 in[6] = _mm_srai_epi16(in[6], 5);
135 in[7] = _mm_srai_epi16(in[7], 5);
136
137 RECON_AND_STORE(dest + 0 * stride, in[0]);
138 RECON_AND_STORE(dest + 1 * stride, in[1]);
139 RECON_AND_STORE(dest + 2 * stride, in[2]);
140 RECON_AND_STORE(dest + 3 * stride, in[3]);
141 RECON_AND_STORE(dest + 4 * stride, in[4]);
142 RECON_AND_STORE(dest + 5 * stride, in[5]);
143 RECON_AND_STORE(dest + 6 * stride, in[6]);
144 RECON_AND_STORE(dest + 7 * stride, in[7]);
145 }
146
vp9_iht16x16_256_add_sse2(const int16_t * input,uint8_t * dest,int stride,int tx_type)147 void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride,
148 int tx_type) {
149 __m128i in0[16], in1[16];
150
151 load_buffer_8x16(input, in0);
152 input += 8;
153 load_buffer_8x16(input, in1);
154
155 switch (tx_type) {
156 case 0: // DCT_DCT
157 idct16_sse2(in0, in1);
158 idct16_sse2(in0, in1);
159 break;
160 case 1: // ADST_DCT
161 idct16_sse2(in0, in1);
162 iadst16_sse2(in0, in1);
163 break;
164 case 2: // DCT_ADST
165 iadst16_sse2(in0, in1);
166 idct16_sse2(in0, in1);
167 break;
168 case 3: // ADST_ADST
169 iadst16_sse2(in0, in1);
170 iadst16_sse2(in0, in1);
171 break;
172 default:
173 assert(0);
174 break;
175 }
176
177 write_buffer_8x16(dest, in0, stride);
178 dest += 8;
179 write_buffer_8x16(dest, in1, stride);
180 }
181