1 /*
2 * Copyright (c) 2022 Samsung Electronics Co., Ltd.
3 * All Rights Reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * - Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * - Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * - Neither the name of the copyright owner, nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED.IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 #include "oapv_def.h"
33 #include "oapv_tq_avx.h"
34
35 #ifndef _mm256_set_m128i
36 #define _mm256_set_m128i(/* __m128i */ hi, /* __m128i */ lo) \
37 _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
38 #endif // !_mm256_set_m128i
39
40 #ifndef _mm256_loadu2_m128i
41 #define _mm256_loadu2_m128i(/* __m128i const* */ hiaddr, \
42 /* __m128i const* */ loaddr) \
43 _mm256_set_m128i(_mm_loadu_si128(hiaddr), _mm_loadu_si128(loaddr))
44 #endif // !_mm256_loadu2_m128i
45
oapv_tx_part_avx(s16 * src,s16 * dst,int shift,int line)46 static void oapv_tx_part_avx(s16 *src, s16 *dst, int shift, int line)
47 {
48 __m256i v0, v1, v2, v3, v4, v5, v6, v7;
49 __m256i d0, d1, d2, d3;
50 __m256i coeff[8];
51 coeff[0] = _mm256_set1_epi16(64);
52 coeff[1] = _mm256_set_epi16(64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64);
53 coeff[2] = _mm256_set_epi16(84, 35, -35, -84, -84, -35, 35, 84, 84, 35, -35, -84, -84, -35, 35, 84);
54 coeff[3] = _mm256_set_epi16(35, -84, 84, -35, -35, 84, -84, 35, 35, -84, 84, -35, -35, 84, -84, 35);
55 coeff[4] = _mm256_set_epi16(-89, -75, -50, -18, 18, 50, 75, 89, -89, -75, -50, -18, 18, 50, 75, 89);
56 coeff[5] = _mm256_set_epi16(-75, 18, 89, 50, -50, -89, -18, 75, -75, 18, 89, 50, -50, -89, -18, 75);
57 coeff[6] = _mm256_set_epi16(-50, 89, -18, -75, 75, 18, -89, 50, -50, 89, -18, -75, 75, 18, -89, 50);
58 coeff[7] = _mm256_set_epi16(-18, 50, -75, 89, -89, 75, -50, 18, -18, 50, -75, 89, -89, 75, -50, 18);
59 __m256i add = _mm256_set1_epi32(1 << (shift - 1));
60
61 __m256i s0, s1, s2, s3;
62
63 s0 = _mm256_loadu2_m128i((const __m128i *)&src[32], (const __m128i *)&src[0]);
64 s1 = _mm256_loadu2_m128i((const __m128i *)&src[40], (const __m128i *)&src[8]);
65 s2 = _mm256_loadu2_m128i((const __m128i *)&src[48], (const __m128i *)&src[16]);
66 s3 = _mm256_loadu2_m128i((const __m128i *)&src[56], (const __m128i *)&src[24]);
67
68 CALCU_2x8(coeff[0], coeff[4], d0, d1);
69 CALCU_2x8(coeff[2], coeff[5], d2, d3);
70 CALCU_2x8_ADD_SHIFT(d0, d1, d2, d3, add, shift)
71
72 d0 = _mm256_packs_epi32(d0, d1);
73 d1 = _mm256_packs_epi32(d2, d3);
74
75 d0 = _mm256_permute4x64_epi64(d0, 0xd8);
76 d1 = _mm256_permute4x64_epi64(d1, 0xd8);
77
78 _mm_store_si128((__m128i *)dst, _mm256_castsi256_si128(d0));
79 _mm_store_si128((__m128i *)(dst + 1 * line), _mm256_extracti128_si256(d0, 1));
80 _mm_store_si128((__m128i *)(dst + 2 * line), _mm256_castsi256_si128(d1));
81 _mm_store_si128((__m128i *)(dst + 3 * line), _mm256_extracti128_si256(d1, 1));
82
83 CALCU_2x8(coeff[1], coeff[6], d0, d1);
84 CALCU_2x8(coeff[3], coeff[7], d2, d3);
85 CALCU_2x8_ADD_SHIFT(d0, d1, d2, d3, add, shift);
86
87 d0 = _mm256_packs_epi32(d0, d1);
88 d1 = _mm256_packs_epi32(d2, d3);
89
90 d0 = _mm256_permute4x64_epi64(d0, 0xd8);
91 d1 = _mm256_permute4x64_epi64(d1, 0xd8);
92
93 _mm_store_si128((__m128i *)(dst + 4 * line), _mm256_castsi256_si128(d0));
94 _mm_store_si128((__m128i *)(dst + 5 * line), _mm256_extracti128_si256(d0, 1));
95 _mm_store_si128((__m128i *)(dst + 6 * line), _mm256_castsi256_si128(d1));
96 _mm_store_si128((__m128i *)(dst + 7 * line), _mm256_extracti128_si256(d1, 1));
97 }
98
99 const oapv_fn_tx_t oapv_tbl_fn_txb_avx[2] =
100 {
101 oapv_tx_part_avx,
102 NULL
103 };
104
105 ///////////////////////////////////////////////////////////////////////////////
106 // end of encoder code
107 // ENABLE_ENCODER
108 ///////////////////////////////////////////////////////////////////////////////
109
110 #define TRANSPOSE_8x4_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, O0, O1, O2, O3) \
111 tr0_0 = _mm_unpacklo_epi16(I0, I1); \
112 tr0_1 = _mm_unpacklo_epi16(I2, I3); \
113 tr0_2 = _mm_unpacklo_epi16(I4, I5); \
114 tr0_3 = _mm_unpacklo_epi16(I6, I7); \
115 tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
116 tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
117 tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
118 tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
119 O0 = _mm_unpacklo_epi64(tr1_0, tr1_2); \
120 O1 = _mm_unpackhi_epi64(tr1_0, tr1_2); \
121 O2 = _mm_unpacklo_epi64(tr1_1, tr1_3); \
122 O3 = _mm_unpackhi_epi64(tr1_1, tr1_3);
123
124 // transpose 8x8: 8 x 8(32bit) --> 8 x 8(16bit)
125 // O0: row0, row4
126 // O1: row1, row5
127 // O2: row2, row6
128 // O3: row3, row7
129 #define TRANSPOSE_8x8_32BIT_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, O0, O1, O2, O3) \
130 I0 = _mm256_packs_epi32(I0, I4); \
131 I1 = _mm256_packs_epi32(I1, I5); \
132 I2 = _mm256_packs_epi32(I2, I6); \
133 I3 = _mm256_packs_epi32(I3, I7); \
134 I4 = _mm256_unpacklo_epi16(I0, I2); \
135 I5 = _mm256_unpackhi_epi16(I0, I2); \
136 I6 = _mm256_unpacklo_epi16(I1, I3); \
137 I7 = _mm256_unpackhi_epi16(I1, I3); \
138 I0 = _mm256_unpacklo_epi16(I4, I6); \
139 I1 = _mm256_unpackhi_epi16(I4, I6); \
140 I2 = _mm256_unpacklo_epi16(I5, I7); \
141 I3 = _mm256_unpackhi_epi16(I5, I7); \
142 O0 = _mm256_unpacklo_epi64(I0, I2); \
143 O1 = _mm256_unpackhi_epi64(I0, I2); \
144 O2 = _mm256_unpacklo_epi64(I1, I3); \
145 O3 = _mm256_unpackhi_epi64(I1, I3)
146
147 // transpose 8x8: 16 x 8(32bit) --> 8 x 16(16bit)
148 #define TRANSPOSE_16x8_32BIT_16BIT(I00, I01, I02, I03, I04, I05, I06, I07, I08, I09, I10, I11, I12, I13, I14, I15, O0, O1, O2, O3, O4, O5, O6, O7)\
149 TRANSPOSE_8x8_32BIT_16BIT(I00, I01, I02, I03, I04, I05, I06, I07, I04, I05, I06, I07); \
150 TRANSPOSE_8x8_32BIT_16BIT(I08, I09, I10, I11, I12, I13, I14, I15, I12, I13, I14, I15); \
151 O0 = _mm256_insertf128_si256(I04, _mm256_castsi256_si128(I12), 1); \
152 O1 = _mm256_insertf128_si256(I05, _mm256_castsi256_si128(I13), 1); \
153 O2 = _mm256_insertf128_si256(I06, _mm256_castsi256_si128(I14), 1); \
154 O3 = _mm256_insertf128_si256(I07, _mm256_castsi256_si128(I15), 1); \
155 O4 = _mm256_insertf128_si256(I12, _mm256_extracti128_si256(I04, 1), 0); \
156 O5 = _mm256_insertf128_si256(I13, _mm256_extracti128_si256(I05, 1), 0); \
157 O6 = _mm256_insertf128_si256(I14, _mm256_extracti128_si256(I06, 1), 0); \
158 O7 = _mm256_insertf128_si256(I15, _mm256_extracti128_si256(I07, 1), 0)
159
160 #define set_vals(a,b) b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a
161 #define set_vals1(a,b) b, a, b, a, b, a, b, a
162
oapv_itx_part_avx(s16 * src,s16 * dst,int shift,int line)163 static void oapv_itx_part_avx(s16* src, s16* dst, int shift, int line)
164 {
165 const __m256i coeff_p89_p75 = _mm256_setr_epi16(89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 89, 75); // 89 75
166 const __m256i coeff_p50_p18 = _mm256_setr_epi16(50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 50, 18); // 50, 18
167 const __m256i coeff_p75_n18 = _mm256_setr_epi16(75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 75, -18); // 75, -18
168 const __m256i coeff_n89_n50 = _mm256_setr_epi16(-89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50); // -89, -50
169 const __m256i coeff_p50_n89 = _mm256_setr_epi16(50, -89, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89); // 50,-89
170 const __m256i coeff_p18_p75 = _mm256_setr_epi16(18, 75, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75); // 18, 75
171 const __m256i coeff_p18_n50 = _mm256_setr_epi16(18, -50, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50); // 18,-50
172 const __m256i coeff_p75_n89 = _mm256_setr_epi16(75, -89, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89); // 75,-89
173 const __m256i coeff_p64_p64 = _mm256_setr_epi16(64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64); // 64, 64
174 const __m256i coeff_p64_n64 = _mm256_setr_epi16(64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64); // 64, -64
175 const __m256i coeff_p84_n35 = _mm256_setr_epi16(84, 35, 84, 35, 84, 35, 84, 35, 84, 35, 84, 35, 84, 35, 84, 35); // 84, 35
176 const __m256i coeff_p35_n84 = _mm256_setr_epi16(35, -84, 35, -84, 35, -84, 35, -84, 35, -84, 35, -84, 35, -84, 35, -84); // 35, -84
177
178 __m128i s0, s1, s2, s3, s4, s5, s6, s7;
179 __m128i ss0, ss1, ss2, ss3;
180 __m256i e0, e1, e2, e3, o0, o1, o2, o3, ee0, ee1, eo0, eo1;
181 __m256i t0, t1, t2, t3;
182 __m256i d0, d1, d2, d3, d4, d5, d6, d7;
183 __m256i offset = _mm256_set1_epi32(1 << (shift - 1));
184 int j;
185 int i_src = line;
186 int i_src2 = line << 1;
187 int i_src3 = i_src + i_src2;
188 int i_src4 = i_src << 2;
189 int i_src5 = i_src2 + i_src3;
190 int i_src6 = i_src3 << 1;
191 int i_src7 = i_src3 + i_src4;
192 for (j = 0; j < line; j += 8)
193 {
194 // O[0] -- O[3]
195 s1 = _mm_loadu_si128((__m128i*)(src + i_src + j));
196 s3 = _mm_loadu_si128((__m128i*)(src + i_src3 + j));
197 s5 = _mm_loadu_si128((__m128i*)(src + i_src5 + j));
198 s7 = _mm_loadu_si128((__m128i*)(src + i_src7 + j));
199
200 ss0 = _mm_unpacklo_epi16(s1, s3);
201 ss1 = _mm_unpackhi_epi16(s1, s3);
202 ss2 = _mm_unpacklo_epi16(s5, s7);
203 ss3 = _mm_unpackhi_epi16(s5, s7);
204
205 e0 = _mm256_set_m128i(ss1, ss0);
206 e1 = _mm256_set_m128i(ss3, ss2);
207
208 t0 = _mm256_madd_epi16(e0, coeff_p89_p75);
209 t1 = _mm256_madd_epi16(e1, coeff_p50_p18);
210 t2 = _mm256_madd_epi16(e0, coeff_p75_n18);
211 t3 = _mm256_madd_epi16(e1, coeff_n89_n50);
212 o0 = _mm256_add_epi32(t0, t1);
213 o1 = _mm256_add_epi32(t2, t3);
214
215 t0 = _mm256_madd_epi16(e0, coeff_p50_n89);
216 t1 = _mm256_madd_epi16(e1, coeff_p18_p75);
217 t2 = _mm256_madd_epi16(e0, coeff_p18_n50);
218 t3 = _mm256_madd_epi16(e1, coeff_p75_n89);
219
220 o2 = _mm256_add_epi32(t0, t1);
221 o3 = _mm256_add_epi32(t2, t3);
222
223 // E[0] - E[3]
224 s0 = _mm_loadu_si128((__m128i*)(src + j));
225 s2 = _mm_loadu_si128((__m128i*)(src + i_src2 + j));
226 s4 = _mm_loadu_si128((__m128i*)(src + i_src4 + j));
227 s6 = _mm_loadu_si128((__m128i*)(src + i_src6 + j));
228
229 ss0 = _mm_unpacklo_epi16(s0, s4);
230 ss1 = _mm_unpackhi_epi16(s0, s4);
231 ss2 = _mm_unpacklo_epi16(s2, s6);
232 ss3 = _mm_unpackhi_epi16(s2, s6);
233
234 e0 = _mm256_set_m128i(ss1, ss0);
235 e1 = _mm256_set_m128i(ss3, ss2);
236
237 ee0 = _mm256_madd_epi16(e0, coeff_p64_p64);
238 ee1 = _mm256_madd_epi16(e0, coeff_p64_n64);
239 eo0 = _mm256_madd_epi16(e1, coeff_p84_n35);
240 eo1 = _mm256_madd_epi16(e1, coeff_p35_n84);
241
242 e0 = _mm256_add_epi32(ee0, eo0);
243 e3 = _mm256_sub_epi32(ee0, eo0);
244 e1 = _mm256_add_epi32(ee1, eo1);
245 e2 = _mm256_sub_epi32(ee1, eo1);
246
247 e0 = _mm256_add_epi32(e0, offset);
248 e3 = _mm256_add_epi32(e3, offset);
249 e1 = _mm256_add_epi32(e1, offset);
250 e2 = _mm256_add_epi32(e2, offset);
251
252 d0 = _mm256_add_epi32(e0, o0);
253 d7 = _mm256_sub_epi32(e0, o0);
254 d1 = _mm256_add_epi32(e1, o1);
255 d6 = _mm256_sub_epi32(e1, o1);
256 d2 = _mm256_add_epi32(e2, o2);
257 d5 = _mm256_sub_epi32(e2, o2);
258 d3 = _mm256_add_epi32(e3, o3);
259 d4 = _mm256_sub_epi32(e3, o3);
260
261 d0 = _mm256_srai_epi32(d0, shift);
262 d7 = _mm256_srai_epi32(d7, shift);
263 d1 = _mm256_srai_epi32(d1, shift);
264 d6 = _mm256_srai_epi32(d6, shift);
265 d2 = _mm256_srai_epi32(d2, shift);
266 d5 = _mm256_srai_epi32(d5, shift);
267 d3 = _mm256_srai_epi32(d3, shift);
268 d4 = _mm256_srai_epi32(d4, shift);
269
270 // transpose 8x8 : 8 x 8(32bit) --> 4 x 16(16bit)
271 TRANSPOSE_8x8_32BIT_16BIT(d0, d1, d2, d3, d4, d5, d6, d7, d4, d5, d6, d7);
272 d0 = _mm256_insertf128_si256(d4, _mm256_castsi256_si128(d5), 1);
273 d1 = _mm256_insertf128_si256(d6, _mm256_castsi256_si128(d7), 1);
274 d2 = _mm256_insertf128_si256(d5, _mm256_extracti128_si256(d4, 1), 0);
275 d3 = _mm256_insertf128_si256(d7, _mm256_extracti128_si256(d6, 1), 0);
276 // store line x 8
277 _mm256_storeu_si256((__m256i*)dst, d0);
278 _mm256_storeu_si256((__m256i*)(dst + 16), d1);
279 _mm256_storeu_si256((__m256i*)(dst + 32), d2);
280 _mm256_storeu_si256((__m256i*)(dst + 48), d3);
281 dst += 64;
282 }
283 }
284
285 const oapv_fn_itx_part_t oapv_tbl_fn_itx_part_avx[2] =
286 {
287 oapv_itx_part_avx,
288 NULL
289 };
290
oapv_itx_avx(s16 * src,int shift1,int shift2,int line)291 static void oapv_itx_avx(s16* src, int shift1, int shift2, int line)
292 {
293 const __m256i coeff_p89_p75 = _mm256_setr_epi16(89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 89, 75); // 89 75
294 const __m256i coeff_p50_p18 = _mm256_setr_epi16(50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 50, 18); // 50, 18
295 const __m256i coeff_p75_n18 = _mm256_setr_epi16(75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 75, -18); // 75, -18
296 const __m256i coeff_n89_n50 = _mm256_setr_epi16(-89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50); // -89, -50
297 const __m256i coeff_p50_n89 = _mm256_setr_epi16(50, -89, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89); // 50,-89
298 const __m256i coeff_p18_p75 = _mm256_setr_epi16(18, 75, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75); // 18, 75
299 const __m256i coeff_p18_n50 = _mm256_setr_epi16(18, -50, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50); // 18,-50
300 const __m256i coeff_p75_n89 = _mm256_setr_epi16(75, -89, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89); // 75,-89
301 const __m256i coeff_p64_p64 = _mm256_setr_epi16(64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64); // 64, 64
302 const __m256i coeff_p64_n64 = _mm256_setr_epi16(64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64); // 64, -64
303 const __m256i coeff_p84_n35 = _mm256_setr_epi16(84, 35, 84, 35, 84, 35, 84, 35, 84, 35, 84, 35, 84, 35, 84, 35); // 84, 35
304 const __m256i coeff_p35_n84 = _mm256_setr_epi16(35, -84, 35, -84, 35, -84, 35, -84, 35, -84, 35, -84, 35, -84, 35, -84); // 35, -84
305
306 __m128i s0, s1, s2, s3, s4, s5, s6, s7;
307 __m128i ss0, ss1, ss2, ss3;
308 __m256i e0, e1, e2, e3, o0, o1, o2, o3, ee0, ee1, eo0, eo1;
309 __m256i t0, t1, t2, t3;
310 __m256i d0, d1, d2, d3, d4, d5, d6, d7;
311 __m256i offset1 = _mm256_set1_epi32(1 << (shift1 - 1));
312 __m256i offset2 = _mm256_set1_epi32(1 << (shift2 - 1));
313 int i_src = line;
314 int i_src2 = line << 1;
315 int i_src3 = i_src + i_src2;
316 int i_src4 = i_src << 2;
317 int i_src5 = i_src2 + i_src3;
318 int i_src6 = i_src3 << 1;
319 int i_src7 = i_src3 + i_src4;
320 {
321 // O[0] - O[3]
322 s1 = _mm_loadu_si128((__m128i*)(src + i_src));
323 s3 = _mm_loadu_si128((__m128i*)(src + i_src3));
324 s5 = _mm_loadu_si128((__m128i*)(src + i_src5));
325 s7 = _mm_loadu_si128((__m128i*)(src + i_src7));
326
327 ss0 = _mm_unpacklo_epi16(s1, s3);
328 ss1 = _mm_unpackhi_epi16(s1, s3);
329 ss2 = _mm_unpacklo_epi16(s5, s7);
330 ss3 = _mm_unpackhi_epi16(s5, s7);
331
332 e0 = _mm256_set_m128i(ss1, ss0);
333 e1 = _mm256_set_m128i(ss3, ss2);
334
335 t0 = _mm256_madd_epi16(e0, coeff_p89_p75);
336 t1 = _mm256_madd_epi16(e1, coeff_p50_p18);
337 t2 = _mm256_madd_epi16(e0, coeff_p75_n18);
338 t3 = _mm256_madd_epi16(e1, coeff_n89_n50);
339 o0 = _mm256_add_epi32(t0, t1);
340 o1 = _mm256_add_epi32(t2, t3);
341
342 t0 = _mm256_madd_epi16(e0, coeff_p50_n89);
343 t1 = _mm256_madd_epi16(e1, coeff_p18_p75);
344 t2 = _mm256_madd_epi16(e0, coeff_p18_n50);
345 t3 = _mm256_madd_epi16(e1, coeff_p75_n89);
346
347 o2 = _mm256_add_epi32(t0, t1);
348 o3 = _mm256_add_epi32(t2, t3);
349
350 // E[0] - E[3]
351 s0 = _mm_loadu_si128((__m128i*)(src));
352 s2 = _mm_loadu_si128((__m128i*)(src + i_src2));
353 s4 = _mm_loadu_si128((__m128i*)(src + i_src4));
354 s6 = _mm_loadu_si128((__m128i*)(src + i_src6));
355
356 ss0 = _mm_unpacklo_epi16(s0, s4);
357 ss1 = _mm_unpackhi_epi16(s0, s4);
358 ss2 = _mm_unpacklo_epi16(s2, s6);
359 ss3 = _mm_unpackhi_epi16(s2, s6);
360
361 e0 = _mm256_set_m128i(ss1, ss0);
362 e1 = _mm256_set_m128i(ss3, ss2);
363
364 ee0 = _mm256_madd_epi16(e0, coeff_p64_p64);
365 ee1 = _mm256_madd_epi16(e0, coeff_p64_n64);
366 eo0 = _mm256_madd_epi16(e1, coeff_p84_n35);
367 eo1 = _mm256_madd_epi16(e1, coeff_p35_n84);
368
369 e0 = _mm256_add_epi32(ee0, eo0);
370 e3 = _mm256_sub_epi32(ee0, eo0);
371 e1 = _mm256_add_epi32(ee1, eo1);
372 e2 = _mm256_sub_epi32(ee1, eo1);
373
374 e0 = _mm256_add_epi32(e0, offset1);
375 e3 = _mm256_add_epi32(e3, offset1);
376 e1 = _mm256_add_epi32(e1, offset1);
377 e2 = _mm256_add_epi32(e2, offset1);
378
379 d0 = _mm256_add_epi32(e0, o0);
380 d7 = _mm256_sub_epi32(e0, o0);
381 d1 = _mm256_add_epi32(e1, o1);
382 d6 = _mm256_sub_epi32(e1, o1);
383 d2 = _mm256_add_epi32(e2, o2);
384 d5 = _mm256_sub_epi32(e2, o2);
385 d3 = _mm256_add_epi32(e3, o3);
386 d4 = _mm256_sub_epi32(e3, o3);
387
388 d0 = _mm256_srai_epi32(d0, shift1);
389 d7 = _mm256_srai_epi32(d7, shift1);
390 d1 = _mm256_srai_epi32(d1, shift1);
391 d6 = _mm256_srai_epi32(d6, shift1);
392 d2 = _mm256_srai_epi32(d2, shift1);
393 d5 = _mm256_srai_epi32(d5, shift1);
394 d3 = _mm256_srai_epi32(d3, shift1);
395 d4 = _mm256_srai_epi32(d4, shift1);
396
397 // transpose 8x8 : 8 x 8(32bit) --> 4 x 16(16bit)
398 TRANSPOSE_8x8_32BIT_16BIT(d0, d1, d2, d3, d4, d5, d6, d7, d4, d5, d6, d7);
399 d0 = _mm256_insertf128_si256(d4, _mm256_castsi256_si128(d5), 1);
400 d1 = _mm256_insertf128_si256(d6, _mm256_castsi256_si128(d7), 1);
401 d2 = _mm256_insertf128_si256(d5, _mm256_extracti128_si256(d4, 1), 0);
402 d3 = _mm256_insertf128_si256(d7, _mm256_extracti128_si256(d6, 1), 0);
403 }
404 {
405 // O[0] - O[3]
406 s1 = _mm256_extracti128_si256(d0, 1);
407 s3 = _mm256_extracti128_si256(d1, 1);
408 s5 = _mm256_extracti128_si256(d2, 1);
409 s7 = _mm256_extracti128_si256(d3, 1);
410
411 ss0 = _mm_unpacklo_epi16(s1, s3);
412 ss1 = _mm_unpackhi_epi16(s1, s3);
413 ss2 = _mm_unpacklo_epi16(s5, s7);
414 ss3 = _mm_unpackhi_epi16(s5, s7);
415
416 e0 = _mm256_set_m128i(ss1, ss0);
417 e1 = _mm256_set_m128i(ss3, ss2);
418
419 t0 = _mm256_madd_epi16(e0, coeff_p89_p75);
420 t1 = _mm256_madd_epi16(e1, coeff_p50_p18);
421 t2 = _mm256_madd_epi16(e0, coeff_p75_n18);
422 t3 = _mm256_madd_epi16(e1, coeff_n89_n50);
423 o0 = _mm256_add_epi32(t0, t1);
424 o1 = _mm256_add_epi32(t2, t3);
425
426 t0 = _mm256_madd_epi16(e0, coeff_p50_n89);
427 t1 = _mm256_madd_epi16(e1, coeff_p18_p75);
428 t2 = _mm256_madd_epi16(e0, coeff_p18_n50);
429 t3 = _mm256_madd_epi16(e1, coeff_p75_n89);
430
431 o2 = _mm256_add_epi32(t0, t1);
432 o3 = _mm256_add_epi32(t2, t3);
433
434 // E[0] - E[3]
435 s0 = _mm256_extracti128_si256(d0, 0);
436 s2 = _mm256_extracti128_si256(d1, 0);
437 s4 = _mm256_extracti128_si256(d2, 0);
438 s6 = _mm256_extracti128_si256(d3, 0);
439
440 ss0 = _mm_unpacklo_epi16(s0, s4);
441 ss1 = _mm_unpackhi_epi16(s0, s4);
442 ss2 = _mm_unpacklo_epi16(s2, s6);
443 ss3 = _mm_unpackhi_epi16(s2, s6);
444
445 e0 = _mm256_set_m128i(ss1, ss0);
446 e1 = _mm256_set_m128i(ss3, ss2);
447
448 ee0 = _mm256_madd_epi16(e0, coeff_p64_p64);
449 ee1 = _mm256_madd_epi16(e0, coeff_p64_n64);
450 eo0 = _mm256_madd_epi16(e1, coeff_p84_n35);
451 eo1 = _mm256_madd_epi16(e1, coeff_p35_n84);
452
453 e0 = _mm256_add_epi32(ee0, eo0);
454 e3 = _mm256_sub_epi32(ee0, eo0);
455 e1 = _mm256_add_epi32(ee1, eo1);
456 e2 = _mm256_sub_epi32(ee1, eo1);
457
458 e0 = _mm256_add_epi32(e0, offset2);
459 e3 = _mm256_add_epi32(e3, offset2);
460 e1 = _mm256_add_epi32(e1, offset2);
461 e2 = _mm256_add_epi32(e2, offset2);
462
463 d0 = _mm256_add_epi32(e0, o0);
464 d7 = _mm256_sub_epi32(e0, o0);
465 d1 = _mm256_add_epi32(e1, o1);
466 d6 = _mm256_sub_epi32(e1, o1);
467 d2 = _mm256_add_epi32(e2, o2);
468 d5 = _mm256_sub_epi32(e2, o2);
469 d3 = _mm256_add_epi32(e3, o3);
470 d4 = _mm256_sub_epi32(e3, o3);
471
472 d0 = _mm256_srai_epi32(d0, shift2);
473 d7 = _mm256_srai_epi32(d7, shift2);
474 d1 = _mm256_srai_epi32(d1, shift2);
475 d6 = _mm256_srai_epi32(d6, shift2);
476 d2 = _mm256_srai_epi32(d2, shift2);
477 d5 = _mm256_srai_epi32(d5, shift2);
478 d3 = _mm256_srai_epi32(d3, shift2);
479 d4 = _mm256_srai_epi32(d4, shift2);
480
481 // transpose 8x8 : 8 x 8(32bit) --> 4 x 16(16bit)
482 TRANSPOSE_8x8_32BIT_16BIT(d0, d1, d2, d3, d4, d5, d6, d7, d4, d5, d6, d7);
483 d0 = _mm256_insertf128_si256(d4, _mm256_castsi256_si128(d5), 1);
484 d1 = _mm256_insertf128_si256(d6, _mm256_castsi256_si128(d7), 1);
485 d2 = _mm256_insertf128_si256(d5, _mm256_extracti128_si256(d4, 1), 0);
486 d3 = _mm256_insertf128_si256(d7, _mm256_extracti128_si256(d6, 1), 0);
487
488 // store line x 8
489 _mm256_storeu_si256((__m256i*)src, d0);
490 _mm256_storeu_si256((__m256i*)(src + 16), d1);
491 _mm256_storeu_si256((__m256i*)(src + 32), d2);
492 _mm256_storeu_si256((__m256i*)(src + 48), d3);
493 }
494 }
495
496 const oapv_fn_itx_t oapv_tbl_fn_itx_avx[2] =
497 {
498 oapv_itx_avx,
499 NULL
500 };
501
mul_128i_to_256i_and_add(__m256i offset_vector,__m128i a,__m128i b)502 __m256i mul_128i_to_256i_and_add(__m256i offset_vector, __m128i a, __m128i b)
503 {
504 __m256i a_64 = _mm256_cvtepi32_epi64(a);
505 __m256i b_64 = _mm256_cvtepi32_epi64(b);
506 __m256i result = _mm256_mul_epi32(a_64, b_64);
507 result = _mm256_add_epi64(result, offset_vector);
508 return result;
509 }
510
oapv_quant_avx(s16 * coef,u8 qp,int q_matrix[OAPV_BLK_D],int log2_w,int log2_h,int bit_depth,int deadzone_offset)511 static int oapv_quant_avx(s16* coef, u8 qp, int q_matrix[OAPV_BLK_D], int log2_w, int log2_h, int bit_depth, int deadzone_offset)
512 {
513 s64 offset;
514 int shift;
515 int tr_shift;
516
517 int log2_size = (log2_w + log2_h) >> 1;
518 tr_shift = MAX_TX_DYNAMIC_RANGE - bit_depth - log2_size;
519 shift = QUANT_SHIFT + tr_shift + (qp / 6);
520 offset = (s64)deadzone_offset << (shift - 9);
521 __m256i offset_vector = _mm256_set1_epi64x(offset);
522 __m256i reg_minval_int16 = _mm256_set1_epi32(-32768);
523 __m256i reg_maxval_int16 = _mm256_set1_epi32(32767);
524
525 int pixels = (1 << (log2_w + log2_h));
526 int i;
527 __m256i shuffle0 = _mm256_setr_epi32(1, 3, 5, 7, 0, 2, 4, 6);
528 __m256i shuffle1 = _mm256_setr_epi8(
529 0, 1, 4, 5, 8, 9, 12, 13,
530 -128, -128, -128, -128, -128, -128, -128, -128,
531 -128, -128, -128, -128, -128, -128, -128, -128,
532 -128, -128, -128, -128, -128, -128, -128, -128);
533 __m256i shuffle2 = _mm256_setr_epi8(
534 -128, -128, -128, -128, -128, -128, -128, -128,
535 0, 1, 4, 5, 8, 9, 12, 13,
536 -128, -128, -128, -128, -128, -128, -128, -128,
537 -128, -128, -128, -128, -128, -128, -128, -128);
538
539 for (i = 0; i < pixels; i += 8)
540 {
541 // Load first row
542 __m256i quant_matrix = _mm256_lddqu_si256((__m256i*)(q_matrix + i));
543 __m128i coef_row = _mm_lddqu_si128((__m128i*)(coef + i));
544
545 // Extract sign
546 __m128i sign_mask = _mm_srai_epi16(coef_row, 15);
547 __m256i sign_mask_ext = _mm256_cvtepi16_epi32(sign_mask);
548
549 // Convert to 32 bits and take abs()
550 __m256i coef_row_ext = _mm256_cvtepi16_epi32(coef_row);
551 __m256i coef_row_abs = _mm256_abs_epi32(coef_row_ext);
552
553 // Multiply coeff with quant values, add offset to result and shift
554 __m256i lev1_low = mul_128i_to_256i_and_add(offset_vector, _mm256_castsi256_si128(coef_row_abs), _mm256_castsi256_si128(quant_matrix));
555 __m256i lev1_high = mul_128i_to_256i_and_add(offset_vector, _mm256_extracti128_si256(coef_row_abs, 1), _mm256_extracti128_si256(quant_matrix, 1));
556 __m256i lev2_low = _mm256_srli_epi64(lev1_low, shift);
557 __m256i lev2_high = _mm256_srli_epi64(lev1_high, shift);
558
559 // First level of combination
560 lev2_low = _mm256_slli_epi64(lev2_low, 32);
561 __m256i combined = _mm256_or_si256(lev2_low, lev2_high);
562 __m256i levx = _mm256_permutevar8x32_epi32(combined, shuffle0);
563
564 // Apply sign and clipping
565 levx = _mm256_sub_epi32(_mm256_xor_si256(levx, sign_mask_ext), sign_mask_ext);
566 levx = _mm256_max_epi32(levx, reg_minval_int16);
567 levx = _mm256_min_epi32(levx, reg_maxval_int16);
568
569 // Second level of combination
570 __m256i levx_low_sh = _mm256_shuffle_epi8(levx, shuffle1);
571 __m128i levx_high = _mm256_extracti128_si256(levx, 1);
572 __m256i levx_high_ext = _mm256_castsi128_si256(levx_high);
573 __m256i levx_high_sh = _mm256_shuffle_epi8(levx_high_ext, shuffle2);
574 levx = _mm256_or_si256(levx_high_sh, levx_low_sh);
575
576 // store in coef
577 __m128i lev4 = _mm256_castsi256_si128(levx);
578 _mm_storeu_si128((__m128i*)(coef + i), lev4);
579 }
580 return OAPV_OK;
581 }
582
583 const oapv_fn_quant_t oapv_tbl_fn_quant_avx[2] =
584 {
585 oapv_quant_avx,
586 NULL
587 };
588
589
oapv_dquant_avx(s16 * coef,s16 q_matrix[OAPV_BLK_D],int log2_w,int log2_h,s8 shift)590 static void oapv_dquant_avx(s16 *coef, s16 q_matrix[OAPV_BLK_D], int log2_w, int log2_h, s8 shift)
591 {
592 int i;
593 int pixels = (1 << (log2_w + log2_h));
594 __m256i shuffle = _mm256_setr_epi8(
595 0, 1, 4, 5, 8, 9, 12, 13,
596 -1, -1, -1, -1, -1, -1, -1, -1,
597 -1, -1, -1, -1, -1, -1, -1, -1,
598 0, 1, 4, 5, 8, 9, 12, 13 );
599 __m256i reg_minval_int16 = _mm256_set1_epi32(-32768);
600 __m256i reg_maxval_int16 = _mm256_set1_epi32( 32767);
601 if (shift > 0)
602 {
603 s32 offset = (1 << (shift - 1));
604 __m256i offset_1 = _mm256_set1_epi32(offset);
605 for (i = 0; i < pixels; i += 8)
606 {
607 __m256i cur_q_matrix = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)(q_matrix + i)));
608 __m256i coef_8_val_act = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)(coef + i)));
609
610 __m256i lev1 = _mm256_mullo_epi32(coef_8_val_act, cur_q_matrix);
611 __m256i lev2 = _mm256_add_epi32(lev1, offset_1);
612 __m256i lev3 = _mm256_srai_epi32(lev2, shift);
613
614 lev3 = _mm256_max_epi32(lev3, reg_minval_int16);
615 lev3 = _mm256_min_epi32(lev3, reg_maxval_int16);
616
617 lev3 = _mm256_shuffle_epi8( lev3, shuffle );
618 __m128i low = _mm256_castsi256_si128( lev3 );
619 __m128i high = _mm256_extracti128_si256( lev3, 1 );
620 __m128i lev4 = _mm_or_si128( low, high );
621
622 _mm_storeu_si128((__m128i *)(coef + i), lev4);
623 }
624 }
625 else
626 {
627 int left_shift = -shift;
628 for (i = 0; i < pixels; i += 8)
629 {
630 __m256i cur_q_matrix = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)(q_matrix + i)));
631 __m256i coef_8_val_act = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)(coef + i)));
632
633 __m256i lev1 = _mm256_mullo_epi32(coef_8_val_act, cur_q_matrix);
634 __m256i lev3 = _mm256_slli_epi32(lev1, left_shift);
635
636 lev3 = _mm256_max_epi32(lev3, reg_minval_int16);
637 lev3 = _mm256_min_epi32(lev3, reg_maxval_int16);
638
639 lev3 = _mm256_shuffle_epi8( lev3, shuffle );
640 __m128i low = _mm256_castsi256_si128( lev3 );
641 __m128i high = _mm256_extracti128_si256( lev3, 1 );
642 __m128i lev4 = _mm_or_si128( low, high );
643
644 _mm_storeu_si128((__m128i *)(coef + i), lev4);
645 }
646 }
647 }
648 const oapv_fn_dquant_t oapv_tbl_fn_dquant_avx[2] =
649 {
650 oapv_dquant_avx,
651 NULL,
652 };
653
oapv_adjust_itrans_avx(int * src,int * dst,int itrans_diff_idx,int diff_step,int shift)654 void oapv_adjust_itrans_avx(int* src, int* dst, int itrans_diff_idx, int diff_step, int shift)
655 {
656 __m256i v0 = _mm256_set1_epi32(diff_step);
657 __m256i v1 = _mm256_set1_epi32(1 << (shift - 1));
658 __m256i s0, s1;
659
660 for (int j = 0; j < 64; j += 8) {
661 s0 = _mm256_loadu_si256((const __m256i*)(src + j));
662 s1 = _mm256_loadu_si256((const __m256i*)(oapv_itrans_diff[itrans_diff_idx] + j));
663 s1 = _mm256_mullo_epi32(s1, v0);
664 s1 = _mm256_add_epi32(s1, v1);
665 s1 = _mm256_srai_epi32(s1, shift);
666 s1 = _mm256_add_epi32(s0, s1);
667 _mm256_storeu_si256((__m256i*)(dst + j), s1);
668 }
669 }
670
671 const oapv_fn_itx_adj_t oapv_tbl_fn_itx_adj_avx[2] =
672 {
673 oapv_adjust_itrans_avx,
674 NULL,
675 };