1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <math.h>
12 #include <stdlib.h>
13 #include <string.h>
14
15 #include "vpx_dsp/ppc/bitdepth_conversion_vsx.h"
16 #include "vpx_dsp/ppc/types_vsx.h"
17 #include "vpx_dsp/ppc/inv_txfm_vsx.h"
18
19 #include "./vpx_dsp_rtcd.h"
20 #include "vpx_dsp/inv_txfm.h"
21
22 static const int16x8_t cospi1_v = { 16364, 16364, 16364, 16364,
23 16364, 16364, 16364, 16364 };
24 static const int16x8_t cospi1m_v = { -16364, -16364, -16364, -16364,
25 -16364, -16364, -16364, -16364 };
26 static const int16x8_t cospi2_v = { 16305, 16305, 16305, 16305,
27 16305, 16305, 16305, 16305 };
28 static const int16x8_t cospi2m_v = { -16305, -16305, -16305, -16305,
29 -16305, -16305, -16305, -16305 };
30 static const int16x8_t cospi3_v = { 16207, 16207, 16207, 16207,
31 16207, 16207, 16207, 16207 };
32 static const int16x8_t cospi4_v = { 16069, 16069, 16069, 16069,
33 16069, 16069, 16069, 16069 };
34 static const int16x8_t cospi4m_v = { -16069, -16069, -16069, -16069,
35 -16069, -16069, -16069, -16069 };
36 static const int16x8_t cospi5_v = { 15893, 15893, 15893, 15893,
37 15893, 15893, 15893, 15893 };
38 static const int16x8_t cospi5m_v = { -15893, -15893, -15893, -15893,
39 -15893, -15893, -15893, -15893 };
40 static const int16x8_t cospi6_v = { 15679, 15679, 15679, 15679,
41 15679, 15679, 15679, 15679 };
42 static const int16x8_t cospi7_v = { 15426, 15426, 15426, 15426,
43 15426, 15426, 15426, 15426 };
44 static const int16x8_t cospi8_v = { 15137, 15137, 15137, 15137,
45 15137, 15137, 15137, 15137 };
46 static const int16x8_t cospi8m_v = { -15137, -15137, -15137, -15137,
47 -15137, -15137, -15137, -15137 };
48 static const int16x8_t cospi9_v = { 14811, 14811, 14811, 14811,
49 14811, 14811, 14811, 14811 };
50 static const int16x8_t cospi9m_v = { -14811, -14811, -14811, -14811,
51 -14811, -14811, -14811, -14811 };
52 static const int16x8_t cospi10_v = { 14449, 14449, 14449, 14449,
53 14449, 14449, 14449, 14449 };
54 static const int16x8_t cospi10m_v = { -14449, -14449, -14449, -14449,
55 -14449, -14449, -14449, -14449 };
56 static const int16x8_t cospi11_v = { 14053, 14053, 14053, 14053,
57 14053, 14053, 14053, 14053 };
58 static const int16x8_t cospi12_v = { 13623, 13623, 13623, 13623,
59 13623, 13623, 13623, 13623 };
60 static const int16x8_t cospi12m_v = { -13623, -13623, -13623, -13623,
61 -13623, -13623, -13623, -13623 };
62 static const int16x8_t cospi13_v = { 13160, 13160, 13160, 13160,
63 13160, 13160, 13160, 13160 };
64 static const int16x8_t cospi13m_v = { -13160, -13160, -13160, -13160,
65 -13160, -13160, -13160, -13160 };
66 static const int16x8_t cospi14_v = { 12665, 12665, 12665, 12665,
67 12665, 12665, 12665, 12665 };
68 static const int16x8_t cospi15_v = { 12140, 12140, 12140, 12140,
69 12140, 12140, 12140, 12140 };
70 static const int16x8_t cospi16_v = { 11585, 11585, 11585, 11585,
71 11585, 11585, 11585, 11585 };
72 static const int16x8_t cospi16m_v = { -11585, -11585, -11585, -11585,
73 -11585, -11585, -11585, -11585 };
74 static const int16x8_t cospi17_v = { 11003, 11003, 11003, 11003,
75 11003, 11003, 11003, 11003 };
76 static const int16x8_t cospi17m_v = { -11003, -11003, -11003, -11003,
77 -11003, -11003, -11003, -11003 };
78 static const int16x8_t cospi18_v = { 10394, 10394, 10394, 10394,
79 10394, 10394, 10394, 10394 };
80 static const int16x8_t cospi18m_v = { -10394, -10394, -10394, -10394,
81 -10394, -10394, -10394, -10394 };
82 static const int16x8_t cospi19_v = { 9760, 9760, 9760, 9760,
83 9760, 9760, 9760, 9760 };
84 static const int16x8_t cospi20_v = { 9102, 9102, 9102, 9102,
85 9102, 9102, 9102, 9102 };
86 static const int16x8_t cospi20m_v = { -9102, -9102, -9102, -9102,
87 -9102, -9102, -9102, -9102 };
88 static const int16x8_t cospi21_v = { 8423, 8423, 8423, 8423,
89 8423, 8423, 8423, 8423 };
90 static const int16x8_t cospi21m_v = { -8423, -8423, -8423, -8423,
91 -8423, -8423, -8423, -8423 };
92 static const int16x8_t cospi22_v = { 7723, 7723, 7723, 7723,
93 7723, 7723, 7723, 7723 };
94 static const int16x8_t cospi23_v = { 7005, 7005, 7005, 7005,
95 7005, 7005, 7005, 7005 };
96 static const int16x8_t cospi24_v = { 6270, 6270, 6270, 6270,
97 6270, 6270, 6270, 6270 };
98 static const int16x8_t cospi24m_v = { -6270, -6270, -6270, -6270,
99 -6270, -6270, -6270, -6270 };
100 static const int16x8_t cospi25_v = { 5520, 5520, 5520, 5520,
101 5520, 5520, 5520, 5520 };
102 static const int16x8_t cospi25m_v = { -5520, -5520, -5520, -5520,
103 -5520, -5520, -5520, -5520 };
104 static const int16x8_t cospi26_v = { 4756, 4756, 4756, 4756,
105 4756, 4756, 4756, 4756 };
106 static const int16x8_t cospi26m_v = { -4756, -4756, -4756, -4756,
107 -4756, -4756, -4756, -4756 };
108 static const int16x8_t cospi27_v = { 3981, 3981, 3981, 3981,
109 3981, 3981, 3981, 3981 };
110 static const int16x8_t cospi28_v = { 3196, 3196, 3196, 3196,
111 3196, 3196, 3196, 3196 };
112 static const int16x8_t cospi28m_v = { -3196, -3196, -3196, -3196,
113 -3196, -3196, -3196, -3196 };
114 static const int16x8_t cospi29_v = { 2404, 2404, 2404, 2404,
115 2404, 2404, 2404, 2404 };
116 static const int16x8_t cospi29m_v = { -2404, -2404, -2404, -2404,
117 -2404, -2404, -2404, -2404 };
118 static const int16x8_t cospi30_v = { 1606, 1606, 1606, 1606,
119 1606, 1606, 1606, 1606 };
120 static const int16x8_t cospi31_v = { 804, 804, 804, 804, 804, 804, 804, 804 };
121
122 static const int16x8_t sinpi_1_9_v = { 5283, 5283, 5283, 5283,
123 5283, 5283, 5283, 5283 };
124 static const int16x8_t sinpi_2_9_v = { 9929, 9929, 9929, 9929,
125 9929, 9929, 9929, 9929 };
126 static const int16x8_t sinpi_3_9_v = { 13377, 13377, 13377, 13377,
127 13377, 13377, 13377, 13377 };
128 static const int16x8_t sinpi_4_9_v = { 15212, 15212, 15212, 15212,
129 15212, 15212, 15212, 15212 };
130
131 static uint8x16_t tr8_mask0 = {
132 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
133 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
134 };
135
136 static uint8x16_t tr8_mask1 = {
137 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF,
138 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F
139 };
140
141 #define ROUND_SHIFT_INIT \
142 const int32x4_t shift = vec_sl(vec_splat_s32(1), vec_splat_u32(13)); \
143 const uint32x4_t shift14 = vec_splat_u32(14);
144
145 #define DCT_CONST_ROUND_SHIFT(vec) vec = vec_sra(vec_add(vec, shift), shift14);
146
147 #define PIXEL_ADD_INIT \
148 int16x8_t add8 = vec_splat_s16(8); \
149 uint16x8_t shift4 = vec_splat_u16(4);
150
151 #define PIXEL_ADD4(out, in) out = vec_sra(vec_add(in, add8), shift4);
152
153 #define IDCT4(in0, in1, out0, out1) \
154 t0 = vec_add(in0, in1); \
155 t1 = vec_sub(in0, in1); \
156 tmp16_0 = vec_mergeh(t0, t1); \
157 temp1 = vec_sra(vec_add(vec_mule(tmp16_0, cospi16_v), shift), shift14); \
158 temp2 = vec_sra(vec_add(vec_mulo(tmp16_0, cospi16_v), shift), shift14); \
159 \
160 tmp16_0 = vec_mergel(in0, in1); \
161 temp3 = vec_sub(vec_mule(tmp16_0, cospi24_v), vec_mulo(tmp16_0, cospi8_v)); \
162 DCT_CONST_ROUND_SHIFT(temp3); \
163 temp4 = vec_add(vec_mule(tmp16_0, cospi8_v), vec_mulo(tmp16_0, cospi24_v)); \
164 DCT_CONST_ROUND_SHIFT(temp4); \
165 \
166 step0 = vec_packs(temp1, temp2); \
167 step1 = vec_packs(temp4, temp3); \
168 out0 = vec_add(step0, step1); \
169 out1 = vec_sub(step0, step1); \
170 out1 = vec_perm(out1, out1, mask0);
171
172 #define PACK_STORE(v0, v1) \
173 tmp16_0 = vec_add(vec_perm(d_u0, d_u1, tr8_mask0), v0); \
174 tmp16_1 = vec_add(vec_perm(d_u2, d_u3, tr8_mask0), v1); \
175 output_v = vec_packsu(tmp16_0, tmp16_1); \
176 \
177 vec_vsx_st(output_v, 0, tmp_dest); \
178 for (i = 0; i < 4; i++) \
179 for (j = 0; j < 4; j++) dest[j * stride + i] = tmp_dest[j * 4 + i];
180
vpx_round_store4x4_vsx(int16x8_t * in,int16x8_t * out,uint8_t * dest,int stride)181 void vpx_round_store4x4_vsx(int16x8_t *in, int16x8_t *out, uint8_t *dest,
182 int stride) {
183 int i, j;
184 uint8x16_t dest0 = vec_vsx_ld(0, dest);
185 uint8x16_t dest1 = vec_vsx_ld(stride, dest);
186 uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest);
187 uint8x16_t dest3 = vec_vsx_ld(3 * stride, dest);
188 uint8x16_t zerov = vec_splat_u8(0);
189 int16x8_t d_u0 = (int16x8_t)vec_mergeh(dest0, zerov);
190 int16x8_t d_u1 = (int16x8_t)vec_mergeh(dest1, zerov);
191 int16x8_t d_u2 = (int16x8_t)vec_mergeh(dest2, zerov);
192 int16x8_t d_u3 = (int16x8_t)vec_mergeh(dest3, zerov);
193 int16x8_t tmp16_0, tmp16_1;
194 uint8x16_t output_v;
195 uint8_t tmp_dest[16];
196 PIXEL_ADD_INIT;
197
198 PIXEL_ADD4(out[0], in[0]);
199 PIXEL_ADD4(out[1], in[1]);
200
201 PACK_STORE(out[0], out[1]);
202 }
203
vpx_idct4_vsx(int16x8_t * in,int16x8_t * out)204 void vpx_idct4_vsx(int16x8_t *in, int16x8_t *out) {
205 int32x4_t temp1, temp2, temp3, temp4;
206 int16x8_t step0, step1, tmp16_0;
207 uint8x16_t mask0 = { 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF,
208 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 };
209 int16x8_t t0 = vec_mergeh(in[0], in[1]);
210 int16x8_t t1 = vec_mergel(in[0], in[1]);
211 ROUND_SHIFT_INIT
212
213 in[0] = vec_mergeh(t0, t1);
214 in[1] = vec_mergel(t0, t1);
215
216 IDCT4(in[0], in[1], out[0], out[1]);
217 }
218
vpx_idct4x4_16_add_vsx(const tran_low_t * input,uint8_t * dest,int stride)219 void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest,
220 int stride) {
221 int16x8_t in[2], out[2];
222
223 in[0] = load_tran_low(0, input);
224 in[1] = load_tran_low(8 * sizeof(*input), input);
225 // Rows
226 vpx_idct4_vsx(in, out);
227
228 // Columns
229 vpx_idct4_vsx(out, in);
230
231 vpx_round_store4x4_vsx(in, out, dest, stride);
232 }
233
234 #define TRANSPOSE8x8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
235 out3, out4, out5, out6, out7) \
236 out0 = vec_mergeh(in0, in1); \
237 out1 = vec_mergel(in0, in1); \
238 out2 = vec_mergeh(in2, in3); \
239 out3 = vec_mergel(in2, in3); \
240 out4 = vec_mergeh(in4, in5); \
241 out5 = vec_mergel(in4, in5); \
242 out6 = vec_mergeh(in6, in7); \
243 out7 = vec_mergel(in6, in7); \
244 in0 = (int16x8_t)vec_mergeh((int32x4_t)out0, (int32x4_t)out2); \
245 in1 = (int16x8_t)vec_mergel((int32x4_t)out0, (int32x4_t)out2); \
246 in2 = (int16x8_t)vec_mergeh((int32x4_t)out1, (int32x4_t)out3); \
247 in3 = (int16x8_t)vec_mergel((int32x4_t)out1, (int32x4_t)out3); \
248 in4 = (int16x8_t)vec_mergeh((int32x4_t)out4, (int32x4_t)out6); \
249 in5 = (int16x8_t)vec_mergel((int32x4_t)out4, (int32x4_t)out6); \
250 in6 = (int16x8_t)vec_mergeh((int32x4_t)out5, (int32x4_t)out7); \
251 in7 = (int16x8_t)vec_mergel((int32x4_t)out5, (int32x4_t)out7); \
252 out0 = vec_perm(in0, in4, tr8_mask0); \
253 out1 = vec_perm(in0, in4, tr8_mask1); \
254 out2 = vec_perm(in1, in5, tr8_mask0); \
255 out3 = vec_perm(in1, in5, tr8_mask1); \
256 out4 = vec_perm(in2, in6, tr8_mask0); \
257 out5 = vec_perm(in2, in6, tr8_mask1); \
258 out6 = vec_perm(in3, in7, tr8_mask0); \
259 out7 = vec_perm(in3, in7, tr8_mask1);
260
261 /* for the: temp1 = step[x] * cospi_q - step[y] * cospi_z
262 * temp2 = step[x] * cospi_z + step[y] * cospi_q */
263 #define STEP8_0(inpt0, inpt1, outpt0, outpt1, cospi0, cospi1) \
264 tmp16_0 = vec_mergeh(inpt0, inpt1); \
265 tmp16_1 = vec_mergel(inpt0, inpt1); \
266 temp10 = vec_sub(vec_mule(tmp16_0, cospi0), vec_mulo(tmp16_0, cospi1)); \
267 temp11 = vec_sub(vec_mule(tmp16_1, cospi0), vec_mulo(tmp16_1, cospi1)); \
268 DCT_CONST_ROUND_SHIFT(temp10); \
269 DCT_CONST_ROUND_SHIFT(temp11); \
270 outpt0 = vec_packs(temp10, temp11); \
271 temp10 = vec_add(vec_mule(tmp16_0, cospi1), vec_mulo(tmp16_0, cospi0)); \
272 temp11 = vec_add(vec_mule(tmp16_1, cospi1), vec_mulo(tmp16_1, cospi0)); \
273 DCT_CONST_ROUND_SHIFT(temp10); \
274 DCT_CONST_ROUND_SHIFT(temp11); \
275 outpt1 = vec_packs(temp10, temp11);
276
277 #define STEP8_1(inpt0, inpt1, outpt0, outpt1, cospi) \
278 tmp16_2 = vec_sub(inpt0, inpt1); \
279 tmp16_3 = vec_add(inpt0, inpt1); \
280 tmp16_0 = vec_mergeh(tmp16_2, tmp16_3); \
281 tmp16_1 = vec_mergel(tmp16_2, tmp16_3); \
282 temp10 = vec_mule(tmp16_0, cospi); \
283 temp11 = vec_mule(tmp16_1, cospi); \
284 DCT_CONST_ROUND_SHIFT(temp10); \
285 DCT_CONST_ROUND_SHIFT(temp11); \
286 outpt0 = vec_packs(temp10, temp11); \
287 temp10 = vec_mulo(tmp16_0, cospi); \
288 temp11 = vec_mulo(tmp16_1, cospi); \
289 DCT_CONST_ROUND_SHIFT(temp10); \
290 DCT_CONST_ROUND_SHIFT(temp11); \
291 outpt1 = vec_packs(temp10, temp11);
292
293 #define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7) \
294 /* stage 1 */ \
295 step0 = in0; \
296 step2 = in4; \
297 step1 = in2; \
298 step3 = in6; \
299 \
300 STEP8_0(in1, in7, step4, step7, cospi28_v, cospi4_v); \
301 STEP8_0(in5, in3, step5, step6, cospi12_v, cospi20_v); \
302 \
303 /* stage 2 */ \
304 STEP8_1(step0, step2, in1, in0, cospi16_v); \
305 STEP8_0(step1, step3, in2, in3, cospi24_v, cospi8_v); \
306 in4 = vec_add(step4, step5); \
307 in5 = vec_sub(step4, step5); \
308 in6 = vec_sub(step7, step6); \
309 in7 = vec_add(step6, step7); \
310 \
311 /* stage 3 */ \
312 step0 = vec_add(in0, in3); \
313 step1 = vec_add(in1, in2); \
314 step2 = vec_sub(in1, in2); \
315 step3 = vec_sub(in0, in3); \
316 step4 = in4; \
317 STEP8_1(in6, in5, step5, step6, cospi16_v); \
318 step7 = in7; \
319 \
320 /* stage 4 */ \
321 in0 = vec_add(step0, step7); \
322 in1 = vec_add(step1, step6); \
323 in2 = vec_add(step2, step5); \
324 in3 = vec_add(step3, step4); \
325 in4 = vec_sub(step3, step4); \
326 in5 = vec_sub(step2, step5); \
327 in6 = vec_sub(step1, step6); \
328 in7 = vec_sub(step0, step7);
329
330 #define PIXEL_ADD(in, out, add, shiftx) \
331 out = vec_add(vec_sra(vec_add(in, add), shiftx), out);
332
vpx_idct8_vsx(int16x8_t * in,int16x8_t * out)333 void vpx_idct8_vsx(int16x8_t *in, int16x8_t *out) {
334 int16x8_t step0, step1, step2, step3, step4, step5, step6, step7;
335 int16x8_t tmp16_0, tmp16_1, tmp16_2, tmp16_3;
336 int32x4_t temp10, temp11;
337 ROUND_SHIFT_INIT;
338
339 TRANSPOSE8x8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], out[0],
340 out[1], out[2], out[3], out[4], out[5], out[6], out[7]);
341
342 IDCT8(out[0], out[1], out[2], out[3], out[4], out[5], out[6], out[7]);
343 }
344
vpx_round_store8x8_vsx(int16x8_t * in,uint8_t * dest,int stride)345 void vpx_round_store8x8_vsx(int16x8_t *in, uint8_t *dest, int stride) {
346 uint8x16_t zerov = vec_splat_u8(0);
347 uint8x16_t dest0 = vec_vsx_ld(0, dest);
348 uint8x16_t dest1 = vec_vsx_ld(stride, dest);
349 uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest);
350 uint8x16_t dest3 = vec_vsx_ld(3 * stride, dest);
351 uint8x16_t dest4 = vec_vsx_ld(4 * stride, dest);
352 uint8x16_t dest5 = vec_vsx_ld(5 * stride, dest);
353 uint8x16_t dest6 = vec_vsx_ld(6 * stride, dest);
354 uint8x16_t dest7 = vec_vsx_ld(7 * stride, dest);
355 int16x8_t d_u0 = (int16x8_t)vec_mergeh(dest0, zerov);
356 int16x8_t d_u1 = (int16x8_t)vec_mergeh(dest1, zerov);
357 int16x8_t d_u2 = (int16x8_t)vec_mergeh(dest2, zerov);
358 int16x8_t d_u3 = (int16x8_t)vec_mergeh(dest3, zerov);
359 int16x8_t d_u4 = (int16x8_t)vec_mergeh(dest4, zerov);
360 int16x8_t d_u5 = (int16x8_t)vec_mergeh(dest5, zerov);
361 int16x8_t d_u6 = (int16x8_t)vec_mergeh(dest6, zerov);
362 int16x8_t d_u7 = (int16x8_t)vec_mergeh(dest7, zerov);
363 int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(1));
364 uint16x8_t shift5 = vec_splat_u16(5);
365 uint8x16_t output0, output1, output2, output3;
366
367 PIXEL_ADD(in[0], d_u0, add, shift5);
368 PIXEL_ADD(in[1], d_u1, add, shift5);
369 PIXEL_ADD(in[2], d_u2, add, shift5);
370 PIXEL_ADD(in[3], d_u3, add, shift5);
371 PIXEL_ADD(in[4], d_u4, add, shift5);
372 PIXEL_ADD(in[5], d_u5, add, shift5);
373 PIXEL_ADD(in[6], d_u6, add, shift5);
374 PIXEL_ADD(in[7], d_u7, add, shift5);
375 output0 = vec_packsu(d_u0, d_u1);
376 output1 = vec_packsu(d_u2, d_u3);
377 output2 = vec_packsu(d_u4, d_u5);
378 output3 = vec_packsu(d_u6, d_u7);
379
380 vec_vsx_st(xxpermdi(output0, dest0, 1), 0, dest);
381 vec_vsx_st(xxpermdi(output0, dest1, 3), stride, dest);
382 vec_vsx_st(xxpermdi(output1, dest2, 1), 2 * stride, dest);
383 vec_vsx_st(xxpermdi(output1, dest3, 3), 3 * stride, dest);
384 vec_vsx_st(xxpermdi(output2, dest4, 1), 4 * stride, dest);
385 vec_vsx_st(xxpermdi(output2, dest5, 3), 5 * stride, dest);
386 vec_vsx_st(xxpermdi(output3, dest6, 1), 6 * stride, dest);
387 vec_vsx_st(xxpermdi(output3, dest7, 3), 7 * stride, dest);
388 }
389
vpx_idct8x8_64_add_vsx(const tran_low_t * input,uint8_t * dest,int stride)390 void vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest,
391 int stride) {
392 int16x8_t src[8], tmp[8];
393
394 src[0] = load_tran_low(0, input);
395 src[1] = load_tran_low(8 * sizeof(*input), input);
396 src[2] = load_tran_low(16 * sizeof(*input), input);
397 src[3] = load_tran_low(24 * sizeof(*input), input);
398 src[4] = load_tran_low(32 * sizeof(*input), input);
399 src[5] = load_tran_low(40 * sizeof(*input), input);
400 src[6] = load_tran_low(48 * sizeof(*input), input);
401 src[7] = load_tran_low(56 * sizeof(*input), input);
402
403 vpx_idct8_vsx(src, tmp);
404 vpx_idct8_vsx(tmp, src);
405
406 vpx_round_store8x8_vsx(src, dest, stride);
407 }
408
409 #define STEP16_1(inpt0, inpt1, outpt0, outpt1, cospi) \
410 tmp16_0 = vec_mergeh(inpt0, inpt1); \
411 tmp16_1 = vec_mergel(inpt0, inpt1); \
412 temp10 = vec_mule(tmp16_0, cospi); \
413 temp11 = vec_mule(tmp16_1, cospi); \
414 temp20 = vec_mulo(tmp16_0, cospi); \
415 temp21 = vec_mulo(tmp16_1, cospi); \
416 temp30 = vec_sub(temp10, temp20); \
417 temp10 = vec_add(temp10, temp20); \
418 temp20 = vec_sub(temp11, temp21); \
419 temp21 = vec_add(temp11, temp21); \
420 DCT_CONST_ROUND_SHIFT(temp30); \
421 DCT_CONST_ROUND_SHIFT(temp20); \
422 outpt0 = vec_packs(temp30, temp20); \
423 DCT_CONST_ROUND_SHIFT(temp10); \
424 DCT_CONST_ROUND_SHIFT(temp21); \
425 outpt1 = vec_packs(temp10, temp21);
426
427 #define IDCT16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, inA, inB, \
428 inC, inD, inE, inF, out0, out1, out2, out3, out4, out5, out6, \
429 out7, out8, out9, outA, outB, outC, outD, outE, outF) \
430 /* stage 1 */ \
431 /* out0 = in0; */ \
432 out1 = in8; \
433 out2 = in4; \
434 out3 = inC; \
435 out4 = in2; \
436 out5 = inA; \
437 out6 = in6; \
438 out7 = inE; \
439 out8 = in1; \
440 out9 = in9; \
441 outA = in5; \
442 outB = inD; \
443 outC = in3; \
444 outD = inB; \
445 outE = in7; \
446 outF = inF; \
447 \
448 /* stage 2 */ \
449 /* in0 = out0; */ \
450 in1 = out1; \
451 in2 = out2; \
452 in3 = out3; \
453 in4 = out4; \
454 in5 = out5; \
455 in6 = out6; \
456 in7 = out7; \
457 \
458 STEP8_0(out8, outF, in8, inF, cospi30_v, cospi2_v); \
459 STEP8_0(out9, outE, in9, inE, cospi14_v, cospi18_v); \
460 STEP8_0(outA, outD, inA, inD, cospi22_v, cospi10_v); \
461 STEP8_0(outB, outC, inB, inC, cospi6_v, cospi26_v); \
462 \
463 /* stage 3 */ \
464 out0 = in0; \
465 out1 = in1; \
466 out2 = in2; \
467 out3 = in3; \
468 \
469 STEP8_0(in4, in7, out4, out7, cospi28_v, cospi4_v); \
470 STEP8_0(in5, in6, out5, out6, cospi12_v, cospi20_v); \
471 \
472 out8 = vec_add(in8, in9); \
473 out9 = vec_sub(in8, in9); \
474 outA = vec_sub(inB, inA); \
475 outB = vec_add(inA, inB); \
476 outC = vec_add(inC, inD); \
477 outD = vec_sub(inC, inD); \
478 outE = vec_sub(inF, inE); \
479 outF = vec_add(inE, inF); \
480 \
481 /* stage 4 */ \
482 STEP16_1(out0, out1, in1, in0, cospi16_v); \
483 STEP8_0(out2, out3, in2, in3, cospi24_v, cospi8_v); \
484 in4 = vec_add(out4, out5); \
485 in5 = vec_sub(out4, out5); \
486 in6 = vec_sub(out7, out6); \
487 in7 = vec_add(out6, out7); \
488 \
489 in8 = out8; \
490 inF = outF; \
491 tmp16_0 = vec_mergeh(out9, outE); \
492 tmp16_1 = vec_mergel(out9, outE); \
493 temp10 = vec_sub(vec_mulo(tmp16_0, cospi24_v), vec_mule(tmp16_0, cospi8_v)); \
494 temp11 = vec_sub(vec_mulo(tmp16_1, cospi24_v), vec_mule(tmp16_1, cospi8_v)); \
495 DCT_CONST_ROUND_SHIFT(temp10); \
496 DCT_CONST_ROUND_SHIFT(temp11); \
497 in9 = vec_packs(temp10, temp11); \
498 temp10 = vec_add(vec_mule(tmp16_0, cospi24_v), vec_mulo(tmp16_0, cospi8_v)); \
499 temp11 = vec_add(vec_mule(tmp16_1, cospi24_v), vec_mulo(tmp16_1, cospi8_v)); \
500 DCT_CONST_ROUND_SHIFT(temp10); \
501 DCT_CONST_ROUND_SHIFT(temp11); \
502 inE = vec_packs(temp10, temp11); \
503 \
504 tmp16_0 = vec_mergeh(outA, outD); \
505 tmp16_1 = vec_mergel(outA, outD); \
506 temp10 = \
507 vec_sub(vec_mule(tmp16_0, cospi24m_v), vec_mulo(tmp16_0, cospi8_v)); \
508 temp11 = \
509 vec_sub(vec_mule(tmp16_1, cospi24m_v), vec_mulo(tmp16_1, cospi8_v)); \
510 DCT_CONST_ROUND_SHIFT(temp10); \
511 DCT_CONST_ROUND_SHIFT(temp11); \
512 inA = vec_packs(temp10, temp11); \
513 temp10 = vec_sub(vec_mulo(tmp16_0, cospi24_v), vec_mule(tmp16_0, cospi8_v)); \
514 temp11 = vec_sub(vec_mulo(tmp16_1, cospi24_v), vec_mule(tmp16_1, cospi8_v)); \
515 DCT_CONST_ROUND_SHIFT(temp10); \
516 DCT_CONST_ROUND_SHIFT(temp11); \
517 inD = vec_packs(temp10, temp11); \
518 \
519 inB = outB; \
520 inC = outC; \
521 \
522 /* stage 5 */ \
523 out0 = vec_add(in0, in3); \
524 out1 = vec_add(in1, in2); \
525 out2 = vec_sub(in1, in2); \
526 out3 = vec_sub(in0, in3); \
527 out4 = in4; \
528 STEP16_1(in6, in5, out5, out6, cospi16_v); \
529 out7 = in7; \
530 \
531 out8 = vec_add(in8, inB); \
532 out9 = vec_add(in9, inA); \
533 outA = vec_sub(in9, inA); \
534 outB = vec_sub(in8, inB); \
535 outC = vec_sub(inF, inC); \
536 outD = vec_sub(inE, inD); \
537 outE = vec_add(inD, inE); \
538 outF = vec_add(inC, inF); \
539 \
540 /* stage 6 */ \
541 in0 = vec_add(out0, out7); \
542 in1 = vec_add(out1, out6); \
543 in2 = vec_add(out2, out5); \
544 in3 = vec_add(out3, out4); \
545 in4 = vec_sub(out3, out4); \
546 in5 = vec_sub(out2, out5); \
547 in6 = vec_sub(out1, out6); \
548 in7 = vec_sub(out0, out7); \
549 in8 = out8; \
550 in9 = out9; \
551 STEP16_1(outD, outA, inA, inD, cospi16_v); \
552 STEP16_1(outC, outB, inB, inC, cospi16_v); \
553 inE = outE; \
554 inF = outF; \
555 \
556 /* stage 7 */ \
557 out0 = vec_add(in0, inF); \
558 out1 = vec_add(in1, inE); \
559 out2 = vec_add(in2, inD); \
560 out3 = vec_add(in3, inC); \
561 out4 = vec_add(in4, inB); \
562 out5 = vec_add(in5, inA); \
563 out6 = vec_add(in6, in9); \
564 out7 = vec_add(in7, in8); \
565 out8 = vec_sub(in7, in8); \
566 out9 = vec_sub(in6, in9); \
567 outA = vec_sub(in5, inA); \
568 outB = vec_sub(in4, inB); \
569 outC = vec_sub(in3, inC); \
570 outD = vec_sub(in2, inD); \
571 outE = vec_sub(in1, inE); \
572 outF = vec_sub(in0, inF);
573
574 #define PIXEL_ADD_STORE16(in0, in1, dst, offset) \
575 d_uh = (int16x8_t)vec_mergeh(dst, zerov); \
576 d_ul = (int16x8_t)vec_mergel(dst, zerov); \
577 PIXEL_ADD(in0, d_uh, add, shift6); \
578 PIXEL_ADD(in1, d_ul, add, shift6); \
579 vec_vsx_st(vec_packsu(d_uh, d_ul), offset, dest);
580
half_idct16x8_vsx(int16x8_t * src)581 static void half_idct16x8_vsx(int16x8_t *src) {
582 int16x8_t tmp0[8], tmp1[8];
583 int32x4_t temp10, temp11, temp20, temp21, temp30;
584 int16x8_t tmp16_0, tmp16_1;
585 ROUND_SHIFT_INIT;
586
587 TRANSPOSE8x8(src[0], src[2], src[4], src[6], src[8], src[10], src[12],
588 src[14], tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5],
589 tmp0[6], tmp0[7]);
590 TRANSPOSE8x8(src[1], src[3], src[5], src[7], src[9], src[11], src[13],
591 src[15], tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5],
592 tmp1[6], tmp1[7]);
593 IDCT16(tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5], tmp0[6], tmp0[7],
594 tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5], tmp1[6], tmp1[7],
595 src[0], src[2], src[4], src[6], src[8], src[10], src[12], src[14],
596 src[1], src[3], src[5], src[7], src[9], src[11], src[13], src[15]);
597 }
598
vpx_idct16_vsx(int16x8_t * src0,int16x8_t * src1)599 void vpx_idct16_vsx(int16x8_t *src0, int16x8_t *src1) {
600 int16x8_t tmp0[8], tmp1[8], tmp2[8], tmp3[8];
601 int32x4_t temp10, temp11, temp20, temp21, temp30;
602 int16x8_t tmp16_0, tmp16_1;
603 ROUND_SHIFT_INIT;
604
605 TRANSPOSE8x8(src0[0], src0[2], src0[4], src0[6], src0[8], src0[10], src0[12],
606 src0[14], tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5],
607 tmp0[6], tmp0[7]);
608 TRANSPOSE8x8(src0[1], src0[3], src0[5], src0[7], src0[9], src0[11], src0[13],
609 src0[15], tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5],
610 tmp1[6], tmp1[7]);
611 TRANSPOSE8x8(src1[0], src1[2], src1[4], src1[6], src1[8], src1[10], src1[12],
612 src1[14], tmp2[0], tmp2[1], tmp2[2], tmp2[3], tmp2[4], tmp2[5],
613 tmp2[6], tmp2[7]);
614 TRANSPOSE8x8(src1[1], src1[3], src1[5], src1[7], src1[9], src1[11], src1[13],
615 src1[15], tmp3[0], tmp3[1], tmp3[2], tmp3[3], tmp3[4], tmp3[5],
616 tmp3[6], tmp3[7]);
617
618 IDCT16(tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5], tmp0[6], tmp0[7],
619 tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5], tmp1[6], tmp1[7],
620 src0[0], src0[2], src0[4], src0[6], src0[8], src0[10], src0[12],
621 src0[14], src1[0], src1[2], src1[4], src1[6], src1[8], src1[10],
622 src1[12], src1[14]);
623
624 IDCT16(tmp2[0], tmp2[1], tmp2[2], tmp2[3], tmp2[4], tmp2[5], tmp2[6], tmp2[7],
625 tmp3[0], tmp3[1], tmp3[2], tmp3[3], tmp3[4], tmp3[5], tmp3[6], tmp3[7],
626 src0[1], src0[3], src0[5], src0[7], src0[9], src0[11], src0[13],
627 src0[15], src1[1], src1[3], src1[5], src1[7], src1[9], src1[11],
628 src1[13], src1[15]);
629 }
630
vpx_round_store16x16_vsx(int16x8_t * src0,int16x8_t * src1,uint8_t * dest,int stride)631 void vpx_round_store16x16_vsx(int16x8_t *src0, int16x8_t *src1, uint8_t *dest,
632 int stride) {
633 uint8x16_t destv[16];
634 int16x8_t d_uh, d_ul;
635 uint8x16_t zerov = vec_splat_u8(0);
636 uint16x8_t shift6 = vec_splat_u16(6);
637 int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(2));
638
639 // load dest
640 LOAD_INPUT16(vec_vsx_ld, dest, 0, stride, destv);
641
642 PIXEL_ADD_STORE16(src0[0], src0[1], destv[0], 0);
643 PIXEL_ADD_STORE16(src0[2], src0[3], destv[1], stride);
644 PIXEL_ADD_STORE16(src0[4], src0[5], destv[2], 2 * stride);
645 PIXEL_ADD_STORE16(src0[6], src0[7], destv[3], 3 * stride);
646 PIXEL_ADD_STORE16(src0[8], src0[9], destv[4], 4 * stride);
647 PIXEL_ADD_STORE16(src0[10], src0[11], destv[5], 5 * stride);
648 PIXEL_ADD_STORE16(src0[12], src0[13], destv[6], 6 * stride);
649 PIXEL_ADD_STORE16(src0[14], src0[15], destv[7], 7 * stride);
650
651 PIXEL_ADD_STORE16(src1[0], src1[1], destv[8], 8 * stride);
652 PIXEL_ADD_STORE16(src1[2], src1[3], destv[9], 9 * stride);
653 PIXEL_ADD_STORE16(src1[4], src1[5], destv[10], 10 * stride);
654 PIXEL_ADD_STORE16(src1[6], src1[7], destv[11], 11 * stride);
655 PIXEL_ADD_STORE16(src1[8], src1[9], destv[12], 12 * stride);
656 PIXEL_ADD_STORE16(src1[10], src1[11], destv[13], 13 * stride);
657 PIXEL_ADD_STORE16(src1[12], src1[13], destv[14], 14 * stride);
658 PIXEL_ADD_STORE16(src1[14], src1[15], destv[15], 15 * stride);
659 }
vpx_idct16x16_256_add_vsx(const tran_low_t * input,uint8_t * dest,int stride)660 void vpx_idct16x16_256_add_vsx(const tran_low_t *input, uint8_t *dest,
661 int stride) {
662 int16x8_t src0[16], src1[16];
663 int16x8_t tmp0[8], tmp1[8], tmp2[8], tmp3[8];
664 int32x4_t temp10, temp11, temp20, temp21, temp30;
665 int16x8_t tmp16_0, tmp16_1;
666 ROUND_SHIFT_INIT;
667
668 LOAD_INPUT16(load_tran_low, input, 0, 8 * sizeof(*input), src0);
669 LOAD_INPUT16(load_tran_low, input, 8 * 8 * 2 * sizeof(*input),
670 8 * sizeof(*input), src1);
671
672 // transform rows
673 // transform the upper half of 16x16 matrix
674 half_idct16x8_vsx(src0);
675 TRANSPOSE8x8(src0[0], src0[2], src0[4], src0[6], src0[8], src0[10], src0[12],
676 src0[14], tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5],
677 tmp0[6], tmp0[7]);
678 TRANSPOSE8x8(src0[1], src0[3], src0[5], src0[7], src0[9], src0[11], src0[13],
679 src0[15], tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5],
680 tmp1[6], tmp1[7]);
681
682 // transform the lower half of 16x16 matrix
683 half_idct16x8_vsx(src1);
684 TRANSPOSE8x8(src1[0], src1[2], src1[4], src1[6], src1[8], src1[10], src1[12],
685 src1[14], tmp2[0], tmp2[1], tmp2[2], tmp2[3], tmp2[4], tmp2[5],
686 tmp2[6], tmp2[7]);
687 TRANSPOSE8x8(src1[1], src1[3], src1[5], src1[7], src1[9], src1[11], src1[13],
688 src1[15], tmp3[0], tmp3[1], tmp3[2], tmp3[3], tmp3[4], tmp3[5],
689 tmp3[6], tmp3[7]);
690
691 // transform columns
692 // left half first
693 IDCT16(tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5], tmp0[6], tmp0[7],
694 tmp2[0], tmp2[1], tmp2[2], tmp2[3], tmp2[4], tmp2[5], tmp2[6], tmp2[7],
695 src0[0], src0[2], src0[4], src0[6], src0[8], src0[10], src0[12],
696 src0[14], src1[0], src1[2], src1[4], src1[6], src1[8], src1[10],
697 src1[12], src1[14]);
698 // right half
699 IDCT16(tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5], tmp1[6], tmp1[7],
700 tmp3[0], tmp3[1], tmp3[2], tmp3[3], tmp3[4], tmp3[5], tmp3[6], tmp3[7],
701 src0[1], src0[3], src0[5], src0[7], src0[9], src0[11], src0[13],
702 src0[15], src1[1], src1[3], src1[5], src1[7], src1[9], src1[11],
703 src1[13], src1[15]);
704
705 vpx_round_store16x16_vsx(src0, src1, dest, stride);
706 }
707
708 #define LOAD_8x32(load, in00, in01, in02, in03, in10, in11, in12, in13, in20, \
709 in21, in22, in23, in30, in31, in32, in33, in40, in41, in42, \
710 in43, in50, in51, in52, in53, in60, in61, in62, in63, in70, \
711 in71, in72, in73, offset) \
712 /* load the first row from the 8x32 block*/ \
713 in00 = load(offset, input); \
714 in01 = load(offset + 16, input); \
715 in02 = load(offset + 2 * 16, input); \
716 in03 = load(offset + 3 * 16, input); \
717 \
718 in10 = load(offset + 4 * 16, input); \
719 in11 = load(offset + 5 * 16, input); \
720 in12 = load(offset + 6 * 16, input); \
721 in13 = load(offset + 7 * 16, input); \
722 \
723 in20 = load(offset + 8 * 16, input); \
724 in21 = load(offset + 9 * 16, input); \
725 in22 = load(offset + 10 * 16, input); \
726 in23 = load(offset + 11 * 16, input); \
727 \
728 in30 = load(offset + 12 * 16, input); \
729 in31 = load(offset + 13 * 16, input); \
730 in32 = load(offset + 14 * 16, input); \
731 in33 = load(offset + 15 * 16, input); \
732 \
733 in40 = load(offset + 16 * 16, input); \
734 in41 = load(offset + 17 * 16, input); \
735 in42 = load(offset + 18 * 16, input); \
736 in43 = load(offset + 19 * 16, input); \
737 \
738 in50 = load(offset + 20 * 16, input); \
739 in51 = load(offset + 21 * 16, input); \
740 in52 = load(offset + 22 * 16, input); \
741 in53 = load(offset + 23 * 16, input); \
742 \
743 in60 = load(offset + 24 * 16, input); \
744 in61 = load(offset + 25 * 16, input); \
745 in62 = load(offset + 26 * 16, input); \
746 in63 = load(offset + 27 * 16, input); \
747 \
748 /* load the last row from the 8x32 block*/ \
749 in70 = load(offset + 28 * 16, input); \
750 in71 = load(offset + 29 * 16, input); \
751 in72 = load(offset + 30 * 16, input); \
752 in73 = load(offset + 31 * 16, input);
753
754 /* for the: temp1 = -step[x] * cospi_q + step[y] * cospi_z
755 * temp2 = step[x] * cospi_z + step[y] * cospi_q */
756 #define STEP32(inpt0, inpt1, outpt0, outpt1, cospi0, cospi1) \
757 tmp16_0 = vec_mergeh(inpt0, inpt1); \
758 tmp16_1 = vec_mergel(inpt0, inpt1); \
759 temp10 = vec_sub(vec_mulo(tmp16_0, cospi1), vec_mule(tmp16_0, cospi0)); \
760 temp11 = vec_sub(vec_mulo(tmp16_1, cospi1), vec_mule(tmp16_1, cospi0)); \
761 DCT_CONST_ROUND_SHIFT(temp10); \
762 DCT_CONST_ROUND_SHIFT(temp11); \
763 outpt0 = vec_packs(temp10, temp11); \
764 temp10 = vec_add(vec_mule(tmp16_0, cospi1), vec_mulo(tmp16_0, cospi0)); \
765 temp11 = vec_add(vec_mule(tmp16_1, cospi1), vec_mulo(tmp16_1, cospi0)); \
766 DCT_CONST_ROUND_SHIFT(temp10); \
767 DCT_CONST_ROUND_SHIFT(temp11); \
768 outpt1 = vec_packs(temp10, temp11);
769
770 /* for the: temp1 = -step[x] * cospi_q - step[y] * cospi_z
771 * temp2 = -step[x] * cospi_z + step[y] * cospi_q */
772 #define STEP32_1(inpt0, inpt1, outpt0, outpt1, cospi0, cospi1, cospi1m) \
773 tmp16_0 = vec_mergeh(inpt0, inpt1); \
774 tmp16_1 = vec_mergel(inpt0, inpt1); \
775 temp10 = vec_sub(vec_mulo(tmp16_0, cospi1m), vec_mule(tmp16_0, cospi0)); \
776 temp11 = vec_sub(vec_mulo(tmp16_1, cospi1m), vec_mule(tmp16_1, cospi0)); \
777 DCT_CONST_ROUND_SHIFT(temp10); \
778 DCT_CONST_ROUND_SHIFT(temp11); \
779 outpt0 = vec_packs(temp10, temp11); \
780 temp10 = vec_sub(vec_mulo(tmp16_0, cospi0), vec_mule(tmp16_0, cospi1)); \
781 temp11 = vec_sub(vec_mulo(tmp16_1, cospi0), vec_mule(tmp16_1, cospi1)); \
782 DCT_CONST_ROUND_SHIFT(temp10); \
783 DCT_CONST_ROUND_SHIFT(temp11); \
784 outpt1 = vec_packs(temp10, temp11);
785
786 #define IDCT32(in0, in1, in2, in3, out) \
787 \
788 /* stage 1 */ \
789 /* out[0][0] = in[0][0]; */ \
790 out[0][1] = in2[0]; \
791 out[0][2] = in1[0]; \
792 out[0][3] = in3[0]; \
793 out[0][4] = in0[4]; \
794 out[0][5] = in2[4]; \
795 out[0][6] = in1[4]; \
796 out[0][7] = in3[4]; \
797 out[1][0] = in0[2]; \
798 out[1][1] = in2[2]; \
799 out[1][2] = in1[2]; \
800 out[1][3] = in3[2]; \
801 out[1][4] = in0[6]; \
802 out[1][5] = in2[6]; \
803 out[1][6] = in1[6]; \
804 out[1][7] = in3[6]; \
805 \
806 STEP8_0(in0[1], in3[7], out[2][0], out[3][7], cospi31_v, cospi1_v); \
807 STEP8_0(in2[1], in1[7], out[2][1], out[3][6], cospi15_v, cospi17_v); \
808 STEP8_0(in1[1], in2[7], out[2][2], out[3][5], cospi23_v, cospi9_v); \
809 STEP8_0(in3[1], in0[7], out[2][3], out[3][4], cospi7_v, cospi25_v); \
810 STEP8_0(in0[5], in3[3], out[2][4], out[3][3], cospi27_v, cospi5_v); \
811 STEP8_0(in2[5], in1[3], out[2][5], out[3][2], cospi11_v, cospi21_v); \
812 STEP8_0(in1[5], in2[3], out[2][6], out[3][1], cospi19_v, cospi13_v); \
813 STEP8_0(in3[5], in0[3], out[2][7], out[3][0], cospi3_v, cospi29_v); \
814 \
815 /* stage 2 */ \
816 /* in0[0] = out[0][0]; */ \
817 in0[1] = out[0][1]; \
818 in0[2] = out[0][2]; \
819 in0[3] = out[0][3]; \
820 in0[4] = out[0][4]; \
821 in0[5] = out[0][5]; \
822 in0[6] = out[0][6]; \
823 in0[7] = out[0][7]; \
824 \
825 STEP8_0(out[1][0], out[1][7], in1[0], in1[7], cospi30_v, cospi2_v); \
826 STEP8_0(out[1][1], out[1][6], in1[1], in1[6], cospi14_v, cospi18_v); \
827 STEP8_0(out[1][2], out[1][5], in1[2], in1[5], cospi22_v, cospi10_v); \
828 STEP8_0(out[1][3], out[1][4], in1[3], in1[4], cospi6_v, cospi26_v); \
829 \
830 in2[0] = vec_add(out[2][0], out[2][1]); \
831 in2[1] = vec_sub(out[2][0], out[2][1]); \
832 in2[2] = vec_sub(out[2][3], out[2][2]); \
833 in2[3] = vec_add(out[2][3], out[2][2]); \
834 in2[4] = vec_add(out[2][4], out[2][5]); \
835 in2[5] = vec_sub(out[2][4], out[2][5]); \
836 in2[6] = vec_sub(out[2][7], out[2][6]); \
837 in2[7] = vec_add(out[2][7], out[2][6]); \
838 in3[0] = vec_add(out[3][0], out[3][1]); \
839 in3[1] = vec_sub(out[3][0], out[3][1]); \
840 in3[2] = vec_sub(out[3][3], out[3][2]); \
841 in3[3] = vec_add(out[3][3], out[3][2]); \
842 in3[4] = vec_add(out[3][4], out[3][5]); \
843 in3[5] = vec_sub(out[3][4], out[3][5]); \
844 in3[6] = vec_sub(out[3][7], out[3][6]); \
845 in3[7] = vec_add(out[3][6], out[3][7]); \
846 \
847 /* stage 3 */ \
848 out[0][0] = in0[0]; \
849 out[0][1] = in0[1]; \
850 out[0][2] = in0[2]; \
851 out[0][3] = in0[3]; \
852 \
853 STEP8_0(in0[4], in0[7], out[0][4], out[0][7], cospi28_v, cospi4_v); \
854 STEP8_0(in0[5], in0[6], out[0][5], out[0][6], cospi12_v, cospi20_v); \
855 \
856 out[1][0] = vec_add(in1[0], in1[1]); \
857 out[1][1] = vec_sub(in1[0], in1[1]); \
858 out[1][2] = vec_sub(in1[3], in1[2]); \
859 out[1][3] = vec_add(in1[2], in1[3]); \
860 out[1][4] = vec_add(in1[4], in1[5]); \
861 out[1][5] = vec_sub(in1[4], in1[5]); \
862 out[1][6] = vec_sub(in1[7], in1[6]); \
863 out[1][7] = vec_add(in1[6], in1[7]); \
864 \
865 out[2][0] = in2[0]; \
866 out[3][7] = in3[7]; \
867 STEP32(in2[1], in3[6], out[2][1], out[3][6], cospi4_v, cospi28_v); \
868 STEP32_1(in2[2], in3[5], out[2][2], out[3][5], cospi28_v, cospi4_v, \
869 cospi4m_v); \
870 out[2][3] = in2[3]; \
871 out[2][4] = in2[4]; \
872 STEP32(in2[5], in3[2], out[2][5], out[3][2], cospi20_v, cospi12_v); \
873 STEP32_1(in2[6], in3[1], out[2][6], out[3][1], cospi12_v, cospi20_v, \
874 cospi20m_v); \
875 out[2][7] = in2[7]; \
876 out[3][0] = in3[0]; \
877 out[3][3] = in3[3]; \
878 out[3][4] = in3[4]; \
879 \
880 /* stage 4 */ \
881 STEP16_1(out[0][0], out[0][1], in0[1], in0[0], cospi16_v); \
882 STEP8_0(out[0][2], out[0][3], in0[2], in0[3], cospi24_v, cospi8_v); \
883 in0[4] = vec_add(out[0][4], out[0][5]); \
884 in0[5] = vec_sub(out[0][4], out[0][5]); \
885 in0[6] = vec_sub(out[0][7], out[0][6]); \
886 in0[7] = vec_add(out[0][7], out[0][6]); \
887 \
888 in1[0] = out[1][0]; \
889 in1[7] = out[1][7]; \
890 STEP32(out[1][1], out[1][6], in1[1], in1[6], cospi8_v, cospi24_v); \
891 STEP32_1(out[1][2], out[1][5], in1[2], in1[5], cospi24_v, cospi8_v, \
892 cospi8m_v); \
893 in1[3] = out[1][3]; \
894 in1[4] = out[1][4]; \
895 \
896 in2[0] = vec_add(out[2][0], out[2][3]); \
897 in2[1] = vec_add(out[2][1], out[2][2]); \
898 in2[2] = vec_sub(out[2][1], out[2][2]); \
899 in2[3] = vec_sub(out[2][0], out[2][3]); \
900 in2[4] = vec_sub(out[2][7], out[2][4]); \
901 in2[5] = vec_sub(out[2][6], out[2][5]); \
902 in2[6] = vec_add(out[2][5], out[2][6]); \
903 in2[7] = vec_add(out[2][4], out[2][7]); \
904 \
905 in3[0] = vec_add(out[3][0], out[3][3]); \
906 in3[1] = vec_add(out[3][1], out[3][2]); \
907 in3[2] = vec_sub(out[3][1], out[3][2]); \
908 in3[3] = vec_sub(out[3][0], out[3][3]); \
909 in3[4] = vec_sub(out[3][7], out[3][4]); \
910 in3[5] = vec_sub(out[3][6], out[3][5]); \
911 in3[6] = vec_add(out[3][5], out[3][6]); \
912 in3[7] = vec_add(out[3][4], out[3][7]); \
913 \
914 /* stage 5 */ \
915 out[0][0] = vec_add(in0[0], in0[3]); \
916 out[0][1] = vec_add(in0[1], in0[2]); \
917 out[0][2] = vec_sub(in0[1], in0[2]); \
918 out[0][3] = vec_sub(in0[0], in0[3]); \
919 out[0][4] = in0[4]; \
920 STEP16_1(in0[6], in0[5], out[0][5], out[0][6], cospi16_v); \
921 out[0][7] = in0[7]; \
922 \
923 out[1][0] = vec_add(in1[0], in1[3]); \
924 out[1][1] = vec_add(in1[1], in1[2]); \
925 out[1][2] = vec_sub(in1[1], in1[2]); \
926 out[1][3] = vec_sub(in1[0], in1[3]); \
927 out[1][4] = vec_sub(in1[7], in1[4]); \
928 out[1][5] = vec_sub(in1[6], in1[5]); \
929 out[1][6] = vec_add(in1[5], in1[6]); \
930 out[1][7] = vec_add(in1[4], in1[7]); \
931 \
932 out[2][0] = in2[0]; \
933 out[2][1] = in2[1]; \
934 STEP32(in2[2], in3[5], out[2][2], out[3][5], cospi8_v, cospi24_v); \
935 STEP32(in2[3], in3[4], out[2][3], out[3][4], cospi8_v, cospi24_v); \
936 STEP32_1(in2[4], in3[3], out[2][4], out[3][3], cospi24_v, cospi8_v, \
937 cospi8m_v); \
938 STEP32_1(in2[5], in3[2], out[2][5], out[3][2], cospi24_v, cospi8_v, \
939 cospi8m_v); \
940 out[2][6] = in2[6]; \
941 out[2][7] = in2[7]; \
942 out[3][0] = in3[0]; \
943 out[3][1] = in3[1]; \
944 out[3][6] = in3[6]; \
945 out[3][7] = in3[7]; \
946 \
947 /* stage 6 */ \
948 in0[0] = vec_add(out[0][0], out[0][7]); \
949 in0[1] = vec_add(out[0][1], out[0][6]); \
950 in0[2] = vec_add(out[0][2], out[0][5]); \
951 in0[3] = vec_add(out[0][3], out[0][4]); \
952 in0[4] = vec_sub(out[0][3], out[0][4]); \
953 in0[5] = vec_sub(out[0][2], out[0][5]); \
954 in0[6] = vec_sub(out[0][1], out[0][6]); \
955 in0[7] = vec_sub(out[0][0], out[0][7]); \
956 in1[0] = out[1][0]; \
957 in1[1] = out[1][1]; \
958 STEP16_1(out[1][5], out[1][2], in1[2], in1[5], cospi16_v); \
959 STEP16_1(out[1][4], out[1][3], in1[3], in1[4], cospi16_v); \
960 in1[6] = out[1][6]; \
961 in1[7] = out[1][7]; \
962 \
963 in2[0] = vec_add(out[2][0], out[2][7]); \
964 in2[1] = vec_add(out[2][1], out[2][6]); \
965 in2[2] = vec_add(out[2][2], out[2][5]); \
966 in2[3] = vec_add(out[2][3], out[2][4]); \
967 in2[4] = vec_sub(out[2][3], out[2][4]); \
968 in2[5] = vec_sub(out[2][2], out[2][5]); \
969 in2[6] = vec_sub(out[2][1], out[2][6]); \
970 in2[7] = vec_sub(out[2][0], out[2][7]); \
971 \
972 in3[0] = vec_sub(out[3][7], out[3][0]); \
973 in3[1] = vec_sub(out[3][6], out[3][1]); \
974 in3[2] = vec_sub(out[3][5], out[3][2]); \
975 in3[3] = vec_sub(out[3][4], out[3][3]); \
976 in3[4] = vec_add(out[3][4], out[3][3]); \
977 in3[5] = vec_add(out[3][5], out[3][2]); \
978 in3[6] = vec_add(out[3][6], out[3][1]); \
979 in3[7] = vec_add(out[3][7], out[3][0]); \
980 \
981 /* stage 7 */ \
982 out[0][0] = vec_add(in0[0], in1[7]); \
983 out[0][1] = vec_add(in0[1], in1[6]); \
984 out[0][2] = vec_add(in0[2], in1[5]); \
985 out[0][3] = vec_add(in0[3], in1[4]); \
986 out[0][4] = vec_add(in0[4], in1[3]); \
987 out[0][5] = vec_add(in0[5], in1[2]); \
988 out[0][6] = vec_add(in0[6], in1[1]); \
989 out[0][7] = vec_add(in0[7], in1[0]); \
990 out[1][0] = vec_sub(in0[7], in1[0]); \
991 out[1][1] = vec_sub(in0[6], in1[1]); \
992 out[1][2] = vec_sub(in0[5], in1[2]); \
993 out[1][3] = vec_sub(in0[4], in1[3]); \
994 out[1][4] = vec_sub(in0[3], in1[4]); \
995 out[1][5] = vec_sub(in0[2], in1[5]); \
996 out[1][6] = vec_sub(in0[1], in1[6]); \
997 out[1][7] = vec_sub(in0[0], in1[7]); \
998 \
999 out[2][0] = in2[0]; \
1000 out[2][1] = in2[1]; \
1001 out[2][2] = in2[2]; \
1002 out[2][3] = in2[3]; \
1003 STEP16_1(in3[3], in2[4], out[2][4], out[3][3], cospi16_v); \
1004 STEP16_1(in3[2], in2[5], out[2][5], out[3][2], cospi16_v); \
1005 STEP16_1(in3[1], in2[6], out[2][6], out[3][1], cospi16_v); \
1006 STEP16_1(in3[0], in2[7], out[2][7], out[3][0], cospi16_v); \
1007 out[3][4] = in3[4]; \
1008 out[3][5] = in3[5]; \
1009 out[3][6] = in3[6]; \
1010 out[3][7] = in3[7]; \
1011 \
1012 /* final */ \
1013 in0[0] = vec_add(out[0][0], out[3][7]); \
1014 in0[1] = vec_add(out[0][1], out[3][6]); \
1015 in0[2] = vec_add(out[0][2], out[3][5]); \
1016 in0[3] = vec_add(out[0][3], out[3][4]); \
1017 in0[4] = vec_add(out[0][4], out[3][3]); \
1018 in0[5] = vec_add(out[0][5], out[3][2]); \
1019 in0[6] = vec_add(out[0][6], out[3][1]); \
1020 in0[7] = vec_add(out[0][7], out[3][0]); \
1021 in1[0] = vec_add(out[1][0], out[2][7]); \
1022 in1[1] = vec_add(out[1][1], out[2][6]); \
1023 in1[2] = vec_add(out[1][2], out[2][5]); \
1024 in1[3] = vec_add(out[1][3], out[2][4]); \
1025 in1[4] = vec_add(out[1][4], out[2][3]); \
1026 in1[5] = vec_add(out[1][5], out[2][2]); \
1027 in1[6] = vec_add(out[1][6], out[2][1]); \
1028 in1[7] = vec_add(out[1][7], out[2][0]); \
1029 in2[0] = vec_sub(out[1][7], out[2][0]); \
1030 in2[1] = vec_sub(out[1][6], out[2][1]); \
1031 in2[2] = vec_sub(out[1][5], out[2][2]); \
1032 in2[3] = vec_sub(out[1][4], out[2][3]); \
1033 in2[4] = vec_sub(out[1][3], out[2][4]); \
1034 in2[5] = vec_sub(out[1][2], out[2][5]); \
1035 in2[6] = vec_sub(out[1][1], out[2][6]); \
1036 in2[7] = vec_sub(out[1][0], out[2][7]); \
1037 in3[0] = vec_sub(out[0][7], out[3][0]); \
1038 in3[1] = vec_sub(out[0][6], out[3][1]); \
1039 in3[2] = vec_sub(out[0][5], out[3][2]); \
1040 in3[3] = vec_sub(out[0][4], out[3][3]); \
1041 in3[4] = vec_sub(out[0][3], out[3][4]); \
1042 in3[5] = vec_sub(out[0][2], out[3][5]); \
1043 in3[6] = vec_sub(out[0][1], out[3][6]); \
1044 in3[7] = vec_sub(out[0][0], out[3][7]);
1045
1046 // NOT A FULL TRANSPOSE! Transposes just each 8x8 block in each row,
1047 // does not transpose rows
1048 #define TRANSPOSE_8x32(in, out) \
1049 /* transpose 4 of 8x8 blocks */ \
1050 TRANSPOSE8x8(in[0][0], in[0][1], in[0][2], in[0][3], in[0][4], in[0][5], \
1051 in[0][6], in[0][7], out[0][0], out[0][1], out[0][2], out[0][3], \
1052 out[0][4], out[0][5], out[0][6], out[0][7]); \
1053 TRANSPOSE8x8(in[1][0], in[1][1], in[1][2], in[1][3], in[1][4], in[1][5], \
1054 in[1][6], in[1][7], out[1][0], out[1][1], out[1][2], out[1][3], \
1055 out[1][4], out[1][5], out[1][6], out[1][7]); \
1056 TRANSPOSE8x8(in[2][0], in[2][1], in[2][2], in[2][3], in[2][4], in[2][5], \
1057 in[2][6], in[2][7], out[2][0], out[2][1], out[2][2], out[2][3], \
1058 out[2][4], out[2][5], out[2][6], out[2][7]); \
1059 TRANSPOSE8x8(in[3][0], in[3][1], in[3][2], in[3][3], in[3][4], in[3][5], \
1060 in[3][6], in[3][7], out[3][0], out[3][1], out[3][2], out[3][3], \
1061 out[3][4], out[3][5], out[3][6], out[3][7]);
1062
1063 #define PIXEL_ADD_STORE32(in0, in1, in2, in3, step) \
1064 dst = vec_vsx_ld((step)*stride, dest); \
1065 d_uh = (int16x8_t)vec_mergeh(dst, zerov); \
1066 d_ul = (int16x8_t)vec_mergel(dst, zerov); \
1067 PIXEL_ADD(in0, d_uh, add, shift6); \
1068 PIXEL_ADD(in1, d_ul, add, shift6); \
1069 vec_vsx_st(vec_packsu(d_uh, d_ul), (step)*stride, dest); \
1070 dst = vec_vsx_ld((step)*stride + 16, dest); \
1071 d_uh = (int16x8_t)vec_mergeh(dst, zerov); \
1072 d_ul = (int16x8_t)vec_mergel(dst, zerov); \
1073 PIXEL_ADD(in2, d_uh, add, shift6); \
1074 PIXEL_ADD(in3, d_ul, add, shift6); \
1075 vec_vsx_st(vec_packsu(d_uh, d_ul), (step)*stride + 16, dest);
1076
1077 #define ADD_STORE_BLOCK(in, offset) \
1078 PIXEL_ADD_STORE32(in[0][0], in[1][0], in[2][0], in[3][0], offset + 0); \
1079 PIXEL_ADD_STORE32(in[0][1], in[1][1], in[2][1], in[3][1], offset + 1); \
1080 PIXEL_ADD_STORE32(in[0][2], in[1][2], in[2][2], in[3][2], offset + 2); \
1081 PIXEL_ADD_STORE32(in[0][3], in[1][3], in[2][3], in[3][3], offset + 3); \
1082 PIXEL_ADD_STORE32(in[0][4], in[1][4], in[2][4], in[3][4], offset + 4); \
1083 PIXEL_ADD_STORE32(in[0][5], in[1][5], in[2][5], in[3][5], offset + 5); \
1084 PIXEL_ADD_STORE32(in[0][6], in[1][6], in[2][6], in[3][6], offset + 6); \
1085 PIXEL_ADD_STORE32(in[0][7], in[1][7], in[2][7], in[3][7], offset + 7);
1086
vpx_idct32x32_1024_add_vsx(const tran_low_t * input,uint8_t * dest,int stride)1087 void vpx_idct32x32_1024_add_vsx(const tran_low_t *input, uint8_t *dest,
1088 int stride) {
1089 int16x8_t src0[4][8], src1[4][8], src2[4][8], src3[4][8], tmp[4][8];
1090 int16x8_t tmp16_0, tmp16_1;
1091 int32x4_t temp10, temp11, temp20, temp21, temp30;
1092 uint8x16_t dst;
1093 int16x8_t d_uh, d_ul;
1094 int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(2));
1095 uint16x8_t shift6 = vec_splat_u16(6);
1096 uint8x16_t zerov = vec_splat_u8(0);
1097
1098 ROUND_SHIFT_INIT;
1099
1100 LOAD_8x32(load_tran_low, src0[0][0], src0[1][0], src0[2][0], src0[3][0],
1101 src0[0][1], src0[1][1], src0[2][1], src0[3][1], src0[0][2],
1102 src0[1][2], src0[2][2], src0[3][2], src0[0][3], src0[1][3],
1103 src0[2][3], src0[3][3], src0[0][4], src0[1][4], src0[2][4],
1104 src0[3][4], src0[0][5], src0[1][5], src0[2][5], src0[3][5],
1105 src0[0][6], src0[1][6], src0[2][6], src0[3][6], src0[0][7],
1106 src0[1][7], src0[2][7], src0[3][7], 0);
1107 // Rows
1108 // transpose the first row of 8x8 blocks
1109 TRANSPOSE_8x32(src0, tmp);
1110 // transform the 32x8 column
1111 IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src0);
1112 TRANSPOSE_8x32(tmp, src0);
1113
1114 LOAD_8x32(load_tran_low, src1[0][0], src1[1][0], src1[2][0], src1[3][0],
1115 src1[0][1], src1[1][1], src1[2][1], src1[3][1], src1[0][2],
1116 src1[1][2], src1[2][2], src1[3][2], src1[0][3], src1[1][3],
1117 src1[2][3], src1[3][3], src1[0][4], src1[1][4], src1[2][4],
1118 src1[3][4], src1[0][5], src1[1][5], src1[2][5], src1[3][5],
1119 src1[0][6], src1[1][6], src1[2][6], src1[3][6], src1[0][7],
1120 src1[1][7], src1[2][7], src1[3][7], 512);
1121 TRANSPOSE_8x32(src1, tmp);
1122 IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src1);
1123 TRANSPOSE_8x32(tmp, src1);
1124
1125 LOAD_8x32(load_tran_low, src2[0][0], src2[1][0], src2[2][0], src2[3][0],
1126 src2[0][1], src2[1][1], src2[2][1], src2[3][1], src2[0][2],
1127 src2[1][2], src2[2][2], src2[3][2], src2[0][3], src2[1][3],
1128 src2[2][3], src2[3][3], src2[0][4], src2[1][4], src2[2][4],
1129 src2[3][4], src2[0][5], src2[1][5], src2[2][5], src2[3][5],
1130 src2[0][6], src2[1][6], src2[2][6], src2[3][6], src2[0][7],
1131 src2[1][7], src2[2][7], src2[3][7], 1024);
1132 TRANSPOSE_8x32(src2, tmp);
1133 IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src2);
1134 TRANSPOSE_8x32(tmp, src2);
1135
1136 LOAD_8x32(load_tran_low, src3[0][0], src3[1][0], src3[2][0], src3[3][0],
1137 src3[0][1], src3[1][1], src3[2][1], src3[3][1], src3[0][2],
1138 src3[1][2], src3[2][2], src3[3][2], src3[0][3], src3[1][3],
1139 src3[2][3], src3[3][3], src3[0][4], src3[1][4], src3[2][4],
1140 src3[3][4], src3[0][5], src3[1][5], src3[2][5], src3[3][5],
1141 src3[0][6], src3[1][6], src3[2][6], src3[3][6], src3[0][7],
1142 src3[1][7], src3[2][7], src3[3][7], 1536);
1143 TRANSPOSE_8x32(src3, tmp);
1144 IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src3);
1145 TRANSPOSE_8x32(tmp, src3);
1146
1147 // Columns
1148 IDCT32(src0[0], src1[0], src2[0], src3[0], tmp);
1149 IDCT32(src0[1], src1[1], src2[1], src3[1], tmp);
1150 IDCT32(src0[2], src1[2], src2[2], src3[2], tmp);
1151 IDCT32(src0[3], src1[3], src2[3], src3[3], tmp);
1152
1153 ADD_STORE_BLOCK(src0, 0);
1154 ADD_STORE_BLOCK(src1, 8);
1155 ADD_STORE_BLOCK(src2, 16);
1156 ADD_STORE_BLOCK(src3, 24);
1157 }
1158
1159 #define TRANSFORM_COLS \
1160 v32_a = vec_add(v32_a, v32_c); \
1161 v32_d = vec_sub(v32_d, v32_b); \
1162 v32_e = vec_sub(v32_a, v32_d); \
1163 v32_e = vec_sra(v32_e, one); \
1164 v32_b = vec_sub(v32_e, v32_b); \
1165 v32_c = vec_sub(v32_e, v32_c); \
1166 v32_a = vec_sub(v32_a, v32_b); \
1167 v32_d = vec_add(v32_d, v32_c); \
1168 v_a = vec_packs(v32_a, v32_b); \
1169 v_c = vec_packs(v32_c, v32_d);
1170
1171 #define TRANSPOSE_WHT \
1172 tmp_a = vec_mergeh(v_a, v_c); \
1173 tmp_c = vec_mergel(v_a, v_c); \
1174 v_a = vec_mergeh(tmp_a, tmp_c); \
1175 v_c = vec_mergel(tmp_a, tmp_c);
1176
vpx_iwht4x4_16_add_vsx(const tran_low_t * input,uint8_t * dest,int stride)1177 void vpx_iwht4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest,
1178 int stride) {
1179 int16x8_t v_a = load_tran_low(0, input);
1180 int16x8_t v_c = load_tran_low(8 * sizeof(*input), input);
1181 int16x8_t tmp_a, tmp_c;
1182 uint16x8_t two = vec_splat_u16(2);
1183 uint32x4_t one = vec_splat_u32(1);
1184 int16x8_t tmp16_0, tmp16_1;
1185 int32x4_t v32_a, v32_c, v32_d, v32_b, v32_e;
1186 uint8x16_t dest0 = vec_vsx_ld(0, dest);
1187 uint8x16_t dest1 = vec_vsx_ld(stride, dest);
1188 uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest);
1189 uint8x16_t dest3 = vec_vsx_ld(3 * stride, dest);
1190 int16x8_t d_u0 = (int16x8_t)unpack_to_u16_h(dest0);
1191 int16x8_t d_u1 = (int16x8_t)unpack_to_u16_h(dest1);
1192 int16x8_t d_u2 = (int16x8_t)unpack_to_u16_h(dest2);
1193 int16x8_t d_u3 = (int16x8_t)unpack_to_u16_h(dest3);
1194 uint8x16_t output_v;
1195 uint8_t tmp_dest[16];
1196 int i, j;
1197
1198 v_a = vec_sra(v_a, two);
1199 v_c = vec_sra(v_c, two);
1200
1201 TRANSPOSE_WHT;
1202
1203 v32_a = vec_unpackh(v_a);
1204 v32_c = vec_unpackl(v_a);
1205
1206 v32_d = vec_unpackh(v_c);
1207 v32_b = vec_unpackl(v_c);
1208
1209 TRANSFORM_COLS;
1210
1211 TRANSPOSE_WHT;
1212
1213 v32_a = vec_unpackh(v_a);
1214 v32_c = vec_unpackl(v_a);
1215 v32_d = vec_unpackh(v_c);
1216 v32_b = vec_unpackl(v_c);
1217
1218 TRANSFORM_COLS;
1219
1220 PACK_STORE(v_a, v_c);
1221 }
1222
vp9_iadst4_vsx(int16x8_t * in,int16x8_t * out)1223 void vp9_iadst4_vsx(int16x8_t *in, int16x8_t *out) {
1224 int16x8_t sinpi_1_3_v, sinpi_4_2_v, sinpi_2_3_v, sinpi_1_4_v, sinpi_12_n3_v;
1225 int32x4_t v_v[5], u_v[4];
1226 int32x4_t zerov = vec_splat_s32(0);
1227 int16x8_t tmp0, tmp1;
1228 int16x8_t zero16v = vec_splat_s16(0);
1229 uint32x4_t shift16 = vec_sl(vec_splat_u32(8), vec_splat_u32(1));
1230 ROUND_SHIFT_INIT;
1231
1232 sinpi_1_3_v = vec_mergel(sinpi_1_9_v, sinpi_3_9_v);
1233 sinpi_4_2_v = vec_mergel(sinpi_4_9_v, sinpi_2_9_v);
1234 sinpi_2_3_v = vec_mergel(sinpi_2_9_v, sinpi_3_9_v);
1235 sinpi_1_4_v = vec_mergel(sinpi_1_9_v, sinpi_4_9_v);
1236 sinpi_12_n3_v = vec_mergel(vec_add(sinpi_1_9_v, sinpi_2_9_v),
1237 vec_sub(zero16v, sinpi_3_9_v));
1238
1239 tmp0 = (int16x8_t)vec_mergeh((int32x4_t)in[0], (int32x4_t)in[1]);
1240 tmp1 = (int16x8_t)vec_mergel((int32x4_t)in[0], (int32x4_t)in[1]);
1241 in[0] = (int16x8_t)vec_mergeh((int32x4_t)tmp0, (int32x4_t)tmp1);
1242 in[1] = (int16x8_t)vec_mergel((int32x4_t)tmp0, (int32x4_t)tmp1);
1243
1244 v_v[0] = vec_msum(in[0], sinpi_1_3_v, zerov);
1245 v_v[1] = vec_msum(in[1], sinpi_4_2_v, zerov);
1246 v_v[2] = vec_msum(in[0], sinpi_2_3_v, zerov);
1247 v_v[3] = vec_msum(in[1], sinpi_1_4_v, zerov);
1248 v_v[4] = vec_msum(in[0], sinpi_12_n3_v, zerov);
1249
1250 in[0] = vec_sub(in[0], in[1]);
1251 in[1] = (int16x8_t)vec_sra((int32x4_t)in[1], shift16);
1252 in[0] = vec_add(in[0], in[1]);
1253 in[0] = (int16x8_t)vec_sl((int32x4_t)in[0], shift16);
1254
1255 u_v[0] = vec_add(v_v[0], v_v[1]);
1256 u_v[1] = vec_sub(v_v[2], v_v[3]);
1257 u_v[2] = vec_msum(in[0], sinpi_1_3_v, zerov);
1258 u_v[3] = vec_sub(v_v[1], v_v[3]);
1259 u_v[3] = vec_add(u_v[3], v_v[4]);
1260
1261 DCT_CONST_ROUND_SHIFT(u_v[0]);
1262 DCT_CONST_ROUND_SHIFT(u_v[1]);
1263 DCT_CONST_ROUND_SHIFT(u_v[2]);
1264 DCT_CONST_ROUND_SHIFT(u_v[3]);
1265
1266 out[0] = vec_packs(u_v[0], u_v[1]);
1267 out[1] = vec_packs(u_v[2], u_v[3]);
1268 }
1269
1270 #define MSUM_ROUND_SHIFT(a, b, cospi) \
1271 b = vec_msums(a, cospi, zerov); \
1272 DCT_CONST_ROUND_SHIFT(b);
1273
1274 #define IADST_WRAPLOW(in0, in1, tmp0, tmp1, out, cospi) \
1275 MSUM_ROUND_SHIFT(in0, tmp0, cospi); \
1276 MSUM_ROUND_SHIFT(in1, tmp1, cospi); \
1277 out = vec_packs(tmp0, tmp1);
1278
vp9_iadst8_vsx(int16x8_t * in,int16x8_t * out)1279 void vp9_iadst8_vsx(int16x8_t *in, int16x8_t *out) {
1280 int32x4_t tmp0[16], tmp1[16];
1281
1282 int32x4_t zerov = vec_splat_s32(0);
1283 int16x8_t zero16v = vec_splat_s16(0);
1284 int16x8_t cospi_p02_p30_v = vec_mergel(cospi2_v, cospi30_v);
1285 int16x8_t cospi_p30_m02_v = vec_mergel(cospi30_v, cospi2m_v);
1286 int16x8_t cospi_p10_p22_v = vec_mergel(cospi10_v, cospi22_v);
1287 int16x8_t cospi_p22_m10_v = vec_mergel(cospi22_v, cospi10m_v);
1288 int16x8_t cospi_p18_p14_v = vec_mergel(cospi18_v, cospi14_v);
1289 int16x8_t cospi_p14_m18_v = vec_mergel(cospi14_v, cospi18m_v);
1290 int16x8_t cospi_p26_p06_v = vec_mergel(cospi26_v, cospi6_v);
1291 int16x8_t cospi_p06_m26_v = vec_mergel(cospi6_v, cospi26m_v);
1292 int16x8_t cospi_p08_p24_v = vec_mergel(cospi8_v, cospi24_v);
1293 int16x8_t cospi_p24_m08_v = vec_mergel(cospi24_v, cospi8m_v);
1294 int16x8_t cospi_m24_p08_v = vec_mergel(cospi24m_v, cospi8_v);
1295 int16x8_t cospi_p16_m16_v = vec_mergel(cospi16_v, cospi16m_v);
1296 ROUND_SHIFT_INIT;
1297
1298 TRANSPOSE8x8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], out[0],
1299 out[1], out[2], out[3], out[4], out[5], out[6], out[7]);
1300
1301 // stage 1
1302 // interleave and multiply/add into 32-bit integer
1303 in[0] = vec_mergeh(out[7], out[0]);
1304 in[1] = vec_mergel(out[7], out[0]);
1305 in[2] = vec_mergeh(out[5], out[2]);
1306 in[3] = vec_mergel(out[5], out[2]);
1307 in[4] = vec_mergeh(out[3], out[4]);
1308 in[5] = vec_mergel(out[3], out[4]);
1309 in[6] = vec_mergeh(out[1], out[6]);
1310 in[7] = vec_mergel(out[1], out[6]);
1311
1312 tmp1[0] = vec_msum(in[0], cospi_p02_p30_v, zerov);
1313 tmp1[1] = vec_msum(in[1], cospi_p02_p30_v, zerov);
1314 tmp1[2] = vec_msum(in[0], cospi_p30_m02_v, zerov);
1315 tmp1[3] = vec_msum(in[1], cospi_p30_m02_v, zerov);
1316 tmp1[4] = vec_msum(in[2], cospi_p10_p22_v, zerov);
1317 tmp1[5] = vec_msum(in[3], cospi_p10_p22_v, zerov);
1318 tmp1[6] = vec_msum(in[2], cospi_p22_m10_v, zerov);
1319 tmp1[7] = vec_msum(in[3], cospi_p22_m10_v, zerov);
1320 tmp1[8] = vec_msum(in[4], cospi_p18_p14_v, zerov);
1321 tmp1[9] = vec_msum(in[5], cospi_p18_p14_v, zerov);
1322 tmp1[10] = vec_msum(in[4], cospi_p14_m18_v, zerov);
1323 tmp1[11] = vec_msum(in[5], cospi_p14_m18_v, zerov);
1324 tmp1[12] = vec_msum(in[6], cospi_p26_p06_v, zerov);
1325 tmp1[13] = vec_msum(in[7], cospi_p26_p06_v, zerov);
1326 tmp1[14] = vec_msum(in[6], cospi_p06_m26_v, zerov);
1327 tmp1[15] = vec_msum(in[7], cospi_p06_m26_v, zerov);
1328
1329 tmp0[0] = vec_add(tmp1[0], tmp1[8]);
1330 tmp0[1] = vec_add(tmp1[1], tmp1[9]);
1331 tmp0[2] = vec_add(tmp1[2], tmp1[10]);
1332 tmp0[3] = vec_add(tmp1[3], tmp1[11]);
1333 tmp0[4] = vec_add(tmp1[4], tmp1[12]);
1334 tmp0[5] = vec_add(tmp1[5], tmp1[13]);
1335 tmp0[6] = vec_add(tmp1[6], tmp1[14]);
1336 tmp0[7] = vec_add(tmp1[7], tmp1[15]);
1337 tmp0[8] = vec_sub(tmp1[0], tmp1[8]);
1338 tmp0[9] = vec_sub(tmp1[1], tmp1[9]);
1339 tmp0[10] = vec_sub(tmp1[2], tmp1[10]);
1340 tmp0[11] = vec_sub(tmp1[3], tmp1[11]);
1341 tmp0[12] = vec_sub(tmp1[4], tmp1[12]);
1342 tmp0[13] = vec_sub(tmp1[5], tmp1[13]);
1343 tmp0[14] = vec_sub(tmp1[6], tmp1[14]);
1344 tmp0[15] = vec_sub(tmp1[7], tmp1[15]);
1345
1346 // shift and rounding
1347 DCT_CONST_ROUND_SHIFT(tmp0[0]);
1348 DCT_CONST_ROUND_SHIFT(tmp0[1]);
1349 DCT_CONST_ROUND_SHIFT(tmp0[2]);
1350 DCT_CONST_ROUND_SHIFT(tmp0[3]);
1351 DCT_CONST_ROUND_SHIFT(tmp0[4]);
1352 DCT_CONST_ROUND_SHIFT(tmp0[5]);
1353 DCT_CONST_ROUND_SHIFT(tmp0[6]);
1354 DCT_CONST_ROUND_SHIFT(tmp0[7]);
1355 DCT_CONST_ROUND_SHIFT(tmp0[8]);
1356 DCT_CONST_ROUND_SHIFT(tmp0[9]);
1357 DCT_CONST_ROUND_SHIFT(tmp0[10]);
1358 DCT_CONST_ROUND_SHIFT(tmp0[11]);
1359 DCT_CONST_ROUND_SHIFT(tmp0[12]);
1360 DCT_CONST_ROUND_SHIFT(tmp0[13]);
1361 DCT_CONST_ROUND_SHIFT(tmp0[14]);
1362 DCT_CONST_ROUND_SHIFT(tmp0[15]);
1363
1364 // back to 16-bit
1365 out[0] = vec_packs(tmp0[0], tmp0[1]);
1366 out[1] = vec_packs(tmp0[2], tmp0[3]);
1367 out[2] = vec_packs(tmp0[4], tmp0[5]);
1368 out[3] = vec_packs(tmp0[6], tmp0[7]);
1369 out[4] = vec_packs(tmp0[8], tmp0[9]);
1370 out[5] = vec_packs(tmp0[10], tmp0[11]);
1371 out[6] = vec_packs(tmp0[12], tmp0[13]);
1372 out[7] = vec_packs(tmp0[14], tmp0[15]);
1373
1374 // stage 2
1375 in[0] = vec_add(out[0], out[2]);
1376 in[1] = vec_add(out[1], out[3]);
1377 in[2] = vec_sub(out[0], out[2]);
1378 in[3] = vec_sub(out[1], out[3]);
1379 in[4] = vec_mergeh(out[4], out[5]);
1380 in[5] = vec_mergel(out[4], out[5]);
1381 in[6] = vec_mergeh(out[6], out[7]);
1382 in[7] = vec_mergel(out[6], out[7]);
1383
1384 tmp1[0] = vec_msum(in[4], cospi_p08_p24_v, zerov);
1385 tmp1[1] = vec_msum(in[5], cospi_p08_p24_v, zerov);
1386 tmp1[2] = vec_msum(in[4], cospi_p24_m08_v, zerov);
1387 tmp1[3] = vec_msum(in[5], cospi_p24_m08_v, zerov);
1388 tmp1[4] = vec_msum(in[6], cospi_m24_p08_v, zerov);
1389 tmp1[5] = vec_msum(in[7], cospi_m24_p08_v, zerov);
1390 tmp1[6] = vec_msum(in[6], cospi_p08_p24_v, zerov);
1391 tmp1[7] = vec_msum(in[7], cospi_p08_p24_v, zerov);
1392
1393 tmp0[0] = vec_add(tmp1[0], tmp1[4]);
1394 tmp0[1] = vec_add(tmp1[1], tmp1[5]);
1395 tmp0[2] = vec_add(tmp1[2], tmp1[6]);
1396 tmp0[3] = vec_add(tmp1[3], tmp1[7]);
1397 tmp0[4] = vec_sub(tmp1[0], tmp1[4]);
1398 tmp0[5] = vec_sub(tmp1[1], tmp1[5]);
1399 tmp0[6] = vec_sub(tmp1[2], tmp1[6]);
1400 tmp0[7] = vec_sub(tmp1[3], tmp1[7]);
1401
1402 DCT_CONST_ROUND_SHIFT(tmp0[0]);
1403 DCT_CONST_ROUND_SHIFT(tmp0[1]);
1404 DCT_CONST_ROUND_SHIFT(tmp0[2]);
1405 DCT_CONST_ROUND_SHIFT(tmp0[3]);
1406 DCT_CONST_ROUND_SHIFT(tmp0[4]);
1407 DCT_CONST_ROUND_SHIFT(tmp0[5]);
1408 DCT_CONST_ROUND_SHIFT(tmp0[6]);
1409 DCT_CONST_ROUND_SHIFT(tmp0[7]);
1410
1411 in[4] = vec_packs(tmp0[0], tmp0[1]);
1412 in[5] = vec_packs(tmp0[2], tmp0[3]);
1413 in[6] = vec_packs(tmp0[4], tmp0[5]);
1414 in[7] = vec_packs(tmp0[6], tmp0[7]);
1415
1416 // stage 3
1417 out[0] = vec_mergeh(in[2], in[3]);
1418 out[1] = vec_mergel(in[2], in[3]);
1419 out[2] = vec_mergeh(in[6], in[7]);
1420 out[3] = vec_mergel(in[6], in[7]);
1421
1422 IADST_WRAPLOW(out[0], out[1], tmp0[0], tmp0[1], in[2], cospi16_v);
1423 IADST_WRAPLOW(out[0], out[1], tmp0[0], tmp0[1], in[3], cospi_p16_m16_v);
1424 IADST_WRAPLOW(out[2], out[3], tmp0[0], tmp0[1], in[6], cospi16_v);
1425 IADST_WRAPLOW(out[2], out[3], tmp0[0], tmp0[1], in[7], cospi_p16_m16_v);
1426
1427 out[0] = in[0];
1428 out[2] = in[6];
1429 out[4] = in[3];
1430 out[6] = in[5];
1431
1432 out[1] = vec_sub(zero16v, in[4]);
1433 out[3] = vec_sub(zero16v, in[2]);
1434 out[5] = vec_sub(zero16v, in[7]);
1435 out[7] = vec_sub(zero16v, in[1]);
1436 }
1437
iadst16x8_vsx(int16x8_t * in,int16x8_t * out)1438 static void iadst16x8_vsx(int16x8_t *in, int16x8_t *out) {
1439 int32x4_t tmp0[32], tmp1[32];
1440 int16x8_t tmp16_0[8];
1441 int16x8_t cospi_p01_p31 = vec_mergel(cospi1_v, cospi31_v);
1442 int16x8_t cospi_p31_m01 = vec_mergel(cospi31_v, cospi1m_v);
1443 int16x8_t cospi_p05_p27 = vec_mergel(cospi5_v, cospi27_v);
1444 int16x8_t cospi_p27_m05 = vec_mergel(cospi27_v, cospi5m_v);
1445 int16x8_t cospi_p09_p23 = vec_mergel(cospi9_v, cospi23_v);
1446 int16x8_t cospi_p23_m09 = vec_mergel(cospi23_v, cospi9m_v);
1447 int16x8_t cospi_p13_p19 = vec_mergel(cospi13_v, cospi19_v);
1448 int16x8_t cospi_p19_m13 = vec_mergel(cospi19_v, cospi13m_v);
1449 int16x8_t cospi_p17_p15 = vec_mergel(cospi17_v, cospi15_v);
1450 int16x8_t cospi_p15_m17 = vec_mergel(cospi15_v, cospi17m_v);
1451 int16x8_t cospi_p21_p11 = vec_mergel(cospi21_v, cospi11_v);
1452 int16x8_t cospi_p11_m21 = vec_mergel(cospi11_v, cospi21m_v);
1453 int16x8_t cospi_p25_p07 = vec_mergel(cospi25_v, cospi7_v);
1454 int16x8_t cospi_p07_m25 = vec_mergel(cospi7_v, cospi25m_v);
1455 int16x8_t cospi_p29_p03 = vec_mergel(cospi29_v, cospi3_v);
1456 int16x8_t cospi_p03_m29 = vec_mergel(cospi3_v, cospi29m_v);
1457 int16x8_t cospi_p04_p28 = vec_mergel(cospi4_v, cospi28_v);
1458 int16x8_t cospi_p28_m04 = vec_mergel(cospi28_v, cospi4m_v);
1459 int16x8_t cospi_p20_p12 = vec_mergel(cospi20_v, cospi12_v);
1460 int16x8_t cospi_p12_m20 = vec_mergel(cospi12_v, cospi20m_v);
1461 int16x8_t cospi_m28_p04 = vec_mergel(cospi28m_v, cospi4_v);
1462 int16x8_t cospi_m12_p20 = vec_mergel(cospi12m_v, cospi20_v);
1463 int16x8_t cospi_p08_p24 = vec_mergel(cospi8_v, cospi24_v);
1464 int16x8_t cospi_p24_m08 = vec_mergel(cospi24_v, cospi8m_v);
1465 int16x8_t cospi_m24_p08 = vec_mergel(cospi24m_v, cospi8_v);
1466 int32x4_t zerov = vec_splat_s32(0);
1467 ROUND_SHIFT_INIT;
1468
1469 tmp16_0[0] = vec_mergeh(in[15], in[0]);
1470 tmp16_0[1] = vec_mergel(in[15], in[0]);
1471 tmp16_0[2] = vec_mergeh(in[13], in[2]);
1472 tmp16_0[3] = vec_mergel(in[13], in[2]);
1473 tmp16_0[4] = vec_mergeh(in[11], in[4]);
1474 tmp16_0[5] = vec_mergel(in[11], in[4]);
1475 tmp16_0[6] = vec_mergeh(in[9], in[6]);
1476 tmp16_0[7] = vec_mergel(in[9], in[6]);
1477 tmp16_0[8] = vec_mergeh(in[7], in[8]);
1478 tmp16_0[9] = vec_mergel(in[7], in[8]);
1479 tmp16_0[10] = vec_mergeh(in[5], in[10]);
1480 tmp16_0[11] = vec_mergel(in[5], in[10]);
1481 tmp16_0[12] = vec_mergeh(in[3], in[12]);
1482 tmp16_0[13] = vec_mergel(in[3], in[12]);
1483 tmp16_0[14] = vec_mergeh(in[1], in[14]);
1484 tmp16_0[15] = vec_mergel(in[1], in[14]);
1485
1486 tmp0[0] = vec_msum(tmp16_0[0], cospi_p01_p31, zerov);
1487 tmp0[1] = vec_msum(tmp16_0[1], cospi_p01_p31, zerov);
1488 tmp0[2] = vec_msum(tmp16_0[0], cospi_p31_m01, zerov);
1489 tmp0[3] = vec_msum(tmp16_0[1], cospi_p31_m01, zerov);
1490 tmp0[4] = vec_msum(tmp16_0[2], cospi_p05_p27, zerov);
1491 tmp0[5] = vec_msum(tmp16_0[3], cospi_p05_p27, zerov);
1492 tmp0[6] = vec_msum(tmp16_0[2], cospi_p27_m05, zerov);
1493 tmp0[7] = vec_msum(tmp16_0[3], cospi_p27_m05, zerov);
1494 tmp0[8] = vec_msum(tmp16_0[4], cospi_p09_p23, zerov);
1495 tmp0[9] = vec_msum(tmp16_0[5], cospi_p09_p23, zerov);
1496 tmp0[10] = vec_msum(tmp16_0[4], cospi_p23_m09, zerov);
1497 tmp0[11] = vec_msum(tmp16_0[5], cospi_p23_m09, zerov);
1498 tmp0[12] = vec_msum(tmp16_0[6], cospi_p13_p19, zerov);
1499 tmp0[13] = vec_msum(tmp16_0[7], cospi_p13_p19, zerov);
1500 tmp0[14] = vec_msum(tmp16_0[6], cospi_p19_m13, zerov);
1501 tmp0[15] = vec_msum(tmp16_0[7], cospi_p19_m13, zerov);
1502 tmp0[16] = vec_msum(tmp16_0[8], cospi_p17_p15, zerov);
1503 tmp0[17] = vec_msum(tmp16_0[9], cospi_p17_p15, zerov);
1504 tmp0[18] = vec_msum(tmp16_0[8], cospi_p15_m17, zerov);
1505 tmp0[19] = vec_msum(tmp16_0[9], cospi_p15_m17, zerov);
1506 tmp0[20] = vec_msum(tmp16_0[10], cospi_p21_p11, zerov);
1507 tmp0[21] = vec_msum(tmp16_0[11], cospi_p21_p11, zerov);
1508 tmp0[22] = vec_msum(tmp16_0[10], cospi_p11_m21, zerov);
1509 tmp0[23] = vec_msum(tmp16_0[11], cospi_p11_m21, zerov);
1510 tmp0[24] = vec_msum(tmp16_0[12], cospi_p25_p07, zerov);
1511 tmp0[25] = vec_msum(tmp16_0[13], cospi_p25_p07, zerov);
1512 tmp0[26] = vec_msum(tmp16_0[12], cospi_p07_m25, zerov);
1513 tmp0[27] = vec_msum(tmp16_0[13], cospi_p07_m25, zerov);
1514 tmp0[28] = vec_msum(tmp16_0[14], cospi_p29_p03, zerov);
1515 tmp0[29] = vec_msum(tmp16_0[15], cospi_p29_p03, zerov);
1516 tmp0[30] = vec_msum(tmp16_0[14], cospi_p03_m29, zerov);
1517 tmp0[31] = vec_msum(tmp16_0[15], cospi_p03_m29, zerov);
1518
1519 tmp1[0] = vec_add(tmp0[0], tmp0[16]);
1520 tmp1[1] = vec_add(tmp0[1], tmp0[17]);
1521 tmp1[2] = vec_add(tmp0[2], tmp0[18]);
1522 tmp1[3] = vec_add(tmp0[3], tmp0[19]);
1523 tmp1[4] = vec_add(tmp0[4], tmp0[20]);
1524 tmp1[5] = vec_add(tmp0[5], tmp0[21]);
1525 tmp1[6] = vec_add(tmp0[6], tmp0[22]);
1526 tmp1[7] = vec_add(tmp0[7], tmp0[23]);
1527 tmp1[8] = vec_add(tmp0[8], tmp0[24]);
1528 tmp1[9] = vec_add(tmp0[9], tmp0[25]);
1529 tmp1[10] = vec_add(tmp0[10], tmp0[26]);
1530 tmp1[11] = vec_add(tmp0[11], tmp0[27]);
1531 tmp1[12] = vec_add(tmp0[12], tmp0[28]);
1532 tmp1[13] = vec_add(tmp0[13], tmp0[29]);
1533 tmp1[14] = vec_add(tmp0[14], tmp0[30]);
1534 tmp1[15] = vec_add(tmp0[15], tmp0[31]);
1535 tmp1[16] = vec_sub(tmp0[0], tmp0[16]);
1536 tmp1[17] = vec_sub(tmp0[1], tmp0[17]);
1537 tmp1[18] = vec_sub(tmp0[2], tmp0[18]);
1538 tmp1[19] = vec_sub(tmp0[3], tmp0[19]);
1539 tmp1[20] = vec_sub(tmp0[4], tmp0[20]);
1540 tmp1[21] = vec_sub(tmp0[5], tmp0[21]);
1541 tmp1[22] = vec_sub(tmp0[6], tmp0[22]);
1542 tmp1[23] = vec_sub(tmp0[7], tmp0[23]);
1543 tmp1[24] = vec_sub(tmp0[8], tmp0[24]);
1544 tmp1[25] = vec_sub(tmp0[9], tmp0[25]);
1545 tmp1[26] = vec_sub(tmp0[10], tmp0[26]);
1546 tmp1[27] = vec_sub(tmp0[11], tmp0[27]);
1547 tmp1[28] = vec_sub(tmp0[12], tmp0[28]);
1548 tmp1[29] = vec_sub(tmp0[13], tmp0[29]);
1549 tmp1[30] = vec_sub(tmp0[14], tmp0[30]);
1550 tmp1[31] = vec_sub(tmp0[15], tmp0[31]);
1551
1552 DCT_CONST_ROUND_SHIFT(tmp1[0]);
1553 DCT_CONST_ROUND_SHIFT(tmp1[1]);
1554 DCT_CONST_ROUND_SHIFT(tmp1[2]);
1555 DCT_CONST_ROUND_SHIFT(tmp1[3]);
1556 DCT_CONST_ROUND_SHIFT(tmp1[4]);
1557 DCT_CONST_ROUND_SHIFT(tmp1[5]);
1558 DCT_CONST_ROUND_SHIFT(tmp1[6]);
1559 DCT_CONST_ROUND_SHIFT(tmp1[7]);
1560 DCT_CONST_ROUND_SHIFT(tmp1[8]);
1561 DCT_CONST_ROUND_SHIFT(tmp1[9]);
1562 DCT_CONST_ROUND_SHIFT(tmp1[10]);
1563 DCT_CONST_ROUND_SHIFT(tmp1[11]);
1564 DCT_CONST_ROUND_SHIFT(tmp1[12]);
1565 DCT_CONST_ROUND_SHIFT(tmp1[13]);
1566 DCT_CONST_ROUND_SHIFT(tmp1[14]);
1567 DCT_CONST_ROUND_SHIFT(tmp1[15]);
1568 DCT_CONST_ROUND_SHIFT(tmp1[16]);
1569 DCT_CONST_ROUND_SHIFT(tmp1[17]);
1570 DCT_CONST_ROUND_SHIFT(tmp1[18]);
1571 DCT_CONST_ROUND_SHIFT(tmp1[19]);
1572 DCT_CONST_ROUND_SHIFT(tmp1[20]);
1573 DCT_CONST_ROUND_SHIFT(tmp1[21]);
1574 DCT_CONST_ROUND_SHIFT(tmp1[22]);
1575 DCT_CONST_ROUND_SHIFT(tmp1[23]);
1576 DCT_CONST_ROUND_SHIFT(tmp1[24]);
1577 DCT_CONST_ROUND_SHIFT(tmp1[25]);
1578 DCT_CONST_ROUND_SHIFT(tmp1[26]);
1579 DCT_CONST_ROUND_SHIFT(tmp1[27]);
1580 DCT_CONST_ROUND_SHIFT(tmp1[28]);
1581 DCT_CONST_ROUND_SHIFT(tmp1[29]);
1582 DCT_CONST_ROUND_SHIFT(tmp1[30]);
1583 DCT_CONST_ROUND_SHIFT(tmp1[31]);
1584
1585 in[0] = vec_packs(tmp1[0], tmp1[1]);
1586 in[1] = vec_packs(tmp1[2], tmp1[3]);
1587 in[2] = vec_packs(tmp1[4], tmp1[5]);
1588 in[3] = vec_packs(tmp1[6], tmp1[7]);
1589 in[4] = vec_packs(tmp1[8], tmp1[9]);
1590 in[5] = vec_packs(tmp1[10], tmp1[11]);
1591 in[6] = vec_packs(tmp1[12], tmp1[13]);
1592 in[7] = vec_packs(tmp1[14], tmp1[15]);
1593 in[8] = vec_packs(tmp1[16], tmp1[17]);
1594 in[9] = vec_packs(tmp1[18], tmp1[19]);
1595 in[10] = vec_packs(tmp1[20], tmp1[21]);
1596 in[11] = vec_packs(tmp1[22], tmp1[23]);
1597 in[12] = vec_packs(tmp1[24], tmp1[25]);
1598 in[13] = vec_packs(tmp1[26], tmp1[27]);
1599 in[14] = vec_packs(tmp1[28], tmp1[29]);
1600 in[15] = vec_packs(tmp1[30], tmp1[31]);
1601
1602 // stage 2
1603 tmp16_0[0] = vec_mergeh(in[8], in[9]);
1604 tmp16_0[1] = vec_mergel(in[8], in[9]);
1605 tmp16_0[2] = vec_mergeh(in[10], in[11]);
1606 tmp16_0[3] = vec_mergel(in[10], in[11]);
1607 tmp16_0[4] = vec_mergeh(in[12], in[13]);
1608 tmp16_0[5] = vec_mergel(in[12], in[13]);
1609 tmp16_0[6] = vec_mergeh(in[14], in[15]);
1610 tmp16_0[7] = vec_mergel(in[14], in[15]);
1611
1612 tmp0[0] = vec_msum(tmp16_0[0], cospi_p04_p28, zerov);
1613 tmp0[1] = vec_msum(tmp16_0[1], cospi_p04_p28, zerov);
1614 tmp0[2] = vec_msum(tmp16_0[0], cospi_p28_m04, zerov);
1615 tmp0[3] = vec_msum(tmp16_0[1], cospi_p28_m04, zerov);
1616 tmp0[4] = vec_msum(tmp16_0[2], cospi_p20_p12, zerov);
1617 tmp0[5] = vec_msum(tmp16_0[3], cospi_p20_p12, zerov);
1618 tmp0[6] = vec_msum(tmp16_0[2], cospi_p12_m20, zerov);
1619 tmp0[7] = vec_msum(tmp16_0[3], cospi_p12_m20, zerov);
1620 tmp0[8] = vec_msum(tmp16_0[4], cospi_m28_p04, zerov);
1621 tmp0[9] = vec_msum(tmp16_0[5], cospi_m28_p04, zerov);
1622 tmp0[10] = vec_msum(tmp16_0[4], cospi_p04_p28, zerov);
1623 tmp0[11] = vec_msum(tmp16_0[5], cospi_p04_p28, zerov);
1624 tmp0[12] = vec_msum(tmp16_0[6], cospi_m12_p20, zerov);
1625 tmp0[13] = vec_msum(tmp16_0[7], cospi_m12_p20, zerov);
1626 tmp0[14] = vec_msum(tmp16_0[6], cospi_p20_p12, zerov);
1627 tmp0[15] = vec_msum(tmp16_0[7], cospi_p20_p12, zerov);
1628
1629 tmp1[0] = vec_add(tmp0[0], tmp0[8]);
1630 tmp1[1] = vec_add(tmp0[1], tmp0[9]);
1631 tmp1[2] = vec_add(tmp0[2], tmp0[10]);
1632 tmp1[3] = vec_add(tmp0[3], tmp0[11]);
1633 tmp1[4] = vec_add(tmp0[4], tmp0[12]);
1634 tmp1[5] = vec_add(tmp0[5], tmp0[13]);
1635 tmp1[6] = vec_add(tmp0[6], tmp0[14]);
1636 tmp1[7] = vec_add(tmp0[7], tmp0[15]);
1637 tmp1[8] = vec_sub(tmp0[0], tmp0[8]);
1638 tmp1[9] = vec_sub(tmp0[1], tmp0[9]);
1639 tmp1[10] = vec_sub(tmp0[2], tmp0[10]);
1640 tmp1[11] = vec_sub(tmp0[3], tmp0[11]);
1641 tmp1[12] = vec_sub(tmp0[4], tmp0[12]);
1642 tmp1[13] = vec_sub(tmp0[5], tmp0[13]);
1643 tmp1[14] = vec_sub(tmp0[6], tmp0[14]);
1644 tmp1[15] = vec_sub(tmp0[7], tmp0[15]);
1645
1646 DCT_CONST_ROUND_SHIFT(tmp1[0]);
1647 DCT_CONST_ROUND_SHIFT(tmp1[1]);
1648 DCT_CONST_ROUND_SHIFT(tmp1[2]);
1649 DCT_CONST_ROUND_SHIFT(tmp1[3]);
1650 DCT_CONST_ROUND_SHIFT(tmp1[4]);
1651 DCT_CONST_ROUND_SHIFT(tmp1[5]);
1652 DCT_CONST_ROUND_SHIFT(tmp1[6]);
1653 DCT_CONST_ROUND_SHIFT(tmp1[7]);
1654 DCT_CONST_ROUND_SHIFT(tmp1[8]);
1655 DCT_CONST_ROUND_SHIFT(tmp1[9]);
1656 DCT_CONST_ROUND_SHIFT(tmp1[10]);
1657 DCT_CONST_ROUND_SHIFT(tmp1[11]);
1658 DCT_CONST_ROUND_SHIFT(tmp1[12]);
1659 DCT_CONST_ROUND_SHIFT(tmp1[13]);
1660 DCT_CONST_ROUND_SHIFT(tmp1[14]);
1661 DCT_CONST_ROUND_SHIFT(tmp1[15]);
1662
1663 tmp16_0[0] = vec_add(in[0], in[4]);
1664 tmp16_0[1] = vec_add(in[1], in[5]);
1665 tmp16_0[2] = vec_add(in[2], in[6]);
1666 tmp16_0[3] = vec_add(in[3], in[7]);
1667 tmp16_0[4] = vec_sub(in[0], in[4]);
1668 tmp16_0[5] = vec_sub(in[1], in[5]);
1669 tmp16_0[6] = vec_sub(in[2], in[6]);
1670 tmp16_0[7] = vec_sub(in[3], in[7]);
1671 tmp16_0[8] = vec_packs(tmp1[0], tmp1[1]);
1672 tmp16_0[9] = vec_packs(tmp1[2], tmp1[3]);
1673 tmp16_0[10] = vec_packs(tmp1[4], tmp1[5]);
1674 tmp16_0[11] = vec_packs(tmp1[6], tmp1[7]);
1675 tmp16_0[12] = vec_packs(tmp1[8], tmp1[9]);
1676 tmp16_0[13] = vec_packs(tmp1[10], tmp1[11]);
1677 tmp16_0[14] = vec_packs(tmp1[12], tmp1[13]);
1678 tmp16_0[15] = vec_packs(tmp1[14], tmp1[15]);
1679
1680 // stage 3
1681 in[0] = vec_mergeh(tmp16_0[4], tmp16_0[5]);
1682 in[1] = vec_mergel(tmp16_0[4], tmp16_0[5]);
1683 in[2] = vec_mergeh(tmp16_0[6], tmp16_0[7]);
1684 in[3] = vec_mergel(tmp16_0[6], tmp16_0[7]);
1685 in[4] = vec_mergeh(tmp16_0[12], tmp16_0[13]);
1686 in[5] = vec_mergel(tmp16_0[12], tmp16_0[13]);
1687 in[6] = vec_mergeh(tmp16_0[14], tmp16_0[15]);
1688 in[7] = vec_mergel(tmp16_0[14], tmp16_0[15]);
1689
1690 tmp0[0] = vec_msum(in[0], cospi_p08_p24, zerov);
1691 tmp0[1] = vec_msum(in[1], cospi_p08_p24, zerov);
1692 tmp0[2] = vec_msum(in[0], cospi_p24_m08, zerov);
1693 tmp0[3] = vec_msum(in[1], cospi_p24_m08, zerov);
1694 tmp0[4] = vec_msum(in[2], cospi_m24_p08, zerov);
1695 tmp0[5] = vec_msum(in[3], cospi_m24_p08, zerov);
1696 tmp0[6] = vec_msum(in[2], cospi_p08_p24, zerov);
1697 tmp0[7] = vec_msum(in[3], cospi_p08_p24, zerov);
1698 tmp0[8] = vec_msum(in[4], cospi_p08_p24, zerov);
1699 tmp0[9] = vec_msum(in[5], cospi_p08_p24, zerov);
1700 tmp0[10] = vec_msum(in[4], cospi_p24_m08, zerov);
1701 tmp0[11] = vec_msum(in[5], cospi_p24_m08, zerov);
1702 tmp0[12] = vec_msum(in[6], cospi_m24_p08, zerov);
1703 tmp0[13] = vec_msum(in[7], cospi_m24_p08, zerov);
1704 tmp0[14] = vec_msum(in[6], cospi_p08_p24, zerov);
1705 tmp0[15] = vec_msum(in[7], cospi_p08_p24, zerov);
1706
1707 tmp1[0] = vec_add(tmp0[0], tmp0[4]);
1708 tmp1[1] = vec_add(tmp0[1], tmp0[5]);
1709 tmp1[2] = vec_add(tmp0[2], tmp0[6]);
1710 tmp1[3] = vec_add(tmp0[3], tmp0[7]);
1711 tmp1[4] = vec_sub(tmp0[0], tmp0[4]);
1712 tmp1[5] = vec_sub(tmp0[1], tmp0[5]);
1713 tmp1[6] = vec_sub(tmp0[2], tmp0[6]);
1714 tmp1[7] = vec_sub(tmp0[3], tmp0[7]);
1715 tmp1[8] = vec_add(tmp0[8], tmp0[12]);
1716 tmp1[9] = vec_add(tmp0[9], tmp0[13]);
1717 tmp1[10] = vec_add(tmp0[10], tmp0[14]);
1718 tmp1[11] = vec_add(tmp0[11], tmp0[15]);
1719 tmp1[12] = vec_sub(tmp0[8], tmp0[12]);
1720 tmp1[13] = vec_sub(tmp0[9], tmp0[13]);
1721 tmp1[14] = vec_sub(tmp0[10], tmp0[14]);
1722 tmp1[15] = vec_sub(tmp0[11], tmp0[15]);
1723
1724 DCT_CONST_ROUND_SHIFT(tmp1[0]);
1725 DCT_CONST_ROUND_SHIFT(tmp1[1]);
1726 DCT_CONST_ROUND_SHIFT(tmp1[2]);
1727 DCT_CONST_ROUND_SHIFT(tmp1[3]);
1728 DCT_CONST_ROUND_SHIFT(tmp1[4]);
1729 DCT_CONST_ROUND_SHIFT(tmp1[5]);
1730 DCT_CONST_ROUND_SHIFT(tmp1[6]);
1731 DCT_CONST_ROUND_SHIFT(tmp1[7]);
1732 DCT_CONST_ROUND_SHIFT(tmp1[8]);
1733 DCT_CONST_ROUND_SHIFT(tmp1[9]);
1734 DCT_CONST_ROUND_SHIFT(tmp1[10]);
1735 DCT_CONST_ROUND_SHIFT(tmp1[11]);
1736 DCT_CONST_ROUND_SHIFT(tmp1[12]);
1737 DCT_CONST_ROUND_SHIFT(tmp1[13]);
1738 DCT_CONST_ROUND_SHIFT(tmp1[14]);
1739 DCT_CONST_ROUND_SHIFT(tmp1[15]);
1740
1741 in[0] = vec_add(tmp16_0[0], tmp16_0[2]);
1742 in[1] = vec_add(tmp16_0[1], tmp16_0[3]);
1743 in[2] = vec_sub(tmp16_0[0], tmp16_0[2]);
1744 in[3] = vec_sub(tmp16_0[1], tmp16_0[3]);
1745 in[4] = vec_packs(tmp1[0], tmp1[1]);
1746 in[5] = vec_packs(tmp1[2], tmp1[3]);
1747 in[6] = vec_packs(tmp1[4], tmp1[5]);
1748 in[7] = vec_packs(tmp1[6], tmp1[7]);
1749 in[8] = vec_add(tmp16_0[8], tmp16_0[10]);
1750 in[9] = vec_add(tmp16_0[9], tmp16_0[11]);
1751 in[10] = vec_sub(tmp16_0[8], tmp16_0[10]);
1752 in[11] = vec_sub(tmp16_0[9], tmp16_0[11]);
1753 in[12] = vec_packs(tmp1[8], tmp1[9]);
1754 in[13] = vec_packs(tmp1[10], tmp1[11]);
1755 in[14] = vec_packs(tmp1[12], tmp1[13]);
1756 in[15] = vec_packs(tmp1[14], tmp1[15]);
1757
1758 // stage 4
1759 out[0] = vec_mergeh(in[2], in[3]);
1760 out[1] = vec_mergel(in[2], in[3]);
1761 out[2] = vec_mergeh(in[6], in[7]);
1762 out[3] = vec_mergel(in[6], in[7]);
1763 out[4] = vec_mergeh(in[10], in[11]);
1764 out[5] = vec_mergel(in[10], in[11]);
1765 out[6] = vec_mergeh(in[14], in[15]);
1766 out[7] = vec_mergel(in[14], in[15]);
1767 }
1768
vpx_iadst16_vsx(int16x8_t * src0,int16x8_t * src1)1769 void vpx_iadst16_vsx(int16x8_t *src0, int16x8_t *src1) {
1770 int16x8_t tmp0[16], tmp1[16], tmp2[8];
1771 int32x4_t tmp3, tmp4;
1772 int16x8_t zero16v = vec_splat_s16(0);
1773 int32x4_t zerov = vec_splat_s32(0);
1774 int16x8_t cospi_p16_m16 = vec_mergel(cospi16_v, cospi16m_v);
1775 int16x8_t cospi_m16_p16 = vec_mergel(cospi16m_v, cospi16_v);
1776 ROUND_SHIFT_INIT;
1777
1778 TRANSPOSE8x8(src0[0], src0[2], src0[4], src0[6], src0[8], src0[10], src0[12],
1779 src0[14], tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5],
1780 tmp0[6], tmp0[7]);
1781 TRANSPOSE8x8(src1[0], src1[2], src1[4], src1[6], src1[8], src1[10], src1[12],
1782 src1[14], tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5],
1783 tmp1[6], tmp1[7]);
1784 TRANSPOSE8x8(src0[1], src0[3], src0[5], src0[7], src0[9], src0[11], src0[13],
1785 src0[15], tmp0[8], tmp0[9], tmp0[10], tmp0[11], tmp0[12],
1786 tmp0[13], tmp0[14], tmp0[15]);
1787 TRANSPOSE8x8(src1[1], src1[3], src1[5], src1[7], src1[9], src1[11], src1[13],
1788 src1[15], tmp1[8], tmp1[9], tmp1[10], tmp1[11], tmp1[12],
1789 tmp1[13], tmp1[14], tmp1[15]);
1790
1791 iadst16x8_vsx(tmp0, tmp2);
1792 IADST_WRAPLOW(tmp2[0], tmp2[1], tmp3, tmp4, src0[14], cospi16m_v);
1793 IADST_WRAPLOW(tmp2[0], tmp2[1], tmp3, tmp4, src1[0], cospi_p16_m16);
1794 IADST_WRAPLOW(tmp2[2], tmp2[3], tmp3, tmp4, src0[8], cospi16_v);
1795 IADST_WRAPLOW(tmp2[2], tmp2[3], tmp3, tmp4, src1[6], cospi_m16_p16);
1796 IADST_WRAPLOW(tmp2[4], tmp2[5], tmp3, tmp4, src0[12], cospi16_v);
1797 IADST_WRAPLOW(tmp2[4], tmp2[5], tmp3, tmp4, src1[2], cospi_m16_p16);
1798 IADST_WRAPLOW(tmp2[6], tmp2[7], tmp3, tmp4, src0[10], cospi16m_v);
1799 IADST_WRAPLOW(tmp2[6], tmp2[7], tmp3, tmp4, src1[4], cospi_p16_m16);
1800
1801 src0[0] = tmp0[0];
1802 src0[2] = vec_sub(zero16v, tmp0[8]);
1803 src0[4] = tmp0[12];
1804 src0[6] = vec_sub(zero16v, tmp0[4]);
1805 src1[8] = tmp0[5];
1806 src1[10] = vec_sub(zero16v, tmp0[13]);
1807 src1[12] = tmp0[9];
1808 src1[14] = vec_sub(zero16v, tmp0[1]);
1809
1810 iadst16x8_vsx(tmp1, tmp2);
1811 IADST_WRAPLOW(tmp2[0], tmp2[1], tmp3, tmp4, src0[15], cospi16m_v);
1812 IADST_WRAPLOW(tmp2[0], tmp2[1], tmp3, tmp4, src1[1], cospi_p16_m16);
1813 IADST_WRAPLOW(tmp2[2], tmp2[3], tmp3, tmp4, src0[9], cospi16_v);
1814 IADST_WRAPLOW(tmp2[2], tmp2[3], tmp3, tmp4, src1[7], cospi_m16_p16);
1815 IADST_WRAPLOW(tmp2[4], tmp2[5], tmp3, tmp4, src0[13], cospi16_v);
1816 IADST_WRAPLOW(tmp2[4], tmp2[5], tmp3, tmp4, src1[3], cospi_m16_p16);
1817 IADST_WRAPLOW(tmp2[6], tmp2[7], tmp3, tmp4, src0[11], cospi16m_v);
1818 IADST_WRAPLOW(tmp2[6], tmp2[7], tmp3, tmp4, src1[5], cospi_p16_m16);
1819
1820 src0[1] = tmp1[0];
1821 src0[3] = vec_sub(zero16v, tmp1[8]);
1822 src0[5] = tmp1[12];
1823 src0[7] = vec_sub(zero16v, tmp1[4]);
1824 src1[9] = tmp1[5];
1825 src1[11] = vec_sub(zero16v, tmp1[13]);
1826 src1[13] = tmp1[9];
1827 src1[15] = vec_sub(zero16v, tmp1[1]);
1828 }
1829