1 /*
2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_dsp/ppc/types_vsx.h"
13
vpx_v_predictor_16x16_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)14 void vpx_v_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
15 const uint8_t *above, const uint8_t *left) {
16 const uint8x16_t d = vec_vsx_ld(0, above);
17 int i;
18 (void)left;
19
20 for (i = 0; i < 16; i++, dst += stride) {
21 vec_vsx_st(d, 0, dst);
22 }
23 }
24
vpx_v_predictor_32x32_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)25 void vpx_v_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
26 const uint8_t *above, const uint8_t *left) {
27 const uint8x16_t d0 = vec_vsx_ld(0, above);
28 const uint8x16_t d1 = vec_vsx_ld(16, above);
29 int i;
30 (void)left;
31
32 for (i = 0; i < 32; i++, dst += stride) {
33 vec_vsx_st(d0, 0, dst);
34 vec_vsx_st(d1, 16, dst);
35 }
36 }
37
38 // TODO(crbug.com/webm/1522): Fix test failures.
39 #if 0
40 static const uint32x4_t mask4 = { 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
41
42 void vpx_h_predictor_4x4_vsx(uint8_t *dst, ptrdiff_t stride,
43 const uint8_t *above, const uint8_t *left) {
44 const uint8x16_t d = vec_vsx_ld(0, left);
45 const uint8x16_t v0 = vec_splat(d, 0);
46 const uint8x16_t v1 = vec_splat(d, 1);
47 const uint8x16_t v2 = vec_splat(d, 2);
48 const uint8x16_t v3 = vec_splat(d, 3);
49
50 (void)above;
51
52 vec_vsx_st(vec_sel(v0, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
53 dst += stride;
54 vec_vsx_st(vec_sel(v1, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
55 dst += stride;
56 vec_vsx_st(vec_sel(v2, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
57 dst += stride;
58 vec_vsx_st(vec_sel(v3, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
59 }
60
61 void vpx_h_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
62 const uint8_t *above, const uint8_t *left) {
63 const uint8x16_t d = vec_vsx_ld(0, left);
64 const uint8x16_t v0 = vec_splat(d, 0);
65 const uint8x16_t v1 = vec_splat(d, 1);
66 const uint8x16_t v2 = vec_splat(d, 2);
67 const uint8x16_t v3 = vec_splat(d, 3);
68
69 const uint8x16_t v4 = vec_splat(d, 4);
70 const uint8x16_t v5 = vec_splat(d, 5);
71 const uint8x16_t v6 = vec_splat(d, 6);
72 const uint8x16_t v7 = vec_splat(d, 7);
73
74 (void)above;
75
76 vec_vsx_st(xxpermdi(v0, vec_vsx_ld(0, dst), 1), 0, dst);
77 dst += stride;
78 vec_vsx_st(xxpermdi(v1, vec_vsx_ld(0, dst), 1), 0, dst);
79 dst += stride;
80 vec_vsx_st(xxpermdi(v2, vec_vsx_ld(0, dst), 1), 0, dst);
81 dst += stride;
82 vec_vsx_st(xxpermdi(v3, vec_vsx_ld(0, dst), 1), 0, dst);
83 dst += stride;
84 vec_vsx_st(xxpermdi(v4, vec_vsx_ld(0, dst), 1), 0, dst);
85 dst += stride;
86 vec_vsx_st(xxpermdi(v5, vec_vsx_ld(0, dst), 1), 0, dst);
87 dst += stride;
88 vec_vsx_st(xxpermdi(v6, vec_vsx_ld(0, dst), 1), 0, dst);
89 dst += stride;
90 vec_vsx_st(xxpermdi(v7, vec_vsx_ld(0, dst), 1), 0, dst);
91 }
92 #endif
93
vpx_h_predictor_16x16_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)94 void vpx_h_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
95 const uint8_t *above, const uint8_t *left) {
96 const uint8x16_t d = vec_vsx_ld(0, left);
97 const uint8x16_t v0 = vec_splat(d, 0);
98 const uint8x16_t v1 = vec_splat(d, 1);
99 const uint8x16_t v2 = vec_splat(d, 2);
100 const uint8x16_t v3 = vec_splat(d, 3);
101
102 const uint8x16_t v4 = vec_splat(d, 4);
103 const uint8x16_t v5 = vec_splat(d, 5);
104 const uint8x16_t v6 = vec_splat(d, 6);
105 const uint8x16_t v7 = vec_splat(d, 7);
106
107 const uint8x16_t v8 = vec_splat(d, 8);
108 const uint8x16_t v9 = vec_splat(d, 9);
109 const uint8x16_t v10 = vec_splat(d, 10);
110 const uint8x16_t v11 = vec_splat(d, 11);
111
112 const uint8x16_t v12 = vec_splat(d, 12);
113 const uint8x16_t v13 = vec_splat(d, 13);
114 const uint8x16_t v14 = vec_splat(d, 14);
115 const uint8x16_t v15 = vec_splat(d, 15);
116
117 (void)above;
118
119 vec_vsx_st(v0, 0, dst);
120 dst += stride;
121 vec_vsx_st(v1, 0, dst);
122 dst += stride;
123 vec_vsx_st(v2, 0, dst);
124 dst += stride;
125 vec_vsx_st(v3, 0, dst);
126 dst += stride;
127 vec_vsx_st(v4, 0, dst);
128 dst += stride;
129 vec_vsx_st(v5, 0, dst);
130 dst += stride;
131 vec_vsx_st(v6, 0, dst);
132 dst += stride;
133 vec_vsx_st(v7, 0, dst);
134 dst += stride;
135 vec_vsx_st(v8, 0, dst);
136 dst += stride;
137 vec_vsx_st(v9, 0, dst);
138 dst += stride;
139 vec_vsx_st(v10, 0, dst);
140 dst += stride;
141 vec_vsx_st(v11, 0, dst);
142 dst += stride;
143 vec_vsx_st(v12, 0, dst);
144 dst += stride;
145 vec_vsx_st(v13, 0, dst);
146 dst += stride;
147 vec_vsx_st(v14, 0, dst);
148 dst += stride;
149 vec_vsx_st(v15, 0, dst);
150 }
151
152 #define H_PREDICTOR_32(v) \
153 vec_vsx_st(v, 0, dst); \
154 vec_vsx_st(v, 16, dst); \
155 dst += stride
156
vpx_h_predictor_32x32_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)157 void vpx_h_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
158 const uint8_t *above, const uint8_t *left) {
159 const uint8x16_t d0 = vec_vsx_ld(0, left);
160 const uint8x16_t d1 = vec_vsx_ld(16, left);
161
162 const uint8x16_t v0_0 = vec_splat(d0, 0);
163 const uint8x16_t v1_0 = vec_splat(d0, 1);
164 const uint8x16_t v2_0 = vec_splat(d0, 2);
165 const uint8x16_t v3_0 = vec_splat(d0, 3);
166 const uint8x16_t v4_0 = vec_splat(d0, 4);
167 const uint8x16_t v5_0 = vec_splat(d0, 5);
168 const uint8x16_t v6_0 = vec_splat(d0, 6);
169 const uint8x16_t v7_0 = vec_splat(d0, 7);
170 const uint8x16_t v8_0 = vec_splat(d0, 8);
171 const uint8x16_t v9_0 = vec_splat(d0, 9);
172 const uint8x16_t v10_0 = vec_splat(d0, 10);
173 const uint8x16_t v11_0 = vec_splat(d0, 11);
174 const uint8x16_t v12_0 = vec_splat(d0, 12);
175 const uint8x16_t v13_0 = vec_splat(d0, 13);
176 const uint8x16_t v14_0 = vec_splat(d0, 14);
177 const uint8x16_t v15_0 = vec_splat(d0, 15);
178
179 const uint8x16_t v0_1 = vec_splat(d1, 0);
180 const uint8x16_t v1_1 = vec_splat(d1, 1);
181 const uint8x16_t v2_1 = vec_splat(d1, 2);
182 const uint8x16_t v3_1 = vec_splat(d1, 3);
183 const uint8x16_t v4_1 = vec_splat(d1, 4);
184 const uint8x16_t v5_1 = vec_splat(d1, 5);
185 const uint8x16_t v6_1 = vec_splat(d1, 6);
186 const uint8x16_t v7_1 = vec_splat(d1, 7);
187 const uint8x16_t v8_1 = vec_splat(d1, 8);
188 const uint8x16_t v9_1 = vec_splat(d1, 9);
189 const uint8x16_t v10_1 = vec_splat(d1, 10);
190 const uint8x16_t v11_1 = vec_splat(d1, 11);
191 const uint8x16_t v12_1 = vec_splat(d1, 12);
192 const uint8x16_t v13_1 = vec_splat(d1, 13);
193 const uint8x16_t v14_1 = vec_splat(d1, 14);
194 const uint8x16_t v15_1 = vec_splat(d1, 15);
195
196 (void)above;
197
198 H_PREDICTOR_32(v0_0);
199 H_PREDICTOR_32(v1_0);
200 H_PREDICTOR_32(v2_0);
201 H_PREDICTOR_32(v3_0);
202
203 H_PREDICTOR_32(v4_0);
204 H_PREDICTOR_32(v5_0);
205 H_PREDICTOR_32(v6_0);
206 H_PREDICTOR_32(v7_0);
207
208 H_PREDICTOR_32(v8_0);
209 H_PREDICTOR_32(v9_0);
210 H_PREDICTOR_32(v10_0);
211 H_PREDICTOR_32(v11_0);
212
213 H_PREDICTOR_32(v12_0);
214 H_PREDICTOR_32(v13_0);
215 H_PREDICTOR_32(v14_0);
216 H_PREDICTOR_32(v15_0);
217
218 H_PREDICTOR_32(v0_1);
219 H_PREDICTOR_32(v1_1);
220 H_PREDICTOR_32(v2_1);
221 H_PREDICTOR_32(v3_1);
222
223 H_PREDICTOR_32(v4_1);
224 H_PREDICTOR_32(v5_1);
225 H_PREDICTOR_32(v6_1);
226 H_PREDICTOR_32(v7_1);
227
228 H_PREDICTOR_32(v8_1);
229 H_PREDICTOR_32(v9_1);
230 H_PREDICTOR_32(v10_1);
231 H_PREDICTOR_32(v11_1);
232
233 H_PREDICTOR_32(v12_1);
234 H_PREDICTOR_32(v13_1);
235 H_PREDICTOR_32(v14_1);
236 H_PREDICTOR_32(v15_1);
237 }
238
239 // TODO(crbug.com/webm/1522): Fix test failures.
240 #if 0
241 void vpx_tm_predictor_4x4_vsx(uint8_t *dst, ptrdiff_t stride,
242 const uint8_t *above, const uint8_t *left) {
243 const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
244 const int16x8_t l = unpack_to_s16_h(vec_vsx_ld(0, left));
245 const int16x8_t a = unpack_to_s16_h(vec_vsx_ld(0, above));
246 int16x8_t tmp, val;
247 uint8x16_t d;
248
249 d = vec_vsx_ld(0, dst);
250 tmp = unpack_to_s16_l(d);
251 val = vec_sub(vec_add(vec_splat(l, 0), a), tl);
252 vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
253 dst += stride;
254
255 d = vec_vsx_ld(0, dst);
256 tmp = unpack_to_s16_l(d);
257 val = vec_sub(vec_add(vec_splat(l, 1), a), tl);
258 vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
259 dst += stride;
260
261 d = vec_vsx_ld(0, dst);
262 tmp = unpack_to_s16_l(d);
263 val = vec_sub(vec_add(vec_splat(l, 2), a), tl);
264 vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
265 dst += stride;
266
267 d = vec_vsx_ld(0, dst);
268 tmp = unpack_to_s16_l(d);
269 val = vec_sub(vec_add(vec_splat(l, 3), a), tl);
270 vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
271 }
272
273 void vpx_tm_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
274 const uint8_t *above, const uint8_t *left) {
275 const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
276 const int16x8_t l = unpack_to_s16_h(vec_vsx_ld(0, left));
277 const int16x8_t a = unpack_to_s16_h(vec_vsx_ld(0, above));
278 int16x8_t tmp, val;
279
280 tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
281 val = vec_sub(vec_add(vec_splat(l, 0), a), tl);
282 vec_vsx_st(vec_packsu(val, tmp), 0, dst);
283 dst += stride;
284
285 tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
286 val = vec_sub(vec_add(vec_splat(l, 1), a), tl);
287 vec_vsx_st(vec_packsu(val, tmp), 0, dst);
288 dst += stride;
289
290 tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
291 val = vec_sub(vec_add(vec_splat(l, 2), a), tl);
292 vec_vsx_st(vec_packsu(val, tmp), 0, dst);
293 dst += stride;
294
295 tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
296 val = vec_sub(vec_add(vec_splat(l, 3), a), tl);
297 vec_vsx_st(vec_packsu(val, tmp), 0, dst);
298 dst += stride;
299
300 tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
301 val = vec_sub(vec_add(vec_splat(l, 4), a), tl);
302 vec_vsx_st(vec_packsu(val, tmp), 0, dst);
303 dst += stride;
304
305 tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
306 val = vec_sub(vec_add(vec_splat(l, 5), a), tl);
307 vec_vsx_st(vec_packsu(val, tmp), 0, dst);
308 dst += stride;
309
310 tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
311 val = vec_sub(vec_add(vec_splat(l, 6), a), tl);
312 vec_vsx_st(vec_packsu(val, tmp), 0, dst);
313 dst += stride;
314
315 tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
316 val = vec_sub(vec_add(vec_splat(l, 7), a), tl);
317 vec_vsx_st(vec_packsu(val, tmp), 0, dst);
318 }
319 #endif
320
tm_predictor_16x8(uint8_t * dst,const ptrdiff_t stride,int16x8_t l,int16x8_t ah,int16x8_t al,int16x8_t tl)321 static void tm_predictor_16x8(uint8_t *dst, const ptrdiff_t stride, int16x8_t l,
322 int16x8_t ah, int16x8_t al, int16x8_t tl) {
323 int16x8_t vh, vl, ls;
324
325 ls = vec_splat(l, 0);
326 vh = vec_sub(vec_add(ls, ah), tl);
327 vl = vec_sub(vec_add(ls, al), tl);
328 vec_vsx_st(vec_packsu(vh, vl), 0, dst);
329 dst += stride;
330
331 ls = vec_splat(l, 1);
332 vh = vec_sub(vec_add(ls, ah), tl);
333 vl = vec_sub(vec_add(ls, al), tl);
334 vec_vsx_st(vec_packsu(vh, vl), 0, dst);
335 dst += stride;
336
337 ls = vec_splat(l, 2);
338 vh = vec_sub(vec_add(ls, ah), tl);
339 vl = vec_sub(vec_add(ls, al), tl);
340 vec_vsx_st(vec_packsu(vh, vl), 0, dst);
341 dst += stride;
342
343 ls = vec_splat(l, 3);
344 vh = vec_sub(vec_add(ls, ah), tl);
345 vl = vec_sub(vec_add(ls, al), tl);
346 vec_vsx_st(vec_packsu(vh, vl), 0, dst);
347 dst += stride;
348
349 ls = vec_splat(l, 4);
350 vh = vec_sub(vec_add(ls, ah), tl);
351 vl = vec_sub(vec_add(ls, al), tl);
352 vec_vsx_st(vec_packsu(vh, vl), 0, dst);
353 dst += stride;
354
355 ls = vec_splat(l, 5);
356 vh = vec_sub(vec_add(ls, ah), tl);
357 vl = vec_sub(vec_add(ls, al), tl);
358 vec_vsx_st(vec_packsu(vh, vl), 0, dst);
359 dst += stride;
360
361 ls = vec_splat(l, 6);
362 vh = vec_sub(vec_add(ls, ah), tl);
363 vl = vec_sub(vec_add(ls, al), tl);
364 vec_vsx_st(vec_packsu(vh, vl), 0, dst);
365 dst += stride;
366
367 ls = vec_splat(l, 7);
368 vh = vec_sub(vec_add(ls, ah), tl);
369 vl = vec_sub(vec_add(ls, al), tl);
370 vec_vsx_st(vec_packsu(vh, vl), 0, dst);
371 }
372
vpx_tm_predictor_16x16_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)373 void vpx_tm_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
374 const uint8_t *above, const uint8_t *left) {
375 const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
376 const uint8x16_t l = vec_vsx_ld(0, left);
377 const int16x8_t lh = unpack_to_s16_h(l);
378 const int16x8_t ll = unpack_to_s16_l(l);
379 const uint8x16_t a = vec_vsx_ld(0, above);
380 const int16x8_t ah = unpack_to_s16_h(a);
381 const int16x8_t al = unpack_to_s16_l(a);
382
383 tm_predictor_16x8(dst, stride, lh, ah, al, tl);
384
385 dst += stride * 8;
386
387 tm_predictor_16x8(dst, stride, ll, ah, al, tl);
388 }
389
tm_predictor_32x1(uint8_t * dst,const int16x8_t ls,const int16x8_t a0h,const int16x8_t a0l,const int16x8_t a1h,const int16x8_t a1l,const int16x8_t tl)390 static INLINE void tm_predictor_32x1(uint8_t *dst, const int16x8_t ls,
391 const int16x8_t a0h, const int16x8_t a0l,
392 const int16x8_t a1h, const int16x8_t a1l,
393 const int16x8_t tl) {
394 int16x8_t vh, vl;
395
396 vh = vec_sub(vec_add(ls, a0h), tl);
397 vl = vec_sub(vec_add(ls, a0l), tl);
398 vec_vsx_st(vec_packsu(vh, vl), 0, dst);
399 vh = vec_sub(vec_add(ls, a1h), tl);
400 vl = vec_sub(vec_add(ls, a1l), tl);
401 vec_vsx_st(vec_packsu(vh, vl), 16, dst);
402 }
403
tm_predictor_32x8(uint8_t * dst,const ptrdiff_t stride,const int16x8_t l,const uint8x16_t a0,const uint8x16_t a1,const int16x8_t tl)404 static void tm_predictor_32x8(uint8_t *dst, const ptrdiff_t stride,
405 const int16x8_t l, const uint8x16_t a0,
406 const uint8x16_t a1, const int16x8_t tl) {
407 const int16x8_t a0h = unpack_to_s16_h(a0);
408 const int16x8_t a0l = unpack_to_s16_l(a0);
409 const int16x8_t a1h = unpack_to_s16_h(a1);
410 const int16x8_t a1l = unpack_to_s16_l(a1);
411
412 tm_predictor_32x1(dst, vec_splat(l, 0), a0h, a0l, a1h, a1l, tl);
413 dst += stride;
414
415 tm_predictor_32x1(dst, vec_splat(l, 1), a0h, a0l, a1h, a1l, tl);
416 dst += stride;
417
418 tm_predictor_32x1(dst, vec_splat(l, 2), a0h, a0l, a1h, a1l, tl);
419 dst += stride;
420
421 tm_predictor_32x1(dst, vec_splat(l, 3), a0h, a0l, a1h, a1l, tl);
422 dst += stride;
423
424 tm_predictor_32x1(dst, vec_splat(l, 4), a0h, a0l, a1h, a1l, tl);
425 dst += stride;
426
427 tm_predictor_32x1(dst, vec_splat(l, 5), a0h, a0l, a1h, a1l, tl);
428 dst += stride;
429
430 tm_predictor_32x1(dst, vec_splat(l, 6), a0h, a0l, a1h, a1l, tl);
431 dst += stride;
432
433 tm_predictor_32x1(dst, vec_splat(l, 7), a0h, a0l, a1h, a1l, tl);
434 }
435
vpx_tm_predictor_32x32_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)436 void vpx_tm_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
437 const uint8_t *above, const uint8_t *left) {
438 const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
439 const uint8x16_t l0 = vec_vsx_ld(0, left);
440 const uint8x16_t l1 = vec_vsx_ld(16, left);
441 const uint8x16_t a0 = vec_vsx_ld(0, above);
442 const uint8x16_t a1 = vec_vsx_ld(16, above);
443
444 tm_predictor_32x8(dst, stride, unpack_to_s16_h(l0), a0, a1, tl);
445 dst += stride * 8;
446
447 tm_predictor_32x8(dst, stride, unpack_to_s16_l(l0), a0, a1, tl);
448 dst += stride * 8;
449
450 tm_predictor_32x8(dst, stride, unpack_to_s16_h(l1), a0, a1, tl);
451 dst += stride * 8;
452
453 tm_predictor_32x8(dst, stride, unpack_to_s16_l(l1), a0, a1, tl);
454 }
455
dc_fill_predictor_8x8(uint8_t * dst,const ptrdiff_t stride,const uint8x16_t val)456 static INLINE void dc_fill_predictor_8x8(uint8_t *dst, const ptrdiff_t stride,
457 const uint8x16_t val) {
458 int i;
459
460 for (i = 0; i < 8; i++, dst += stride) {
461 const uint8x16_t d = vec_vsx_ld(0, dst);
462 vec_vsx_st(xxpermdi(val, d, 1), 0, dst);
463 }
464 }
465
dc_fill_predictor_16x16(uint8_t * dst,const ptrdiff_t stride,const uint8x16_t val)466 static INLINE void dc_fill_predictor_16x16(uint8_t *dst, const ptrdiff_t stride,
467 const uint8x16_t val) {
468 int i;
469
470 for (i = 0; i < 16; i++, dst += stride) {
471 vec_vsx_st(val, 0, dst);
472 }
473 }
474
vpx_dc_128_predictor_16x16_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)475 void vpx_dc_128_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
476 const uint8_t *above, const uint8_t *left) {
477 const uint8x16_t v128 = vec_sl(vec_splat_u8(1), vec_splat_u8(7));
478 (void)above;
479 (void)left;
480
481 dc_fill_predictor_16x16(dst, stride, v128);
482 }
483
dc_fill_predictor_32x32(uint8_t * dst,const ptrdiff_t stride,const uint8x16_t val)484 static INLINE void dc_fill_predictor_32x32(uint8_t *dst, const ptrdiff_t stride,
485 const uint8x16_t val) {
486 int i;
487
488 for (i = 0; i < 32; i++, dst += stride) {
489 vec_vsx_st(val, 0, dst);
490 vec_vsx_st(val, 16, dst);
491 }
492 }
493
vpx_dc_128_predictor_32x32_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)494 void vpx_dc_128_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
495 const uint8_t *above, const uint8_t *left) {
496 const uint8x16_t v128 = vec_sl(vec_splat_u8(1), vec_splat_u8(7));
497 (void)above;
498 (void)left;
499
500 dc_fill_predictor_32x32(dst, stride, v128);
501 }
502
avg16(const uint8_t * values)503 static uint8x16_t avg16(const uint8_t *values) {
504 const int32x4_t sum4s =
505 (int32x4_t)vec_sum4s(vec_vsx_ld(0, values), vec_splat_u32(0));
506 const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, vec_splat_s32(8));
507 const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(4));
508
509 return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
510 3);
511 }
512
vpx_dc_left_predictor_16x16_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)513 void vpx_dc_left_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
514 const uint8_t *above,
515 const uint8_t *left) {
516 (void)above;
517
518 dc_fill_predictor_16x16(dst, stride, avg16(left));
519 }
520
vpx_dc_top_predictor_16x16_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)521 void vpx_dc_top_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
522 const uint8_t *above, const uint8_t *left) {
523 (void)left;
524
525 dc_fill_predictor_16x16(dst, stride, avg16(above));
526 }
527
avg32(const uint8_t * values)528 static uint8x16_t avg32(const uint8_t *values) {
529 const uint8x16_t v0 = vec_vsx_ld(0, values);
530 const uint8x16_t v1 = vec_vsx_ld(16, values);
531 const int32x4_t v16 = vec_sl(vec_splat_s32(1), vec_splat_u32(4));
532 const int32x4_t sum4s =
533 (int32x4_t)vec_sum4s(v0, vec_sum4s(v1, vec_splat_u32(0)));
534 const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v16);
535 const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(5));
536
537 return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
538 3);
539 }
540
vpx_dc_left_predictor_32x32_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)541 void vpx_dc_left_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
542 const uint8_t *above,
543 const uint8_t *left) {
544 (void)above;
545
546 dc_fill_predictor_32x32(dst, stride, avg32(left));
547 }
548
vpx_dc_top_predictor_32x32_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)549 void vpx_dc_top_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
550 const uint8_t *above, const uint8_t *left) {
551 (void)left;
552
553 dc_fill_predictor_32x32(dst, stride, avg32(above));
554 }
555
556 // TODO(crbug.com/webm/1522): Fix test failures.
557 #if 0
558 static uint8x16_t dc_avg8(const uint8_t *above, const uint8_t *left) {
559 const uint8x16_t a0 = vec_vsx_ld(0, above);
560 const uint8x16_t l0 = vec_vsx_ld(0, left);
561 const int32x4_t sum4s =
562 (int32x4_t)vec_sum4s(l0, vec_sum4s(a0, vec_splat_u32(0)));
563 const int32x4_t sum4s8 = xxpermdi(sum4s, vec_splat_s32(0), 1);
564 const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s8, vec_splat_s32(8));
565 const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(4));
566
567 return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
568 3);
569 }
570 #endif
571
dc_avg16(const uint8_t * above,const uint8_t * left)572 static uint8x16_t dc_avg16(const uint8_t *above, const uint8_t *left) {
573 const uint8x16_t a0 = vec_vsx_ld(0, above);
574 const uint8x16_t l0 = vec_vsx_ld(0, left);
575 const int32x4_t v16 = vec_sl(vec_splat_s32(1), vec_splat_u32(4));
576 const int32x4_t sum4s =
577 (int32x4_t)vec_sum4s(l0, vec_sum4s(a0, vec_splat_u32(0)));
578 const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v16);
579 const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(5));
580
581 return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
582 3);
583 }
584
585 // TODO(crbug.com/webm/1522): Fix test failures.
586 #if 0
587 void vpx_dc_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
588 const uint8_t *above, const uint8_t *left) {
589 dc_fill_predictor_8x8(dst, stride, dc_avg8(above, left));
590 }
591 #endif
592
vpx_dc_predictor_16x16_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)593 void vpx_dc_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
594 const uint8_t *above, const uint8_t *left) {
595 dc_fill_predictor_16x16(dst, stride, dc_avg16(above, left));
596 }
597
dc_avg32(const uint8_t * above,const uint8_t * left)598 static uint8x16_t dc_avg32(const uint8_t *above, const uint8_t *left) {
599 const uint8x16_t a0 = vec_vsx_ld(0, above);
600 const uint8x16_t a1 = vec_vsx_ld(16, above);
601 const uint8x16_t l0 = vec_vsx_ld(0, left);
602 const uint8x16_t l1 = vec_vsx_ld(16, left);
603 const int32x4_t v32 = vec_sl(vec_splat_s32(1), vec_splat_u32(5));
604 const uint32x4_t a_sum = vec_sum4s(a0, vec_sum4s(a1, vec_splat_u32(0)));
605 const int32x4_t sum4s = (int32x4_t)vec_sum4s(l0, vec_sum4s(l1, a_sum));
606 const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v32);
607 const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(6));
608
609 return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
610 3);
611 }
612
vpx_dc_predictor_32x32_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)613 void vpx_dc_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
614 const uint8_t *above, const uint8_t *left) {
615 dc_fill_predictor_32x32(dst, stride, dc_avg32(above, left));
616 }
617
avg3(const uint8x16_t a,const uint8x16_t b,const uint8x16_t c)618 static uint8x16_t avg3(const uint8x16_t a, const uint8x16_t b,
619 const uint8x16_t c) {
620 const uint8x16_t ac =
621 vec_adds(vec_and(a, c), vec_sr(vec_xor(a, c), vec_splat_u8(1)));
622
623 return vec_avg(ac, b);
624 }
625
626 // Workaround vec_sld/vec_xxsldi/vec_lsdoi being missing or broken.
627 static const uint8x16_t sl1 = { 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8,
628 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x10 };
629
630 // TODO(crbug.com/webm/1522): Fix test failures.
631 #if 0
632 void vpx_d45_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
633 const uint8_t *above, const uint8_t *left) {
634 const uint8x16_t af = vec_vsx_ld(0, above);
635 const uint8x16_t above_right = vec_splat(af, 7);
636 const uint8x16_t a = xxpermdi(af, above_right, 1);
637 const uint8x16_t b = vec_perm(a, above_right, sl1);
638 const uint8x16_t c = vec_perm(b, above_right, sl1);
639 uint8x16_t row = avg3(a, b, c);
640 int i;
641 (void)left;
642
643 for (i = 0; i < 8; i++) {
644 const uint8x16_t d = vec_vsx_ld(0, dst);
645 vec_vsx_st(xxpermdi(row, d, 1), 0, dst);
646 dst += stride;
647 row = vec_perm(row, above_right, sl1);
648 }
649 }
650 #endif
651
vpx_d45_predictor_16x16_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)652 void vpx_d45_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
653 const uint8_t *above, const uint8_t *left) {
654 const uint8x16_t a = vec_vsx_ld(0, above);
655 const uint8x16_t above_right = vec_splat(a, 15);
656 const uint8x16_t b = vec_perm(a, above_right, sl1);
657 const uint8x16_t c = vec_perm(b, above_right, sl1);
658 uint8x16_t row = avg3(a, b, c);
659 int i;
660 (void)left;
661
662 for (i = 0; i < 16; i++) {
663 vec_vsx_st(row, 0, dst);
664 dst += stride;
665 row = vec_perm(row, above_right, sl1);
666 }
667 }
668
vpx_d45_predictor_32x32_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)669 void vpx_d45_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
670 const uint8_t *above, const uint8_t *left) {
671 const uint8x16_t a0 = vec_vsx_ld(0, above);
672 const uint8x16_t a1 = vec_vsx_ld(16, above);
673 const uint8x16_t above_right = vec_splat(a1, 15);
674 const uint8x16_t b0 = vec_perm(a0, a1, sl1);
675 const uint8x16_t b1 = vec_perm(a1, above_right, sl1);
676 const uint8x16_t c0 = vec_perm(b0, b1, sl1);
677 const uint8x16_t c1 = vec_perm(b1, above_right, sl1);
678 uint8x16_t row0 = avg3(a0, b0, c0);
679 uint8x16_t row1 = avg3(a1, b1, c1);
680 int i;
681 (void)left;
682
683 for (i = 0; i < 32; i++) {
684 vec_vsx_st(row0, 0, dst);
685 vec_vsx_st(row1, 16, dst);
686 dst += stride;
687 row0 = vec_perm(row0, row1, sl1);
688 row1 = vec_perm(row1, above_right, sl1);
689 }
690 }
691
692 // TODO(crbug.com/webm/1522): Fix test failures.
693 #if 0
694 void vpx_d63_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
695 const uint8_t *above, const uint8_t *left) {
696 const uint8x16_t af = vec_vsx_ld(0, above);
697 const uint8x16_t above_right = vec_splat(af, 9);
698 const uint8x16_t a = xxpermdi(af, above_right, 1);
699 const uint8x16_t b = vec_perm(a, above_right, sl1);
700 const uint8x16_t c = vec_perm(b, above_right, sl1);
701 uint8x16_t row0 = vec_avg(a, b);
702 uint8x16_t row1 = avg3(a, b, c);
703 int i;
704 (void)left;
705
706 for (i = 0; i < 4; i++) {
707 const uint8x16_t d0 = vec_vsx_ld(0, dst);
708 const uint8x16_t d1 = vec_vsx_ld(0, dst + stride);
709 vec_vsx_st(xxpermdi(row0, d0, 1), 0, dst);
710 vec_vsx_st(xxpermdi(row1, d1, 1), 0, dst + stride);
711 dst += stride * 2;
712 row0 = vec_perm(row0, above_right, sl1);
713 row1 = vec_perm(row1, above_right, sl1);
714 }
715 }
716 #endif
717
vpx_d63_predictor_16x16_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)718 void vpx_d63_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
719 const uint8_t *above, const uint8_t *left) {
720 const uint8x16_t a0 = vec_vsx_ld(0, above);
721 const uint8x16_t a1 = vec_vsx_ld(16, above);
722 const uint8x16_t above_right = vec_splat(a1, 0);
723 const uint8x16_t b = vec_perm(a0, above_right, sl1);
724 const uint8x16_t c = vec_perm(b, above_right, sl1);
725 uint8x16_t row0 = vec_avg(a0, b);
726 uint8x16_t row1 = avg3(a0, b, c);
727 int i;
728 (void)left;
729
730 for (i = 0; i < 8; i++) {
731 vec_vsx_st(row0, 0, dst);
732 vec_vsx_st(row1, 0, dst + stride);
733 dst += stride * 2;
734 row0 = vec_perm(row0, above_right, sl1);
735 row1 = vec_perm(row1, above_right, sl1);
736 }
737 }
738
vpx_d63_predictor_32x32_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)739 void vpx_d63_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
740 const uint8_t *above, const uint8_t *left) {
741 const uint8x16_t a0 = vec_vsx_ld(0, above);
742 const uint8x16_t a1 = vec_vsx_ld(16, above);
743 const uint8x16_t a2 = vec_vsx_ld(32, above);
744 const uint8x16_t above_right = vec_splat(a2, 0);
745 const uint8x16_t b0 = vec_perm(a0, a1, sl1);
746 const uint8x16_t b1 = vec_perm(a1, above_right, sl1);
747 const uint8x16_t c0 = vec_perm(b0, b1, sl1);
748 const uint8x16_t c1 = vec_perm(b1, above_right, sl1);
749 uint8x16_t row0_0 = vec_avg(a0, b0);
750 uint8x16_t row0_1 = vec_avg(a1, b1);
751 uint8x16_t row1_0 = avg3(a0, b0, c0);
752 uint8x16_t row1_1 = avg3(a1, b1, c1);
753 int i;
754 (void)left;
755
756 for (i = 0; i < 16; i++) {
757 vec_vsx_st(row0_0, 0, dst);
758 vec_vsx_st(row0_1, 16, dst);
759 vec_vsx_st(row1_0, 0, dst + stride);
760 vec_vsx_st(row1_1, 16, dst + stride);
761 dst += stride * 2;
762 row0_0 = vec_perm(row0_0, row0_1, sl1);
763 row0_1 = vec_perm(row0_1, above_right, sl1);
764 row1_0 = vec_perm(row1_0, row1_1, sl1);
765 row1_1 = vec_perm(row1_1, above_right, sl1);
766 }
767 }
768