1 /*
2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10 #include <assert.h>
11 #include <string.h>
12 #include "./vpx_dsp_rtcd.h"
13 #include "vpx_dsp/vpx_filter.h"
14 #include "vpx_dsp/ppc/types_vsx.h"
15
16 // TODO(lu_zero): unroll
copy_w16(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int32_t h)17 static inline void copy_w16(const uint8_t *src, ptrdiff_t src_stride,
18 uint8_t *dst, ptrdiff_t dst_stride, int32_t h) {
19 int i;
20
21 for (i = h; i--;) {
22 vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
23 src += src_stride;
24 dst += dst_stride;
25 }
26 }
27
copy_w32(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int32_t h)28 static inline void copy_w32(const uint8_t *src, ptrdiff_t src_stride,
29 uint8_t *dst, ptrdiff_t dst_stride, int32_t h) {
30 int i;
31
32 for (i = h; i--;) {
33 vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
34 vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
35 src += src_stride;
36 dst += dst_stride;
37 }
38 }
39
copy_w64(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int32_t h)40 static inline void copy_w64(const uint8_t *src, ptrdiff_t src_stride,
41 uint8_t *dst, ptrdiff_t dst_stride, int32_t h) {
42 int i;
43
44 for (i = h; i--;) {
45 vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
46 vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
47 vec_vsx_st(vec_vsx_ld(32, src), 32, dst);
48 vec_vsx_st(vec_vsx_ld(48, src), 48, dst);
49 src += src_stride;
50 dst += dst_stride;
51 }
52 }
53
vpx_convolve_copy_vsx(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int32_t y_step_q4,int32_t w,int32_t h)54 void vpx_convolve_copy_vsx(const uint8_t *src, ptrdiff_t src_stride,
55 uint8_t *dst, ptrdiff_t dst_stride,
56 const InterpKernel *filter, int x0_q4, int x_step_q4,
57 int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) {
58 (void)filter;
59 (void)x0_q4;
60 (void)x_step_q4;
61 (void)y0_q4;
62 (void)y_step_q4;
63
64 switch (w) {
65 case 16: {
66 copy_w16(src, src_stride, dst, dst_stride, h);
67 break;
68 }
69 case 32: {
70 copy_w32(src, src_stride, dst, dst_stride, h);
71 break;
72 }
73 case 64: {
74 copy_w64(src, src_stride, dst, dst_stride, h);
75 break;
76 }
77 default: {
78 int i;
79 for (i = h; i--;) {
80 memcpy(dst, src, w);
81 src += src_stride;
82 dst += dst_stride;
83 }
84 break;
85 }
86 }
87 }
88
avg_w16(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int32_t h)89 static inline void avg_w16(const uint8_t *src, ptrdiff_t src_stride,
90 uint8_t *dst, ptrdiff_t dst_stride, int32_t h) {
91 int i;
92
93 for (i = h; i--;) {
94 const uint8x16_t v = vec_avg(vec_vsx_ld(0, src), vec_vsx_ld(0, dst));
95 vec_vsx_st(v, 0, dst);
96 src += src_stride;
97 dst += dst_stride;
98 }
99 }
100
avg_w32(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int32_t h)101 static inline void avg_w32(const uint8_t *src, ptrdiff_t src_stride,
102 uint8_t *dst, ptrdiff_t dst_stride, int32_t h) {
103 int i;
104
105 for (i = h; i--;) {
106 const uint8x16_t v0 = vec_avg(vec_vsx_ld(0, src), vec_vsx_ld(0, dst));
107 const uint8x16_t v1 = vec_avg(vec_vsx_ld(16, src), vec_vsx_ld(16, dst));
108 vec_vsx_st(v0, 0, dst);
109 vec_vsx_st(v1, 16, dst);
110 src += src_stride;
111 dst += dst_stride;
112 }
113 }
114
avg_w64(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int32_t h)115 static inline void avg_w64(const uint8_t *src, ptrdiff_t src_stride,
116 uint8_t *dst, ptrdiff_t dst_stride, int32_t h) {
117 int i;
118
119 for (i = h; i--;) {
120 const uint8x16_t v0 = vec_avg(vec_vsx_ld(0, src), vec_vsx_ld(0, dst));
121 const uint8x16_t v1 = vec_avg(vec_vsx_ld(16, src), vec_vsx_ld(16, dst));
122 const uint8x16_t v2 = vec_avg(vec_vsx_ld(32, src), vec_vsx_ld(32, dst));
123 const uint8x16_t v3 = vec_avg(vec_vsx_ld(48, src), vec_vsx_ld(48, dst));
124 vec_vsx_st(v0, 0, dst);
125 vec_vsx_st(v1, 16, dst);
126 vec_vsx_st(v2, 32, dst);
127 vec_vsx_st(v3, 48, dst);
128 src += src_stride;
129 dst += dst_stride;
130 }
131 }
132
vpx_convolve_avg_vsx(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int32_t y_step_q4,int32_t w,int32_t h)133 void vpx_convolve_avg_vsx(const uint8_t *src, ptrdiff_t src_stride,
134 uint8_t *dst, ptrdiff_t dst_stride,
135 const InterpKernel *filter, int x0_q4, int x_step_q4,
136 int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) {
137 switch (w) {
138 case 16: {
139 avg_w16(src, src_stride, dst, dst_stride, h);
140 break;
141 }
142 case 32: {
143 avg_w32(src, src_stride, dst, dst_stride, h);
144 break;
145 }
146 case 64: {
147 avg_w64(src, src_stride, dst, dst_stride, h);
148 break;
149 }
150 default: {
151 vpx_convolve_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
152 x_step_q4, y0_q4, y_step_q4, w, h);
153 break;
154 }
155 }
156 }
157
convolve_line(uint8_t * dst,const int16x8_t s,const int16x8_t f)158 static inline void convolve_line(uint8_t *dst, const int16x8_t s,
159 const int16x8_t f) {
160 const int32x4_t sum = vec_msum(s, f, vec_splat_s32(0));
161 const int32x4_t bias =
162 vec_sl(vec_splat_s32(1), vec_splat_u32(FILTER_BITS - 1));
163 const int32x4_t avg = vec_sr(vec_sums(sum, bias), vec_splat_u32(FILTER_BITS));
164 const uint8x16_t v = vec_splat(
165 vec_packsu(vec_pack(avg, vec_splat_s32(0)), vec_splat_s16(0)), 3);
166 vec_ste(v, 0, dst);
167 }
168
convolve_line_h(uint8_t * dst,const uint8_t * const src_x,const int16_t * const x_filter)169 static inline void convolve_line_h(uint8_t *dst, const uint8_t *const src_x,
170 const int16_t *const x_filter) {
171 const int16x8_t s = unpack_to_s16_h(vec_vsx_ld(0, src_x));
172 const int16x8_t f = vec_vsx_ld(0, x_filter);
173
174 convolve_line(dst, s, f);
175 }
176
177 // TODO(lu_zero): Implement 8x8 and bigger block special cases
convolve_horiz(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h)178 static inline void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
179 uint8_t *dst, ptrdiff_t dst_stride,
180 const InterpKernel *x_filters, int x0_q4,
181 int x_step_q4, int w, int h) {
182 int x, y;
183 src -= SUBPEL_TAPS / 2 - 1;
184
185 for (y = 0; y < h; ++y) {
186 int x_q4 = x0_q4;
187 for (x = 0; x < w; ++x) {
188 convolve_line_h(dst + x, &src[x_q4 >> SUBPEL_BITS],
189 x_filters[x_q4 & SUBPEL_MASK]);
190 x_q4 += x_step_q4;
191 }
192 src += src_stride;
193 dst += dst_stride;
194 }
195 }
196
convolve_avg_horiz(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h)197 static inline void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
198 uint8_t *dst, ptrdiff_t dst_stride,
199 const InterpKernel *x_filters, int x0_q4,
200 int x_step_q4, int w, int h) {
201 int x, y;
202 src -= SUBPEL_TAPS / 2 - 1;
203
204 for (y = 0; y < h; ++y) {
205 int x_q4 = x0_q4;
206 for (x = 0; x < w; ++x) {
207 uint8_t v;
208 convolve_line_h(&v, &src[x_q4 >> SUBPEL_BITS],
209 x_filters[x_q4 & SUBPEL_MASK]);
210 dst[x] = ROUND_POWER_OF_TWO(dst[x] + v, 1);
211 x_q4 += x_step_q4;
212 }
213 src += src_stride;
214 dst += dst_stride;
215 }
216 }
217
transpose_line_u8_8x8(uint8x16_t a,uint8x16_t b,uint8x16_t c,uint8x16_t d,uint8x16_t e,uint8x16_t f,uint8x16_t g,uint8x16_t h)218 static uint8x16_t transpose_line_u8_8x8(uint8x16_t a, uint8x16_t b,
219 uint8x16_t c, uint8x16_t d,
220 uint8x16_t e, uint8x16_t f,
221 uint8x16_t g, uint8x16_t h) {
222 uint16x8_t ab = (uint16x8_t)vec_mergeh(a, b);
223 uint16x8_t cd = (uint16x8_t)vec_mergeh(c, d);
224 uint16x8_t ef = (uint16x8_t)vec_mergeh(e, f);
225 uint16x8_t gh = (uint16x8_t)vec_mergeh(g, h);
226
227 uint32x4_t abcd = (uint32x4_t)vec_mergeh(ab, cd);
228 uint32x4_t efgh = (uint32x4_t)vec_mergeh(ef, gh);
229
230 return (uint8x16_t)vec_mergeh(abcd, efgh);
231 }
232
convolve_line_v(uint8_t * dst,const uint8_t * const src_y,ptrdiff_t src_stride,const int16_t * const y_filter)233 static inline void convolve_line_v(uint8_t *dst, const uint8_t *const src_y,
234 ptrdiff_t src_stride,
235 const int16_t *const y_filter) {
236 uint8x16_t s0 = vec_vsx_ld(0, src_y + 0 * src_stride);
237 uint8x16_t s1 = vec_vsx_ld(0, src_y + 1 * src_stride);
238 uint8x16_t s2 = vec_vsx_ld(0, src_y + 2 * src_stride);
239 uint8x16_t s3 = vec_vsx_ld(0, src_y + 3 * src_stride);
240 uint8x16_t s4 = vec_vsx_ld(0, src_y + 4 * src_stride);
241 uint8x16_t s5 = vec_vsx_ld(0, src_y + 5 * src_stride);
242 uint8x16_t s6 = vec_vsx_ld(0, src_y + 6 * src_stride);
243 uint8x16_t s7 = vec_vsx_ld(0, src_y + 7 * src_stride);
244 const int16x8_t f = vec_vsx_ld(0, y_filter);
245 uint8_t buf[16];
246 const uint8x16_t s = transpose_line_u8_8x8(s0, s1, s2, s3, s4, s5, s6, s7);
247
248 vec_vsx_st(s, 0, buf);
249
250 convolve_line(dst, unpack_to_s16_h(s), f);
251 }
252
convolve_vert(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h)253 static inline void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
254 uint8_t *dst, ptrdiff_t dst_stride,
255 const InterpKernel *y_filters, int y0_q4,
256 int y_step_q4, int w, int h) {
257 int x, y;
258 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
259
260 for (x = 0; x < w; ++x) {
261 int y_q4 = y0_q4;
262 for (y = 0; y < h; ++y) {
263 convolve_line_v(dst + y * dst_stride,
264 &src[(y_q4 >> SUBPEL_BITS) * src_stride], src_stride,
265 y_filters[y_q4 & SUBPEL_MASK]);
266 y_q4 += y_step_q4;
267 }
268 ++src;
269 ++dst;
270 }
271 }
272
convolve_avg_vert(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h)273 static inline void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
274 uint8_t *dst, ptrdiff_t dst_stride,
275 const InterpKernel *y_filters, int y0_q4,
276 int y_step_q4, int w, int h) {
277 int x, y;
278 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
279
280 for (x = 0; x < w; ++x) {
281 int y_q4 = y0_q4;
282 for (y = 0; y < h; ++y) {
283 uint8_t v;
284 convolve_line_v(&v, &src[(y_q4 >> SUBPEL_BITS) * src_stride], src_stride,
285 y_filters[y_q4 & SUBPEL_MASK]);
286 dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] + v, 1);
287 y_q4 += y_step_q4;
288 }
289 ++src;
290 ++dst;
291 }
292 }
293
convolve(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * const filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)294 static inline void convolve(const uint8_t *src, ptrdiff_t src_stride,
295 uint8_t *dst, ptrdiff_t dst_stride,
296 const InterpKernel *const filter, int x0_q4,
297 int x_step_q4, int y0_q4, int y_step_q4, int w,
298 int h) {
299 // Note: Fixed size intermediate buffer, temp, places limits on parameters.
300 // 2d filtering proceeds in 2 steps:
301 // (1) Interpolate horizontally into an intermediate buffer, temp.
302 // (2) Interpolate temp vertically to derive the sub-pixel result.
303 // Deriving the maximum number of rows in the temp buffer (135):
304 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
305 // --Largest block size is 64x64 pixels.
306 // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
307 // original frame (in 1/16th pixel units).
308 // --Must round-up because block may be located at sub-pixel position.
309 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
310 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
311 DECLARE_ALIGNED(16, uint8_t, temp[64 * 135]);
312 const int intermediate_height =
313 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
314
315 assert(w <= 64);
316 assert(h <= 64);
317 assert(y_step_q4 <= 32);
318 assert(x_step_q4 <= 32);
319
320 convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
321 filter, x0_q4, x_step_q4, w, intermediate_height);
322 convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter,
323 y0_q4, y_step_q4, w, h);
324 }
325
vpx_convolve8_horiz_vsx(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)326 void vpx_convolve8_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride,
327 uint8_t *dst, ptrdiff_t dst_stride,
328 const InterpKernel *filter, int x0_q4,
329 int x_step_q4, int y0_q4, int y_step_q4, int w,
330 int h) {
331 (void)y0_q4;
332 (void)y_step_q4;
333
334 convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, w,
335 h);
336 }
337
vpx_convolve8_avg_horiz_vsx(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)338 void vpx_convolve8_avg_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride,
339 uint8_t *dst, ptrdiff_t dst_stride,
340 const InterpKernel *filter, int x0_q4,
341 int x_step_q4, int y0_q4, int y_step_q4, int w,
342 int h) {
343 (void)y0_q4;
344 (void)y_step_q4;
345
346 convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
347 w, h);
348 }
349
vpx_convolve8_vert_vsx(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)350 void vpx_convolve8_vert_vsx(const uint8_t *src, ptrdiff_t src_stride,
351 uint8_t *dst, ptrdiff_t dst_stride,
352 const InterpKernel *filter, int x0_q4,
353 int x_step_q4, int y0_q4, int y_step_q4, int w,
354 int h) {
355 (void)x0_q4;
356 (void)x_step_q4;
357
358 convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w,
359 h);
360 }
361
vpx_convolve8_avg_vert_vsx(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)362 void vpx_convolve8_avg_vert_vsx(const uint8_t *src, ptrdiff_t src_stride,
363 uint8_t *dst, ptrdiff_t dst_stride,
364 const InterpKernel *filter, int x0_q4,
365 int x_step_q4, int y0_q4, int y_step_q4, int w,
366 int h) {
367 (void)x0_q4;
368 (void)x_step_q4;
369
370 convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4,
371 w, h);
372 }
373
vpx_convolve8_vsx(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)374 void vpx_convolve8_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
375 ptrdiff_t dst_stride, const InterpKernel *filter,
376 int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
377 int w, int h) {
378 convolve(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4,
379 y_step_q4, w, h);
380 }
381
vpx_convolve8_avg_vsx(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)382 void vpx_convolve8_avg_vsx(const uint8_t *src, ptrdiff_t src_stride,
383 uint8_t *dst, ptrdiff_t dst_stride,
384 const InterpKernel *filter, int x0_q4, int x_step_q4,
385 int y0_q4, int y_step_q4, int w, int h) {
386 // Fixed size intermediate buffer places limits on parameters.
387 DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
388 assert(w <= 64);
389 assert(h <= 64);
390
391 vpx_convolve8_vsx(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4,
392 y_step_q4, w, h);
393 vpx_convolve_avg_vsx(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h);
394 }
395