1 /*
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include <string.h>
13
14 #include "./vpx_config.h"
15 #include "./vpx_dsp_rtcd.h"
16 #include "vpx/vpx_integer.h"
17 #include "vpx_dsp/vpx_convolve.h"
18 #include "vpx_dsp/vpx_dsp_common.h"
19 #include "vpx_dsp/vpx_filter.h"
20 #include "vpx_ports/mem.h"
21
convolve_horiz(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h)22 static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
23 uint8_t *dst, ptrdiff_t dst_stride,
24 const InterpKernel *x_filters, int x0_q4,
25 int x_step_q4, int w, int h) {
26 int x, y;
27 src -= SUBPEL_TAPS / 2 - 1;
28
29 for (y = 0; y < h; ++y) {
30 int x_q4 = x0_q4;
31 for (x = 0; x < w; ++x) {
32 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
33 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
34 int k, sum = 0;
35 for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
36 dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
37 x_q4 += x_step_q4;
38 }
39 src += src_stride;
40 dst += dst_stride;
41 }
42 }
43
convolve_avg_horiz(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h)44 static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
45 uint8_t *dst, ptrdiff_t dst_stride,
46 const InterpKernel *x_filters, int x0_q4,
47 int x_step_q4, int w, int h) {
48 int x, y;
49 src -= SUBPEL_TAPS / 2 - 1;
50
51 for (y = 0; y < h; ++y) {
52 int x_q4 = x0_q4;
53 for (x = 0; x < w; ++x) {
54 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
55 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
56 int k, sum = 0;
57 for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
58 dst[x] = ROUND_POWER_OF_TWO(
59 dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
60 x_q4 += x_step_q4;
61 }
62 src += src_stride;
63 dst += dst_stride;
64 }
65 }
66
convolve_vert(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h)67 static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
68 uint8_t *dst, ptrdiff_t dst_stride,
69 const InterpKernel *y_filters, int y0_q4,
70 int y_step_q4, int w, int h) {
71 int x, y;
72 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
73
74 for (x = 0; x < w; ++x) {
75 int y_q4 = y0_q4;
76 for (y = 0; y < h; ++y) {
77 const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
78 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
79 int k, sum = 0;
80 for (k = 0; k < SUBPEL_TAPS; ++k)
81 sum += src_y[k * src_stride] * y_filter[k];
82 dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
83 y_q4 += y_step_q4;
84 }
85 ++src;
86 ++dst;
87 }
88 }
89
convolve_avg_vert(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h)90 static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
91 uint8_t *dst, ptrdiff_t dst_stride,
92 const InterpKernel *y_filters, int y0_q4,
93 int y_step_q4, int w, int h) {
94 int x, y;
95 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
96
97 for (x = 0; x < w; ++x) {
98 int y_q4 = y0_q4;
99 for (y = 0; y < h; ++y) {
100 const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
101 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
102 int k, sum = 0;
103 for (k = 0; k < SUBPEL_TAPS; ++k)
104 sum += src_y[k * src_stride] * y_filter[k];
105 dst[y * dst_stride] = ROUND_POWER_OF_TWO(
106 dst[y * dst_stride] +
107 clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
108 1);
109 y_q4 += y_step_q4;
110 }
111 ++src;
112 ++dst;
113 }
114 }
115
convolve(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * const x_filters,int x0_q4,int x_step_q4,const InterpKernel * const y_filters,int y0_q4,int y_step_q4,int w,int h)116 static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
117 ptrdiff_t dst_stride, const InterpKernel *const x_filters,
118 int x0_q4, int x_step_q4,
119 const InterpKernel *const y_filters, int y0_q4,
120 int y_step_q4, int w, int h) {
121 // Note: Fixed size intermediate buffer, temp, places limits on parameters.
122 // 2d filtering proceeds in 2 steps:
123 // (1) Interpolate horizontally into an intermediate buffer, temp.
124 // (2) Interpolate temp vertically to derive the sub-pixel result.
125 // Deriving the maximum number of rows in the temp buffer (135):
126 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
127 // --Largest block size is 64x64 pixels.
128 // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
129 // original frame (in 1/16th pixel units).
130 // --Must round-up because block may be located at sub-pixel position.
131 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
132 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
133 uint8_t temp[64 * 135];
134 const int intermediate_height =
135 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
136
137 assert(w <= 64);
138 assert(h <= 64);
139 assert(y_step_q4 <= 32);
140 assert(x_step_q4 <= 32);
141
142 convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
143 x_filters, x0_q4, x_step_q4, w, intermediate_height);
144 convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
145 y_filters, y0_q4, y_step_q4, w, h);
146 }
147
vpx_convolve8_horiz_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)148 void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
149 uint8_t *dst, ptrdiff_t dst_stride,
150 const int16_t *filter_x, int x_step_q4,
151 const int16_t *filter_y, int y_step_q4, int w,
152 int h) {
153 const InterpKernel *const filters_x = get_filter_base(filter_x);
154 const int x0_q4 = get_filter_offset(filter_x, filters_x);
155
156 (void)filter_y;
157 (void)y_step_q4;
158
159 convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
160 w, h);
161 }
162
vpx_convolve8_avg_horiz_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)163 void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
164 uint8_t *dst, ptrdiff_t dst_stride,
165 const int16_t *filter_x, int x_step_q4,
166 const int16_t *filter_y, int y_step_q4, int w,
167 int h) {
168 const InterpKernel *const filters_x = get_filter_base(filter_x);
169 const int x0_q4 = get_filter_offset(filter_x, filters_x);
170
171 (void)filter_y;
172 (void)y_step_q4;
173
174 convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
175 x_step_q4, w, h);
176 }
177
vpx_convolve8_vert_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)178 void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
179 uint8_t *dst, ptrdiff_t dst_stride,
180 const int16_t *filter_x, int x_step_q4,
181 const int16_t *filter_y, int y_step_q4, int w,
182 int h) {
183 const InterpKernel *const filters_y = get_filter_base(filter_y);
184 const int y0_q4 = get_filter_offset(filter_y, filters_y);
185
186 (void)filter_x;
187 (void)x_step_q4;
188
189 convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4,
190 w, h);
191 }
192
vpx_convolve8_avg_vert_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)193 void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
194 uint8_t *dst, ptrdiff_t dst_stride,
195 const int16_t *filter_x, int x_step_q4,
196 const int16_t *filter_y, int y_step_q4, int w,
197 int h) {
198 const InterpKernel *const filters_y = get_filter_base(filter_y);
199 const int y0_q4 = get_filter_offset(filter_y, filters_y);
200
201 (void)filter_x;
202 (void)x_step_q4;
203
204 convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
205 y_step_q4, w, h);
206 }
207
vpx_convolve8_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)208 void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
209 ptrdiff_t dst_stride, const int16_t *filter_x,
210 int x_step_q4, const int16_t *filter_y, int y_step_q4,
211 int w, int h) {
212 const InterpKernel *const filters_x = get_filter_base(filter_x);
213 const int x0_q4 = get_filter_offset(filter_x, filters_x);
214 const InterpKernel *const filters_y = get_filter_base(filter_y);
215 const int y0_q4 = get_filter_offset(filter_y, filters_y);
216
217 convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
218 filters_y, y0_q4, y_step_q4, w, h);
219 }
220
vpx_convolve8_avg_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)221 void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
222 ptrdiff_t dst_stride, const int16_t *filter_x,
223 int x_step_q4, const int16_t *filter_y, int y_step_q4,
224 int w, int h) {
225 // Fixed size intermediate buffer places limits on parameters.
226 DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
227 assert(w <= 64);
228 assert(h <= 64);
229
230 vpx_convolve8_c(src, src_stride, temp, 64, filter_x, x_step_q4, filter_y,
231 y_step_q4, w, h);
232 vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);
233 }
234
vpx_convolve_copy_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int filter_x_stride,const int16_t * filter_y,int filter_y_stride,int w,int h)235 void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
236 ptrdiff_t dst_stride, const int16_t *filter_x,
237 int filter_x_stride, const int16_t *filter_y,
238 int filter_y_stride, int w, int h) {
239 int r;
240
241 (void)filter_x;
242 (void)filter_x_stride;
243 (void)filter_y;
244 (void)filter_y_stride;
245
246 for (r = h; r > 0; --r) {
247 memcpy(dst, src, w);
248 src += src_stride;
249 dst += dst_stride;
250 }
251 }
252
vpx_convolve_avg_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int filter_x_stride,const int16_t * filter_y,int filter_y_stride,int w,int h)253 void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
254 ptrdiff_t dst_stride, const int16_t *filter_x,
255 int filter_x_stride, const int16_t *filter_y,
256 int filter_y_stride, int w, int h) {
257 int x, y;
258
259 (void)filter_x;
260 (void)filter_x_stride;
261 (void)filter_y;
262 (void)filter_y_stride;
263
264 for (y = 0; y < h; ++y) {
265 for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
266 src += src_stride;
267 dst += dst_stride;
268 }
269 }
270
vpx_scaled_horiz_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)271 void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
272 ptrdiff_t dst_stride, const int16_t *filter_x,
273 int x_step_q4, const int16_t *filter_y, int y_step_q4,
274 int w, int h) {
275 vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
276 filter_y, y_step_q4, w, h);
277 }
278
vpx_scaled_vert_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)279 void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
280 ptrdiff_t dst_stride, const int16_t *filter_x,
281 int x_step_q4, const int16_t *filter_y, int y_step_q4,
282 int w, int h) {
283 vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
284 filter_y, y_step_q4, w, h);
285 }
286
vpx_scaled_2d_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)287 void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
288 ptrdiff_t dst_stride, const int16_t *filter_x,
289 int x_step_q4, const int16_t *filter_y, int y_step_q4,
290 int w, int h) {
291 vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
292 filter_y, y_step_q4, w, h);
293 }
294
vpx_scaled_avg_horiz_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)295 void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
296 uint8_t *dst, ptrdiff_t dst_stride,
297 const int16_t *filter_x, int x_step_q4,
298 const int16_t *filter_y, int y_step_q4, int w,
299 int h) {
300 vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
301 x_step_q4, filter_y, y_step_q4, w, h);
302 }
303
vpx_scaled_avg_vert_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)304 void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
305 uint8_t *dst, ptrdiff_t dst_stride,
306 const int16_t *filter_x, int x_step_q4,
307 const int16_t *filter_y, int y_step_q4, int w,
308 int h) {
309 vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
310 x_step_q4, filter_y, y_step_q4, w, h);
311 }
312
vpx_scaled_avg_2d_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)313 void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
314 ptrdiff_t dst_stride, const int16_t *filter_x,
315 int x_step_q4, const int16_t *filter_y, int y_step_q4,
316 int w, int h) {
317 vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
318 filter_y, y_step_q4, w, h);
319 }
320
321 #if CONFIG_VP9_HIGHBITDEPTH
highbd_convolve_horiz(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h,int bd)322 static void highbd_convolve_horiz(const uint16_t *src, ptrdiff_t src_stride,
323 uint16_t *dst, ptrdiff_t dst_stride,
324 const InterpKernel *x_filters, int x0_q4,
325 int x_step_q4, int w, int h, int bd) {
326 int x, y;
327 src -= SUBPEL_TAPS / 2 - 1;
328
329 for (y = 0; y < h; ++y) {
330 int x_q4 = x0_q4;
331 for (x = 0; x < w; ++x) {
332 const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
333 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
334 int k, sum = 0;
335 for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
336 dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
337 x_q4 += x_step_q4;
338 }
339 src += src_stride;
340 dst += dst_stride;
341 }
342 }
343
highbd_convolve_avg_horiz(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h,int bd)344 static void highbd_convolve_avg_horiz(const uint16_t *src, ptrdiff_t src_stride,
345 uint16_t *dst, ptrdiff_t dst_stride,
346 const InterpKernel *x_filters, int x0_q4,
347 int x_step_q4, int w, int h, int bd) {
348 int x, y;
349 src -= SUBPEL_TAPS / 2 - 1;
350
351 for (y = 0; y < h; ++y) {
352 int x_q4 = x0_q4;
353 for (x = 0; x < w; ++x) {
354 const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
355 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
356 int k, sum = 0;
357 for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
358 dst[x] = ROUND_POWER_OF_TWO(
359 dst[x] + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
360 1);
361 x_q4 += x_step_q4;
362 }
363 src += src_stride;
364 dst += dst_stride;
365 }
366 }
367
highbd_convolve_vert(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h,int bd)368 static void highbd_convolve_vert(const uint16_t *src, ptrdiff_t src_stride,
369 uint16_t *dst, ptrdiff_t dst_stride,
370 const InterpKernel *y_filters, int y0_q4,
371 int y_step_q4, int w, int h, int bd) {
372 int x, y;
373 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
374
375 for (x = 0; x < w; ++x) {
376 int y_q4 = y0_q4;
377 for (y = 0; y < h; ++y) {
378 const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
379 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
380 int k, sum = 0;
381 for (k = 0; k < SUBPEL_TAPS; ++k)
382 sum += src_y[k * src_stride] * y_filter[k];
383 dst[y * dst_stride] =
384 clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
385 y_q4 += y_step_q4;
386 }
387 ++src;
388 ++dst;
389 }
390 }
391
highbd_convolve_avg_vert(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h,int bd)392 static void highbd_convolve_avg_vert(const uint16_t *src, ptrdiff_t src_stride,
393 uint16_t *dst, ptrdiff_t dst_stride,
394 const InterpKernel *y_filters, int y0_q4,
395 int y_step_q4, int w, int h, int bd) {
396 int x, y;
397 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
398
399 for (x = 0; x < w; ++x) {
400 int y_q4 = y0_q4;
401 for (y = 0; y < h; ++y) {
402 const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
403 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
404 int k, sum = 0;
405 for (k = 0; k < SUBPEL_TAPS; ++k)
406 sum += src_y[k * src_stride] * y_filter[k];
407 dst[y * dst_stride] = ROUND_POWER_OF_TWO(
408 dst[y * dst_stride] +
409 clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
410 1);
411 y_q4 += y_step_q4;
412 }
413 ++src;
414 ++dst;
415 }
416 }
417
highbd_convolve(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * const x_filters,int x0_q4,int x_step_q4,const InterpKernel * const y_filters,int y0_q4,int y_step_q4,int w,int h,int bd)418 static void highbd_convolve(const uint16_t *src, ptrdiff_t src_stride,
419 uint16_t *dst, ptrdiff_t dst_stride,
420 const InterpKernel *const x_filters, int x0_q4,
421 int x_step_q4, const InterpKernel *const y_filters,
422 int y0_q4, int y_step_q4, int w, int h, int bd) {
423 // Note: Fixed size intermediate buffer, temp, places limits on parameters.
424 // 2d filtering proceeds in 2 steps:
425 // (1) Interpolate horizontally into an intermediate buffer, temp.
426 // (2) Interpolate temp vertically to derive the sub-pixel result.
427 // Deriving the maximum number of rows in the temp buffer (135):
428 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
429 // --Largest block size is 64x64 pixels.
430 // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
431 // original frame (in 1/16th pixel units).
432 // --Must round-up because block may be located at sub-pixel position.
433 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
434 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
435 uint16_t temp[64 * 135];
436 const int intermediate_height =
437 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
438
439 assert(w <= 64);
440 assert(h <= 64);
441 assert(y_step_q4 <= 32);
442 assert(x_step_q4 <= 32);
443
444 highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
445 temp, 64, x_filters, x0_q4, x_step_q4, w,
446 intermediate_height, bd);
447 highbd_convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
448 y_filters, y0_q4, y_step_q4, w, h, bd);
449 }
450
vpx_highbd_convolve8_horiz_c(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)451 void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride,
452 uint16_t *dst, ptrdiff_t dst_stride,
453 const int16_t *filter_x, int x_step_q4,
454 const int16_t *filter_y, int y_step_q4, int w,
455 int h, int bd) {
456 const InterpKernel *const filters_x = get_filter_base(filter_x);
457 const int x0_q4 = get_filter_offset(filter_x, filters_x);
458
459 (void)filter_y;
460 (void)y_step_q4;
461
462 highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
463 x_step_q4, w, h, bd);
464 }
465
vpx_highbd_convolve8_avg_horiz_c(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)466 void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride,
467 uint16_t *dst, ptrdiff_t dst_stride,
468 const int16_t *filter_x, int x_step_q4,
469 const int16_t *filter_y, int y_step_q4,
470 int w, int h, int bd) {
471 const InterpKernel *const filters_x = get_filter_base(filter_x);
472 const int x0_q4 = get_filter_offset(filter_x, filters_x);
473
474 (void)filter_y;
475 (void)y_step_q4;
476
477 highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
478 x_step_q4, w, h, bd);
479 }
480
vpx_highbd_convolve8_vert_c(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)481 void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride,
482 uint16_t *dst, ptrdiff_t dst_stride,
483 const int16_t *filter_x, int x_step_q4,
484 const int16_t *filter_y, int y_step_q4, int w,
485 int h, int bd) {
486 const InterpKernel *const filters_y = get_filter_base(filter_y);
487 const int y0_q4 = get_filter_offset(filter_y, filters_y);
488
489 (void)filter_x;
490 (void)x_step_q4;
491
492 highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
493 y_step_q4, w, h, bd);
494 }
495
vpx_highbd_convolve8_avg_vert_c(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)496 void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride,
497 uint16_t *dst, ptrdiff_t dst_stride,
498 const int16_t *filter_x, int x_step_q4,
499 const int16_t *filter_y, int y_step_q4,
500 int w, int h, int bd) {
501 const InterpKernel *const filters_y = get_filter_base(filter_y);
502 const int y0_q4 = get_filter_offset(filter_y, filters_y);
503
504 (void)filter_x;
505 (void)x_step_q4;
506
507 highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
508 y_step_q4, w, h, bd);
509 }
510
vpx_highbd_convolve8_c(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)511 void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride,
512 uint16_t *dst, ptrdiff_t dst_stride,
513 const int16_t *filter_x, int x_step_q4,
514 const int16_t *filter_y, int y_step_q4, int w,
515 int h, int bd) {
516 const InterpKernel *const filters_x = get_filter_base(filter_x);
517 const int x0_q4 = get_filter_offset(filter_x, filters_x);
518 const InterpKernel *const filters_y = get_filter_base(filter_y);
519 const int y0_q4 = get_filter_offset(filter_y, filters_y);
520
521 highbd_convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
522 filters_y, y0_q4, y_step_q4, w, h, bd);
523 }
524
vpx_highbd_convolve8_avg_c(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)525 void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride,
526 uint16_t *dst, ptrdiff_t dst_stride,
527 const int16_t *filter_x, int x_step_q4,
528 const int16_t *filter_y, int y_step_q4, int w,
529 int h, int bd) {
530 // Fixed size intermediate buffer places limits on parameters.
531 DECLARE_ALIGNED(16, uint16_t, temp[64 * 64]);
532 assert(w <= 64);
533 assert(h <= 64);
534
535 vpx_highbd_convolve8_c(src, src_stride, temp, 64, filter_x, x_step_q4,
536 filter_y, y_step_q4, w, h, bd);
537 vpx_highbd_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h,
538 bd);
539 }
540
vpx_highbd_convolve_copy_c(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int filter_x_stride,const int16_t * filter_y,int filter_y_stride,int w,int h,int bd)541 void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride,
542 uint16_t *dst, ptrdiff_t dst_stride,
543 const int16_t *filter_x, int filter_x_stride,
544 const int16_t *filter_y, int filter_y_stride,
545 int w, int h, int bd) {
546 int r;
547
548 (void)filter_x;
549 (void)filter_x_stride;
550 (void)filter_y;
551 (void)filter_y_stride;
552 (void)bd;
553
554 for (r = h; r > 0; --r) {
555 memcpy(dst, src, w * sizeof(uint16_t));
556 src += src_stride;
557 dst += dst_stride;
558 }
559 }
560
vpx_highbd_convolve_avg_c(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int filter_x_stride,const int16_t * filter_y,int filter_y_stride,int w,int h,int bd)561 void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride,
562 uint16_t *dst, ptrdiff_t dst_stride,
563 const int16_t *filter_x, int filter_x_stride,
564 const int16_t *filter_y, int filter_y_stride,
565 int w, int h, int bd) {
566 int x, y;
567
568 (void)filter_x;
569 (void)filter_x_stride;
570 (void)filter_y;
571 (void)filter_y_stride;
572 (void)bd;
573
574 for (y = 0; y < h; ++y) {
575 for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
576 src += src_stride;
577 dst += dst_stride;
578 }
579 }
580 #endif
581