1 /*
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include <string.h>
13
14 #include "./vpx_config.h"
15 #include "./vpx_dsp_rtcd.h"
16 #include "vpx/vpx_integer.h"
17 #include "vpx_dsp/vpx_convolve.h"
18 #include "vpx_dsp/vpx_dsp_common.h"
19 #include "vpx_dsp/vpx_filter.h"
20 #include "vpx_ports/mem.h"
21
convolve_horiz(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h)22 static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
23 uint8_t *dst, ptrdiff_t dst_stride,
24 const InterpKernel *x_filters, int x0_q4,
25 int x_step_q4, int w, int h) {
26 int x, y;
27 src -= SUBPEL_TAPS / 2 - 1;
28
29 for (y = 0; y < h; ++y) {
30 int x_q4 = x0_q4;
31 for (x = 0; x < w; ++x) {
32 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
33 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
34 int k, sum = 0;
35 for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
36 dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
37 x_q4 += x_step_q4;
38 }
39 src += src_stride;
40 dst += dst_stride;
41 }
42 }
43
convolve_avg_horiz(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h)44 static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
45 uint8_t *dst, ptrdiff_t dst_stride,
46 const InterpKernel *x_filters, int x0_q4,
47 int x_step_q4, int w, int h) {
48 int x, y;
49 src -= SUBPEL_TAPS / 2 - 1;
50
51 for (y = 0; y < h; ++y) {
52 int x_q4 = x0_q4;
53 for (x = 0; x < w; ++x) {
54 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
55 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
56 int k, sum = 0;
57 for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
58 dst[x] = ROUND_POWER_OF_TWO(
59 dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
60 x_q4 += x_step_q4;
61 }
62 src += src_stride;
63 dst += dst_stride;
64 }
65 }
66
convolve_vert(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h)67 static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
68 uint8_t *dst, ptrdiff_t dst_stride,
69 const InterpKernel *y_filters, int y0_q4,
70 int y_step_q4, int w, int h) {
71 int x, y;
72 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
73
74 for (x = 0; x < w; ++x) {
75 int y_q4 = y0_q4;
76 for (y = 0; y < h; ++y) {
77 const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
78 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
79 int k, sum = 0;
80 for (k = 0; k < SUBPEL_TAPS; ++k)
81 sum += src_y[k * src_stride] * y_filter[k];
82 dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
83 y_q4 += y_step_q4;
84 }
85 ++src;
86 ++dst;
87 }
88 }
89
convolve_avg_vert(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h)90 static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
91 uint8_t *dst, ptrdiff_t dst_stride,
92 const InterpKernel *y_filters, int y0_q4,
93 int y_step_q4, int w, int h) {
94 int x, y;
95 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
96
97 for (x = 0; x < w; ++x) {
98 int y_q4 = y0_q4;
99 for (y = 0; y < h; ++y) {
100 const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
101 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
102 int k, sum = 0;
103 for (k = 0; k < SUBPEL_TAPS; ++k)
104 sum += src_y[k * src_stride] * y_filter[k];
105 dst[y * dst_stride] = ROUND_POWER_OF_TWO(
106 dst[y * dst_stride] +
107 clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
108 1);
109 y_q4 += y_step_q4;
110 }
111 ++src;
112 ++dst;
113 }
114 }
115
vpx_convolve8_horiz_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)116 void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
117 uint8_t *dst, ptrdiff_t dst_stride,
118 const InterpKernel *filter, int x0_q4, int x_step_q4,
119 int y0_q4, int y_step_q4, int w, int h) {
120 (void)y0_q4;
121 (void)y_step_q4;
122 convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, w,
123 h);
124 }
125
vpx_convolve8_avg_horiz_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)126 void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
127 uint8_t *dst, ptrdiff_t dst_stride,
128 const InterpKernel *filter, int x0_q4,
129 int x_step_q4, int y0_q4, int y_step_q4, int w,
130 int h) {
131 (void)y0_q4;
132 (void)y_step_q4;
133 convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
134 w, h);
135 }
136
vpx_convolve8_vert_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)137 void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
138 uint8_t *dst, ptrdiff_t dst_stride,
139 const InterpKernel *filter, int x0_q4, int x_step_q4,
140 int y0_q4, int y_step_q4, int w, int h) {
141 (void)x0_q4;
142 (void)x_step_q4;
143 convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w,
144 h);
145 }
146
vpx_convolve8_avg_vert_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)147 void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
148 uint8_t *dst, ptrdiff_t dst_stride,
149 const InterpKernel *filter, int x0_q4,
150 int x_step_q4, int y0_q4, int y_step_q4, int w,
151 int h) {
152 (void)x0_q4;
153 (void)x_step_q4;
154 convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4,
155 w, h);
156 }
157
vpx_convolve8_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)158 void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
159 ptrdiff_t dst_stride, const InterpKernel *filter,
160 int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w,
161 int h) {
162 // Note: Fixed size intermediate buffer, temp, places limits on parameters.
163 // 2d filtering proceeds in 2 steps:
164 // (1) Interpolate horizontally into an intermediate buffer, temp.
165 // (2) Interpolate temp vertically to derive the sub-pixel result.
166 // Deriving the maximum number of rows in the temp buffer (135):
167 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
168 // --Largest block size is 64x64 pixels.
169 // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
170 // original frame (in 1/16th pixel units).
171 // --Must round-up because block may be located at sub-pixel position.
172 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
173 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
174 // When calling in frame scaling function, the smallest scaling factor is x1/4
175 // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
176 // big enough.
177 uint8_t temp[64 * 135];
178 const int intermediate_height =
179 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
180
181 assert(w <= 64);
182 assert(h <= 64);
183 assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
184 assert(x_step_q4 <= 64);
185
186 convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
187 filter, x0_q4, x_step_q4, w, intermediate_height);
188 convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter,
189 y0_q4, y_step_q4, w, h);
190 }
191
vpx_convolve8_avg_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)192 void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
193 ptrdiff_t dst_stride, const InterpKernel *filter,
194 int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
195 int w, int h) {
196 // Fixed size intermediate buffer places limits on parameters.
197 DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
198 assert(w <= 64);
199 assert(h <= 64);
200
201 vpx_convolve8_c(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4,
202 y_step_q4, w, h);
203 vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h);
204 }
205
vpx_convolve_copy_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)206 void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
207 ptrdiff_t dst_stride, const InterpKernel *filter,
208 int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
209 int w, int h) {
210 int r;
211
212 (void)filter;
213 (void)x0_q4;
214 (void)x_step_q4;
215 (void)y0_q4;
216 (void)y_step_q4;
217
218 for (r = h; r > 0; --r) {
219 memcpy(dst, src, w);
220 src += src_stride;
221 dst += dst_stride;
222 }
223 }
224
vpx_convolve_avg_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)225 void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
226 ptrdiff_t dst_stride, const InterpKernel *filter,
227 int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
228 int w, int h) {
229 int x, y;
230
231 (void)filter;
232 (void)x0_q4;
233 (void)x_step_q4;
234 (void)y0_q4;
235 (void)y_step_q4;
236
237 for (y = 0; y < h; ++y) {
238 for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
239 src += src_stride;
240 dst += dst_stride;
241 }
242 }
243
vpx_scaled_horiz_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)244 void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
245 ptrdiff_t dst_stride, const InterpKernel *filter,
246 int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
247 int w, int h) {
248 vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
249 x_step_q4, y0_q4, y_step_q4, w, h);
250 }
251
vpx_scaled_vert_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)252 void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
253 ptrdiff_t dst_stride, const InterpKernel *filter,
254 int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
255 int w, int h) {
256 vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
257 x_step_q4, y0_q4, y_step_q4, w, h);
258 }
259
vpx_scaled_2d_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)260 void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
261 ptrdiff_t dst_stride, const InterpKernel *filter,
262 int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w,
263 int h) {
264 vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
265 y0_q4, y_step_q4, w, h);
266 }
267
vpx_scaled_avg_horiz_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)268 void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
269 uint8_t *dst, ptrdiff_t dst_stride,
270 const InterpKernel *filter, int x0_q4,
271 int x_step_q4, int y0_q4, int y_step_q4, int w,
272 int h) {
273 vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
274 x_step_q4, y0_q4, y_step_q4, w, h);
275 }
276
vpx_scaled_avg_vert_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)277 void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
278 uint8_t *dst, ptrdiff_t dst_stride,
279 const InterpKernel *filter, int x0_q4, int x_step_q4,
280 int y0_q4, int y_step_q4, int w, int h) {
281 vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
282 x_step_q4, y0_q4, y_step_q4, w, h);
283 }
284
vpx_scaled_avg_2d_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)285 void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
286 ptrdiff_t dst_stride, const InterpKernel *filter,
287 int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
288 int w, int h) {
289 vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
290 x_step_q4, y0_q4, y_step_q4, w, h);
291 }
292
293 #if CONFIG_VP9_HIGHBITDEPTH
highbd_convolve_horiz(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h,int bd)294 static void highbd_convolve_horiz(const uint16_t *src, ptrdiff_t src_stride,
295 uint16_t *dst, ptrdiff_t dst_stride,
296 const InterpKernel *x_filters, int x0_q4,
297 int x_step_q4, int w, int h, int bd) {
298 int x, y;
299 src -= SUBPEL_TAPS / 2 - 1;
300
301 for (y = 0; y < h; ++y) {
302 int x_q4 = x0_q4;
303 for (x = 0; x < w; ++x) {
304 const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
305 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
306 int k, sum = 0;
307 for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
308 dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
309 x_q4 += x_step_q4;
310 }
311 src += src_stride;
312 dst += dst_stride;
313 }
314 }
315
highbd_convolve_avg_horiz(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h,int bd)316 static void highbd_convolve_avg_horiz(const uint16_t *src, ptrdiff_t src_stride,
317 uint16_t *dst, ptrdiff_t dst_stride,
318 const InterpKernel *x_filters, int x0_q4,
319 int x_step_q4, int w, int h, int bd) {
320 int x, y;
321 src -= SUBPEL_TAPS / 2 - 1;
322
323 for (y = 0; y < h; ++y) {
324 int x_q4 = x0_q4;
325 for (x = 0; x < w; ++x) {
326 const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
327 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
328 int k, sum = 0;
329 for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
330 dst[x] = ROUND_POWER_OF_TWO(
331 dst[x] + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
332 1);
333 x_q4 += x_step_q4;
334 }
335 src += src_stride;
336 dst += dst_stride;
337 }
338 }
339
highbd_convolve_vert(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h,int bd)340 static void highbd_convolve_vert(const uint16_t *src, ptrdiff_t src_stride,
341 uint16_t *dst, ptrdiff_t dst_stride,
342 const InterpKernel *y_filters, int y0_q4,
343 int y_step_q4, int w, int h, int bd) {
344 int x, y;
345 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
346
347 for (x = 0; x < w; ++x) {
348 int y_q4 = y0_q4;
349 for (y = 0; y < h; ++y) {
350 const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
351 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
352 int k, sum = 0;
353 for (k = 0; k < SUBPEL_TAPS; ++k)
354 sum += src_y[k * src_stride] * y_filter[k];
355 dst[y * dst_stride] =
356 clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
357 y_q4 += y_step_q4;
358 }
359 ++src;
360 ++dst;
361 }
362 }
363
highbd_convolve_avg_vert(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h,int bd)364 static void highbd_convolve_avg_vert(const uint16_t *src, ptrdiff_t src_stride,
365 uint16_t *dst, ptrdiff_t dst_stride,
366 const InterpKernel *y_filters, int y0_q4,
367 int y_step_q4, int w, int h, int bd) {
368 int x, y;
369 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
370
371 for (x = 0; x < w; ++x) {
372 int y_q4 = y0_q4;
373 for (y = 0; y < h; ++y) {
374 const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
375 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
376 int k, sum = 0;
377 for (k = 0; k < SUBPEL_TAPS; ++k)
378 sum += src_y[k * src_stride] * y_filter[k];
379 dst[y * dst_stride] = ROUND_POWER_OF_TWO(
380 dst[y * dst_stride] +
381 clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
382 1);
383 y_q4 += y_step_q4;
384 }
385 ++src;
386 ++dst;
387 }
388 }
389
highbd_convolve(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h,int bd)390 static void highbd_convolve(const uint16_t *src, ptrdiff_t src_stride,
391 uint16_t *dst, ptrdiff_t dst_stride,
392 const InterpKernel *filter, int x0_q4,
393 int x_step_q4, int y0_q4, int y_step_q4, int w,
394 int h, int bd) {
395 // Note: Fixed size intermediate buffer, temp, places limits on parameters.
396 // 2d filtering proceeds in 2 steps:
397 // (1) Interpolate horizontally into an intermediate buffer, temp.
398 // (2) Interpolate temp vertically to derive the sub-pixel result.
399 // Deriving the maximum number of rows in the temp buffer (135):
400 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
401 // --Largest block size is 64x64 pixels.
402 // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
403 // original frame (in 1/16th pixel units).
404 // --Must round-up because block may be located at sub-pixel position.
405 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
406 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
407 uint16_t temp[64 * 135];
408 const int intermediate_height =
409 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
410
411 assert(w <= 64);
412 assert(h <= 64);
413 assert(y_step_q4 <= 32);
414 assert(x_step_q4 <= 32);
415
416 highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
417 temp, 64, filter, x0_q4, x_step_q4, w,
418 intermediate_height, bd);
419 highbd_convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
420 filter, y0_q4, y_step_q4, w, h, bd);
421 }
422
vpx_highbd_convolve8_horiz_c(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h,int bd)423 void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride,
424 uint16_t *dst, ptrdiff_t dst_stride,
425 const InterpKernel *filter, int x0_q4,
426 int x_step_q4, int y0_q4, int y_step_q4,
427 int w, int h, int bd) {
428 (void)y0_q4;
429 (void)y_step_q4;
430
431 highbd_convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4,
432 x_step_q4, w, h, bd);
433 }
434
vpx_highbd_convolve8_avg_horiz_c(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h,int bd)435 void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride,
436 uint16_t *dst, ptrdiff_t dst_stride,
437 const InterpKernel *filter, int x0_q4,
438 int x_step_q4, int y0_q4, int y_step_q4,
439 int w, int h, int bd) {
440 (void)y0_q4;
441 (void)y_step_q4;
442
443 highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4,
444 x_step_q4, w, h, bd);
445 }
446
vpx_highbd_convolve8_vert_c(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h,int bd)447 void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride,
448 uint16_t *dst, ptrdiff_t dst_stride,
449 const InterpKernel *filter, int x0_q4,
450 int x_step_q4, int y0_q4, int y_step_q4, int w,
451 int h, int bd) {
452 (void)x0_q4;
453 (void)x_step_q4;
454
455 highbd_convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4,
456 y_step_q4, w, h, bd);
457 }
458
vpx_highbd_convolve8_avg_vert_c(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h,int bd)459 void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride,
460 uint16_t *dst, ptrdiff_t dst_stride,
461 const InterpKernel *filter, int x0_q4,
462 int x_step_q4, int y0_q4, int y_step_q4,
463 int w, int h, int bd) {
464 (void)x0_q4;
465 (void)x_step_q4;
466
467 highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4,
468 y_step_q4, w, h, bd);
469 }
470
vpx_highbd_convolve8_c(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h,int bd)471 void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride,
472 uint16_t *dst, ptrdiff_t dst_stride,
473 const InterpKernel *filter, int x0_q4,
474 int x_step_q4, int y0_q4, int y_step_q4, int w,
475 int h, int bd) {
476 highbd_convolve(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
477 y0_q4, y_step_q4, w, h, bd);
478 }
479
vpx_highbd_convolve8_avg_c(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h,int bd)480 void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride,
481 uint16_t *dst, ptrdiff_t dst_stride,
482 const InterpKernel *filter, int x0_q4,
483 int x_step_q4, int y0_q4, int y_step_q4, int w,
484 int h, int bd) {
485 // Fixed size intermediate buffer places limits on parameters.
486 DECLARE_ALIGNED(16, uint16_t, temp[64 * 64]);
487 assert(w <= 64);
488 assert(h <= 64);
489
490 vpx_highbd_convolve8_c(src, src_stride, temp, 64, filter, x0_q4, x_step_q4,
491 y0_q4, y_step_q4, w, h, bd);
492 vpx_highbd_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h,
493 bd);
494 }
495
vpx_highbd_convolve_copy_c(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h,int bd)496 void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride,
497 uint16_t *dst, ptrdiff_t dst_stride,
498 const InterpKernel *filter, int x0_q4,
499 int x_step_q4, int y0_q4, int y_step_q4, int w,
500 int h, int bd) {
501 int r;
502
503 (void)filter;
504 (void)x0_q4;
505 (void)x_step_q4;
506 (void)y0_q4;
507 (void)y_step_q4;
508 (void)bd;
509
510 for (r = h; r > 0; --r) {
511 memcpy(dst, src, w * sizeof(uint16_t));
512 src += src_stride;
513 dst += dst_stride;
514 }
515 }
516
vpx_highbd_convolve_avg_c(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h,int bd)517 void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride,
518 uint16_t *dst, ptrdiff_t dst_stride,
519 const InterpKernel *filter, int x0_q4,
520 int x_step_q4, int y0_q4, int y_step_q4, int w,
521 int h, int bd) {
522 int x, y;
523
524 (void)filter;
525 (void)x0_q4;
526 (void)x_step_q4;
527 (void)y0_q4;
528 (void)y_step_q4;
529 (void)bd;
530
531 for (y = 0; y < h; ++y) {
532 for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
533 src += src_stride;
534 dst += dst_stride;
535 }
536 }
537 #endif
538