1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./vpx_config.h"
12 #include "./vp9_rtcd.h"
13 #include "./vpx_dsp_rtcd.h"
14 #include "./vpx_scale_rtcd.h"
15
16 #include "vp9/common/vp9_onyxc_int.h"
17 #include "vp9/common/vp9_postproc.h"
18
19 // TODO(jackychen): Replace this function with SSE2 code. There is
20 // one SSE2 implementation in vp8, so will consider how to share it
21 // between vp8 and vp9.
filter_by_weight(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int block_size,int src_weight)22 static void filter_by_weight(const uint8_t *src, int src_stride, uint8_t *dst,
23 int dst_stride, int block_size, int src_weight) {
24 const int dst_weight = (1 << MFQE_PRECISION) - src_weight;
25 const int rounding_bit = 1 << (MFQE_PRECISION - 1);
26 int r, c;
27
28 for (r = 0; r < block_size; r++) {
29 for (c = 0; c < block_size; c++) {
30 dst[c] = (src[c] * src_weight + dst[c] * dst_weight + rounding_bit) >>
31 MFQE_PRECISION;
32 }
33 src += src_stride;
34 dst += dst_stride;
35 }
36 }
37
vp9_filter_by_weight8x8_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int src_weight)38 void vp9_filter_by_weight8x8_c(const uint8_t *src, int src_stride, uint8_t *dst,
39 int dst_stride, int src_weight) {
40 filter_by_weight(src, src_stride, dst, dst_stride, 8, src_weight);
41 }
42
vp9_filter_by_weight16x16_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int src_weight)43 void vp9_filter_by_weight16x16_c(const uint8_t *src, int src_stride,
44 uint8_t *dst, int dst_stride, int src_weight) {
45 filter_by_weight(src, src_stride, dst, dst_stride, 16, src_weight);
46 }
47
filter_by_weight32x32(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int weight)48 static void filter_by_weight32x32(const uint8_t *src, int src_stride,
49 uint8_t *dst, int dst_stride, int weight) {
50 vp9_filter_by_weight16x16(src, src_stride, dst, dst_stride, weight);
51 vp9_filter_by_weight16x16(src + 16, src_stride, dst + 16, dst_stride, weight);
52 vp9_filter_by_weight16x16(src + src_stride * 16, src_stride,
53 dst + dst_stride * 16, dst_stride, weight);
54 vp9_filter_by_weight16x16(src + src_stride * 16 + 16, src_stride,
55 dst + dst_stride * 16 + 16, dst_stride, weight);
56 }
57
filter_by_weight64x64(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int weight)58 static void filter_by_weight64x64(const uint8_t *src, int src_stride,
59 uint8_t *dst, int dst_stride, int weight) {
60 filter_by_weight32x32(src, src_stride, dst, dst_stride, weight);
61 filter_by_weight32x32(src + 32, src_stride, dst + 32, dst_stride, weight);
62 filter_by_weight32x32(src + src_stride * 32, src_stride,
63 dst + dst_stride * 32, dst_stride, weight);
64 filter_by_weight32x32(src + src_stride * 32 + 32, src_stride,
65 dst + dst_stride * 32 + 32, dst_stride, weight);
66 }
67
apply_ifactor(const uint8_t * y,int y_stride,uint8_t * yd,int yd_stride,const uint8_t * u,const uint8_t * v,int uv_stride,uint8_t * ud,uint8_t * vd,int uvd_stride,BLOCK_SIZE block_size,int weight)68 static void apply_ifactor(const uint8_t *y, int y_stride, uint8_t *yd,
69 int yd_stride, const uint8_t *u, const uint8_t *v,
70 int uv_stride, uint8_t *ud, uint8_t *vd,
71 int uvd_stride, BLOCK_SIZE block_size, int weight) {
72 if (block_size == BLOCK_16X16) {
73 vp9_filter_by_weight16x16(y, y_stride, yd, yd_stride, weight);
74 vp9_filter_by_weight8x8(u, uv_stride, ud, uvd_stride, weight);
75 vp9_filter_by_weight8x8(v, uv_stride, vd, uvd_stride, weight);
76 } else if (block_size == BLOCK_32X32) {
77 filter_by_weight32x32(y, y_stride, yd, yd_stride, weight);
78 vp9_filter_by_weight16x16(u, uv_stride, ud, uvd_stride, weight);
79 vp9_filter_by_weight16x16(v, uv_stride, vd, uvd_stride, weight);
80 } else if (block_size == BLOCK_64X64) {
81 filter_by_weight64x64(y, y_stride, yd, yd_stride, weight);
82 filter_by_weight32x32(u, uv_stride, ud, uvd_stride, weight);
83 filter_by_weight32x32(v, uv_stride, vd, uvd_stride, weight);
84 }
85 }
86
87 // TODO(jackychen): Determine whether replace it with assembly code.
copy_mem8x8(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride)88 static void copy_mem8x8(const uint8_t *src, int src_stride, uint8_t *dst,
89 int dst_stride) {
90 int r;
91 for (r = 0; r < 8; r++) {
92 memcpy(dst, src, 8);
93 src += src_stride;
94 dst += dst_stride;
95 }
96 }
97
copy_mem16x16(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride)98 static void copy_mem16x16(const uint8_t *src, int src_stride, uint8_t *dst,
99 int dst_stride) {
100 int r;
101 for (r = 0; r < 16; r++) {
102 memcpy(dst, src, 16);
103 src += src_stride;
104 dst += dst_stride;
105 }
106 }
107
copy_mem32x32(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride)108 static void copy_mem32x32(const uint8_t *src, int src_stride, uint8_t *dst,
109 int dst_stride) {
110 copy_mem16x16(src, src_stride, dst, dst_stride);
111 copy_mem16x16(src + 16, src_stride, dst + 16, dst_stride);
112 copy_mem16x16(src + src_stride * 16, src_stride, dst + dst_stride * 16,
113 dst_stride);
114 copy_mem16x16(src + src_stride * 16 + 16, src_stride,
115 dst + dst_stride * 16 + 16, dst_stride);
116 }
117
copy_mem64x64(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride)118 static void copy_mem64x64(const uint8_t *src, int src_stride, uint8_t *dst,
119 int dst_stride) {
120 copy_mem32x32(src, src_stride, dst, dst_stride);
121 copy_mem32x32(src + 32, src_stride, dst + 32, dst_stride);
122 copy_mem32x32(src + src_stride * 32, src_stride, dst + src_stride * 32,
123 dst_stride);
124 copy_mem32x32(src + src_stride * 32 + 32, src_stride,
125 dst + src_stride * 32 + 32, dst_stride);
126 }
127
copy_block(const uint8_t * y,const uint8_t * u,const uint8_t * v,int y_stride,int uv_stride,uint8_t * yd,uint8_t * ud,uint8_t * vd,int yd_stride,int uvd_stride,BLOCK_SIZE bs)128 static void copy_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
129 int y_stride, int uv_stride, uint8_t *yd, uint8_t *ud,
130 uint8_t *vd, int yd_stride, int uvd_stride,
131 BLOCK_SIZE bs) {
132 if (bs == BLOCK_16X16) {
133 copy_mem16x16(y, y_stride, yd, yd_stride);
134 copy_mem8x8(u, uv_stride, ud, uvd_stride);
135 copy_mem8x8(v, uv_stride, vd, uvd_stride);
136 } else if (bs == BLOCK_32X32) {
137 copy_mem32x32(y, y_stride, yd, yd_stride);
138 copy_mem16x16(u, uv_stride, ud, uvd_stride);
139 copy_mem16x16(v, uv_stride, vd, uvd_stride);
140 } else {
141 copy_mem64x64(y, y_stride, yd, yd_stride);
142 copy_mem32x32(u, uv_stride, ud, uvd_stride);
143 copy_mem32x32(v, uv_stride, vd, uvd_stride);
144 }
145 }
146
get_thr(BLOCK_SIZE bs,int qdiff,int * sad_thr,int * vdiff_thr)147 static void get_thr(BLOCK_SIZE bs, int qdiff, int *sad_thr, int *vdiff_thr) {
148 const int adj = qdiff >> MFQE_PRECISION;
149 if (bs == BLOCK_16X16) {
150 *sad_thr = 7 + adj;
151 } else if (bs == BLOCK_32X32) {
152 *sad_thr = 6 + adj;
153 } else { // BLOCK_64X64
154 *sad_thr = 5 + adj;
155 }
156 *vdiff_thr = 125 + qdiff;
157 }
158
mfqe_block(BLOCK_SIZE bs,const uint8_t * y,const uint8_t * u,const uint8_t * v,int y_stride,int uv_stride,uint8_t * yd,uint8_t * ud,uint8_t * vd,int yd_stride,int uvd_stride,int qdiff)159 static void mfqe_block(BLOCK_SIZE bs, const uint8_t *y, const uint8_t *u,
160 const uint8_t *v, int y_stride, int uv_stride,
161 uint8_t *yd, uint8_t *ud, uint8_t *vd, int yd_stride,
162 int uvd_stride, int qdiff) {
163 int sad, sad_thr, vdiff, vdiff_thr;
164 uint32_t sse;
165
166 get_thr(bs, qdiff, &sad_thr, &vdiff_thr);
167
168 if (bs == BLOCK_16X16) {
169 vdiff = (vpx_variance16x16(y, y_stride, yd, yd_stride, &sse) + 128) >> 8;
170 sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8;
171 } else if (bs == BLOCK_32X32) {
172 vdiff = (vpx_variance32x32(y, y_stride, yd, yd_stride, &sse) + 512) >> 10;
173 sad = (vpx_sad32x32(y, y_stride, yd, yd_stride) + 512) >> 10;
174 } else /* if (bs == BLOCK_64X64) */ {
175 vdiff = (vpx_variance64x64(y, y_stride, yd, yd_stride, &sse) + 2048) >> 12;
176 sad = (vpx_sad64x64(y, y_stride, yd, yd_stride) + 2048) >> 12;
177 }
178
179 // vdiff > sad * 3 means vdiff should not be too small, otherwise,
180 // it might be a lighting change in smooth area. When there is a
181 // lighting change in smooth area, it is dangerous to do MFQE.
182 if (sad > 1 && vdiff > sad * 3) {
183 const int weight = 1 << MFQE_PRECISION;
184 int ifactor = weight * sad * vdiff / (sad_thr * vdiff_thr);
185 // When ifactor equals weight, no MFQE is done.
186 if (ifactor > weight) {
187 ifactor = weight;
188 }
189 apply_ifactor(y, y_stride, yd, yd_stride, u, v, uv_stride, ud, vd,
190 uvd_stride, bs, ifactor);
191 } else {
192 // Copy the block from current frame (i.e., no mfqe is done).
193 copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd, yd_stride, uvd_stride,
194 bs);
195 }
196 }
197
mfqe_decision(MODE_INFO * mi,BLOCK_SIZE cur_bs)198 static int mfqe_decision(MODE_INFO *mi, BLOCK_SIZE cur_bs) {
199 // Check the motion in current block(for inter frame),
200 // or check the motion in the correlated block in last frame (for keyframe).
201 const int mv_len_square = mi->mv[0].as_mv.row * mi->mv[0].as_mv.row +
202 mi->mv[0].as_mv.col * mi->mv[0].as_mv.col;
203 const int mv_threshold = 100;
204 return mi->mode >= NEARESTMV && // Not an intra block
205 cur_bs >= BLOCK_16X16 && mv_len_square <= mv_threshold;
206 }
207
208 // Process each partiton in a super block, recursively.
mfqe_partition(VP9_COMMON * cm,MODE_INFO * mi,BLOCK_SIZE bs,const uint8_t * y,const uint8_t * u,const uint8_t * v,int y_stride,int uv_stride,uint8_t * yd,uint8_t * ud,uint8_t * vd,int yd_stride,int uvd_stride)209 static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs,
210 const uint8_t *y, const uint8_t *u, const uint8_t *v,
211 int y_stride, int uv_stride, uint8_t *yd,
212 uint8_t *ud, uint8_t *vd, int yd_stride,
213 int uvd_stride) {
214 int mi_offset, y_offset, uv_offset;
215 const BLOCK_SIZE cur_bs = mi->sb_type;
216 const int qdiff = cm->base_qindex - cm->postproc_state.last_base_qindex;
217 const int bsl = b_width_log2_lookup[bs];
218 PARTITION_TYPE partition = partition_lookup[bsl][cur_bs];
219 const BLOCK_SIZE subsize = get_subsize(bs, partition);
220
221 if (cur_bs < BLOCK_8X8) {
222 // If there are blocks smaller than 8x8, it must be on the boundary.
223 return;
224 }
225 // No MFQE on blocks smaller than 16x16
226 if (bs == BLOCK_16X16) {
227 partition = PARTITION_NONE;
228 }
229 if (bs == BLOCK_64X64) {
230 mi_offset = 4;
231 y_offset = 32;
232 uv_offset = 16;
233 } else {
234 mi_offset = 2;
235 y_offset = 16;
236 uv_offset = 8;
237 }
238 switch (partition) {
239 BLOCK_SIZE mfqe_bs, bs_tmp;
240 case PARTITION_HORZ:
241 if (bs == BLOCK_64X64) {
242 mfqe_bs = BLOCK_64X32;
243 bs_tmp = BLOCK_32X32;
244 } else {
245 mfqe_bs = BLOCK_32X16;
246 bs_tmp = BLOCK_16X16;
247 }
248 if (mfqe_decision(mi, mfqe_bs)) {
249 // Do mfqe on the first square partition.
250 mfqe_block(bs_tmp, y, u, v, y_stride, uv_stride, yd, ud, vd, yd_stride,
251 uvd_stride, qdiff);
252 // Do mfqe on the second square partition.
253 mfqe_block(bs_tmp, y + y_offset, u + uv_offset, v + uv_offset, y_stride,
254 uv_stride, yd + y_offset, ud + uv_offset, vd + uv_offset,
255 yd_stride, uvd_stride, qdiff);
256 }
257 if (mfqe_decision(mi + mi_offset * cm->mi_stride, mfqe_bs)) {
258 // Do mfqe on the first square partition.
259 mfqe_block(bs_tmp, y + y_offset * y_stride, u + uv_offset * uv_stride,
260 v + uv_offset * uv_stride, y_stride, uv_stride,
261 yd + y_offset * yd_stride, ud + uv_offset * uvd_stride,
262 vd + uv_offset * uvd_stride, yd_stride, uvd_stride, qdiff);
263 // Do mfqe on the second square partition.
264 mfqe_block(bs_tmp, y + y_offset * y_stride + y_offset,
265 u + uv_offset * uv_stride + uv_offset,
266 v + uv_offset * uv_stride + uv_offset, y_stride, uv_stride,
267 yd + y_offset * yd_stride + y_offset,
268 ud + uv_offset * uvd_stride + uv_offset,
269 vd + uv_offset * uvd_stride + uv_offset, yd_stride,
270 uvd_stride, qdiff);
271 }
272 break;
273 case PARTITION_VERT:
274 if (bs == BLOCK_64X64) {
275 mfqe_bs = BLOCK_32X64;
276 bs_tmp = BLOCK_32X32;
277 } else {
278 mfqe_bs = BLOCK_16X32;
279 bs_tmp = BLOCK_16X16;
280 }
281 if (mfqe_decision(mi, mfqe_bs)) {
282 // Do mfqe on the first square partition.
283 mfqe_block(bs_tmp, y, u, v, y_stride, uv_stride, yd, ud, vd, yd_stride,
284 uvd_stride, qdiff);
285 // Do mfqe on the second square partition.
286 mfqe_block(bs_tmp, y + y_offset * y_stride, u + uv_offset * uv_stride,
287 v + uv_offset * uv_stride, y_stride, uv_stride,
288 yd + y_offset * yd_stride, ud + uv_offset * uvd_stride,
289 vd + uv_offset * uvd_stride, yd_stride, uvd_stride, qdiff);
290 }
291 if (mfqe_decision(mi + mi_offset, mfqe_bs)) {
292 // Do mfqe on the first square partition.
293 mfqe_block(bs_tmp, y + y_offset, u + uv_offset, v + uv_offset, y_stride,
294 uv_stride, yd + y_offset, ud + uv_offset, vd + uv_offset,
295 yd_stride, uvd_stride, qdiff);
296 // Do mfqe on the second square partition.
297 mfqe_block(bs_tmp, y + y_offset * y_stride + y_offset,
298 u + uv_offset * uv_stride + uv_offset,
299 v + uv_offset * uv_stride + uv_offset, y_stride, uv_stride,
300 yd + y_offset * yd_stride + y_offset,
301 ud + uv_offset * uvd_stride + uv_offset,
302 vd + uv_offset * uvd_stride + uv_offset, yd_stride,
303 uvd_stride, qdiff);
304 }
305 break;
306 case PARTITION_NONE:
307 if (mfqe_decision(mi, cur_bs)) {
308 // Do mfqe on this partition.
309 mfqe_block(cur_bs, y, u, v, y_stride, uv_stride, yd, ud, vd, yd_stride,
310 uvd_stride, qdiff);
311 } else {
312 // Copy the block from current frame(i.e., no mfqe is done).
313 copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd, yd_stride,
314 uvd_stride, bs);
315 }
316 break;
317 case PARTITION_SPLIT:
318 // Recursion on four square partitions, e.g. if bs is 64X64,
319 // then look into four 32X32 blocks in it.
320 mfqe_partition(cm, mi, subsize, y, u, v, y_stride, uv_stride, yd, ud, vd,
321 yd_stride, uvd_stride);
322 mfqe_partition(cm, mi + mi_offset, subsize, y + y_offset, u + uv_offset,
323 v + uv_offset, y_stride, uv_stride, yd + y_offset,
324 ud + uv_offset, vd + uv_offset, yd_stride, uvd_stride);
325 mfqe_partition(cm, mi + mi_offset * cm->mi_stride, subsize,
326 y + y_offset * y_stride, u + uv_offset * uv_stride,
327 v + uv_offset * uv_stride, y_stride, uv_stride,
328 yd + y_offset * yd_stride, ud + uv_offset * uvd_stride,
329 vd + uv_offset * uvd_stride, yd_stride, uvd_stride);
330 mfqe_partition(cm, mi + mi_offset * cm->mi_stride + mi_offset, subsize,
331 y + y_offset * y_stride + y_offset,
332 u + uv_offset * uv_stride + uv_offset,
333 v + uv_offset * uv_stride + uv_offset, y_stride, uv_stride,
334 yd + y_offset * yd_stride + y_offset,
335 ud + uv_offset * uvd_stride + uv_offset,
336 vd + uv_offset * uvd_stride + uv_offset, yd_stride,
337 uvd_stride);
338 break;
339 default: assert(0);
340 }
341 }
342
vp9_mfqe(VP9_COMMON * cm)343 void vp9_mfqe(VP9_COMMON *cm) {
344 int mi_row, mi_col;
345 // Current decoded frame.
346 const YV12_BUFFER_CONFIG *show = cm->frame_to_show;
347 // Last decoded frame and will store the MFQE result.
348 YV12_BUFFER_CONFIG *dest = &cm->post_proc_buffer;
349 // Loop through each super block.
350 for (mi_row = 0; mi_row < cm->mi_rows; mi_row += MI_BLOCK_SIZE) {
351 for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
352 MODE_INFO *mi;
353 MODE_INFO *mi_local = cm->mi + (mi_row * cm->mi_stride + mi_col);
354 // Motion Info in last frame.
355 MODE_INFO *mi_prev =
356 cm->postproc_state.prev_mi + (mi_row * cm->mi_stride + mi_col);
357 const uint32_t y_stride = show->y_stride;
358 const uint32_t uv_stride = show->uv_stride;
359 const uint32_t yd_stride = dest->y_stride;
360 const uint32_t uvd_stride = dest->uv_stride;
361 const uint32_t row_offset_y = mi_row << 3;
362 const uint32_t row_offset_uv = mi_row << 2;
363 const uint32_t col_offset_y = mi_col << 3;
364 const uint32_t col_offset_uv = mi_col << 2;
365 const uint8_t *y =
366 show->y_buffer + row_offset_y * y_stride + col_offset_y;
367 const uint8_t *u =
368 show->u_buffer + row_offset_uv * uv_stride + col_offset_uv;
369 const uint8_t *v =
370 show->v_buffer + row_offset_uv * uv_stride + col_offset_uv;
371 uint8_t *yd = dest->y_buffer + row_offset_y * yd_stride + col_offset_y;
372 uint8_t *ud = dest->u_buffer + row_offset_uv * uvd_stride + col_offset_uv;
373 uint8_t *vd = dest->v_buffer + row_offset_uv * uvd_stride + col_offset_uv;
374 if (frame_is_intra_only(cm)) {
375 mi = mi_prev;
376 } else {
377 mi = mi_local;
378 }
379 mfqe_partition(cm, mi, BLOCK_64X64, y, u, v, y_stride, uv_stride, yd, ud,
380 vd, yd_stride, uvd_stride);
381 }
382 }
383 }
384