• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
3  * Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru>
4  * Copyright (c) 2014 Arwa Arif <arwaarif1994@gmail.com>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License along
19  * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
21  */
22 
23 /**
24  * @file
25  * Fast Simple Post-processing filter
26  * This implementation is based on an algorithm described in
27  * "Aria Nosratinia Embedded Post-Processing for
28  * Enhancement of Compressed Images (1999)"
29  * (http://www.utdallas.edu/~aria/papers/vlsisp99.pdf)
30  * Further, with splitting (I)DCT into horizontal/vertical passes, one of
31  * them can be performed once per block, not per pixel. This allows for much
32  * higher speed.
33  *
34  * Originally written by Michael Niedermayer and Nikolaj for the MPlayer
35  * project, and ported by Arwa Arif for FFmpeg.
36  */
37 
38 #include "libavutil/imgutils.h"
39 #include "libavutil/mem_internal.h"
40 #include "libavutil/opt.h"
41 #include "libavutil/pixdesc.h"
42 #include "internal.h"
43 #include "qp_table.h"
44 #include "vf_fspp.h"
45 
46 #define OFFSET(x) offsetof(FSPPContext, x)
47 #define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM
48 static const AVOption fspp_options[] = {
49     { "quality",       "set quality",                          OFFSET(log2_count),    AV_OPT_TYPE_INT, {.i64 = 4},   4, MAX_LEVEL, FLAGS },
50     { "qp",            "force a constant quantizer parameter", OFFSET(qp),            AV_OPT_TYPE_INT, {.i64 = 0},   0, 64,        FLAGS },
51     { "strength",      "set filter strength",                  OFFSET(strength),      AV_OPT_TYPE_INT, {.i64 = 0}, -15, 32,        FLAGS },
52     { "use_bframe_qp", "use B-frames' QP",                     OFFSET(use_bframe_qp), AV_OPT_TYPE_BOOL,{.i64 = 0},   0, 1,         FLAGS },
53     { NULL }
54 };
55 
56 AVFILTER_DEFINE_CLASS(fspp);
57 
58 DECLARE_ALIGNED(32, static const uint8_t, dither)[8][8] = {
59     {  0,  48,  12,  60,   3,  51,  15,  63, },
60     { 32,  16,  44,  28,  35,  19,  47,  31, },
61     {  8,  56,   4,  52,  11,  59,   7,  55, },
62     { 40,  24,  36,  20,  43,  27,  39,  23, },
63     {  2,  50,  14,  62,   1,  49,  13,  61, },
64     { 34,  18,  46,  30,  33,  17,  45,  29, },
65     { 10,  58,   6,  54,   9,  57,   5,  53, },
66     { 42,  26,  38,  22,  41,  25,  37,  21, },
67 };
68 
69 static const short custom_threshold[64] = {
70 // values (296) can't be too high
71 // -it causes too big quant dependence
72 // or maybe overflow(check), which results in some flashing
73      71, 296, 295, 237,  71,  40,  38,  19,
74     245, 193, 185, 121, 102,  73,  53,  27,
75     158, 129, 141, 107,  97,  73,  50,  26,
76     102, 116, 109,  98,  82,  66,  45,  23,
77      71,  94,  95,  81,  70,  56,  38,  20,
78      56,  77,  74,  66,  56,  44,  30,  15,
79      38,  53,  50,  45,  38,  30,  21,  11,
80      20,  27,  26,  23,  20,  15,  11,   5
81 };
82 
83 //This func reads from 1 slice, 1 and clears 0 & 1
store_slice_c(uint8_t * dst,int16_t * src,ptrdiff_t dst_stride,ptrdiff_t src_stride,ptrdiff_t width,ptrdiff_t height,ptrdiff_t log2_scale)84 static void store_slice_c(uint8_t *dst, int16_t *src,
85                           ptrdiff_t dst_stride, ptrdiff_t src_stride,
86                           ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
87 {
88     int y, x;
89 #define STORE(pos)                                                             \
90     temp = (src[x + pos] + (d[pos] >> log2_scale)) >> (6 - log2_scale);        \
91     src[x + pos] = src[x + pos - 8 * src_stride] = 0;                          \
92     if (temp & 0x100) temp = ~(temp >> 31);                                    \
93     dst[x + pos] = temp;
94 
95     for (y = 0; y < height; y++) {
96         const uint8_t *d = dither[y];
97         for (x = 0; x < width; x += 8) {
98             int temp;
99             STORE(0);
100             STORE(1);
101             STORE(2);
102             STORE(3);
103             STORE(4);
104             STORE(5);
105             STORE(6);
106             STORE(7);
107         }
108         src += src_stride;
109         dst += dst_stride;
110     }
111 }
112 
113 //This func reads from 2 slices, 0 & 2  and clears 2-nd
store_slice2_c(uint8_t * dst,int16_t * src,ptrdiff_t dst_stride,ptrdiff_t src_stride,ptrdiff_t width,ptrdiff_t height,ptrdiff_t log2_scale)114 static void store_slice2_c(uint8_t *dst, int16_t *src,
115                            ptrdiff_t dst_stride, ptrdiff_t src_stride,
116                            ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
117 {
118     int y, x;
119 #define STORE2(pos)                                                                                       \
120     temp = (src[x + pos] + src[x + pos + 16 * src_stride] + (d[pos] >> log2_scale)) >> (6 - log2_scale);  \
121     src[x + pos + 16 * src_stride] = 0;                                                                   \
122     if (temp & 0x100) temp = ~(temp >> 31);                                                               \
123     dst[x + pos] = temp;
124 
125     for (y = 0; y < height; y++) {
126         const uint8_t *d = dither[y];
127         for (x = 0; x < width; x += 8) {
128             int temp;
129             STORE2(0);
130             STORE2(1);
131             STORE2(2);
132             STORE2(3);
133             STORE2(4);
134             STORE2(5);
135             STORE2(6);
136             STORE2(7);
137         }
138         src += src_stride;
139         dst += dst_stride;
140     }
141 }
142 
mul_thrmat_c(int16_t * thr_adr_noq,int16_t * thr_adr,int q)143 static void mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q)
144 {
145     int a;
146     for (a = 0; a < 64; a++)
147         thr_adr[a] = q * thr_adr_noq[a];
148 }
149 
filter(FSPPContext * p,uint8_t * dst,uint8_t * src,int dst_stride,int src_stride,int width,int height,uint8_t * qp_store,int qp_stride,int is_luma)150 static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src,
151                    int dst_stride, int src_stride,
152                    int width, int height,
153                    uint8_t *qp_store, int qp_stride, int is_luma)
154 {
155     int x, x0, y, es, qy, t;
156 
157     const int stride = is_luma ? p->temp_stride : (width + 16);
158     const int step = 6 - p->log2_count;
159     const int qpsh = 4 - p->hsub * !is_luma;
160     const int qpsv = 4 - p->vsub * !is_luma;
161 
162     DECLARE_ALIGNED(32, int32_t, block_align)[4 * 8 * BLOCKSZ + 4 * 8 * BLOCKSZ];
163     int16_t *block  = (int16_t *)block_align;
164     int16_t *block3 = (int16_t *)(block_align + 4 * 8 * BLOCKSZ);
165 
166     memset(block3, 0, 4 * 8 * BLOCKSZ);
167 
168     if (!src || !dst) return;
169 
170     for (y = 0; y < height; y++) {
171         int index = 8 + 8 * stride + y * stride;
172         memcpy(p->src + index, src + y * src_stride, width);
173         for (x = 0; x < 8; x++) {
174             p->src[index         - x - 1] = p->src[index +         x    ];
175             p->src[index + width + x    ] = p->src[index + width - x - 1];
176         }
177     }
178 
179     for (y = 0; y < 8; y++) {
180         memcpy(p->src + (     7 - y    ) * stride, p->src + (     y + 8    ) * stride, stride);
181         memcpy(p->src + (height + 8 + y) * stride, p->src + (height - y + 7) * stride, stride);
182     }
183     //FIXME (try edge emu)
184 
185     for (y = 8; y < 24; y++)
186         memset(p->temp + 8 + y * stride, 0, width * sizeof(int16_t));
187 
188     for (y = step; y < height + 8; y += step) {    //step= 1,2
189         const int y1 = y - 8 + step;                 //l5-7  l4-6;
190         qy = y - 4;
191 
192         if (qy > height - 1) qy = height - 1;
193         if (qy < 0) qy = 0;
194 
195         qy = (qy >> qpsv) * qp_stride;
196         p->row_fdct(block, p->src + y * stride + 2 - (y&1), stride, 2);
197 
198         for (x0 = 0; x0 < width + 8 - 8 * (BLOCKSZ - 1); x0 += 8 * (BLOCKSZ - 1)) {
199             p->row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y&1), stride, 2 * (BLOCKSZ - 1));
200 
201             if (p->qp)
202                 p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block + 0 * 8, block3 + 0 * 8, 8 * (BLOCKSZ - 1)); //yes, this is a HOTSPOT
203             else
204                 for (x = 0; x < 8 * (BLOCKSZ - 1); x += 8) {
205                     t = x + x0 - 2;                    //correct t=x+x0-2-(y&1), but its the same
206 
207                     if (t < 0) t = 0;                   //t always < width-2
208 
209                     t = qp_store[qy + (t >> qpsh)];
210                     t = ff_norm_qscale(t, p->qscale_type);
211 
212                     if (t != p->prev_q) p->prev_q = t, p->mul_thrmat((int16_t *)(&p->threshold_mtx_noq[0]), (int16_t *)(&p->threshold_mtx[0]), t);
213                     p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block + x * 8, block3 + x * 8, 8); //yes, this is a HOTSPOT
214                 }
215             p->row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, 2 * (BLOCKSZ - 1));
216             memmove(block,  block  + (BLOCKSZ - 1) * 64, 8 * 8 * sizeof(int16_t)); //cycling
217             memmove(block3, block3 + (BLOCKSZ - 1) * 64, 6 * 8 * sizeof(int16_t));
218         }
219 
220         es = width + 8 - x0; //  8, ...
221         if (es > 8)
222             p->row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y & 1), stride, (es - 4) >> 2);
223 
224         p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block, block3, es&(~1));
225         if (es > 3)
226             p->row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, es >> 2);
227 
228         if (!(y1 & 7) && y1) {
229             if (y1 & 8)
230                 p->store_slice(dst + (y1 - 8) * dst_stride, p->temp + 8 + 8 * stride,
231                                dst_stride, stride, width, 8, 5 - p->log2_count);
232             else
233                 p->store_slice2(dst + (y1 - 8) * dst_stride, p->temp + 8 + 0 * stride,
234                                 dst_stride, stride, width, 8, 5 - p->log2_count);
235         }
236     }
237 
238     if (y & 7) {  // height % 8 != 0
239         if (y & 8)
240             p->store_slice(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 8 * stride,
241                            dst_stride, stride, width, y&7, 5 - p->log2_count);
242         else
243             p->store_slice2(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 0 * stride,
244                             dst_stride, stride, width, y&7, 5 - p->log2_count);
245     }
246 }
247 
column_fidct_c(int16_t * thr_adr,int16_t * data,int16_t * output,int cnt)248 static void column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt)
249 {
250     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
251     int_simd16_t tmp10, tmp11, tmp12, tmp13;
252     int_simd16_t z1,z2,z3,z4,z5, z10, z11, z12, z13;
253     int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7;
254 
255     int16_t *dataptr;
256     int16_t *wsptr;
257     int16_t *threshold;
258     int ctr;
259 
260     dataptr = data;
261     wsptr = output;
262 
263     for (; cnt > 0; cnt -= 2) { //start positions
264         threshold = (int16_t *)thr_adr;//threshold_mtx
265         for (ctr = DCTSIZE; ctr > 0; ctr--) {
266             // Process columns from input, add to output.
267             tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
268             tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
269 
270             tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
271             tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
272 
273             tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
274             tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
275 
276             tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
277             tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
278 
279             // Even part of FDCT
280 
281             tmp10 = tmp0 + tmp3;
282             tmp13 = tmp0 - tmp3;
283             tmp11 = tmp1 + tmp2;
284             tmp12 = tmp1 - tmp2;
285 
286             d0 = tmp10 + tmp11;
287             d4 = tmp10 - tmp11;
288 
289             z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
290             d2 = tmp13 + z1;
291             d6 = tmp13 - z1;
292 
293             // Even part of IDCT
294 
295             THRESHOLD(tmp0, d0, threshold[0 * 8]);
296             THRESHOLD(tmp1, d2, threshold[2 * 8]);
297             THRESHOLD(tmp2, d4, threshold[4 * 8]);
298             THRESHOLD(tmp3, d6, threshold[6 * 8]);
299             tmp0 += 2;
300             tmp10 = (tmp0 + tmp2) >> 2;
301             tmp11 = (tmp0 - tmp2) >> 2;
302 
303             tmp13 = (tmp1 + tmp3) >>2; //+2 !  (psnr decides)
304             tmp12 = MULTIPLY16H((tmp1 - tmp3), FIX_1_414213562_A) - tmp13; //<<2
305 
306             tmp0 = tmp10 + tmp13; //->temps
307             tmp3 = tmp10 - tmp13; //->temps
308             tmp1 = tmp11 + tmp12; //->temps
309             tmp2 = tmp11 - tmp12; //->temps
310 
311             // Odd part of FDCT
312 
313             tmp10 = tmp4 + tmp5;
314             tmp11 = tmp5 + tmp6;
315             tmp12 = tmp6 + tmp7;
316 
317             z5 = MULTIPLY16H((tmp10 - tmp12) << 2, FIX_0_382683433);
318             z2 = MULTIPLY16H(tmp10 << 2, FIX_0_541196100) + z5;
319             z4 = MULTIPLY16H(tmp12 << 2, FIX_1_306562965) + z5;
320             z3 = MULTIPLY16H(tmp11 << 2, FIX_0_707106781);
321 
322             z11 = tmp7 + z3;
323             z13 = tmp7 - z3;
324 
325             d5 = z13 + z2;
326             d3 = z13 - z2;
327             d1 = z11 + z4;
328             d7 = z11 - z4;
329 
330             // Odd part of IDCT
331 
332             THRESHOLD(tmp4, d1, threshold[1 * 8]);
333             THRESHOLD(tmp5, d3, threshold[3 * 8]);
334             THRESHOLD(tmp6, d5, threshold[5 * 8]);
335             THRESHOLD(tmp7, d7, threshold[7 * 8]);
336 
337             //Simd version uses here a shortcut for the tmp5,tmp6,tmp7 == 0
338             z13 = tmp6 + tmp5;
339             z10 = (tmp6 - tmp5) << 1;
340             z11 = tmp4 + tmp7;
341             z12 = (tmp4 - tmp7) << 1;
342 
343             tmp7  = (z11 + z13) >> 2; //+2 !
344             tmp11 = MULTIPLY16H((z11 - z13) << 1, FIX_1_414213562);
345             z5    = MULTIPLY16H(z10 + z12,        FIX_1_847759065);
346             tmp10 = MULTIPLY16H(z12,              FIX_1_082392200) - z5;
347             tmp12 = MULTIPLY16H(z10,              FIX_2_613125930) + z5; // - !!
348 
349             tmp6 = tmp12 - tmp7;
350             tmp5 = tmp11 - tmp6;
351             tmp4 = tmp10 + tmp5;
352 
353             wsptr[DCTSIZE * 0] +=  (tmp0 + tmp7);
354             wsptr[DCTSIZE * 1] +=  (tmp1 + tmp6);
355             wsptr[DCTSIZE * 2] +=  (tmp2 + tmp5);
356             wsptr[DCTSIZE * 3] +=  (tmp3 - tmp4);
357             wsptr[DCTSIZE * 4] +=  (tmp3 + tmp4);
358             wsptr[DCTSIZE * 5] +=  (tmp2 - tmp5);
359             wsptr[DCTSIZE * 6]  =  (tmp1 - tmp6);
360             wsptr[DCTSIZE * 7]  =  (tmp0 - tmp7);
361             //
362             dataptr++; //next column
363             wsptr++;
364             threshold++;
365         }
366         dataptr += 8; //skip each second start pos
367         wsptr   += 8;
368     }
369 }
370 
row_idct_c(int16_t * workspace,int16_t * output_adr,ptrdiff_t output_stride,int cnt)371 static void row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt)
372 {
373     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
374     int_simd16_t tmp10, tmp11, tmp12, tmp13;
375     int_simd16_t z5, z10, z11, z12, z13;
376     int16_t *outptr;
377     int16_t *wsptr;
378 
379     cnt *= 4;
380     wsptr = workspace;
381     outptr = output_adr;
382     for (; cnt > 0; cnt--) {
383         // Even part
384         //Simd version reads 4x4 block and transposes it
385         tmp10 = wsptr[2] +  wsptr[3];
386         tmp11 = wsptr[2] -  wsptr[3];
387 
388         tmp13 = wsptr[0] +  wsptr[1];
389         tmp12 = (MULTIPLY16H(wsptr[0] - wsptr[1], FIX_1_414213562_A) << 2) - tmp13;//this shift order to avoid overflow
390 
391         tmp0 = tmp10 + tmp13; //->temps
392         tmp3 = tmp10 - tmp13; //->temps
393         tmp1 = tmp11 + tmp12;
394         tmp2 = tmp11 - tmp12;
395 
396         // Odd part
397         //Also transpose, with previous:
398         // ---- ----      ||||
399         // ---- ---- idct ||||
400         // ---- ---- ---> ||||
401         // ---- ----      ||||
402         z13 = wsptr[4] + wsptr[5];
403         z10 = wsptr[4] - wsptr[5];
404         z11 = wsptr[6] + wsptr[7];
405         z12 = wsptr[6] - wsptr[7];
406 
407         tmp7 = z11 + z13;
408         tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562);
409 
410         z5 =    MULTIPLY16H(z10 + z12, FIX_1_847759065);
411         tmp10 = MULTIPLY16H(z12,       FIX_1_082392200) - z5;
412         tmp12 = MULTIPLY16H(z10,       FIX_2_613125930) + z5; // - FIX_
413 
414         tmp6 = (tmp12 << 3) - tmp7;
415         tmp5 = (tmp11 << 3) - tmp6;
416         tmp4 = (tmp10 << 3) + tmp5;
417 
418         // Final output stage: descale and write column
419         outptr[0 * output_stride] += DESCALE(tmp0 + tmp7, 3);
420         outptr[1 * output_stride] += DESCALE(tmp1 + tmp6, 3);
421         outptr[2 * output_stride] += DESCALE(tmp2 + tmp5, 3);
422         outptr[3 * output_stride] += DESCALE(tmp3 - tmp4, 3);
423         outptr[4 * output_stride] += DESCALE(tmp3 + tmp4, 3);
424         outptr[5 * output_stride] += DESCALE(tmp2 - tmp5, 3);
425         outptr[6 * output_stride] += DESCALE(tmp1 - tmp6, 3); //no += ?
426         outptr[7 * output_stride] += DESCALE(tmp0 - tmp7, 3); //no += ?
427         outptr++;
428 
429         wsptr += DCTSIZE;       // advance pointer to next row
430     }
431 }
432 
row_fdct_c(int16_t * data,const uint8_t * pixels,ptrdiff_t line_size,int cnt)433 static void row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt)
434 {
435     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
436     int_simd16_t tmp10, tmp11, tmp12, tmp13;
437     int_simd16_t z1, z2, z3, z4, z5, z11, z13;
438     int16_t *dataptr;
439 
440     cnt *= 4;
441     // Pass 1: process rows.
442 
443     dataptr = data;
444     for (; cnt > 0; cnt--) {
445         tmp0 = pixels[line_size * 0] + pixels[line_size * 7];
446         tmp7 = pixels[line_size * 0] - pixels[line_size * 7];
447         tmp1 = pixels[line_size * 1] + pixels[line_size * 6];
448         tmp6 = pixels[line_size * 1] - pixels[line_size * 6];
449         tmp2 = pixels[line_size * 2] + pixels[line_size * 5];
450         tmp5 = pixels[line_size * 2] - pixels[line_size * 5];
451         tmp3 = pixels[line_size * 3] + pixels[line_size * 4];
452         tmp4 = pixels[line_size * 3] - pixels[line_size * 4];
453 
454         // Even part
455 
456         tmp10 = tmp0 + tmp3;
457         tmp13 = tmp0 - tmp3;
458         tmp11 = tmp1 + tmp2;
459         tmp12 = tmp1 - tmp2;
460         //Even columns are written first, this leads to different order of columns
461         //in column_fidct(), but they are processed independently, so all ok.
462         //Later in the row_idct() columns readed at the same order.
463         dataptr[2] = tmp10 + tmp11;
464         dataptr[3] = tmp10 - tmp11;
465 
466         z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
467         dataptr[0] = tmp13 + z1;
468         dataptr[1] = tmp13 - z1;
469 
470         // Odd part
471 
472         tmp10 = (tmp4 + tmp5) << 2;
473         tmp11 = (tmp5 + tmp6) << 2;
474         tmp12 = (tmp6 + tmp7) << 2;
475 
476         z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433);
477         z2 = MULTIPLY16H(tmp10,         FIX_0_541196100) + z5;
478         z4 = MULTIPLY16H(tmp12,         FIX_1_306562965) + z5;
479         z3 = MULTIPLY16H(tmp11,         FIX_0_707106781);
480 
481         z11 = tmp7 + z3;
482         z13 = tmp7 - z3;
483 
484         dataptr[4] = z13 + z2;
485         dataptr[5] = z13 - z2;
486         dataptr[6] = z11 + z4;
487         dataptr[7] = z11 - z4;
488 
489         pixels++;               // advance pointer to next column
490         dataptr += DCTSIZE;
491     }
492 }
493 
494 static const enum AVPixelFormat pix_fmts[] = {
495     AV_PIX_FMT_YUV444P,  AV_PIX_FMT_YUV422P,
496     AV_PIX_FMT_YUV420P,  AV_PIX_FMT_YUV411P,
497     AV_PIX_FMT_YUV410P,  AV_PIX_FMT_YUV440P,
498     AV_PIX_FMT_YUVJ444P, AV_PIX_FMT_YUVJ422P,
499     AV_PIX_FMT_YUVJ420P, AV_PIX_FMT_YUVJ440P,
500     AV_PIX_FMT_GBRP, AV_PIX_FMT_GRAY8,
501     AV_PIX_FMT_NONE
502 };
503 
config_input(AVFilterLink * inlink)504 static int config_input(AVFilterLink *inlink)
505 {
506     AVFilterContext *ctx = inlink->dst;
507     FSPPContext *fspp = ctx->priv;
508     const int h = FFALIGN(inlink->h + 16, 16);
509     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format);
510 
511     fspp->hsub = desc->log2_chroma_w;
512     fspp->vsub = desc->log2_chroma_h;
513 
514     fspp->temp_stride = FFALIGN(inlink->w + 16, 16);
515     fspp->temp = av_malloc_array(fspp->temp_stride, h * sizeof(*fspp->temp));
516     fspp->src  = av_malloc_array(fspp->temp_stride, h * sizeof(*fspp->src));
517 
518     if (!fspp->temp || !fspp->src)
519         return AVERROR(ENOMEM);
520 
521     fspp->store_slice  = store_slice_c;
522     fspp->store_slice2 = store_slice2_c;
523     fspp->mul_thrmat   = mul_thrmat_c;
524     fspp->column_fidct = column_fidct_c;
525     fspp->row_idct     = row_idct_c;
526     fspp->row_fdct     = row_fdct_c;
527 
528 #if ARCH_X86
529     ff_fspp_init_x86(fspp);
530 #endif
531 
532     return 0;
533 }
534 
filter_frame(AVFilterLink * inlink,AVFrame * in)535 static int filter_frame(AVFilterLink *inlink, AVFrame *in)
536 {
537     AVFilterContext *ctx = inlink->dst;
538     FSPPContext *fspp = ctx->priv;
539     AVFilterLink *outlink = ctx->outputs[0];
540     AVFrame *out = in;
541 
542     int qp_stride = 0;
543     int8_t *qp_table = NULL;
544     int i, bias;
545     int ret = 0;
546     int custom_threshold_m[64];
547 
548     bias = (1 << 4) + fspp->strength;
549 
550     for (i = 0; i < 64; i++) //FIXME: tune custom_threshold[] and remove this !
551         custom_threshold_m[i] = (int)(custom_threshold[i] * (bias / 71.0) + 0.5);
552 
553     for (i = 0; i < 8; i++) {
554         fspp->threshold_mtx_noq[2 * i] = (uint64_t)custom_threshold_m[i * 8 + 2]
555                                       |(((uint64_t)custom_threshold_m[i * 8 + 6]) << 16)
556                                       |(((uint64_t)custom_threshold_m[i * 8 + 0]) << 32)
557                                       |(((uint64_t)custom_threshold_m[i * 8 + 4]) << 48);
558 
559         fspp->threshold_mtx_noq[2 * i + 1] = (uint64_t)custom_threshold_m[i * 8 + 5]
560                                           |(((uint64_t)custom_threshold_m[i * 8 + 3]) << 16)
561                                           |(((uint64_t)custom_threshold_m[i * 8 + 1]) << 32)
562                                           |(((uint64_t)custom_threshold_m[i * 8 + 7]) << 48);
563     }
564 
565     if (fspp->qp)
566         fspp->prev_q = fspp->qp, fspp->mul_thrmat((int16_t *)(&fspp->threshold_mtx_noq[0]), (int16_t *)(&fspp->threshold_mtx[0]), fspp->qp);
567 
568     /* if we are not in a constant user quantizer mode and we don't want to use
569      * the quantizers from the B-frames (B-frames often have a higher QP), we
570      * need to save the qp table from the last non B-frame; this is what the
571      * following code block does */
572     if (!fspp->qp && (fspp->use_bframe_qp || in->pict_type != AV_PICTURE_TYPE_B)) {
573         ret = ff_qp_table_extract(in, &qp_table, &qp_stride, NULL, &fspp->qscale_type);
574         if (ret < 0) {
575             av_frame_free(&in);
576             return ret;
577         }
578 
579         if (!fspp->use_bframe_qp && in->pict_type != AV_PICTURE_TYPE_B) {
580             av_freep(&fspp->non_b_qp_table);
581             fspp->non_b_qp_table  = qp_table;
582             fspp->non_b_qp_stride = qp_stride;
583         }
584     }
585 
586     if (fspp->log2_count && !ctx->is_disabled) {
587         if (!fspp->use_bframe_qp && fspp->non_b_qp_table) {
588             qp_table = fspp->non_b_qp_table;
589             qp_stride = fspp->non_b_qp_stride;
590         }
591 
592         if (qp_table || fspp->qp) {
593             const int cw = AV_CEIL_RSHIFT(inlink->w, fspp->hsub);
594             const int ch = AV_CEIL_RSHIFT(inlink->h, fspp->vsub);
595 
596             /* get a new frame if in-place is not possible or if the dimensions
597              * are not multiple of 8 */
598             if (!av_frame_is_writable(in) || (inlink->w & 7) || (inlink->h & 7)) {
599                 const int aligned_w = FFALIGN(inlink->w, 8);
600                 const int aligned_h = FFALIGN(inlink->h, 8);
601 
602                 out = ff_get_video_buffer(outlink, aligned_w, aligned_h);
603                 if (!out) {
604                     av_frame_free(&in);
605                     ret = AVERROR(ENOMEM);
606                     goto finish;
607                 }
608                 av_frame_copy_props(out, in);
609                 out->width = in->width;
610                 out->height = in->height;
611             }
612 
613             filter(fspp, out->data[0], in->data[0], out->linesize[0], in->linesize[0],
614                    inlink->w, inlink->h, qp_table, qp_stride, 1);
615             filter(fspp, out->data[1], in->data[1], out->linesize[1], in->linesize[1],
616                    cw,        ch,        qp_table, qp_stride, 0);
617             filter(fspp, out->data[2], in->data[2], out->linesize[2], in->linesize[2],
618                    cw,        ch,        qp_table, qp_stride, 0);
619             emms_c();
620         }
621     }
622 
623     if (in != out) {
624         if (in->data[3])
625             av_image_copy_plane(out->data[3], out->linesize[3],
626                                 in ->data[3], in ->linesize[3],
627                                 inlink->w, inlink->h);
628         av_frame_free(&in);
629     }
630     ret = ff_filter_frame(outlink, out);
631 finish:
632     if (qp_table != fspp->non_b_qp_table)
633         av_freep(&qp_table);
634     return ret;
635 }
636 
uninit(AVFilterContext * ctx)637 static av_cold void uninit(AVFilterContext *ctx)
638 {
639     FSPPContext *fspp = ctx->priv;
640     av_freep(&fspp->temp);
641     av_freep(&fspp->src);
642     av_freep(&fspp->non_b_qp_table);
643 }
644 
645 static const AVFilterPad fspp_inputs[] = {
646     {
647         .name         = "default",
648         .type         = AVMEDIA_TYPE_VIDEO,
649         .config_props = config_input,
650         .filter_frame = filter_frame,
651     },
652 };
653 
654 static const AVFilterPad fspp_outputs[] = {
655     {
656         .name = "default",
657         .type = AVMEDIA_TYPE_VIDEO,
658     },
659 };
660 
661 const AVFilter ff_vf_fspp = {
662     .name            = "fspp",
663     .description     = NULL_IF_CONFIG_SMALL("Apply Fast Simple Post-processing filter."),
664     .priv_size       = sizeof(FSPPContext),
665     .uninit          = uninit,
666     FILTER_INPUTS(fspp_inputs),
667     FILTER_OUTPUTS(fspp_outputs),
668     FILTER_PIXFMTS_ARRAY(pix_fmts),
669     .priv_class      = &fspp_class,
670     .flags           = AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL,
671 };
672