• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
3  * Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru>
4  * Copyright (c) 2014 Arwa Arif <arwaarif1994@gmail.com>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License along
19  * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
21  */
22 
23 /**
24  * @file
25  * Fast Simple Post-processing filter
26  * This implementation is based on an algorithm described in
27  * "Aria Nosratinia Embedded Post-Processing for
28  * Enhancement of Compressed Images (1999)"
29  * (http://www.utdallas.edu/~aria/papers/vlsisp99.pdf)
30  * Further, with splitting (I)DCT into horizontal/vertical passes, one of
31  * them can be performed once per block, not per pixel. This allows for much
32  * higher speed.
33  *
34  * Originally written by Michael Niedermayer and Nikolaj for the MPlayer
35  * project, and ported by Arwa Arif for FFmpeg.
36  */
37 
38 #include "libavutil/avassert.h"
39 #include "libavutil/imgutils.h"
40 #include "libavutil/mem_internal.h"
41 #include "libavutil/opt.h"
42 #include "libavutil/pixdesc.h"
43 #include "internal.h"
44 #include "qp_table.h"
45 #include "vf_fspp.h"
46 
47 #define OFFSET(x) offsetof(FSPPContext, x)
48 #define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM
49 static const AVOption fspp_options[] = {
50     { "quality",       "set quality",                          OFFSET(log2_count),    AV_OPT_TYPE_INT, {.i64 = 4},   4, MAX_LEVEL, FLAGS },
51     { "qp",            "force a constant quantizer parameter", OFFSET(qp),            AV_OPT_TYPE_INT, {.i64 = 0},   0, 64,        FLAGS },
52     { "strength",      "set filter strength",                  OFFSET(strength),      AV_OPT_TYPE_INT, {.i64 = 0}, -15, 32,        FLAGS },
53     { "use_bframe_qp", "use B-frames' QP",                     OFFSET(use_bframe_qp), AV_OPT_TYPE_BOOL,{.i64 = 0},   0, 1,         FLAGS },
54     { NULL }
55 };
56 
57 AVFILTER_DEFINE_CLASS(fspp);
58 
59 DECLARE_ALIGNED(32, static const uint8_t, dither)[8][8] = {
60     {  0,  48,  12,  60,   3,  51,  15,  63, },
61     { 32,  16,  44,  28,  35,  19,  47,  31, },
62     {  8,  56,   4,  52,  11,  59,   7,  55, },
63     { 40,  24,  36,  20,  43,  27,  39,  23, },
64     {  2,  50,  14,  62,   1,  49,  13,  61, },
65     { 34,  18,  46,  30,  33,  17,  45,  29, },
66     { 10,  58,   6,  54,   9,  57,   5,  53, },
67     { 42,  26,  38,  22,  41,  25,  37,  21, },
68 };
69 
70 static const short custom_threshold[64] = {
71 // values (296) can't be too high
72 // -it causes too big quant dependence
73 // or maybe overflow(check), which results in some flashing
74      71, 296, 295, 237,  71,  40,  38,  19,
75     245, 193, 185, 121, 102,  73,  53,  27,
76     158, 129, 141, 107,  97,  73,  50,  26,
77     102, 116, 109,  98,  82,  66,  45,  23,
78      71,  94,  95,  81,  70,  56,  38,  20,
79      56,  77,  74,  66,  56,  44,  30,  15,
80      38,  53,  50,  45,  38,  30,  21,  11,
81      20,  27,  26,  23,  20,  15,  11,   5
82 };
83 
84 //This func reads from 1 slice, 1 and clears 0 & 1
store_slice_c(uint8_t * dst,int16_t * src,ptrdiff_t dst_stride,ptrdiff_t src_stride,ptrdiff_t width,ptrdiff_t height,ptrdiff_t log2_scale)85 static void store_slice_c(uint8_t *dst, int16_t *src,
86                           ptrdiff_t dst_stride, ptrdiff_t src_stride,
87                           ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
88 {
89     int y, x;
90 #define STORE(pos)                                                             \
91     temp = (src[x + pos] + (d[pos] >> log2_scale)) >> (6 - log2_scale);        \
92     src[x + pos] = src[x + pos - 8 * src_stride] = 0;                          \
93     if (temp & 0x100) temp = ~(temp >> 31);                                    \
94     dst[x + pos] = temp;
95 
96     for (y = 0; y < height; y++) {
97         const uint8_t *d = dither[y];
98         for (x = 0; x < width; x += 8) {
99             int temp;
100             STORE(0);
101             STORE(1);
102             STORE(2);
103             STORE(3);
104             STORE(4);
105             STORE(5);
106             STORE(6);
107             STORE(7);
108         }
109         src += src_stride;
110         dst += dst_stride;
111     }
112 }
113 
114 //This func reads from 2 slices, 0 & 2  and clears 2-nd
store_slice2_c(uint8_t * dst,int16_t * src,ptrdiff_t dst_stride,ptrdiff_t src_stride,ptrdiff_t width,ptrdiff_t height,ptrdiff_t log2_scale)115 static void store_slice2_c(uint8_t *dst, int16_t *src,
116                            ptrdiff_t dst_stride, ptrdiff_t src_stride,
117                            ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
118 {
119     int y, x;
120 #define STORE2(pos)                                                                                       \
121     temp = (src[x + pos] + src[x + pos + 16 * src_stride] + (d[pos] >> log2_scale)) >> (6 - log2_scale);  \
122     src[x + pos + 16 * src_stride] = 0;                                                                   \
123     if (temp & 0x100) temp = ~(temp >> 31);                                                               \
124     dst[x + pos] = temp;
125 
126     for (y = 0; y < height; y++) {
127         const uint8_t *d = dither[y];
128         for (x = 0; x < width; x += 8) {
129             int temp;
130             STORE2(0);
131             STORE2(1);
132             STORE2(2);
133             STORE2(3);
134             STORE2(4);
135             STORE2(5);
136             STORE2(6);
137             STORE2(7);
138         }
139         src += src_stride;
140         dst += dst_stride;
141     }
142 }
143 
mul_thrmat_c(int16_t * thr_adr_noq,int16_t * thr_adr,int q)144 static void mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q)
145 {
146     int a;
147     for (a = 0; a < 64; a++)
148         thr_adr[a] = q * thr_adr_noq[a];
149 }
150 
filter(FSPPContext * p,uint8_t * dst,uint8_t * src,int dst_stride,int src_stride,int width,int height,uint8_t * qp_store,int qp_stride,int is_luma)151 static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src,
152                    int dst_stride, int src_stride,
153                    int width, int height,
154                    uint8_t *qp_store, int qp_stride, int is_luma)
155 {
156     int x, x0, y, es, qy, t;
157 
158     const int stride = is_luma ? p->temp_stride : (width + 16);
159     const int step = 6 - p->log2_count;
160     const int qpsh = 4 - p->hsub * !is_luma;
161     const int qpsv = 4 - p->vsub * !is_luma;
162 
163     DECLARE_ALIGNED(32, int32_t, block_align)[4 * 8 * BLOCKSZ + 4 * 8 * BLOCKSZ];
164     int16_t *block  = (int16_t *)block_align;
165     int16_t *block3 = (int16_t *)(block_align + 4 * 8 * BLOCKSZ);
166 
167     memset(block3, 0, 4 * 8 * BLOCKSZ);
168 
169     if (!src || !dst) return;
170 
171     for (y = 0; y < height; y++) {
172         int index = 8 + 8 * stride + y * stride;
173         memcpy(p->src + index, src + y * src_stride, width);
174         for (x = 0; x < 8; x++) {
175             p->src[index         - x - 1] = p->src[index +         x    ];
176             p->src[index + width + x    ] = p->src[index + width - x - 1];
177         }
178     }
179 
180     for (y = 0; y < 8; y++) {
181         memcpy(p->src + (     7 - y    ) * stride, p->src + (     y + 8    ) * stride, stride);
182         memcpy(p->src + (height + 8 + y) * stride, p->src + (height - y + 7) * stride, stride);
183     }
184     //FIXME (try edge emu)
185 
186     for (y = 8; y < 24; y++)
187         memset(p->temp + 8 + y * stride, 0, width * sizeof(int16_t));
188 
189     for (y = step; y < height + 8; y += step) {    //step= 1,2
190         const int y1 = y - 8 + step;                 //l5-7  l4-6;
191         qy = y - 4;
192 
193         if (qy > height - 1) qy = height - 1;
194         if (qy < 0) qy = 0;
195 
196         qy = (qy >> qpsv) * qp_stride;
197         p->row_fdct(block, p->src + y * stride + 2 - (y&1), stride, 2);
198 
199         for (x0 = 0; x0 < width + 8 - 8 * (BLOCKSZ - 1); x0 += 8 * (BLOCKSZ - 1)) {
200             p->row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y&1), stride, 2 * (BLOCKSZ - 1));
201 
202             if (p->qp)
203                 p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block + 0 * 8, block3 + 0 * 8, 8 * (BLOCKSZ - 1)); //yes, this is a HOTSPOT
204             else
205                 for (x = 0; x < 8 * (BLOCKSZ - 1); x += 8) {
206                     t = x + x0 - 2;                    //correct t=x+x0-2-(y&1), but its the same
207 
208                     if (t < 0) t = 0;                   //t always < width-2
209 
210                     t = qp_store[qy + (t >> qpsh)];
211                     t = ff_norm_qscale(t, p->qscale_type);
212 
213                     if (t != p->prev_q) p->prev_q = t, p->mul_thrmat((int16_t *)(&p->threshold_mtx_noq[0]), (int16_t *)(&p->threshold_mtx[0]), t);
214                     p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block + x * 8, block3 + x * 8, 8); //yes, this is a HOTSPOT
215                 }
216             p->row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, 2 * (BLOCKSZ - 1));
217             memmove(block,  block  + (BLOCKSZ - 1) * 64, 8 * 8 * sizeof(int16_t)); //cycling
218             memmove(block3, block3 + (BLOCKSZ - 1) * 64, 6 * 8 * sizeof(int16_t));
219         }
220 
221         es = width + 8 - x0; //  8, ...
222         if (es > 8)
223             p->row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y & 1), stride, (es - 4) >> 2);
224 
225         p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block, block3, es&(~1));
226         if (es > 3)
227             p->row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, es >> 2);
228 
229         if (!(y1 & 7) && y1) {
230             if (y1 & 8)
231                 p->store_slice(dst + (y1 - 8) * dst_stride, p->temp + 8 + 8 * stride,
232                                dst_stride, stride, width, 8, 5 - p->log2_count);
233             else
234                 p->store_slice2(dst + (y1 - 8) * dst_stride, p->temp + 8 + 0 * stride,
235                                 dst_stride, stride, width, 8, 5 - p->log2_count);
236         }
237     }
238 
239     if (y & 7) {  // height % 8 != 0
240         if (y & 8)
241             p->store_slice(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 8 * stride,
242                            dst_stride, stride, width, y&7, 5 - p->log2_count);
243         else
244             p->store_slice2(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 0 * stride,
245                             dst_stride, stride, width, y&7, 5 - p->log2_count);
246     }
247 }
248 
column_fidct_c(int16_t * thr_adr,int16_t * data,int16_t * output,int cnt)249 static void column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt)
250 {
251     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
252     int_simd16_t tmp10, tmp11, tmp12, tmp13;
253     int_simd16_t z1,z2,z3,z4,z5, z10, z11, z12, z13;
254     int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7;
255 
256     int16_t *dataptr;
257     int16_t *wsptr;
258     int16_t *threshold;
259     int ctr;
260 
261     dataptr = data;
262     wsptr = output;
263 
264     for (; cnt > 0; cnt -= 2) { //start positions
265         threshold = (int16_t *)thr_adr;//threshold_mtx
266         for (ctr = DCTSIZE; ctr > 0; ctr--) {
267             // Process columns from input, add to output.
268             tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
269             tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
270 
271             tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
272             tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
273 
274             tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
275             tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
276 
277             tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
278             tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
279 
280             // Even part of FDCT
281 
282             tmp10 = tmp0 + tmp3;
283             tmp13 = tmp0 - tmp3;
284             tmp11 = tmp1 + tmp2;
285             tmp12 = tmp1 - tmp2;
286 
287             d0 = tmp10 + tmp11;
288             d4 = tmp10 - tmp11;
289 
290             z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
291             d2 = tmp13 + z1;
292             d6 = tmp13 - z1;
293 
294             // Even part of IDCT
295 
296             THRESHOLD(tmp0, d0, threshold[0 * 8]);
297             THRESHOLD(tmp1, d2, threshold[2 * 8]);
298             THRESHOLD(tmp2, d4, threshold[4 * 8]);
299             THRESHOLD(tmp3, d6, threshold[6 * 8]);
300             tmp0 += 2;
301             tmp10 = (tmp0 + tmp2) >> 2;
302             tmp11 = (tmp0 - tmp2) >> 2;
303 
304             tmp13 = (tmp1 + tmp3) >>2; //+2 !  (psnr decides)
305             tmp12 = MULTIPLY16H((tmp1 - tmp3), FIX_1_414213562_A) - tmp13; //<<2
306 
307             tmp0 = tmp10 + tmp13; //->temps
308             tmp3 = tmp10 - tmp13; //->temps
309             tmp1 = tmp11 + tmp12; //->temps
310             tmp2 = tmp11 - tmp12; //->temps
311 
312             // Odd part of FDCT
313 
314             tmp10 = tmp4 + tmp5;
315             tmp11 = tmp5 + tmp6;
316             tmp12 = tmp6 + tmp7;
317 
318             z5 = MULTIPLY16H((tmp10 - tmp12) << 2, FIX_0_382683433);
319             z2 = MULTIPLY16H(tmp10 << 2, FIX_0_541196100) + z5;
320             z4 = MULTIPLY16H(tmp12 << 2, FIX_1_306562965) + z5;
321             z3 = MULTIPLY16H(tmp11 << 2, FIX_0_707106781);
322 
323             z11 = tmp7 + z3;
324             z13 = tmp7 - z3;
325 
326             d5 = z13 + z2;
327             d3 = z13 - z2;
328             d1 = z11 + z4;
329             d7 = z11 - z4;
330 
331             // Odd part of IDCT
332 
333             THRESHOLD(tmp4, d1, threshold[1 * 8]);
334             THRESHOLD(tmp5, d3, threshold[3 * 8]);
335             THRESHOLD(tmp6, d5, threshold[5 * 8]);
336             THRESHOLD(tmp7, d7, threshold[7 * 8]);
337 
338             //Simd version uses here a shortcut for the tmp5,tmp6,tmp7 == 0
339             z13 = tmp6 + tmp5;
340             z10 = (tmp6 - tmp5) << 1;
341             z11 = tmp4 + tmp7;
342             z12 = (tmp4 - tmp7) << 1;
343 
344             tmp7  = (z11 + z13) >> 2; //+2 !
345             tmp11 = MULTIPLY16H((z11 - z13) << 1, FIX_1_414213562);
346             z5    = MULTIPLY16H(z10 + z12,        FIX_1_847759065);
347             tmp10 = MULTIPLY16H(z12,              FIX_1_082392200) - z5;
348             tmp12 = MULTIPLY16H(z10,              FIX_2_613125930) + z5; // - !!
349 
350             tmp6 = tmp12 - tmp7;
351             tmp5 = tmp11 - tmp6;
352             tmp4 = tmp10 + tmp5;
353 
354             wsptr[DCTSIZE * 0] +=  (tmp0 + tmp7);
355             wsptr[DCTSIZE * 1] +=  (tmp1 + tmp6);
356             wsptr[DCTSIZE * 2] +=  (tmp2 + tmp5);
357             wsptr[DCTSIZE * 3] +=  (tmp3 - tmp4);
358             wsptr[DCTSIZE * 4] +=  (tmp3 + tmp4);
359             wsptr[DCTSIZE * 5] +=  (tmp2 - tmp5);
360             wsptr[DCTSIZE * 6]  =  (tmp1 - tmp6);
361             wsptr[DCTSIZE * 7]  =  (tmp0 - tmp7);
362             //
363             dataptr++; //next column
364             wsptr++;
365             threshold++;
366         }
367         dataptr += 8; //skip each second start pos
368         wsptr   += 8;
369     }
370 }
371 
row_idct_c(int16_t * workspace,int16_t * output_adr,ptrdiff_t output_stride,int cnt)372 static void row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt)
373 {
374     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
375     int_simd16_t tmp10, tmp11, tmp12, tmp13;
376     int_simd16_t z5, z10, z11, z12, z13;
377     int16_t *outptr;
378     int16_t *wsptr;
379 
380     cnt *= 4;
381     wsptr = workspace;
382     outptr = output_adr;
383     for (; cnt > 0; cnt--) {
384         // Even part
385         //Simd version reads 4x4 block and transposes it
386         tmp10 = wsptr[2] +  wsptr[3];
387         tmp11 = wsptr[2] -  wsptr[3];
388 
389         tmp13 = wsptr[0] +  wsptr[1];
390         tmp12 = (MULTIPLY16H(wsptr[0] - wsptr[1], FIX_1_414213562_A) << 2) - tmp13;//this shift order to avoid overflow
391 
392         tmp0 = tmp10 + tmp13; //->temps
393         tmp3 = tmp10 - tmp13; //->temps
394         tmp1 = tmp11 + tmp12;
395         tmp2 = tmp11 - tmp12;
396 
397         // Odd part
398         //Also transpose, with previous:
399         // ---- ----      ||||
400         // ---- ---- idct ||||
401         // ---- ---- ---> ||||
402         // ---- ----      ||||
403         z13 = wsptr[4] + wsptr[5];
404         z10 = wsptr[4] - wsptr[5];
405         z11 = wsptr[6] + wsptr[7];
406         z12 = wsptr[6] - wsptr[7];
407 
408         tmp7 = z11 + z13;
409         tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562);
410 
411         z5 =    MULTIPLY16H(z10 + z12, FIX_1_847759065);
412         tmp10 = MULTIPLY16H(z12,       FIX_1_082392200) - z5;
413         tmp12 = MULTIPLY16H(z10,       FIX_2_613125930) + z5; // - FIX_
414 
415         tmp6 = (tmp12 << 3) - tmp7;
416         tmp5 = (tmp11 << 3) - tmp6;
417         tmp4 = (tmp10 << 3) + tmp5;
418 
419         // Final output stage: descale and write column
420         outptr[0 * output_stride] += DESCALE(tmp0 + tmp7, 3);
421         outptr[1 * output_stride] += DESCALE(tmp1 + tmp6, 3);
422         outptr[2 * output_stride] += DESCALE(tmp2 + tmp5, 3);
423         outptr[3 * output_stride] += DESCALE(tmp3 - tmp4, 3);
424         outptr[4 * output_stride] += DESCALE(tmp3 + tmp4, 3);
425         outptr[5 * output_stride] += DESCALE(tmp2 - tmp5, 3);
426         outptr[6 * output_stride] += DESCALE(tmp1 - tmp6, 3); //no += ?
427         outptr[7 * output_stride] += DESCALE(tmp0 - tmp7, 3); //no += ?
428         outptr++;
429 
430         wsptr += DCTSIZE;       // advance pointer to next row
431     }
432 }
433 
row_fdct_c(int16_t * data,const uint8_t * pixels,ptrdiff_t line_size,int cnt)434 static void row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt)
435 {
436     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
437     int_simd16_t tmp10, tmp11, tmp12, tmp13;
438     int_simd16_t z1, z2, z3, z4, z5, z11, z13;
439     int16_t *dataptr;
440 
441     cnt *= 4;
442     // Pass 1: process rows.
443 
444     dataptr = data;
445     for (; cnt > 0; cnt--) {
446         tmp0 = pixels[line_size * 0] + pixels[line_size * 7];
447         tmp7 = pixels[line_size * 0] - pixels[line_size * 7];
448         tmp1 = pixels[line_size * 1] + pixels[line_size * 6];
449         tmp6 = pixels[line_size * 1] - pixels[line_size * 6];
450         tmp2 = pixels[line_size * 2] + pixels[line_size * 5];
451         tmp5 = pixels[line_size * 2] - pixels[line_size * 5];
452         tmp3 = pixels[line_size * 3] + pixels[line_size * 4];
453         tmp4 = pixels[line_size * 3] - pixels[line_size * 4];
454 
455         // Even part
456 
457         tmp10 = tmp0 + tmp3;
458         tmp13 = tmp0 - tmp3;
459         tmp11 = tmp1 + tmp2;
460         tmp12 = tmp1 - tmp2;
461         //Even columns are written first, this leads to different order of columns
462         //in column_fidct(), but they are processed independently, so all ok.
463         //Later in the row_idct() columns readed at the same order.
464         dataptr[2] = tmp10 + tmp11;
465         dataptr[3] = tmp10 - tmp11;
466 
467         z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
468         dataptr[0] = tmp13 + z1;
469         dataptr[1] = tmp13 - z1;
470 
471         // Odd part
472 
473         tmp10 = (tmp4 + tmp5) << 2;
474         tmp11 = (tmp5 + tmp6) << 2;
475         tmp12 = (tmp6 + tmp7) << 2;
476 
477         z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433);
478         z2 = MULTIPLY16H(tmp10,         FIX_0_541196100) + z5;
479         z4 = MULTIPLY16H(tmp12,         FIX_1_306562965) + z5;
480         z3 = MULTIPLY16H(tmp11,         FIX_0_707106781);
481 
482         z11 = tmp7 + z3;
483         z13 = tmp7 - z3;
484 
485         dataptr[4] = z13 + z2;
486         dataptr[5] = z13 - z2;
487         dataptr[6] = z11 + z4;
488         dataptr[7] = z11 - z4;
489 
490         pixels++;               // advance pointer to next column
491         dataptr += DCTSIZE;
492     }
493 }
494 
query_formats(AVFilterContext * ctx)495 static int query_formats(AVFilterContext *ctx)
496 {
497     static const enum AVPixelFormat pix_fmts[] = {
498         AV_PIX_FMT_YUV444P,  AV_PIX_FMT_YUV422P,
499         AV_PIX_FMT_YUV420P,  AV_PIX_FMT_YUV411P,
500         AV_PIX_FMT_YUV410P,  AV_PIX_FMT_YUV440P,
501         AV_PIX_FMT_YUVJ444P, AV_PIX_FMT_YUVJ422P,
502         AV_PIX_FMT_YUVJ420P, AV_PIX_FMT_YUVJ440P,
503         AV_PIX_FMT_GBRP, AV_PIX_FMT_GRAY8,
504         AV_PIX_FMT_NONE
505     };
506 
507     AVFilterFormats *fmts_list = ff_make_format_list(pix_fmts);
508     if (!fmts_list)
509         return AVERROR(ENOMEM);
510     return ff_set_common_formats(ctx, fmts_list);
511 }
512 
config_input(AVFilterLink * inlink)513 static int config_input(AVFilterLink *inlink)
514 {
515     AVFilterContext *ctx = inlink->dst;
516     FSPPContext *fspp = ctx->priv;
517     const int h = FFALIGN(inlink->h + 16, 16);
518     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format);
519 
520     fspp->hsub = desc->log2_chroma_w;
521     fspp->vsub = desc->log2_chroma_h;
522 
523     fspp->temp_stride = FFALIGN(inlink->w + 16, 16);
524     fspp->temp = av_malloc_array(fspp->temp_stride, h * sizeof(*fspp->temp));
525     fspp->src  = av_malloc_array(fspp->temp_stride, h * sizeof(*fspp->src));
526 
527     if (!fspp->temp || !fspp->src)
528         return AVERROR(ENOMEM);
529 
530     fspp->store_slice  = store_slice_c;
531     fspp->store_slice2 = store_slice2_c;
532     fspp->mul_thrmat   = mul_thrmat_c;
533     fspp->column_fidct = column_fidct_c;
534     fspp->row_idct     = row_idct_c;
535     fspp->row_fdct     = row_fdct_c;
536 
537     if (ARCH_X86)
538         ff_fspp_init_x86(fspp);
539 
540     return 0;
541 }
542 
filter_frame(AVFilterLink * inlink,AVFrame * in)543 static int filter_frame(AVFilterLink *inlink, AVFrame *in)
544 {
545     AVFilterContext *ctx = inlink->dst;
546     FSPPContext *fspp = ctx->priv;
547     AVFilterLink *outlink = ctx->outputs[0];
548     AVFrame *out = in;
549 
550     int qp_stride = 0;
551     int8_t *qp_table = NULL;
552     int i, bias;
553     int ret = 0;
554     int custom_threshold_m[64];
555 
556     bias = (1 << 4) + fspp->strength;
557 
558     for (i = 0; i < 64; i++) //FIXME: tune custom_threshold[] and remove this !
559         custom_threshold_m[i] = (int)(custom_threshold[i] * (bias / 71.0) + 0.5);
560 
561     for (i = 0; i < 8; i++) {
562         fspp->threshold_mtx_noq[2 * i] = (uint64_t)custom_threshold_m[i * 8 + 2]
563                                       |(((uint64_t)custom_threshold_m[i * 8 + 6]) << 16)
564                                       |(((uint64_t)custom_threshold_m[i * 8 + 0]) << 32)
565                                       |(((uint64_t)custom_threshold_m[i * 8 + 4]) << 48);
566 
567         fspp->threshold_mtx_noq[2 * i + 1] = (uint64_t)custom_threshold_m[i * 8 + 5]
568                                           |(((uint64_t)custom_threshold_m[i * 8 + 3]) << 16)
569                                           |(((uint64_t)custom_threshold_m[i * 8 + 1]) << 32)
570                                           |(((uint64_t)custom_threshold_m[i * 8 + 7]) << 48);
571     }
572 
573     if (fspp->qp)
574         fspp->prev_q = fspp->qp, fspp->mul_thrmat((int16_t *)(&fspp->threshold_mtx_noq[0]), (int16_t *)(&fspp->threshold_mtx[0]), fspp->qp);
575 
576     /* if we are not in a constant user quantizer mode and we don't want to use
577      * the quantizers from the B-frames (B-frames often have a higher QP), we
578      * need to save the qp table from the last non B-frame; this is what the
579      * following code block does */
580     if (!fspp->qp && (fspp->use_bframe_qp || in->pict_type != AV_PICTURE_TYPE_B)) {
581         ret = ff_qp_table_extract(in, &qp_table, &qp_stride, NULL, &fspp->qscale_type);
582         if (ret < 0) {
583             av_frame_free(&in);
584             return ret;
585         }
586 
587         if (!fspp->use_bframe_qp && in->pict_type != AV_PICTURE_TYPE_B) {
588             av_freep(&fspp->non_b_qp_table);
589             fspp->non_b_qp_table  = qp_table;
590             fspp->non_b_qp_stride = qp_stride;
591         }
592     }
593 
594     if (fspp->log2_count && !ctx->is_disabled) {
595         if (!fspp->use_bframe_qp && fspp->non_b_qp_table) {
596             qp_table = fspp->non_b_qp_table;
597             qp_stride = fspp->non_b_qp_stride;
598         }
599 
600         if (qp_table || fspp->qp) {
601             const int cw = AV_CEIL_RSHIFT(inlink->w, fspp->hsub);
602             const int ch = AV_CEIL_RSHIFT(inlink->h, fspp->vsub);
603 
604             /* get a new frame if in-place is not possible or if the dimensions
605              * are not multiple of 8 */
606             if (!av_frame_is_writable(in) || (inlink->w & 7) || (inlink->h & 7)) {
607                 const int aligned_w = FFALIGN(inlink->w, 8);
608                 const int aligned_h = FFALIGN(inlink->h, 8);
609 
610                 out = ff_get_video_buffer(outlink, aligned_w, aligned_h);
611                 if (!out) {
612                     av_frame_free(&in);
613                     ret = AVERROR(ENOMEM);
614                     goto finish;
615                 }
616                 av_frame_copy_props(out, in);
617                 out->width = in->width;
618                 out->height = in->height;
619             }
620 
621             filter(fspp, out->data[0], in->data[0], out->linesize[0], in->linesize[0],
622                    inlink->w, inlink->h, qp_table, qp_stride, 1);
623             filter(fspp, out->data[1], in->data[1], out->linesize[1], in->linesize[1],
624                    cw,        ch,        qp_table, qp_stride, 0);
625             filter(fspp, out->data[2], in->data[2], out->linesize[2], in->linesize[2],
626                    cw,        ch,        qp_table, qp_stride, 0);
627             emms_c();
628         }
629     }
630 
631     if (in != out) {
632         if (in->data[3])
633             av_image_copy_plane(out->data[3], out->linesize[3],
634                                 in ->data[3], in ->linesize[3],
635                                 inlink->w, inlink->h);
636         av_frame_free(&in);
637     }
638     ret = ff_filter_frame(outlink, out);
639 finish:
640     if (qp_table != fspp->non_b_qp_table)
641         av_freep(&qp_table);
642     return ret;
643 }
644 
uninit(AVFilterContext * ctx)645 static av_cold void uninit(AVFilterContext *ctx)
646 {
647     FSPPContext *fspp = ctx->priv;
648     av_freep(&fspp->temp);
649     av_freep(&fspp->src);
650     av_freep(&fspp->non_b_qp_table);
651 }
652 
653 static const AVFilterPad fspp_inputs[] = {
654     {
655         .name         = "default",
656         .type         = AVMEDIA_TYPE_VIDEO,
657         .config_props = config_input,
658         .filter_frame = filter_frame,
659     },
660     { NULL }
661 };
662 
663 static const AVFilterPad fspp_outputs[] = {
664     {
665         .name = "default",
666         .type = AVMEDIA_TYPE_VIDEO,
667     },
668     { NULL }
669 };
670 
671 AVFilter ff_vf_fspp = {
672     .name            = "fspp",
673     .description     = NULL_IF_CONFIG_SMALL("Apply Fast Simple Post-processing filter."),
674     .priv_size       = sizeof(FSPPContext),
675     .uninit          = uninit,
676     .query_formats   = query_formats,
677     .inputs          = fspp_inputs,
678     .outputs         = fspp_outputs,
679     .priv_class      = &fspp_class,
680     .flags           = AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL,
681 };
682