• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2022 Ben Avison
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License along
17  * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
18  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19  */
20 
21 #include <string.h>
22 
23 #include "checkasm.h"
24 
25 #include "libavcodec/vc1dsp.h"
26 
27 #include "libavutil/common.h"
28 #include "libavutil/internal.h"
29 #include "libavutil/intreadwrite.h"
30 #include "libavutil/mem_internal.h"
31 
32 #define VC1DSP_TEST(func) { #func, offsetof(VC1DSPContext, func) },
33 #define VC1DSP_SIZED_TEST(func, width, height) { #func, offsetof(VC1DSPContext, func), width, height },
34 
35 typedef struct {
36     const char *name;
37     size_t offset;
38     int width;
39     int height;
40 } test;
41 
42 typedef struct matrix {
43     size_t width;
44     size_t height;
45     float d[];
46 } matrix;
47 
48 static const matrix T8 = { 8, 8, {
49         12,  12,  12,  12,  12,  12,  12,  12,
50         16,  15,   9,   4,  -4,  -9, -15, -16,
51         16,   6,  -6, -16, -16,  -6,   6,  16,
52         15,  -4, -16,  -9,   9,  16,   4, -15,
53         12, -12, -12,  12,  12, -12, -12,  12,
54          9, -16,   4,  15, -15,  -4,  16,  -9,
55          6, -16,  16,  -6,  -6,  16, -16,   6,
56          4,  -9,  15, -16,  16, -15,   9,  -4
57 } };
58 
59 static const matrix T4 = { 4, 4, {
60         17,  17,  17,  17,
61         22,  10, -10, -22,
62         17, -17, -17,  17,
63         10, -22,  22, -10
64 } };
65 
66 static const matrix T8t = { 8, 8, {
67         12,  16,  16,  15,  12,   9,   6,   4,
68         12,  15,   6,  -4, -12, -16, -16,  -9,
69         12,   9,  -6, -16, -12,   4,  16,  15,
70         12,   4, -16,  -9,  12,  15,  -6, -16,
71         12,  -4, -16,   9,  12, -15,  -6,  16,
72         12,  -9,  -6,  16, -12,  -4,  16, -15,
73         12, -15,   6,   4, -12,  16, -16,   9,
74         12, -16,  16, -15,  12,  -9,   6,  -4
75 } };
76 
77 static const matrix T4t = { 4, 4, {
78         17,  22,  17,  10,
79         17,  10, -17, -22,
80         17, -10, -17,  22,
81         17, -22,  17, -10
82 } };
83 
new_matrix(size_t width,size_t height)84 static matrix *new_matrix(size_t width, size_t height)
85 {
86     matrix *out = av_mallocz(sizeof (matrix) + height * width * sizeof (float));
87     if (out == NULL) {
88         fprintf(stderr, "Memory allocation failure\n");
89         exit(EXIT_FAILURE);
90     }
91     out->width = width;
92     out->height = height;
93     return out;
94 }
95 
multiply(const matrix * a,const matrix * b)96 static matrix *multiply(const matrix *a, const matrix *b)
97 {
98     matrix *out;
99     if (a->width != b->height) {
100         fprintf(stderr, "Incompatible multiplication\n");
101         exit(EXIT_FAILURE);
102     }
103     out = new_matrix(b->width, a->height);
104     for (int j = 0; j < out->height; ++j)
105         for (int i = 0; i < out->width; ++i) {
106             float sum = 0;
107             for (int k = 0; k < a->width; ++k)
108                 sum += a->d[j * a->width + k] * b->d[k * b->width + i];
109             out->d[j * out->width + i] = sum;
110         }
111     return out;
112 }
113 
normalise(matrix * a)114 static void normalise(matrix *a)
115 {
116     for (int j = 0; j < a->height; ++j)
117         for (int i = 0; i < a->width; ++i) {
118             float *p = a->d + j * a->width + i;
119             *p *= 64;
120             if (a->height == 4)
121                 *p /= (const unsigned[]) { 289, 292, 289, 292 } [j];
122             else
123                 *p /= (const unsigned[]) { 288, 289, 292, 289, 288, 289, 292, 289 } [j];
124             if (a->width == 4)
125                 *p /= (const unsigned[]) { 289, 292, 289, 292 } [i];
126             else
127                 *p /= (const unsigned[]) { 288, 289, 292, 289, 288, 289, 292, 289 } [i];
128         }
129 }
130 
divide_and_round_nearest(matrix * a,float by)131 static void divide_and_round_nearest(matrix *a, float by)
132 {
133     for (int j = 0; j < a->height; ++j)
134         for (int i = 0; i < a->width; ++i) {
135             float *p = a->d + j * a->width + i;
136             *p = rintf(*p / by);
137         }
138 }
139 
tweak(matrix * a)140 static void tweak(matrix *a)
141 {
142     for (int j = 4; j < a->height; ++j)
143         for (int i = 0; i < a->width; ++i) {
144             float *p = a->d + j * a->width + i;
145             *p += 1;
146         }
147 }
148 
149 /* The VC-1 spec places restrictions on the values permitted at three
150  * different stages:
151  * - D: the input coefficients in frequency domain
152  * - E: the intermediate coefficients, inverse-transformed only horizontally
153  * - R: the fully inverse-transformed coefficients
154  *
155  * To fully cater for the ranges specified requires various intermediate
156  * values to be held to 17-bit precision; yet these conditions do not appear
157  * to be utilised in real-world streams. At least some assembly
158  * implementations have chosen to restrict these values to 16-bit precision,
159  * to accelerate the decoding of real-world streams at the cost of strict
160  * adherence to the spec. To avoid our test marking these as failures,
161  * reduce our random inputs.
162  */
163 #define ATTENUATION 4
164 
generate_inverse_quantized_transform_coefficients(size_t width,size_t height)165 static matrix *generate_inverse_quantized_transform_coefficients(size_t width, size_t height)
166 {
167     matrix *raw, *tmp, *D, *E, *R;
168     raw = new_matrix(width, height);
169     for (int i = 0; i < width * height; ++i)
170         raw->d[i] = (int) (rnd() % (1024/ATTENUATION)) - 512/ATTENUATION;
171     tmp = multiply(height == 8 ? &T8 : &T4, raw);
172     D = multiply(tmp, width == 8 ? &T8t : &T4t);
173     normalise(D);
174     divide_and_round_nearest(D, 1);
175     for (int i = 0; i < width * height; ++i) {
176         if (D->d[i] < -2048/ATTENUATION || D->d[i] > 2048/ATTENUATION-1) {
177             /* Rare, so simply try again */
178             av_free(raw);
179             av_free(tmp);
180             av_free(D);
181             return generate_inverse_quantized_transform_coefficients(width, height);
182         }
183     }
184     E = multiply(D, width == 8 ? &T8 : &T4);
185     divide_and_round_nearest(E, 8);
186     for (int i = 0; i < width * height; ++i)
187         if (E->d[i] < -4096/ATTENUATION || E->d[i] > 4096/ATTENUATION-1) {
188             /* Rare, so simply try again */
189             av_free(raw);
190             av_free(tmp);
191             av_free(D);
192             av_free(E);
193             return generate_inverse_quantized_transform_coefficients(width, height);
194         }
195     R = multiply(height == 8 ? &T8t : &T4t, E);
196     tweak(R);
197     divide_and_round_nearest(R, 128);
198     for (int i = 0; i < width * height; ++i)
199         if (R->d[i] < -512/ATTENUATION || R->d[i] > 512/ATTENUATION-1) {
200             /* Rare, so simply try again */
201             av_free(raw);
202             av_free(tmp);
203             av_free(D);
204             av_free(E);
205             av_free(R);
206             return generate_inverse_quantized_transform_coefficients(width, height);
207         }
208     av_free(raw);
209     av_free(tmp);
210     av_free(E);
211     av_free(R);
212     return D;
213 }
214 
215 #define RANDOMIZE_BUFFER16(name, size)        \
216     do {                                      \
217         int i;                                \
218         for (i = 0; i < size; ++i) {          \
219             uint16_t r = rnd();               \
220             AV_WN16A(name##0 + i, r);         \
221             AV_WN16A(name##1 + i, r);         \
222         }                                     \
223     } while (0)
224 
225 #define RANDOMIZE_BUFFER8(name, size)         \
226     do {                                      \
227         int i;                                \
228         for (i = 0; i < size; ++i) {          \
229             uint8_t r = rnd();                \
230             name##0[i] = r;                   \
231             name##1[i] = r;                   \
232         }                                     \
233     } while (0)
234 
235 #define RANDOMIZE_BUFFER8_MID_WEIGHTED(name, size)  \
236     do {                                            \
237         uint8_t *p##0 = name##0, *p##1 = name##1;   \
238         int i = (size);                             \
239         while (i-- > 0) {                           \
240             int x = 0x80 | (rnd() & 0x7F);          \
241             x >>= rnd() % 9;                        \
242             if (rnd() & 1)                          \
243                 x = -x;                             \
244             *p##1++ = *p##0++ = 0x80 + x;           \
245         }                                           \
246     } while (0)
247 
check_inv_trans_inplace(void)248 static void check_inv_trans_inplace(void)
249 {
250     /* Inverse transform input coefficients are stored in a 16-bit buffer
251      * with row stride of 8 coefficients irrespective of transform size.
252      * vc1_inv_trans_8x8 differs from the others in two ways: coefficients
253      * are stored in column-major order, and the outputs are written back
254      * to the input buffer, so we oversize it slightly to catch overruns. */
255     LOCAL_ALIGNED_16(int16_t, inv_trans_in0, [10 * 8]);
256     LOCAL_ALIGNED_16(int16_t, inv_trans_in1, [10 * 8]);
257 
258     VC1DSPContext h;
259 
260     ff_vc1dsp_init(&h);
261 
262     if (check_func(h.vc1_inv_trans_8x8, "vc1dsp.vc1_inv_trans_8x8")) {
263         matrix *coeffs;
264         declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *);
265         RANDOMIZE_BUFFER16(inv_trans_in, 10 * 8);
266         coeffs = generate_inverse_quantized_transform_coefficients(8, 8);
267         for (int j = 0; j < 8; ++j)
268             for (int i = 0; i < 8; ++i) {
269                 int idx = 8 + i * 8 + j;
270                 inv_trans_in1[idx] = inv_trans_in0[idx] = coeffs->d[j * 8 + i];
271             }
272         call_ref(inv_trans_in0 + 8);
273         call_new(inv_trans_in1 + 8);
274         if (memcmp(inv_trans_in0,  inv_trans_in1,  10 * 8 * sizeof (int16_t)))
275             fail();
276         bench_new(inv_trans_in1 + 8);
277         av_free(coeffs);
278     }
279 }
280 
check_inv_trans_adding(void)281 static void check_inv_trans_adding(void)
282 {
283     /* Inverse transform input coefficients are stored in a 16-bit buffer
284      * with row stride of 8 coefficients irrespective of transform size. */
285     LOCAL_ALIGNED_16(int16_t, inv_trans_in0, [8 * 8]);
286     LOCAL_ALIGNED_16(int16_t, inv_trans_in1, [8 * 8]);
287 
288     /* For all but vc1_inv_trans_8x8, the inverse transform is narrowed and
289      * added with saturation to an array of unsigned 8-bit values. Oversize
290      * this by 8 samples left and right and one row above and below. */
291     LOCAL_ALIGNED_8(uint8_t, inv_trans_out0, [10 * 24]);
292     LOCAL_ALIGNED_8(uint8_t, inv_trans_out1, [10 * 24]);
293 
294     VC1DSPContext h;
295 
296     const test tests[] = {
297         VC1DSP_SIZED_TEST(vc1_inv_trans_8x4, 8, 4)
298         VC1DSP_SIZED_TEST(vc1_inv_trans_4x8, 4, 8)
299         VC1DSP_SIZED_TEST(vc1_inv_trans_4x4, 4, 4)
300         VC1DSP_SIZED_TEST(vc1_inv_trans_8x8_dc, 8, 8)
301         VC1DSP_SIZED_TEST(vc1_inv_trans_8x4_dc, 8, 4)
302         VC1DSP_SIZED_TEST(vc1_inv_trans_4x8_dc, 4, 8)
303         VC1DSP_SIZED_TEST(vc1_inv_trans_4x4_dc, 4, 4)
304     };
305 
306     ff_vc1dsp_init(&h);
307 
308     for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) {
309         void (*func)(uint8_t *, ptrdiff_t, int16_t *) = *(void **)((intptr_t) &h + tests[t].offset);
310         if (check_func(func, "vc1dsp.%s", tests[t].name)) {
311             matrix *coeffs;
312             declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, ptrdiff_t, int16_t *);
313             RANDOMIZE_BUFFER16(inv_trans_in, 8 * 8);
314             RANDOMIZE_BUFFER8(inv_trans_out, 10 * 24);
315             coeffs = generate_inverse_quantized_transform_coefficients(tests[t].width, tests[t].height);
316             for (int j = 0; j < tests[t].height; ++j)
317                 for (int i = 0; i < tests[t].width; ++i) {
318                     int idx = j * 8 + i;
319                     inv_trans_in1[idx] = inv_trans_in0[idx] = coeffs->d[j * tests[t].width + i];
320                 }
321             call_ref(inv_trans_out0 + 24 + 8, 24, inv_trans_in0);
322             call_new(inv_trans_out1 + 24 + 8, 24, inv_trans_in1);
323             if (memcmp(inv_trans_out0, inv_trans_out1, 10 * 24))
324                 fail();
325             bench_new(inv_trans_out1 + 24 + 8, 24, inv_trans_in1 + 8);
326             av_free(coeffs);
327         }
328     }
329 }
330 
check_loop_filter(void)331 static void check_loop_filter(void)
332 {
333     /* Deblocking filter buffers are big enough to hold a 16x16 block,
334      * plus 16 columns left and 4 rows above to hold filter inputs
335      * (depending on whether v or h neighbouring block edge, oversized
336      * horizontally to maintain 16-byte alignment) plus 16 columns and
337      * 4 rows below to catch write overflows */
338     LOCAL_ALIGNED_16(uint8_t, filter_buf0, [24 * 48]);
339     LOCAL_ALIGNED_16(uint8_t, filter_buf1, [24 * 48]);
340 
341     VC1DSPContext h;
342 
343     const test tests[] = {
344         VC1DSP_TEST(vc1_v_loop_filter4)
345         VC1DSP_TEST(vc1_h_loop_filter4)
346         VC1DSP_TEST(vc1_v_loop_filter8)
347         VC1DSP_TEST(vc1_h_loop_filter8)
348         VC1DSP_TEST(vc1_v_loop_filter16)
349         VC1DSP_TEST(vc1_h_loop_filter16)
350     };
351 
352     ff_vc1dsp_init(&h);
353 
354     for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) {
355         void (*func)(uint8_t *, ptrdiff_t, int) = *(void **)((intptr_t) &h + tests[t].offset);
356         declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, ptrdiff_t, int);
357         if (check_func(func, "vc1dsp.%s", tests[t].name)) {
358             for (int count = 1000; count > 0; --count) {
359                 int pq = rnd() % 31 + 1;
360                 RANDOMIZE_BUFFER8_MID_WEIGHTED(filter_buf, 24 * 48);
361                 call_ref(filter_buf0 + 4 * 48 + 16, 48, pq);
362                 call_new(filter_buf1 + 4 * 48 + 16, 48, pq);
363                 if (memcmp(filter_buf0, filter_buf1, 24 * 48))
364                     fail();
365             }
366         }
367         for (int j = 0; j < 24; ++j)
368             for (int i = 0; i < 48; ++i)
369                 filter_buf1[j * 48 + i] = 0x60 + 0x40 * (i >= 16 && j >= 4);
370         if (check_func(func, "vc1dsp.%s_bestcase", tests[t].name))
371             bench_new(filter_buf1 + 4 * 48 + 16, 48, 1);
372         if (check_func(func, "vc1dsp.%s_worstcase", tests[t].name))
373             bench_new(filter_buf1 + 4 * 48 + 16, 48, 31);
374     }
375 }
376 
377 #define TEST_UNESCAPE                                                                               \
378     do {                                                                                            \
379         for (int count = 100; count > 0; --count) {                                                 \
380             escaped_offset = rnd() & 7;                                                             \
381             unescaped_offset = rnd() & 7;                                                           \
382             escaped_len = (1u << (rnd() % 8) + 3) - (rnd() & 7);                                    \
383             RANDOMIZE_BUFFER8(unescaped, UNESCAPE_BUF_SIZE);                                        \
384             len0 = call_ref(escaped0 + escaped_offset, escaped_len, unescaped0 + unescaped_offset); \
385             len1 = call_new(escaped1 + escaped_offset, escaped_len, unescaped1 + unescaped_offset); \
386             if (len0 != len1 || memcmp(unescaped0, unescaped1, UNESCAPE_BUF_SIZE))                  \
387                 fail();                                                                             \
388         }                                                                                           \
389     } while (0)
390 
check_unescape(void)391 static void check_unescape(void)
392 {
393     /* This appears to be a typical length of buffer in use */
394 #define LOG2_UNESCAPE_BUF_SIZE 17
395 #define UNESCAPE_BUF_SIZE (1u<<LOG2_UNESCAPE_BUF_SIZE)
396     LOCAL_ALIGNED_8(uint8_t, escaped0, [UNESCAPE_BUF_SIZE]);
397     LOCAL_ALIGNED_8(uint8_t, escaped1, [UNESCAPE_BUF_SIZE]);
398     LOCAL_ALIGNED_8(uint8_t, unescaped0, [UNESCAPE_BUF_SIZE]);
399     LOCAL_ALIGNED_8(uint8_t, unescaped1, [UNESCAPE_BUF_SIZE]);
400 
401     VC1DSPContext h;
402 
403     ff_vc1dsp_init(&h);
404 
405     if (check_func(h.vc1_unescape_buffer, "vc1dsp.vc1_unescape_buffer")) {
406         int len0, len1, escaped_offset, unescaped_offset, escaped_len;
407         declare_func_emms(AV_CPU_FLAG_MMX, int, const uint8_t *, int, uint8_t *);
408 
409         /* Test data which consists of escapes sequences packed as tightly as possible */
410         for (int x = 0; x < UNESCAPE_BUF_SIZE; ++x)
411             escaped1[x] = escaped0[x] = 3 * (x % 3 == 0);
412         TEST_UNESCAPE;
413 
414         /* Test random data */
415         RANDOMIZE_BUFFER8(escaped, UNESCAPE_BUF_SIZE);
416         TEST_UNESCAPE;
417 
418         /* Test data with escape sequences at random intervals */
419         for (int x = 0; x <= UNESCAPE_BUF_SIZE - 4;) {
420             int gap, gap_msb;
421             escaped1[x+0] = escaped0[x+0] = 0;
422             escaped1[x+1] = escaped0[x+1] = 0;
423             escaped1[x+2] = escaped0[x+2] = 3;
424             escaped1[x+3] = escaped0[x+3] = rnd() & 3;
425             gap_msb = 2u << (rnd() % 8);
426             gap = (rnd() &~ -gap_msb) | gap_msb;
427             x += gap;
428         }
429         TEST_UNESCAPE;
430 
431         /* Test data which is known to contain no escape sequences */
432         memset(escaped0, 0xFF, UNESCAPE_BUF_SIZE);
433         memset(escaped1, 0xFF, UNESCAPE_BUF_SIZE);
434         TEST_UNESCAPE;
435 
436         /* Benchmark the no-escape-sequences case */
437         bench_new(escaped1, UNESCAPE_BUF_SIZE, unescaped1);
438     }
439 }
440 
checkasm_check_vc1dsp(void)441 void checkasm_check_vc1dsp(void)
442 {
443     check_inv_trans_inplace();
444     check_inv_trans_adding();
445     report("inv_trans");
446 
447     check_loop_filter();
448     report("loop_filter");
449 
450     check_unescape();
451     report("unescape_buffer");
452 }
453