1 /*
2 * Copyright (c) 2022 Ben Avison
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 */
20
21 #include <string.h>
22
23 #include "checkasm.h"
24
25 #include "libavcodec/vc1dsp.h"
26
27 #include "libavutil/common.h"
28 #include "libavutil/internal.h"
29 #include "libavutil/intreadwrite.h"
30 #include "libavutil/mem_internal.h"
31
32 #define VC1DSP_TEST(func) { #func, offsetof(VC1DSPContext, func) },
33 #define VC1DSP_SIZED_TEST(func, width, height) { #func, offsetof(VC1DSPContext, func), width, height },
34
35 typedef struct {
36 const char *name;
37 size_t offset;
38 int width;
39 int height;
40 } test;
41
42 typedef struct matrix {
43 size_t width;
44 size_t height;
45 float d[];
46 } matrix;
47
48 static const matrix T8 = { 8, 8, {
49 12, 12, 12, 12, 12, 12, 12, 12,
50 16, 15, 9, 4, -4, -9, -15, -16,
51 16, 6, -6, -16, -16, -6, 6, 16,
52 15, -4, -16, -9, 9, 16, 4, -15,
53 12, -12, -12, 12, 12, -12, -12, 12,
54 9, -16, 4, 15, -15, -4, 16, -9,
55 6, -16, 16, -6, -6, 16, -16, 6,
56 4, -9, 15, -16, 16, -15, 9, -4
57 } };
58
59 static const matrix T4 = { 4, 4, {
60 17, 17, 17, 17,
61 22, 10, -10, -22,
62 17, -17, -17, 17,
63 10, -22, 22, -10
64 } };
65
66 static const matrix T8t = { 8, 8, {
67 12, 16, 16, 15, 12, 9, 6, 4,
68 12, 15, 6, -4, -12, -16, -16, -9,
69 12, 9, -6, -16, -12, 4, 16, 15,
70 12, 4, -16, -9, 12, 15, -6, -16,
71 12, -4, -16, 9, 12, -15, -6, 16,
72 12, -9, -6, 16, -12, -4, 16, -15,
73 12, -15, 6, 4, -12, 16, -16, 9,
74 12, -16, 16, -15, 12, -9, 6, -4
75 } };
76
77 static const matrix T4t = { 4, 4, {
78 17, 22, 17, 10,
79 17, 10, -17, -22,
80 17, -10, -17, 22,
81 17, -22, 17, -10
82 } };
83
new_matrix(size_t width,size_t height)84 static matrix *new_matrix(size_t width, size_t height)
85 {
86 matrix *out = av_mallocz(sizeof (matrix) + height * width * sizeof (float));
87 if (out == NULL) {
88 fprintf(stderr, "Memory allocation failure\n");
89 exit(EXIT_FAILURE);
90 }
91 out->width = width;
92 out->height = height;
93 return out;
94 }
95
multiply(const matrix * a,const matrix * b)96 static matrix *multiply(const matrix *a, const matrix *b)
97 {
98 matrix *out;
99 if (a->width != b->height) {
100 fprintf(stderr, "Incompatible multiplication\n");
101 exit(EXIT_FAILURE);
102 }
103 out = new_matrix(b->width, a->height);
104 for (int j = 0; j < out->height; ++j)
105 for (int i = 0; i < out->width; ++i) {
106 float sum = 0;
107 for (int k = 0; k < a->width; ++k)
108 sum += a->d[j * a->width + k] * b->d[k * b->width + i];
109 out->d[j * out->width + i] = sum;
110 }
111 return out;
112 }
113
normalise(matrix * a)114 static void normalise(matrix *a)
115 {
116 for (int j = 0; j < a->height; ++j)
117 for (int i = 0; i < a->width; ++i) {
118 float *p = a->d + j * a->width + i;
119 *p *= 64;
120 if (a->height == 4)
121 *p /= (const unsigned[]) { 289, 292, 289, 292 } [j];
122 else
123 *p /= (const unsigned[]) { 288, 289, 292, 289, 288, 289, 292, 289 } [j];
124 if (a->width == 4)
125 *p /= (const unsigned[]) { 289, 292, 289, 292 } [i];
126 else
127 *p /= (const unsigned[]) { 288, 289, 292, 289, 288, 289, 292, 289 } [i];
128 }
129 }
130
divide_and_round_nearest(matrix * a,float by)131 static void divide_and_round_nearest(matrix *a, float by)
132 {
133 for (int j = 0; j < a->height; ++j)
134 for (int i = 0; i < a->width; ++i) {
135 float *p = a->d + j * a->width + i;
136 *p = rintf(*p / by);
137 }
138 }
139
tweak(matrix * a)140 static void tweak(matrix *a)
141 {
142 for (int j = 4; j < a->height; ++j)
143 for (int i = 0; i < a->width; ++i) {
144 float *p = a->d + j * a->width + i;
145 *p += 1;
146 }
147 }
148
149 /* The VC-1 spec places restrictions on the values permitted at three
150 * different stages:
151 * - D: the input coefficients in frequency domain
152 * - E: the intermediate coefficients, inverse-transformed only horizontally
153 * - R: the fully inverse-transformed coefficients
154 *
155 * To fully cater for the ranges specified requires various intermediate
156 * values to be held to 17-bit precision; yet these conditions do not appear
157 * to be utilised in real-world streams. At least some assembly
158 * implementations have chosen to restrict these values to 16-bit precision,
159 * to accelerate the decoding of real-world streams at the cost of strict
160 * adherence to the spec. To avoid our test marking these as failures,
161 * reduce our random inputs.
162 */
163 #define ATTENUATION 4
164
generate_inverse_quantized_transform_coefficients(size_t width,size_t height)165 static matrix *generate_inverse_quantized_transform_coefficients(size_t width, size_t height)
166 {
167 matrix *raw, *tmp, *D, *E, *R;
168 raw = new_matrix(width, height);
169 for (int i = 0; i < width * height; ++i)
170 raw->d[i] = (int) (rnd() % (1024/ATTENUATION)) - 512/ATTENUATION;
171 tmp = multiply(height == 8 ? &T8 : &T4, raw);
172 D = multiply(tmp, width == 8 ? &T8t : &T4t);
173 normalise(D);
174 divide_and_round_nearest(D, 1);
175 for (int i = 0; i < width * height; ++i) {
176 if (D->d[i] < -2048/ATTENUATION || D->d[i] > 2048/ATTENUATION-1) {
177 /* Rare, so simply try again */
178 av_free(raw);
179 av_free(tmp);
180 av_free(D);
181 return generate_inverse_quantized_transform_coefficients(width, height);
182 }
183 }
184 E = multiply(D, width == 8 ? &T8 : &T4);
185 divide_and_round_nearest(E, 8);
186 for (int i = 0; i < width * height; ++i)
187 if (E->d[i] < -4096/ATTENUATION || E->d[i] > 4096/ATTENUATION-1) {
188 /* Rare, so simply try again */
189 av_free(raw);
190 av_free(tmp);
191 av_free(D);
192 av_free(E);
193 return generate_inverse_quantized_transform_coefficients(width, height);
194 }
195 R = multiply(height == 8 ? &T8t : &T4t, E);
196 tweak(R);
197 divide_and_round_nearest(R, 128);
198 for (int i = 0; i < width * height; ++i)
199 if (R->d[i] < -512/ATTENUATION || R->d[i] > 512/ATTENUATION-1) {
200 /* Rare, so simply try again */
201 av_free(raw);
202 av_free(tmp);
203 av_free(D);
204 av_free(E);
205 av_free(R);
206 return generate_inverse_quantized_transform_coefficients(width, height);
207 }
208 av_free(raw);
209 av_free(tmp);
210 av_free(E);
211 av_free(R);
212 return D;
213 }
214
215 #define RANDOMIZE_BUFFER16(name, size) \
216 do { \
217 int i; \
218 for (i = 0; i < size; ++i) { \
219 uint16_t r = rnd(); \
220 AV_WN16A(name##0 + i, r); \
221 AV_WN16A(name##1 + i, r); \
222 } \
223 } while (0)
224
225 #define RANDOMIZE_BUFFER8(name, size) \
226 do { \
227 int i; \
228 for (i = 0; i < size; ++i) { \
229 uint8_t r = rnd(); \
230 name##0[i] = r; \
231 name##1[i] = r; \
232 } \
233 } while (0)
234
235 #define RANDOMIZE_BUFFER8_MID_WEIGHTED(name, size) \
236 do { \
237 uint8_t *p##0 = name##0, *p##1 = name##1; \
238 int i = (size); \
239 while (i-- > 0) { \
240 int x = 0x80 | (rnd() & 0x7F); \
241 x >>= rnd() % 9; \
242 if (rnd() & 1) \
243 x = -x; \
244 *p##1++ = *p##0++ = 0x80 + x; \
245 } \
246 } while (0)
247
check_inv_trans_inplace(void)248 static void check_inv_trans_inplace(void)
249 {
250 /* Inverse transform input coefficients are stored in a 16-bit buffer
251 * with row stride of 8 coefficients irrespective of transform size.
252 * vc1_inv_trans_8x8 differs from the others in two ways: coefficients
253 * are stored in column-major order, and the outputs are written back
254 * to the input buffer, so we oversize it slightly to catch overruns. */
255 LOCAL_ALIGNED_16(int16_t, inv_trans_in0, [10 * 8]);
256 LOCAL_ALIGNED_16(int16_t, inv_trans_in1, [10 * 8]);
257
258 VC1DSPContext h;
259
260 ff_vc1dsp_init(&h);
261
262 if (check_func(h.vc1_inv_trans_8x8, "vc1dsp.vc1_inv_trans_8x8")) {
263 matrix *coeffs;
264 declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *);
265 RANDOMIZE_BUFFER16(inv_trans_in, 10 * 8);
266 coeffs = generate_inverse_quantized_transform_coefficients(8, 8);
267 for (int j = 0; j < 8; ++j)
268 for (int i = 0; i < 8; ++i) {
269 int idx = 8 + i * 8 + j;
270 inv_trans_in1[idx] = inv_trans_in0[idx] = coeffs->d[j * 8 + i];
271 }
272 call_ref(inv_trans_in0 + 8);
273 call_new(inv_trans_in1 + 8);
274 if (memcmp(inv_trans_in0, inv_trans_in1, 10 * 8 * sizeof (int16_t)))
275 fail();
276 bench_new(inv_trans_in1 + 8);
277 av_free(coeffs);
278 }
279 }
280
check_inv_trans_adding(void)281 static void check_inv_trans_adding(void)
282 {
283 /* Inverse transform input coefficients are stored in a 16-bit buffer
284 * with row stride of 8 coefficients irrespective of transform size. */
285 LOCAL_ALIGNED_16(int16_t, inv_trans_in0, [8 * 8]);
286 LOCAL_ALIGNED_16(int16_t, inv_trans_in1, [8 * 8]);
287
288 /* For all but vc1_inv_trans_8x8, the inverse transform is narrowed and
289 * added with saturation to an array of unsigned 8-bit values. Oversize
290 * this by 8 samples left and right and one row above and below. */
291 LOCAL_ALIGNED_8(uint8_t, inv_trans_out0, [10 * 24]);
292 LOCAL_ALIGNED_8(uint8_t, inv_trans_out1, [10 * 24]);
293
294 VC1DSPContext h;
295
296 const test tests[] = {
297 VC1DSP_SIZED_TEST(vc1_inv_trans_8x4, 8, 4)
298 VC1DSP_SIZED_TEST(vc1_inv_trans_4x8, 4, 8)
299 VC1DSP_SIZED_TEST(vc1_inv_trans_4x4, 4, 4)
300 VC1DSP_SIZED_TEST(vc1_inv_trans_8x8_dc, 8, 8)
301 VC1DSP_SIZED_TEST(vc1_inv_trans_8x4_dc, 8, 4)
302 VC1DSP_SIZED_TEST(vc1_inv_trans_4x8_dc, 4, 8)
303 VC1DSP_SIZED_TEST(vc1_inv_trans_4x4_dc, 4, 4)
304 };
305
306 ff_vc1dsp_init(&h);
307
308 for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) {
309 void (*func)(uint8_t *, ptrdiff_t, int16_t *) = *(void **)((intptr_t) &h + tests[t].offset);
310 if (check_func(func, "vc1dsp.%s", tests[t].name)) {
311 matrix *coeffs;
312 declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, ptrdiff_t, int16_t *);
313 RANDOMIZE_BUFFER16(inv_trans_in, 8 * 8);
314 RANDOMIZE_BUFFER8(inv_trans_out, 10 * 24);
315 coeffs = generate_inverse_quantized_transform_coefficients(tests[t].width, tests[t].height);
316 for (int j = 0; j < tests[t].height; ++j)
317 for (int i = 0; i < tests[t].width; ++i) {
318 int idx = j * 8 + i;
319 inv_trans_in1[idx] = inv_trans_in0[idx] = coeffs->d[j * tests[t].width + i];
320 }
321 call_ref(inv_trans_out0 + 24 + 8, 24, inv_trans_in0);
322 call_new(inv_trans_out1 + 24 + 8, 24, inv_trans_in1);
323 if (memcmp(inv_trans_out0, inv_trans_out1, 10 * 24))
324 fail();
325 bench_new(inv_trans_out1 + 24 + 8, 24, inv_trans_in1 + 8);
326 av_free(coeffs);
327 }
328 }
329 }
330
check_loop_filter(void)331 static void check_loop_filter(void)
332 {
333 /* Deblocking filter buffers are big enough to hold a 16x16 block,
334 * plus 16 columns left and 4 rows above to hold filter inputs
335 * (depending on whether v or h neighbouring block edge, oversized
336 * horizontally to maintain 16-byte alignment) plus 16 columns and
337 * 4 rows below to catch write overflows */
338 LOCAL_ALIGNED_16(uint8_t, filter_buf0, [24 * 48]);
339 LOCAL_ALIGNED_16(uint8_t, filter_buf1, [24 * 48]);
340
341 VC1DSPContext h;
342
343 const test tests[] = {
344 VC1DSP_TEST(vc1_v_loop_filter4)
345 VC1DSP_TEST(vc1_h_loop_filter4)
346 VC1DSP_TEST(vc1_v_loop_filter8)
347 VC1DSP_TEST(vc1_h_loop_filter8)
348 VC1DSP_TEST(vc1_v_loop_filter16)
349 VC1DSP_TEST(vc1_h_loop_filter16)
350 };
351
352 ff_vc1dsp_init(&h);
353
354 for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) {
355 void (*func)(uint8_t *, ptrdiff_t, int) = *(void **)((intptr_t) &h + tests[t].offset);
356 declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, ptrdiff_t, int);
357 if (check_func(func, "vc1dsp.%s", tests[t].name)) {
358 for (int count = 1000; count > 0; --count) {
359 int pq = rnd() % 31 + 1;
360 RANDOMIZE_BUFFER8_MID_WEIGHTED(filter_buf, 24 * 48);
361 call_ref(filter_buf0 + 4 * 48 + 16, 48, pq);
362 call_new(filter_buf1 + 4 * 48 + 16, 48, pq);
363 if (memcmp(filter_buf0, filter_buf1, 24 * 48))
364 fail();
365 }
366 }
367 for (int j = 0; j < 24; ++j)
368 for (int i = 0; i < 48; ++i)
369 filter_buf1[j * 48 + i] = 0x60 + 0x40 * (i >= 16 && j >= 4);
370 if (check_func(func, "vc1dsp.%s_bestcase", tests[t].name))
371 bench_new(filter_buf1 + 4 * 48 + 16, 48, 1);
372 if (check_func(func, "vc1dsp.%s_worstcase", tests[t].name))
373 bench_new(filter_buf1 + 4 * 48 + 16, 48, 31);
374 }
375 }
376
377 #define TEST_UNESCAPE \
378 do { \
379 for (int count = 100; count > 0; --count) { \
380 escaped_offset = rnd() & 7; \
381 unescaped_offset = rnd() & 7; \
382 escaped_len = (1u << (rnd() % 8) + 3) - (rnd() & 7); \
383 RANDOMIZE_BUFFER8(unescaped, UNESCAPE_BUF_SIZE); \
384 len0 = call_ref(escaped0 + escaped_offset, escaped_len, unescaped0 + unescaped_offset); \
385 len1 = call_new(escaped1 + escaped_offset, escaped_len, unescaped1 + unescaped_offset); \
386 if (len0 != len1 || memcmp(unescaped0, unescaped1, UNESCAPE_BUF_SIZE)) \
387 fail(); \
388 } \
389 } while (0)
390
check_unescape(void)391 static void check_unescape(void)
392 {
393 /* This appears to be a typical length of buffer in use */
394 #define LOG2_UNESCAPE_BUF_SIZE 17
395 #define UNESCAPE_BUF_SIZE (1u<<LOG2_UNESCAPE_BUF_SIZE)
396 LOCAL_ALIGNED_8(uint8_t, escaped0, [UNESCAPE_BUF_SIZE]);
397 LOCAL_ALIGNED_8(uint8_t, escaped1, [UNESCAPE_BUF_SIZE]);
398 LOCAL_ALIGNED_8(uint8_t, unescaped0, [UNESCAPE_BUF_SIZE]);
399 LOCAL_ALIGNED_8(uint8_t, unescaped1, [UNESCAPE_BUF_SIZE]);
400
401 VC1DSPContext h;
402
403 ff_vc1dsp_init(&h);
404
405 if (check_func(h.vc1_unescape_buffer, "vc1dsp.vc1_unescape_buffer")) {
406 int len0, len1, escaped_offset, unescaped_offset, escaped_len;
407 declare_func_emms(AV_CPU_FLAG_MMX, int, const uint8_t *, int, uint8_t *);
408
409 /* Test data which consists of escapes sequences packed as tightly as possible */
410 for (int x = 0; x < UNESCAPE_BUF_SIZE; ++x)
411 escaped1[x] = escaped0[x] = 3 * (x % 3 == 0);
412 TEST_UNESCAPE;
413
414 /* Test random data */
415 RANDOMIZE_BUFFER8(escaped, UNESCAPE_BUF_SIZE);
416 TEST_UNESCAPE;
417
418 /* Test data with escape sequences at random intervals */
419 for (int x = 0; x <= UNESCAPE_BUF_SIZE - 4;) {
420 int gap, gap_msb;
421 escaped1[x+0] = escaped0[x+0] = 0;
422 escaped1[x+1] = escaped0[x+1] = 0;
423 escaped1[x+2] = escaped0[x+2] = 3;
424 escaped1[x+3] = escaped0[x+3] = rnd() & 3;
425 gap_msb = 2u << (rnd() % 8);
426 gap = (rnd() &~ -gap_msb) | gap_msb;
427 x += gap;
428 }
429 TEST_UNESCAPE;
430
431 /* Test data which is known to contain no escape sequences */
432 memset(escaped0, 0xFF, UNESCAPE_BUF_SIZE);
433 memset(escaped1, 0xFF, UNESCAPE_BUF_SIZE);
434 TEST_UNESCAPE;
435
436 /* Benchmark the no-escape-sequences case */
437 bench_new(escaped1, UNESCAPE_BUF_SIZE, unescaped1);
438 }
439 }
440
checkasm_check_vc1dsp(void)441 void checkasm_check_vc1dsp(void)
442 {
443 check_inv_trans_inplace();
444 check_inv_trans_adding();
445 report("inv_trans");
446
447 check_loop_filter();
448 report("loop_filter");
449
450 check_unescape();
451 report("unescape_buffer");
452 }
453