1 // SPDX-License-Identifier: GPL-2.0+
2 /*
3 * Copyright 2016 Tom aan de Wiel
4 * Copyright 2018 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
5 *
6 * 8x8 Fast Walsh Hadamard Transform in sequency order based on the paper:
7 *
8 * A Recursive Algorithm for Sequency-Ordered Fast Walsh Transforms,
9 * R.D. Brown, 1977
10 */
11
12 #include <linux/string.h>
13 #include "vicodec-codec.h"
14
15 #define ALL_ZEROS 15
16 #define DEADZONE_WIDTH 20
17
18 static const uint8_t zigzag[64] = {
19 0,
20 1, 8,
21 2, 9, 16,
22 3, 10, 17, 24,
23 4, 11, 18, 25, 32,
24 5, 12, 19, 26, 33, 40,
25 6, 13, 20, 27, 34, 41, 48,
26 7, 14, 21, 28, 35, 42, 49, 56,
27 15, 22, 29, 36, 43, 50, 57,
28 23, 30, 37, 44, 51, 58,
29 31, 38, 45, 52, 59,
30 39, 46, 53, 60,
31 47, 54, 61,
32 55, 62,
33 63,
34 };
35
36
rlc(const s16 * in,__be16 * output,int blocktype)37 static int rlc(const s16 *in, __be16 *output, int blocktype)
38 {
39 s16 block[8 * 8];
40 s16 *wp = block;
41 int i = 0;
42 int x, y;
43 int ret = 0;
44
45 /* read in block from framebuffer */
46 int lastzero_run = 0;
47 int to_encode;
48
49 for (y = 0; y < 8; y++) {
50 for (x = 0; x < 8; x++) {
51 *wp = in[x + y * 8];
52 wp++;
53 }
54 }
55
56 /* keep track of amount of trailing zeros */
57 for (i = 63; i >= 0 && !block[zigzag[i]]; i--)
58 lastzero_run++;
59
60 *output++ = (blocktype == PBLOCK ? htons(PFRAME_BIT) : 0);
61 ret++;
62
63 to_encode = 8 * 8 - (lastzero_run > 14 ? lastzero_run : 0);
64
65 i = 0;
66 while (i < to_encode) {
67 int cnt = 0;
68 int tmp;
69
70 /* count leading zeros */
71 while ((tmp = block[zigzag[i]]) == 0 && cnt < 14) {
72 cnt++;
73 i++;
74 if (i == to_encode) {
75 cnt--;
76 break;
77 }
78 }
79 /* 4 bits for run, 12 for coefficient (quantization by 4) */
80 *output++ = htons((cnt | tmp << 4));
81 i++;
82 ret++;
83 }
84 if (lastzero_run > 14) {
85 *output = htons(ALL_ZEROS | 0);
86 ret++;
87 }
88
89 return ret;
90 }
91
92 /*
93 * This function will worst-case increase rlc_in by 65*2 bytes:
94 * one s16 value for the header and 8 * 8 coefficients of type s16.
95 */
derlc(const __be16 ** rlc_in,s16 * dwht_out)96 static s16 derlc(const __be16 **rlc_in, s16 *dwht_out)
97 {
98 /* header */
99 const __be16 *input = *rlc_in;
100 s16 ret = ntohs(*input++);
101 int dec_count = 0;
102 s16 block[8 * 8 + 16];
103 s16 *wp = block;
104 int i;
105
106 /*
107 * Now de-compress, it expands one byte to up to 15 bytes
108 * (or fills the remainder of the 64 bytes with zeroes if it
109 * is the last byte to expand).
110 *
111 * So block has to be 8 * 8 + 16 bytes, the '+ 16' is to
112 * allow for overflow if the incoming data was malformed.
113 */
114 while (dec_count < 8 * 8) {
115 s16 in = ntohs(*input++);
116 int length = in & 0xf;
117 int coeff = in >> 4;
118
119 /* fill remainder with zeros */
120 if (length == 15) {
121 for (i = 0; i < 64 - dec_count; i++)
122 *wp++ = 0;
123 break;
124 }
125
126 for (i = 0; i < length; i++)
127 *wp++ = 0;
128 *wp++ = coeff;
129 dec_count += length + 1;
130 }
131
132 wp = block;
133
134 for (i = 0; i < 64; i++) {
135 int pos = zigzag[i];
136 int y = pos / 8;
137 int x = pos % 8;
138
139 dwht_out[x + y * 8] = *wp++;
140 }
141 *rlc_in = input;
142 return ret;
143 }
144
145 static const int quant_table[] = {
146 2, 2, 2, 2, 2, 2, 2, 2,
147 2, 2, 2, 2, 2, 2, 2, 2,
148 2, 2, 2, 2, 2, 2, 2, 3,
149 2, 2, 2, 2, 2, 2, 3, 6,
150 2, 2, 2, 2, 2, 3, 6, 6,
151 2, 2, 2, 2, 3, 6, 6, 6,
152 2, 2, 2, 3, 6, 6, 6, 6,
153 2, 2, 3, 6, 6, 6, 6, 8,
154 };
155
156 static const int quant_table_p[] = {
157 3, 3, 3, 3, 3, 3, 3, 3,
158 3, 3, 3, 3, 3, 3, 3, 3,
159 3, 3, 3, 3, 3, 3, 3, 3,
160 3, 3, 3, 3, 3, 3, 3, 6,
161 3, 3, 3, 3, 3, 3, 6, 6,
162 3, 3, 3, 3, 3, 6, 6, 9,
163 3, 3, 3, 3, 6, 6, 9, 9,
164 3, 3, 3, 6, 6, 9, 9, 10,
165 };
166
quantize_intra(s16 * coeff,s16 * de_coeff)167 static void quantize_intra(s16 *coeff, s16 *de_coeff)
168 {
169 const int *quant = quant_table;
170 int i, j;
171
172 for (j = 0; j < 8; j++) {
173 for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) {
174 *coeff >>= *quant;
175 if (*coeff >= -DEADZONE_WIDTH &&
176 *coeff <= DEADZONE_WIDTH)
177 *coeff = *de_coeff = 0;
178 else
179 *de_coeff = *coeff << *quant;
180 }
181 }
182 }
183
dequantize_intra(s16 * coeff)184 static void dequantize_intra(s16 *coeff)
185 {
186 const int *quant = quant_table;
187 int i, j;
188
189 for (j = 0; j < 8; j++)
190 for (i = 0; i < 8; i++, quant++, coeff++)
191 *coeff <<= *quant;
192 }
193
quantize_inter(s16 * coeff,s16 * de_coeff)194 static void quantize_inter(s16 *coeff, s16 *de_coeff)
195 {
196 const int *quant = quant_table_p;
197 int i, j;
198
199 for (j = 0; j < 8; j++) {
200 for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) {
201 *coeff >>= *quant;
202 if (*coeff >= -DEADZONE_WIDTH &&
203 *coeff <= DEADZONE_WIDTH)
204 *coeff = *de_coeff = 0;
205 else
206 *de_coeff = *coeff << *quant;
207 }
208 }
209 }
210
dequantize_inter(s16 * coeff)211 static void dequantize_inter(s16 *coeff)
212 {
213 const int *quant = quant_table_p;
214 int i, j;
215
216 for (j = 0; j < 8; j++)
217 for (i = 0; i < 8; i++, quant++, coeff++)
218 *coeff <<= *quant;
219 }
220
fwht(const u8 * block,s16 * output_block,unsigned int stride,unsigned int input_step,bool intra)221 static void fwht(const u8 *block, s16 *output_block, unsigned int stride,
222 unsigned int input_step, bool intra)
223 {
224 /* we'll need more than 8 bits for the transformed coefficients */
225 s32 workspace1[8], workspace2[8];
226 const u8 *tmp = block;
227 s16 *out = output_block;
228 int add = intra ? 256 : 0;
229 unsigned int i;
230
231 /* stage 1 */
232 stride *= input_step;
233
234 for (i = 0; i < 8; i++, tmp += stride, out += 8) {
235 if (input_step == 1) {
236 workspace1[0] = tmp[0] + tmp[1] - add;
237 workspace1[1] = tmp[0] - tmp[1];
238
239 workspace1[2] = tmp[2] + tmp[3] - add;
240 workspace1[3] = tmp[2] - tmp[3];
241
242 workspace1[4] = tmp[4] + tmp[5] - add;
243 workspace1[5] = tmp[4] - tmp[5];
244
245 workspace1[6] = tmp[6] + tmp[7] - add;
246 workspace1[7] = tmp[6] - tmp[7];
247 } else {
248 workspace1[0] = tmp[0] + tmp[2] - add;
249 workspace1[1] = tmp[0] - tmp[2];
250
251 workspace1[2] = tmp[4] + tmp[6] - add;
252 workspace1[3] = tmp[4] - tmp[6];
253
254 workspace1[4] = tmp[8] + tmp[10] - add;
255 workspace1[5] = tmp[8] - tmp[10];
256
257 workspace1[6] = tmp[12] + tmp[14] - add;
258 workspace1[7] = tmp[12] - tmp[14];
259 }
260
261 /* stage 2 */
262 workspace2[0] = workspace1[0] + workspace1[2];
263 workspace2[1] = workspace1[0] - workspace1[2];
264 workspace2[2] = workspace1[1] - workspace1[3];
265 workspace2[3] = workspace1[1] + workspace1[3];
266
267 workspace2[4] = workspace1[4] + workspace1[6];
268 workspace2[5] = workspace1[4] - workspace1[6];
269 workspace2[6] = workspace1[5] - workspace1[7];
270 workspace2[7] = workspace1[5] + workspace1[7];
271
272 /* stage 3 */
273 out[0] = workspace2[0] + workspace2[4];
274 out[1] = workspace2[0] - workspace2[4];
275 out[2] = workspace2[1] - workspace2[5];
276 out[3] = workspace2[1] + workspace2[5];
277 out[4] = workspace2[2] + workspace2[6];
278 out[5] = workspace2[2] - workspace2[6];
279 out[6] = workspace2[3] - workspace2[7];
280 out[7] = workspace2[3] + workspace2[7];
281 }
282
283 out = output_block;
284
285 for (i = 0; i < 8; i++, out++) {
286 /* stage 1 */
287 workspace1[0] = out[0] + out[1 * 8];
288 workspace1[1] = out[0] - out[1 * 8];
289
290 workspace1[2] = out[2 * 8] + out[3 * 8];
291 workspace1[3] = out[2 * 8] - out[3 * 8];
292
293 workspace1[4] = out[4 * 8] + out[5 * 8];
294 workspace1[5] = out[4 * 8] - out[5 * 8];
295
296 workspace1[6] = out[6 * 8] + out[7 * 8];
297 workspace1[7] = out[6 * 8] - out[7 * 8];
298
299 /* stage 2 */
300 workspace2[0] = workspace1[0] + workspace1[2];
301 workspace2[1] = workspace1[0] - workspace1[2];
302 workspace2[2] = workspace1[1] - workspace1[3];
303 workspace2[3] = workspace1[1] + workspace1[3];
304
305 workspace2[4] = workspace1[4] + workspace1[6];
306 workspace2[5] = workspace1[4] - workspace1[6];
307 workspace2[6] = workspace1[5] - workspace1[7];
308 workspace2[7] = workspace1[5] + workspace1[7];
309 /* stage 3 */
310 out[0 * 8] = workspace2[0] + workspace2[4];
311 out[1 * 8] = workspace2[0] - workspace2[4];
312 out[2 * 8] = workspace2[1] - workspace2[5];
313 out[3 * 8] = workspace2[1] + workspace2[5];
314 out[4 * 8] = workspace2[2] + workspace2[6];
315 out[5 * 8] = workspace2[2] - workspace2[6];
316 out[6 * 8] = workspace2[3] - workspace2[7];
317 out[7 * 8] = workspace2[3] + workspace2[7];
318 }
319 }
320
321 /*
322 * Not the nicest way of doing it, but P-blocks get twice the range of
323 * that of the I-blocks. Therefore we need a type bigger than 8 bits.
324 * Furthermore values can be negative... This is just a version that
325 * works with 16 signed data
326 */
fwht16(const s16 * block,s16 * output_block,int stride,int intra)327 static void fwht16(const s16 *block, s16 *output_block, int stride, int intra)
328 {
329 /* we'll need more than 8 bits for the transformed coefficients */
330 s32 workspace1[8], workspace2[8];
331 const s16 *tmp = block;
332 s16 *out = output_block;
333 int i;
334
335 for (i = 0; i < 8; i++, tmp += stride, out += 8) {
336 /* stage 1 */
337 workspace1[0] = tmp[0] + tmp[1];
338 workspace1[1] = tmp[0] - tmp[1];
339
340 workspace1[2] = tmp[2] + tmp[3];
341 workspace1[3] = tmp[2] - tmp[3];
342
343 workspace1[4] = tmp[4] + tmp[5];
344 workspace1[5] = tmp[4] - tmp[5];
345
346 workspace1[6] = tmp[6] + tmp[7];
347 workspace1[7] = tmp[6] - tmp[7];
348
349 /* stage 2 */
350 workspace2[0] = workspace1[0] + workspace1[2];
351 workspace2[1] = workspace1[0] - workspace1[2];
352 workspace2[2] = workspace1[1] - workspace1[3];
353 workspace2[3] = workspace1[1] + workspace1[3];
354
355 workspace2[4] = workspace1[4] + workspace1[6];
356 workspace2[5] = workspace1[4] - workspace1[6];
357 workspace2[6] = workspace1[5] - workspace1[7];
358 workspace2[7] = workspace1[5] + workspace1[7];
359
360 /* stage 3 */
361 out[0] = workspace2[0] + workspace2[4];
362 out[1] = workspace2[0] - workspace2[4];
363 out[2] = workspace2[1] - workspace2[5];
364 out[3] = workspace2[1] + workspace2[5];
365 out[4] = workspace2[2] + workspace2[6];
366 out[5] = workspace2[2] - workspace2[6];
367 out[6] = workspace2[3] - workspace2[7];
368 out[7] = workspace2[3] + workspace2[7];
369 }
370
371 out = output_block;
372
373 for (i = 0; i < 8; i++, out++) {
374 /* stage 1 */
375 workspace1[0] = out[0] + out[1*8];
376 workspace1[1] = out[0] - out[1*8];
377
378 workspace1[2] = out[2*8] + out[3*8];
379 workspace1[3] = out[2*8] - out[3*8];
380
381 workspace1[4] = out[4*8] + out[5*8];
382 workspace1[5] = out[4*8] - out[5*8];
383
384 workspace1[6] = out[6*8] + out[7*8];
385 workspace1[7] = out[6*8] - out[7*8];
386
387 /* stage 2 */
388 workspace2[0] = workspace1[0] + workspace1[2];
389 workspace2[1] = workspace1[0] - workspace1[2];
390 workspace2[2] = workspace1[1] - workspace1[3];
391 workspace2[3] = workspace1[1] + workspace1[3];
392
393 workspace2[4] = workspace1[4] + workspace1[6];
394 workspace2[5] = workspace1[4] - workspace1[6];
395 workspace2[6] = workspace1[5] - workspace1[7];
396 workspace2[7] = workspace1[5] + workspace1[7];
397
398 /* stage 3 */
399 out[0*8] = workspace2[0] + workspace2[4];
400 out[1*8] = workspace2[0] - workspace2[4];
401 out[2*8] = workspace2[1] - workspace2[5];
402 out[3*8] = workspace2[1] + workspace2[5];
403 out[4*8] = workspace2[2] + workspace2[6];
404 out[5*8] = workspace2[2] - workspace2[6];
405 out[6*8] = workspace2[3] - workspace2[7];
406 out[7*8] = workspace2[3] + workspace2[7];
407 }
408 }
409
ifwht(const s16 * block,s16 * output_block,int intra)410 static void ifwht(const s16 *block, s16 *output_block, int intra)
411 {
412 /*
413 * we'll need more than 8 bits for the transformed coefficients
414 * use native unit of cpu
415 */
416 int workspace1[8], workspace2[8];
417 int inter = intra ? 0 : 1;
418 const s16 *tmp = block;
419 s16 *out = output_block;
420 int i;
421
422 for (i = 0; i < 8; i++, tmp += 8, out += 8) {
423 /* stage 1 */
424 workspace1[0] = tmp[0] + tmp[1];
425 workspace1[1] = tmp[0] - tmp[1];
426
427 workspace1[2] = tmp[2] + tmp[3];
428 workspace1[3] = tmp[2] - tmp[3];
429
430 workspace1[4] = tmp[4] + tmp[5];
431 workspace1[5] = tmp[4] - tmp[5];
432
433 workspace1[6] = tmp[6] + tmp[7];
434 workspace1[7] = tmp[6] - tmp[7];
435
436 /* stage 2 */
437 workspace2[0] = workspace1[0] + workspace1[2];
438 workspace2[1] = workspace1[0] - workspace1[2];
439 workspace2[2] = workspace1[1] - workspace1[3];
440 workspace2[3] = workspace1[1] + workspace1[3];
441
442 workspace2[4] = workspace1[4] + workspace1[6];
443 workspace2[5] = workspace1[4] - workspace1[6];
444 workspace2[6] = workspace1[5] - workspace1[7];
445 workspace2[7] = workspace1[5] + workspace1[7];
446
447 /* stage 3 */
448 out[0] = workspace2[0] + workspace2[4];
449 out[1] = workspace2[0] - workspace2[4];
450 out[2] = workspace2[1] - workspace2[5];
451 out[3] = workspace2[1] + workspace2[5];
452 out[4] = workspace2[2] + workspace2[6];
453 out[5] = workspace2[2] - workspace2[6];
454 out[6] = workspace2[3] - workspace2[7];
455 out[7] = workspace2[3] + workspace2[7];
456 }
457
458 out = output_block;
459
460 for (i = 0; i < 8; i++, out++) {
461 /* stage 1 */
462 workspace1[0] = out[0] + out[1 * 8];
463 workspace1[1] = out[0] - out[1 * 8];
464
465 workspace1[2] = out[2 * 8] + out[3 * 8];
466 workspace1[3] = out[2 * 8] - out[3 * 8];
467
468 workspace1[4] = out[4 * 8] + out[5 * 8];
469 workspace1[5] = out[4 * 8] - out[5 * 8];
470
471 workspace1[6] = out[6 * 8] + out[7 * 8];
472 workspace1[7] = out[6 * 8] - out[7 * 8];
473
474 /* stage 2 */
475 workspace2[0] = workspace1[0] + workspace1[2];
476 workspace2[1] = workspace1[0] - workspace1[2];
477 workspace2[2] = workspace1[1] - workspace1[3];
478 workspace2[3] = workspace1[1] + workspace1[3];
479
480 workspace2[4] = workspace1[4] + workspace1[6];
481 workspace2[5] = workspace1[4] - workspace1[6];
482 workspace2[6] = workspace1[5] - workspace1[7];
483 workspace2[7] = workspace1[5] + workspace1[7];
484
485 /* stage 3 */
486 if (inter) {
487 int d;
488
489 out[0 * 8] = workspace2[0] + workspace2[4];
490 out[1 * 8] = workspace2[0] - workspace2[4];
491 out[2 * 8] = workspace2[1] - workspace2[5];
492 out[3 * 8] = workspace2[1] + workspace2[5];
493 out[4 * 8] = workspace2[2] + workspace2[6];
494 out[5 * 8] = workspace2[2] - workspace2[6];
495 out[6 * 8] = workspace2[3] - workspace2[7];
496 out[7 * 8] = workspace2[3] + workspace2[7];
497
498 for (d = 0; d < 8; d++)
499 out[8 * d] >>= 6;
500 } else {
501 int d;
502
503 out[0 * 8] = workspace2[0] + workspace2[4];
504 out[1 * 8] = workspace2[0] - workspace2[4];
505 out[2 * 8] = workspace2[1] - workspace2[5];
506 out[3 * 8] = workspace2[1] + workspace2[5];
507 out[4 * 8] = workspace2[2] + workspace2[6];
508 out[5 * 8] = workspace2[2] - workspace2[6];
509 out[6 * 8] = workspace2[3] - workspace2[7];
510 out[7 * 8] = workspace2[3] + workspace2[7];
511
512 for (d = 0; d < 8; d++) {
513 out[8 * d] >>= 6;
514 out[8 * d] += 128;
515 }
516 }
517 }
518 }
519
fill_encoder_block(const u8 * input,s16 * dst,unsigned int stride,unsigned int input_step)520 static void fill_encoder_block(const u8 *input, s16 *dst,
521 unsigned int stride, unsigned int input_step)
522 {
523 int i, j;
524
525 for (i = 0; i < 8; i++) {
526 for (j = 0; j < 8; j++, input += input_step)
527 *dst++ = *input;
528 input += (stride - 8) * input_step;
529 }
530 }
531
var_intra(const s16 * input)532 static int var_intra(const s16 *input)
533 {
534 int32_t mean = 0;
535 int32_t ret = 0;
536 const s16 *tmp = input;
537 int i;
538
539 for (i = 0; i < 8 * 8; i++, tmp++)
540 mean += *tmp;
541 mean /= 64;
542 tmp = input;
543 for (i = 0; i < 8 * 8; i++, tmp++)
544 ret += (*tmp - mean) < 0 ? -(*tmp - mean) : (*tmp - mean);
545 return ret;
546 }
547
var_inter(const s16 * old,const s16 * new)548 static int var_inter(const s16 *old, const s16 *new)
549 {
550 int32_t ret = 0;
551 int i;
552
553 for (i = 0; i < 8 * 8; i++, old++, new++)
554 ret += (*old - *new) < 0 ? -(*old - *new) : (*old - *new);
555 return ret;
556 }
557
decide_blocktype(const u8 * cur,const u8 * reference,s16 * deltablock,unsigned int stride,unsigned int input_step)558 static int decide_blocktype(const u8 *cur, const u8 *reference,
559 s16 *deltablock, unsigned int stride,
560 unsigned int input_step)
561 {
562 s16 tmp[64];
563 s16 old[64];
564 s16 *work = tmp;
565 unsigned int k, l;
566 int vari;
567 int vard;
568
569 fill_encoder_block(cur, tmp, stride, input_step);
570 fill_encoder_block(reference, old, 8, 1);
571 vari = var_intra(tmp);
572
573 for (k = 0; k < 8; k++) {
574 for (l = 0; l < 8; l++) {
575 *deltablock = *work - *reference;
576 deltablock++;
577 work++;
578 reference++;
579 }
580 }
581 deltablock -= 64;
582 vard = var_inter(old, tmp);
583 return vari <= vard ? IBLOCK : PBLOCK;
584 }
585
fill_decoder_block(u8 * dst,const s16 * input,int stride)586 static void fill_decoder_block(u8 *dst, const s16 *input, int stride)
587 {
588 int i, j;
589
590 for (i = 0; i < 8; i++) {
591 for (j = 0; j < 8; j++, input++, dst++) {
592 if (*input < 0)
593 *dst = 0;
594 else if (*input > 255)
595 *dst = 255;
596 else
597 *dst = *input;
598 }
599 dst += stride - 8;
600 }
601 }
602
add_deltas(s16 * deltas,const u8 * ref,int stride)603 static void add_deltas(s16 *deltas, const u8 *ref, int stride)
604 {
605 int k, l;
606
607 for (k = 0; k < 8; k++) {
608 for (l = 0; l < 8; l++) {
609 *deltas += *ref++;
610 /*
611 * Due to quantizing, it might possible that the
612 * decoded coefficients are slightly out of range
613 */
614 if (*deltas < 0)
615 *deltas = 0;
616 else if (*deltas > 255)
617 *deltas = 255;
618 deltas++;
619 }
620 ref += stride - 8;
621 }
622 }
623
encode_plane(u8 * input,u8 * refp,__be16 ** rlco,__be16 * rlco_max,struct cframe * cf,u32 height,u32 width,unsigned int input_step,bool is_intra,bool next_is_intra)624 static u32 encode_plane(u8 *input, u8 *refp, __be16 **rlco, __be16 *rlco_max,
625 struct cframe *cf, u32 height, u32 width,
626 unsigned int input_step,
627 bool is_intra, bool next_is_intra)
628 {
629 u8 *input_start = input;
630 __be16 *rlco_start = *rlco;
631 s16 deltablock[64];
632 __be16 pframe_bit = htons(PFRAME_BIT);
633 u32 encoding = 0;
634 unsigned int last_size = 0;
635 unsigned int i, j;
636
637 for (j = 0; j < height / 8; j++) {
638 for (i = 0; i < width / 8; i++) {
639 /* intra code, first frame is always intra coded. */
640 int blocktype = IBLOCK;
641 unsigned int size;
642
643 if (!is_intra)
644 blocktype = decide_blocktype(input, refp,
645 deltablock, width, input_step);
646 if (is_intra || blocktype == IBLOCK) {
647 fwht(input, cf->coeffs, width, input_step, 1);
648 quantize_intra(cf->coeffs, cf->de_coeffs);
649 blocktype = IBLOCK;
650 } else {
651 /* inter code */
652 encoding |= FRAME_PCODED;
653 fwht16(deltablock, cf->coeffs, 8, 0);
654 quantize_inter(cf->coeffs, cf->de_coeffs);
655 }
656 if (!next_is_intra) {
657 ifwht(cf->de_coeffs, cf->de_fwht, blocktype);
658
659 if (blocktype == PBLOCK)
660 add_deltas(cf->de_fwht, refp, 8);
661 fill_decoder_block(refp, cf->de_fwht, 8);
662 }
663
664 input += 8 * input_step;
665 refp += 8 * 8;
666
667 if (encoding & FRAME_UNENCODED)
668 continue;
669
670 size = rlc(cf->coeffs, *rlco, blocktype);
671 if (last_size == size &&
672 !memcmp(*rlco + 1, *rlco - size + 1, 2 * size - 2)) {
673 __be16 *last_rlco = *rlco - size;
674 s16 hdr = ntohs(*last_rlco);
675
676 if (!((*last_rlco ^ **rlco) & pframe_bit) &&
677 (hdr & DUPS_MASK) < DUPS_MASK)
678 *last_rlco = htons(hdr + 2);
679 else
680 *rlco += size;
681 } else {
682 *rlco += size;
683 }
684 if (*rlco >= rlco_max)
685 encoding |= FRAME_UNENCODED;
686 last_size = size;
687 }
688 input += width * 7 * input_step;
689 }
690 if (encoding & FRAME_UNENCODED) {
691 u8 *out = (u8 *)rlco_start;
692
693 input = input_start;
694 /*
695 * The compressed stream should never contain the magic
696 * header, so when we copy the YUV data we replace 0xff
697 * by 0xfe. Since YUV is limited range such values
698 * shouldn't appear anyway.
699 */
700 for (i = 0; i < height * width; i++, input += input_step)
701 *out++ = (*input == 0xff) ? 0xfe : *input;
702 *rlco = (__be16 *)out;
703 }
704 return encoding;
705 }
706
encode_frame(struct raw_frame * frm,struct raw_frame * ref_frm,struct cframe * cf,bool is_intra,bool next_is_intra)707 u32 encode_frame(struct raw_frame *frm, struct raw_frame *ref_frm,
708 struct cframe *cf, bool is_intra, bool next_is_intra)
709 {
710 unsigned int size = frm->height * frm->width;
711 __be16 *rlco = cf->rlc_data;
712 __be16 *rlco_max;
713 u32 encoding;
714
715 rlco_max = rlco + size / 2 - 256;
716 encoding = encode_plane(frm->luma, ref_frm->luma, &rlco, rlco_max, cf,
717 frm->height, frm->width,
718 1, is_intra, next_is_intra);
719 if (encoding & FRAME_UNENCODED)
720 encoding |= LUMA_UNENCODED;
721 encoding &= ~FRAME_UNENCODED;
722 rlco_max = rlco + size / 8 - 256;
723 encoding |= encode_plane(frm->cb, ref_frm->cb, &rlco, rlco_max, cf,
724 frm->height / 2, frm->width / 2,
725 frm->chroma_step, is_intra, next_is_intra);
726 if (encoding & FRAME_UNENCODED)
727 encoding |= CB_UNENCODED;
728 encoding &= ~FRAME_UNENCODED;
729 rlco_max = rlco + size / 8 - 256;
730 encoding |= encode_plane(frm->cr, ref_frm->cr, &rlco, rlco_max, cf,
731 frm->height / 2, frm->width / 2,
732 frm->chroma_step, is_intra, next_is_intra);
733 if (encoding & FRAME_UNENCODED)
734 encoding |= CR_UNENCODED;
735 encoding &= ~FRAME_UNENCODED;
736 cf->size = (rlco - cf->rlc_data) * sizeof(*rlco);
737 return encoding;
738 }
739
decode_plane(struct cframe * cf,const __be16 ** rlco,u8 * ref,u32 height,u32 width,bool uncompressed)740 static void decode_plane(struct cframe *cf, const __be16 **rlco, u8 *ref,
741 u32 height, u32 width, bool uncompressed)
742 {
743 unsigned int copies = 0;
744 s16 copy[8 * 8];
745 s16 stat;
746 unsigned int i, j;
747
748 if (uncompressed) {
749 memcpy(ref, *rlco, width * height);
750 *rlco += width * height / 2;
751 return;
752 }
753
754 /*
755 * When decoding each macroblock the rlco pointer will be increased
756 * by 65 * 2 bytes worst-case.
757 * To avoid overflow the buffer has to be 65/64th of the actual raw
758 * image size, just in case someone feeds it malicious data.
759 */
760 for (j = 0; j < height / 8; j++) {
761 for (i = 0; i < width / 8; i++) {
762 u8 *refp = ref + j * 8 * width + i * 8;
763
764 if (copies) {
765 memcpy(cf->de_fwht, copy, sizeof(copy));
766 if (stat & PFRAME_BIT)
767 add_deltas(cf->de_fwht, refp, width);
768 fill_decoder_block(refp, cf->de_fwht, width);
769 copies--;
770 continue;
771 }
772
773 stat = derlc(rlco, cf->coeffs);
774
775 if (stat & PFRAME_BIT)
776 dequantize_inter(cf->coeffs);
777 else
778 dequantize_intra(cf->coeffs);
779
780 ifwht(cf->coeffs, cf->de_fwht,
781 (stat & PFRAME_BIT) ? 0 : 1);
782
783 copies = (stat & DUPS_MASK) >> 1;
784 if (copies)
785 memcpy(copy, cf->de_fwht, sizeof(copy));
786 if (stat & PFRAME_BIT)
787 add_deltas(cf->de_fwht, refp, width);
788 fill_decoder_block(refp, cf->de_fwht, width);
789 }
790 }
791 }
792
decode_frame(struct cframe * cf,struct raw_frame * ref,u32 hdr_flags)793 void decode_frame(struct cframe *cf, struct raw_frame *ref, u32 hdr_flags)
794 {
795 const __be16 *rlco = cf->rlc_data;
796
797 decode_plane(cf, &rlco, ref->luma, cf->height, cf->width,
798 hdr_flags & VICODEC_FL_LUMA_IS_UNCOMPRESSED);
799 decode_plane(cf, &rlco, ref->cb, cf->height / 2, cf->width / 2,
800 hdr_flags & VICODEC_FL_CB_IS_UNCOMPRESSED);
801 decode_plane(cf, &rlco, ref->cr, cf->height / 2, cf->width / 2,
802 hdr_flags & VICODEC_FL_CR_IS_UNCOMPRESSED);
803 }
804