1 /*
2 * Copyright 2013 Ilia Mirkin
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23 #include "nv50/nv84_video.h"
24
25 #include "util/u_sse.h"
26
27 struct h264_iparm1 {
28 uint8_t scaling_lists_4x4[6][16]; // 00
29 uint8_t scaling_lists_8x8[2][64]; // 60
30 uint32_t width; // e0
31 uint32_t height; // e4
32 uint64_t ref1_addrs[16]; // e8
33 uint64_t ref2_addrs[16]; // 168
34 uint32_t unk1e8;
35 uint32_t unk1ec;
36 uint32_t w1; // 1f0
37 uint32_t w2; // 1f4
38 uint32_t w3; // 1f8
39 uint32_t h1; // 1fc
40 uint32_t h2; // 200
41 uint32_t h3; // 204
42 uint32_t mb_adaptive_frame_field_flag; // 208
43 uint32_t field_pic_flag; // 20c
44 uint32_t format; // 210
45 uint32_t unk214; // 214
46 };
47
48 struct h264_iparm2 {
49 uint32_t width; // 00
50 uint32_t height; // 04
51 uint32_t mbs; // 08
52 uint32_t w1; // 0c
53 uint32_t w2; // 10
54 uint32_t w3; // 14
55 uint32_t h1; // 18
56 uint32_t h2; // 1c
57 uint32_t h3; // 20
58 uint32_t unk24;
59 uint32_t mb_adaptive_frame_field_flag; // 28
60 uint32_t top; // 2c
61 uint32_t bottom; // 30
62 uint32_t is_reference; // 34
63 };
64
65 void
nv84_decoder_vp_h264(struct nv84_decoder * dec,struct pipe_h264_picture_desc * desc,struct nv84_video_buffer * dest)66 nv84_decoder_vp_h264(struct nv84_decoder *dec,
67 struct pipe_h264_picture_desc *desc,
68 struct nv84_video_buffer *dest)
69 {
70 struct h264_iparm1 param1;
71 struct h264_iparm2 param2;
72 int i, width = align(dest->base.width, 16),
73 height = align(dest->base.height, 16);
74
75 struct nouveau_pushbuf *push = dec->vp_pushbuf;
76 struct nouveau_pushbuf_refn bo_refs[] = {
77 { dest->interlaced, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
78 { dest->full, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
79 { dec->vpring, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
80 { dec->mbring, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
81 { dec->vp_params, NOUVEAU_BO_RDWR | NOUVEAU_BO_GART },
82 { dec->fence, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
83 };
84 int num_refs = ARRAY_SIZE(bo_refs);
85 bool is_ref = desc->is_reference;
86
87 STATIC_ASSERT(sizeof(struct h264_iparm1) == 0x218);
88 STATIC_ASSERT(sizeof(struct h264_iparm2) == 0x38);
89
90 memset(¶m1, 0, sizeof(param1));
91 memset(¶m2, 0, sizeof(param2));
92
93 memcpy(¶m1.scaling_lists_4x4, desc->pps->ScalingList4x4,
94 sizeof(param1.scaling_lists_4x4));
95 memcpy(¶m1.scaling_lists_8x8, desc->pps->ScalingList8x8,
96 sizeof(param1.scaling_lists_8x8));
97
98 param1.width = width;
99 param1.w1 = param1.w2 = param1.w3 = align(width, 64);
100 param1.height = param1.h2 = height;
101 param1.h1 = param1.h3 = align(height, 32);
102 param1.format = 0x3231564e; /* 'NV12' */
103 param1.mb_adaptive_frame_field_flag = desc->pps->sps->mb_adaptive_frame_field_flag;
104 param1.field_pic_flag = desc->field_pic_flag;
105
106 param2.width = width;
107 param2.w1 = param2.w2 = param2.w3 = param1.w1;
108 if (desc->field_pic_flag)
109 param2.height = align(height, 32) / 2;
110 else
111 param2.height = height;
112 param2.h1 = param2.h2 = align(height, 32);
113 param2.h3 = height;
114 param2.mbs = width * height >> 8;
115 if (desc->field_pic_flag) {
116 param2.top = desc->bottom_field_flag ? 2 : 1;
117 param2.bottom = desc->bottom_field_flag;
118 }
119 param2.mb_adaptive_frame_field_flag = desc->pps->sps->mb_adaptive_frame_field_flag;
120 param2.is_reference = desc->is_reference;
121
122 PUSH_SPACE(push, 5 + 16 + 3 + 2 + 6 + (is_ref ? 2 : 0) + 3 + 2 + 4 + 2);
123
124 struct nouveau_bo *ref2_default = dest->full;
125
126 for (i = 0; i < 16; i++) {
127 struct nv84_video_buffer *buf = (struct nv84_video_buffer *)desc->ref[i];
128 struct nouveau_bo *bo1, *bo2;
129 if (buf) {
130 bo1 = buf->interlaced;
131 bo2 = buf->full;
132 if (i == 0)
133 ref2_default = buf->full;
134 } else {
135 bo1 = dest->interlaced;
136 bo2 = ref2_default;
137 }
138 param1.ref1_addrs[i] = bo1->offset;
139 param1.ref2_addrs[i] = bo2->offset;
140 struct nouveau_pushbuf_refn bo_refs[] = {
141 { bo1, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
142 { bo2, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
143 };
144 nouveau_pushbuf_refn(push, bo_refs, ARRAY_SIZE(bo_refs));
145 }
146
147 memcpy(dec->vp_params->map, ¶m1, sizeof(param1));
148 memcpy(dec->vp_params->map + 0x400, ¶m2, sizeof(param2));
149
150 nouveau_pushbuf_refn(push, bo_refs, num_refs);
151
152 /* Wait for BSP to have completed */
153 BEGIN_NV04(push, SUBC_VP(0x10), 4);
154 PUSH_DATAh(push, dec->fence->offset);
155 PUSH_DATA (push, dec->fence->offset);
156 PUSH_DATA (push, 2);
157 PUSH_DATA (push, 1); /* wait for sem == 2 */
158
159 /* VP step 1 */
160 BEGIN_NV04(push, SUBC_VP(0x400), 15);
161 PUSH_DATA (push, 1);
162 PUSH_DATA (push, param2.mbs);
163 PUSH_DATA (push, 0x3987654); /* each nibble probably a dma index */
164 PUSH_DATA (push, 0x55001); /* constant */
165 PUSH_DATA (push, dec->vp_params->offset >> 8);
166 PUSH_DATA (push, (dec->vpring->offset + dec->vpring_residual) >> 8);
167 PUSH_DATA (push, dec->vpring_ctrl);
168 PUSH_DATA (push, dec->vpring->offset >> 8);
169 PUSH_DATA (push, dec->bitstream->size / 2 - 0x700);
170 PUSH_DATA (push, (dec->mbring->offset + dec->mbring->size - 0x2000) >> 8);
171 PUSH_DATA (push, (dec->vpring->offset + dec->vpring_ctrl +
172 dec->vpring_residual + dec->vpring_deblock) >> 8);
173 PUSH_DATA (push, 0);
174 PUSH_DATA (push, 0x100008);
175 PUSH_DATA (push, dest->interlaced->offset >> 8);
176 PUSH_DATA (push, 0);
177
178 BEGIN_NV04(push, SUBC_VP(0x620), 2);
179 PUSH_DATA (push, 0);
180 PUSH_DATA (push, 0);
181
182 BEGIN_NV04(push, SUBC_VP(0x300), 1);
183 PUSH_DATA (push, 0);
184
185 /* VP step 2 */
186 BEGIN_NV04(push, SUBC_VP(0x400), 5);
187 PUSH_DATA (push, 0x54530201);
188 PUSH_DATA (push, (dec->vp_params->offset >> 8) + 0x4);
189 PUSH_DATA (push, (dec->vpring->offset + dec->vpring_ctrl +
190 dec->vpring_residual) >> 8);
191 PUSH_DATA (push, dest->interlaced->offset >> 8);
192 PUSH_DATA (push, dest->interlaced->offset >> 8);
193
194 if (is_ref) {
195 BEGIN_NV04(push, SUBC_VP(0x414), 1);
196 PUSH_DATA (push, dest->full->offset >> 8);
197 }
198
199 BEGIN_NV04(push, SUBC_VP(0x620), 2);
200 PUSH_DATAh(push, dec->vp_fw2_offset);
201 PUSH_DATA (push, dec->vp_fw2_offset);
202
203 BEGIN_NV04(push, SUBC_VP(0x300), 1);
204 PUSH_DATA (push, 0);
205
206 /* Set the semaphore back to 1 */
207 BEGIN_NV04(push, SUBC_VP(0x610), 3);
208 PUSH_DATAh(push, dec->fence->offset);
209 PUSH_DATA (push, dec->fence->offset);
210 PUSH_DATA (push, 1);
211
212 /* Write to the semaphore location, intr */
213 BEGIN_NV04(push, SUBC_VP(0x304), 1);
214 PUSH_DATA (push, 0x101);
215
216 for (i = 0; i < 2; i++) {
217 struct nv50_miptree *mt = nv50_miptree(dest->resources[i]);
218 mt->base.status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING;
219 }
220
221 PUSH_KICK (push);
222 }
223
inverse_quantize(int16_t val,uint8_t quant,int mpeg1)224 static inline int16_t inverse_quantize(int16_t val, uint8_t quant, int mpeg1) {
225 int16_t ret = val * quant / 16;
226 if (mpeg1 && ret) {
227 if (ret > 0)
228 ret = (ret - 1) | 1;
229 else
230 ret = (ret + 1) | 1;
231 }
232 if (ret < -2048)
233 ret = -2048;
234 else if (ret > 2047)
235 ret = 2047;
236 return ret;
237 }
238
239 struct mpeg12_mb_info {
240 uint32_t index;
241 uint8_t unk4;
242 uint8_t unk5;
243 uint16_t coded_block_pattern;
244 uint8_t block_counts[6];
245 uint16_t PMV[8];
246 uint16_t skipped;
247 };
248
249 void
nv84_decoder_vp_mpeg12_mb(struct nv84_decoder * dec,struct pipe_mpeg12_picture_desc * desc,const struct pipe_mpeg12_macroblock * macrob)250 nv84_decoder_vp_mpeg12_mb(struct nv84_decoder *dec,
251 struct pipe_mpeg12_picture_desc *desc,
252 const struct pipe_mpeg12_macroblock *macrob)
253 {
254 STATIC_ASSERT(sizeof(struct mpeg12_mb_info) == 32);
255
256 struct mpeg12_mb_info info = {0};
257 int i, sum = 0, mask, block_index, count;
258 const int16_t *blocks;
259 int intra = macrob->macroblock_type & PIPE_MPEG12_MB_TYPE_INTRA;
260 int motion = macrob->macroblock_type &
261 (PIPE_MPEG12_MB_TYPE_MOTION_FORWARD | PIPE_MPEG12_MB_TYPE_MOTION_BACKWARD);
262 const uint8_t *quant_matrix = intra ? dec->mpeg12_intra_matrix :
263 dec->mpeg12_non_intra_matrix;
264 int mpeg1 = dec->base.profile == PIPE_VIDEO_PROFILE_MPEG1;
265
266 info.index = macrob->y * mb(dec->base.width) + macrob->x;
267 info.unk4 = motion;
268 if (intra)
269 info.unk4 |= 1;
270 if (macrob->macroblock_modes.bits.dct_type)
271 info.unk4 |= 0x20;
272 info.unk5 = (macrob->motion_vertical_field_select << 4) |
273 (macrob->macroblock_modes.value & 0xf);
274 info.coded_block_pattern = macrob->coded_block_pattern;
275 if (motion) {
276 memcpy(info.PMV, macrob->PMV, sizeof(info.PMV));
277 }
278 blocks = macrob->blocks;
279 for (mask = 0x20, block_index = 0; mask > 0; mask >>= 1, block_index++) {
280 if ((macrob->coded_block_pattern & mask) == 0)
281 continue;
282
283 count = 0;
284
285 /*
286 * The observation here is that there are a lot of 0's, and things go
287 * a lot faster if one skips over them.
288 */
289
290 #if defined(PIPE_ARCH_SSE) && defined(PIPE_ARCH_X86_64)
291 /* Note that the SSE implementation is much more tuned to X86_64. As it's not
292 * benchmarked on X86_32, disable it there. I suspect that the code needs to
293 * be reorganized in terms of 32-bit wide data in order to be more
294 * efficient. NV84+ were released well into the 64-bit CPU era, so it should
295 * be a minority case.
296 */
297
298 /* This returns a 16-bit bit-mask, each 2 bits are both 1 or both 0, depending
299 * on whether the corresponding (16-bit) word in blocks is zero or non-zero. */
300 #define wordmask(blocks, zero) \
301 (uint64_t)(_mm_movemask_epi8( \
302 _mm_cmpeq_epi16( \
303 zero, _mm_load_si128((__m128i *)(blocks)))))
304
305 __m128i zero = _mm_setzero_si128();
306
307 /* TODO: Look into doing the inverse quantization in terms of SSE
308 * operations unconditionally, when necessary. */
309 uint64_t bmask0 = wordmask(blocks, zero);
310 bmask0 |= wordmask(blocks + 8, zero) << 16;
311 bmask0 |= wordmask(blocks + 16, zero) << 32;
312 bmask0 |= wordmask(blocks + 24, zero) << 48;
313 uint64_t bmask1 = wordmask(blocks + 32, zero);
314 bmask1 |= wordmask(blocks + 40, zero) << 16;
315 bmask1 |= wordmask(blocks + 48, zero) << 32;
316 bmask1 |= wordmask(blocks + 56, zero) << 48;
317
318 /* The wordmask macro returns the inverse of what we want, since it
319 * returns a 1 for equal-to-zero. Invert. */
320 bmask0 = ~bmask0;
321 bmask1 = ~bmask1;
322
323 /* Note that the bitmask is actually sequences of 2 bits for each block
324 * index. This is because there is no movemask_epi16. That means that
325 * (a) ffs will never return 64, since the prev bit will always be set
326 * in that case, and (b) we need to do an extra bit shift. Or'ing the
327 * bitmasks together is faster than having a loop that computes them one
328 * at a time and processes them, on a Core i7-920. Trying to put bmask
329 * into an array and then looping also slows things down.
330 */
331
332 /* shift needs to be the same width as i, and unsigned so that / 2
333 * becomes a rshift operation */
334 uint32_t shift;
335 i = 0;
336
337 if (dec->base.entrypoint == PIPE_VIDEO_ENTRYPOINT_BITSTREAM) {
338 int16_t tmp;
339 while ((shift = __builtin_ffsll(bmask0))) {
340 i += (shift - 1) / 2;
341 bmask0 >>= shift - 1;
342 *dec->mpeg12_data++ = dec->zscan[i] * 2;
343 tmp = inverse_quantize(blocks[i], quant_matrix[i], mpeg1);
344 *dec->mpeg12_data++ = tmp;
345 sum += tmp;
346 count++;
347 i++;
348 bmask0 >>= 2;
349 }
350 i = 32;
351 while ((shift = __builtin_ffsll(bmask1))) {
352 i += (shift - 1) / 2;
353 bmask1 >>= shift - 1;
354 *dec->mpeg12_data++ = dec->zscan[i] * 2;
355 tmp = inverse_quantize(blocks[i], quant_matrix[i], mpeg1);
356 *dec->mpeg12_data++ = tmp;
357 sum += tmp;
358 count++;
359 i++;
360 bmask1 >>= 2;
361 }
362 } else {
363 while ((shift = __builtin_ffsll(bmask0))) {
364 i += (shift - 1) / 2;
365 bmask0 >>= shift - 1;
366 *dec->mpeg12_data++ = i * 2;
367 *dec->mpeg12_data++ = blocks[i];
368 count++;
369 i++;
370 bmask0 >>= 2;
371 }
372 i = 32;
373 while ((shift = __builtin_ffsll(bmask1))) {
374 i += (shift - 1) / 2;
375 bmask1 >>= shift - 1;
376 *dec->mpeg12_data++ = i * 2;
377 *dec->mpeg12_data++ = blocks[i];
378 count++;
379 i++;
380 bmask1 >>= 2;
381 }
382 }
383 #undef wordmask
384 #else
385
386 /*
387 * This loop looks ridiculously written... and it is. I tried a lot of
388 * different ways of achieving this scan, and this was the fastest, at
389 * least on a Core i7-920. Note that it's not necessary to skip the 0's,
390 * the firmware will deal with those just fine. But it's faster to skip
391 * them. Note to people trying benchmarks: make sure to use realistic
392 * mpeg data, which can often be a single data point first followed by
393 * 63 0's, or <data> 7x <0> <data> 7x <0> etc.
394 */
395 i = 0;
396 if (dec->base.entrypoint == PIPE_VIDEO_ENTRYPOINT_BITSTREAM) {
397 while (true) {
398 int16_t tmp;
399 while (likely(i < 64 && !(tmp = blocks[i]))) i++;
400 if (i >= 64) break;
401 *dec->mpeg12_data++ = dec->zscan[i] * 2;
402 tmp = inverse_quantize(tmp, quant_matrix[i], mpeg1);
403 *dec->mpeg12_data++ = tmp;
404 sum += tmp;
405 count++;
406 i++;
407 }
408 } else {
409 while (true) {
410 int16_t tmp;
411 while (likely(i < 64 && !(tmp = blocks[i]))) i++;
412 if (i >= 64) break;
413 *dec->mpeg12_data++ = i * 2;
414 *dec->mpeg12_data++ = tmp;
415 count++;
416 i++;
417 }
418 }
419
420 #endif
421
422 if (dec->base.entrypoint == PIPE_VIDEO_ENTRYPOINT_BITSTREAM) {
423 if (!mpeg1 && (sum & 1) == 0) {
424 if (count && *(dec->mpeg12_data - 2) == 63 * 2) {
425 uint16_t *val = dec->mpeg12_data - 1;
426 if (*val & 1) *val -= 1;
427 else *val += 1;
428 } else {
429 *dec->mpeg12_data++ = 63 * 2;
430 *dec->mpeg12_data++ = 1;
431 count++;
432 }
433 }
434 }
435
436 if (count) {
437 *(dec->mpeg12_data - 2) |= 1;
438 } else {
439 *dec->mpeg12_data++ = 1;
440 *dec->mpeg12_data++ = 0;
441 count = 1;
442 }
443 info.block_counts[block_index] = count;
444 blocks += 64;
445 }
446
447 memcpy(dec->mpeg12_mb_info, &info, sizeof(info));
448 dec->mpeg12_mb_info += sizeof(info);
449
450 if (macrob->num_skipped_macroblocks) {
451 info.index++;
452 info.coded_block_pattern = 0;
453 info.skipped = macrob->num_skipped_macroblocks - 1;
454 memset(info.block_counts, 0, sizeof(info.block_counts));
455 memcpy(dec->mpeg12_mb_info, &info, sizeof(info));
456 dec->mpeg12_mb_info += sizeof(info);
457 }
458 }
459
460 struct mpeg12_header {
461 uint32_t luma_top_size; // 00
462 uint32_t luma_bottom_size; // 04
463 uint32_t chroma_top_size; // 08
464 uint32_t mbs; // 0c
465 uint32_t mb_info_size; // 10
466 uint32_t mb_width_minus1; // 14
467 uint32_t mb_height_minus1; // 18
468 uint32_t width; // 1c
469 uint32_t height; // 20
470 uint8_t progressive; // 24
471 uint8_t mocomp_only; // 25
472 uint8_t frames; // 26
473 uint8_t picture_structure; // 27
474 uint32_t unk28; // 28 -- 0x50100
475 uint32_t unk2c; // 2c
476 uint32_t pad[4 * 13];
477 };
478
479 void
nv84_decoder_vp_mpeg12(struct nv84_decoder * dec,struct pipe_mpeg12_picture_desc * desc,struct nv84_video_buffer * dest)480 nv84_decoder_vp_mpeg12(struct nv84_decoder *dec,
481 struct pipe_mpeg12_picture_desc *desc,
482 struct nv84_video_buffer *dest)
483 {
484 struct nouveau_pushbuf *push = dec->vp_pushbuf;
485 struct nv84_video_buffer *ref1 = (struct nv84_video_buffer *)desc->ref[0];
486 struct nv84_video_buffer *ref2 = (struct nv84_video_buffer *)desc->ref[1];
487 struct nouveau_pushbuf_refn bo_refs[] = {
488 { dest->interlaced, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
489 { NULL, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
490 { NULL, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
491 { dec->mpeg12_bo, NOUVEAU_BO_RDWR | NOUVEAU_BO_GART },
492 };
493 int i, num_refs = ARRAY_SIZE(bo_refs);
494 struct mpeg12_header header = {0};
495 struct nv50_miptree *y = nv50_miptree(dest->resources[0]);
496 struct nv50_miptree *uv = nv50_miptree(dest->resources[1]);
497
498 STATIC_ASSERT(sizeof(struct mpeg12_header) == 0x100);
499
500 if (!ref1)
501 ref1 = dest;
502 if (!ref2)
503 ref2 = dest;
504 bo_refs[1].bo = ref1->interlaced;
505 bo_refs[2].bo = ref2->interlaced;
506
507 header.luma_top_size = y->layer_stride;
508 header.luma_bottom_size = y->layer_stride;
509 header.chroma_top_size = uv->layer_stride;
510 header.mbs = mb(dec->base.width) * mb(dec->base.height);
511 header.mb_info_size = dec->mpeg12_mb_info - dec->mpeg12_bo->map - 0x100;
512 header.mb_width_minus1 = mb(dec->base.width) - 1;
513 header.mb_height_minus1 = mb(dec->base.height) - 1;
514 header.width = align(dec->base.width, 16);
515 header.height = align(dec->base.height, 16);
516 header.progressive = desc->frame_pred_frame_dct;
517 header.frames = 1 + (desc->ref[0] != NULL) + (desc->ref[1] != NULL);
518 header.picture_structure = desc->picture_structure;
519 header.unk28 = 0x50100;
520
521 memcpy(dec->mpeg12_bo->map, &header, sizeof(header));
522
523 PUSH_SPACE(push, 10 + 3 + 2);
524
525 nouveau_pushbuf_refn(push, bo_refs, num_refs);
526
527 BEGIN_NV04(push, SUBC_VP(0x400), 9);
528 PUSH_DATA (push, 0x543210); /* each nibble possibly a dma index */
529 PUSH_DATA (push, 0x555001); /* constant */
530 PUSH_DATA (push, dec->mpeg12_bo->offset >> 8);
531 PUSH_DATA (push, (dec->mpeg12_bo->offset + 0x100) >> 8);
532 PUSH_DATA (push, (dec->mpeg12_bo->offset + 0x100 +
533 align(0x20 * mb(dec->base.width) *
534 mb(dec->base.height), 0x100)) >> 8);
535 PUSH_DATA (push, dest->interlaced->offset >> 8);
536 PUSH_DATA (push, ref1->interlaced->offset >> 8);
537 PUSH_DATA (push, ref2->interlaced->offset >> 8);
538 PUSH_DATA (push, 6 * 64 * 8 * header.mbs);
539
540 BEGIN_NV04(push, SUBC_VP(0x620), 2);
541 PUSH_DATA (push, 0);
542 PUSH_DATA (push, 0);
543
544 BEGIN_NV04(push, SUBC_VP(0x300), 1);
545 PUSH_DATA (push, 0);
546
547 for (i = 0; i < 2; i++) {
548 struct nv50_miptree *mt = nv50_miptree(dest->resources[i]);
549 mt->base.status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING;
550 }
551 PUSH_KICK (push);
552 }
553