1
2 /*
3 * Copyright © 2014 Broadcom
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25 #include <stdlib.h>
26
27 #include "vc4_qpu.h"
28
29 static void
fail_instr(uint64_t inst,const char * msg)30 fail_instr(uint64_t inst, const char *msg)
31 {
32 fprintf(stderr, "vc4_qpu_validate: %s: ", msg);
33 vc4_qpu_disasm(&inst, 1);
34 fprintf(stderr, "\n");
35 abort();
36 }
37
38 static bool
writes_reg(uint64_t inst,uint32_t w)39 writes_reg(uint64_t inst, uint32_t w)
40 {
41 return (QPU_GET_FIELD(inst, QPU_WADDR_ADD) == w ||
42 QPU_GET_FIELD(inst, QPU_WADDR_MUL) == w);
43 }
44
45 static bool
_reads_reg(uint64_t inst,uint32_t r,bool ignore_a,bool ignore_b)46 _reads_reg(uint64_t inst, uint32_t r, bool ignore_a, bool ignore_b)
47 {
48 struct {
49 uint32_t mux, addr;
50 } src_regs[] = {
51 { QPU_GET_FIELD(inst, QPU_ADD_A) },
52 { QPU_GET_FIELD(inst, QPU_ADD_B) },
53 { QPU_GET_FIELD(inst, QPU_MUL_A) },
54 { QPU_GET_FIELD(inst, QPU_MUL_B) },
55 };
56
57 /* Branches only reference raddr_a (no mux), and we don't use that
58 * feature of branching.
59 */
60 if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH)
61 return false;
62
63 /* Load immediates don't read any registers. */
64 if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LOAD_IMM)
65 return false;
66
67 for (int i = 0; i < ARRAY_SIZE(src_regs); i++) {
68 if (!ignore_a &&
69 src_regs[i].mux == QPU_MUX_A &&
70 (QPU_GET_FIELD(inst, QPU_RADDR_A) == r))
71 return true;
72
73 if (!ignore_b &&
74 QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_SMALL_IMM &&
75 src_regs[i].mux == QPU_MUX_B &&
76 (QPU_GET_FIELD(inst, QPU_RADDR_B) == r))
77 return true;
78 }
79
80 return false;
81 }
82
83 static bool
reads_reg(uint64_t inst,uint32_t r)84 reads_reg(uint64_t inst, uint32_t r)
85 {
86 return _reads_reg(inst, r, false, false);
87 }
88
89 static bool
reads_a_reg(uint64_t inst,uint32_t r)90 reads_a_reg(uint64_t inst, uint32_t r)
91 {
92 return _reads_reg(inst, r, false, true);
93 }
94
95 static bool
reads_b_reg(uint64_t inst,uint32_t r)96 reads_b_reg(uint64_t inst, uint32_t r)
97 {
98 return _reads_reg(inst, r, true, false);
99 }
100
101 static bool
writes_sfu(uint64_t inst)102 writes_sfu(uint64_t inst)
103 {
104 return (writes_reg(inst, QPU_W_SFU_RECIP) ||
105 writes_reg(inst, QPU_W_SFU_RECIPSQRT) ||
106 writes_reg(inst, QPU_W_SFU_EXP) ||
107 writes_reg(inst, QPU_W_SFU_LOG));
108 }
109
110 /**
111 * Checks for the instruction restrictions from page 37 ("Summary of
112 * Instruction Restrictions").
113 */
114 void
vc4_qpu_validate(uint64_t * insts,uint32_t num_inst)115 vc4_qpu_validate(uint64_t *insts, uint32_t num_inst)
116 {
117 bool scoreboard_locked = false;
118 bool threaded = false;
119
120 /* We don't want to do validation in release builds, but we want to
121 * keep compiling the validation code to make sure it doesn't get
122 * broken.
123 */
124 #ifndef DEBUG
125 return;
126 #endif
127
128 for (int i = 0; i < num_inst; i++) {
129 uint64_t inst = insts[i];
130 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
131
132 if (sig != QPU_SIG_PROG_END) {
133 if (qpu_inst_is_tlb(inst))
134 scoreboard_locked = true;
135
136 if (sig == QPU_SIG_THREAD_SWITCH ||
137 sig == QPU_SIG_LAST_THREAD_SWITCH) {
138 threaded = true;
139 }
140
141 continue;
142 }
143
144 /* "The Thread End instruction must not write to either physical
145 * regfile A or B."
146 */
147 if (QPU_GET_FIELD(inst, QPU_WADDR_ADD) < 32 ||
148 QPU_GET_FIELD(inst, QPU_WADDR_MUL) < 32) {
149 fail_instr(inst, "write to phys reg in thread end");
150 }
151
152 /* Can't trigger an implicit wait on scoreboard in the program
153 * end instruction.
154 */
155 if (qpu_inst_is_tlb(inst) && !scoreboard_locked)
156 fail_instr(inst, "implicit sb wait in program end");
157
158 /* Two delay slots will be executed. */
159 assert(i + 2 <= num_inst);
160
161 for (int j = i; j < i + 2; j++) {
162 /* "The last three instructions of any program
163 * (Thread End plus the following two delay-slot
164 * instructions) must not do varyings read, uniforms
165 * read or any kind of VPM, VDR, or VDW read or
166 * write."
167 */
168 if (writes_reg(insts[j], QPU_W_VPM) ||
169 reads_reg(insts[j], QPU_R_VARY) ||
170 reads_reg(insts[j], QPU_R_UNIF) ||
171 reads_reg(insts[j], QPU_R_VPM)) {
172 fail_instr(insts[j], "last 3 instructions "
173 "using fixed functions");
174 }
175
176 /* "The Thread End instruction and the following two
177 * delay slot instructions must not write or read
178 * address 14 in either regfile A or B."
179 */
180 if (writes_reg(insts[j], 14) ||
181 reads_reg(insts[j], 14)) {
182 fail_instr(insts[j], "last 3 instructions "
183 "must not use r14");
184 }
185 }
186
187 /* "The final program instruction (the second delay slot
188 * instruction) must not do a TLB Z write."
189 */
190 if (writes_reg(insts[i + 2], QPU_W_TLB_Z)) {
191 fail_instr(insts[i + 2], "final instruction doing "
192 "Z write");
193 }
194 }
195
196 /* "A scoreboard wait must not occur in the first two instructions of
197 * a fragment shader. This is either the explicit Wait for Scoreboard
198 * signal or an implicit wait with the first tile-buffer read or
199 * write instruction."
200 */
201 for (int i = 0; i < 2; i++) {
202 uint64_t inst = insts[i];
203
204 if (qpu_inst_is_tlb(inst))
205 fail_instr(inst, "sb wait in first two insts");
206 }
207
208 /* "If TMU_NOSWAP is written, the write must be three instructions
209 * before the first TMU write instruction. For example, if
210 * TMU_NOSWAP is written in the first shader instruction, the first
211 * TMU write cannot occur before the 4th shader instruction."
212 */
213 int last_tmu_noswap = -10;
214 for (int i = 0; i < num_inst; i++) {
215 uint64_t inst = insts[i];
216
217 if ((i - last_tmu_noswap) <= 3 &&
218 (writes_reg(inst, QPU_W_TMU0_S) ||
219 writes_reg(inst, QPU_W_TMU1_S))) {
220 fail_instr(inst, "TMU write too soon after TMU_NOSWAP");
221 }
222
223 if (writes_reg(inst, QPU_W_TMU_NOSWAP))
224 last_tmu_noswap = i;
225 }
226
227 /* "An instruction must not read from a location in physical regfile A
228 * or B that was written to by the previous instruction."
229 */
230 for (int i = 0; i < num_inst - 1; i++) {
231 uint64_t inst = insts[i];
232 uint32_t add_waddr = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
233 uint32_t mul_waddr = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
234 uint32_t waddr_a, waddr_b;
235
236 if (inst & QPU_WS) {
237 waddr_b = add_waddr;
238 waddr_a = mul_waddr;
239 } else {
240 waddr_a = add_waddr;
241 waddr_b = mul_waddr;
242 }
243
244 if ((waddr_a < 32 && reads_a_reg(insts[i + 1], waddr_a)) ||
245 (waddr_b < 32 && reads_b_reg(insts[i + 1], waddr_b))) {
246 fail_instr(insts[i + 1],
247 "Reads physical reg too soon after write");
248 }
249 }
250
251 /* "After an SFU lookup instruction, accumulator r4 must not be read
252 * in the following two instructions. Any other instruction that
253 * results in r4 being written (that is, TMU read, TLB read, SFU
254 * lookup) cannot occur in the two instructions following an SFU
255 * lookup."
256 */
257 int last_sfu_inst = -10;
258 for (int i = 0; i < num_inst - 1; i++) {
259 uint64_t inst = insts[i];
260 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
261
262 if (i - last_sfu_inst <= 2 &&
263 (writes_sfu(inst) ||
264 sig == QPU_SIG_LOAD_TMU0 ||
265 sig == QPU_SIG_LOAD_TMU1 ||
266 sig == QPU_SIG_COLOR_LOAD)) {
267 fail_instr(inst, "R4 write too soon after SFU write");
268 }
269
270 if (writes_sfu(inst))
271 last_sfu_inst = i;
272 }
273
274 for (int i = 0; i < num_inst - 1; i++) {
275 uint64_t inst = insts[i];
276
277 if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_SMALL_IMM &&
278 QPU_GET_FIELD(inst, QPU_SMALL_IMM) >=
279 QPU_SMALL_IMM_MUL_ROT) {
280 uint32_t mux_a = QPU_GET_FIELD(inst, QPU_MUL_A);
281 uint32_t mux_b = QPU_GET_FIELD(inst, QPU_MUL_B);
282
283 /* "The full horizontal vector rotate is only
284 * available when both of the mul ALU input arguments
285 * are taken from accumulators r0-r3."
286 */
287 if (mux_a > QPU_MUX_R3 || mux_b > QPU_MUX_R3) {
288 fail_instr(inst,
289 "MUL rotate using non-accumulator "
290 "input");
291 }
292
293 if (QPU_GET_FIELD(inst, QPU_SMALL_IMM) ==
294 QPU_SMALL_IMM_MUL_ROT) {
295 /* "An instruction that does a vector rotate
296 * by r5 must not immediately follow an
297 * instruction that writes to r5."
298 */
299 if (writes_reg(insts[i - 1], QPU_W_ACC5)) {
300 fail_instr(inst,
301 "vector rotate by r5 "
302 "immediately after r5 write");
303 }
304 }
305
306 /* "An instruction that does a vector rotate must not
307 * immediately follow an instruction that writes to the
308 * accumulator that is being rotated."
309 */
310 if (writes_reg(insts[i - 1], QPU_W_ACC0 + mux_a) ||
311 writes_reg(insts[i - 1], QPU_W_ACC0 + mux_b)) {
312 fail_instr(inst,
313 "vector rotate of value "
314 "written in previous instruction");
315 }
316 }
317 }
318
319 /* "An instruction that does a vector rotate must not immediately
320 * follow an instruction that writes to the accumulator that is being
321 * rotated.
322 *
323 * XXX: TODO.
324 */
325
326 /* "After an instruction that does a TLB Z write, the multisample mask
327 * must not be read as an instruction input argument in the following
328 * two instruction. The TLB Z write instruction can, however, be
329 * followed immediately by a TLB color write."
330 */
331 for (int i = 0; i < num_inst - 1; i++) {
332 uint64_t inst = insts[i];
333 if (writes_reg(inst, QPU_W_TLB_Z) &&
334 (reads_a_reg(insts[i + 1], QPU_R_MS_REV_FLAGS) ||
335 reads_a_reg(insts[i + 2], QPU_R_MS_REV_FLAGS))) {
336 fail_instr(inst, "TLB Z write followed by MS mask read");
337 }
338 }
339
340 /*
341 * "A single instruction can only perform a maximum of one of the
342 * following closely coupled peripheral accesses in a single
343 * instruction: TMU write, TMU read, TLB write, TLB read, TLB
344 * combined color read and write, SFU write, Mutex read or Semaphore
345 * access."
346 */
347 for (int i = 0; i < num_inst - 1; i++) {
348 uint64_t inst = insts[i];
349
350 if (qpu_num_sf_accesses(inst) > 1)
351 fail_instr(inst, "Single instruction writes SFU twice");
352 }
353
354 /* "The uniform base pointer can be written (from SIMD element 0) by
355 * the processor to reset the stream, there must be at least two
356 * nonuniform-accessing instructions following a pointer change
357 * before uniforms can be accessed once more."
358 */
359 int last_unif_pointer_update = -3;
360 for (int i = 0; i < num_inst; i++) {
361 uint64_t inst = insts[i];
362 uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
363 uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
364
365 if (reads_reg(inst, QPU_R_UNIF) &&
366 i - last_unif_pointer_update <= 2) {
367 fail_instr(inst,
368 "uniform read too soon after pointer update");
369 }
370
371 if (waddr_add == QPU_W_UNIFORMS_ADDRESS ||
372 waddr_mul == QPU_W_UNIFORMS_ADDRESS)
373 last_unif_pointer_update = i;
374 }
375
376 if (threaded) {
377 bool last_thrsw_found = false;
378 bool scoreboard_locked = false;
379 int tex_samples_outstanding = 0;
380 int last_tex_samples_outstanding = 0;
381 int thrsw_ip = -1;
382
383 for (int i = 0; i < num_inst; i++) {
384 uint64_t inst = insts[i];
385 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
386
387 if (i == thrsw_ip) {
388 /* In order to get texture results back in the
389 * correct order, before a new thrsw we have
390 * to read all the texture results from before
391 * the previous thrsw.
392 *
393 * FIXME: Is collecting the remaining results
394 * during the delay slots OK, or should we do
395 * this at THRSW signal time?
396 */
397 if (last_tex_samples_outstanding != 0) {
398 fail_instr(inst, "THRSW with texture "
399 "results from the previous "
400 "THRSW still in the FIFO.");
401 }
402
403 last_tex_samples_outstanding =
404 tex_samples_outstanding;
405 tex_samples_outstanding = 0;
406 }
407
408 if (qpu_inst_is_tlb(inst))
409 scoreboard_locked = true;
410
411 switch (sig) {
412 case QPU_SIG_THREAD_SWITCH:
413 case QPU_SIG_LAST_THREAD_SWITCH:
414 /* No thread switching with the scoreboard
415 * locked. Doing so means we may deadlock
416 * when the other thread tries to lock
417 * scoreboard.
418 */
419 if (scoreboard_locked) {
420 fail_instr(inst, "THRSW with the "
421 "scoreboard locked.");
422 }
423
424 /* No thread switching after lthrsw, since
425 * lthrsw means that we get delayed until the
426 * other shader is ready for us to terminate.
427 */
428 if (last_thrsw_found) {
429 fail_instr(inst, "THRSW after a "
430 "previous LTHRSW");
431 }
432
433 if (sig == QPU_SIG_LAST_THREAD_SWITCH)
434 last_thrsw_found = true;
435
436 /* No THRSW while we already have a THRSW
437 * queued.
438 */
439 if (i < thrsw_ip) {
440 fail_instr(inst,
441 "THRSW with a THRSW queued.");
442 }
443
444 thrsw_ip = i + 3;
445 break;
446
447 case QPU_SIG_LOAD_TMU0:
448 case QPU_SIG_LOAD_TMU1:
449 if (last_tex_samples_outstanding == 0) {
450 fail_instr(inst, "TMU load with nothing "
451 "in the results fifo from "
452 "the previous THRSW.");
453 }
454
455 last_tex_samples_outstanding--;
456 break;
457 }
458
459 uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
460 uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
461 if (waddr_add == QPU_W_TMU0_S ||
462 waddr_add == QPU_W_TMU1_S ||
463 waddr_mul == QPU_W_TMU0_S ||
464 waddr_mul == QPU_W_TMU1_S) {
465 tex_samples_outstanding++;
466 }
467 }
468 }
469 }
470