1
2 /*
3 * Copyright © 2014 Broadcom
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25 #include "vc4_qpu.h"
26
27 static void
fail_instr(uint64_t inst,const char * msg)28 fail_instr(uint64_t inst, const char *msg)
29 {
30 fprintf(stderr, "vc4_qpu_validate: %s: ", msg);
31 vc4_qpu_disasm(&inst, 1);
32 fprintf(stderr, "\n");
33 abort();
34 }
35
36 static bool
writes_reg(uint64_t inst,uint32_t w)37 writes_reg(uint64_t inst, uint32_t w)
38 {
39 return (QPU_GET_FIELD(inst, QPU_WADDR_ADD) == w ||
40 QPU_GET_FIELD(inst, QPU_WADDR_MUL) == w);
41 }
42
43 static bool
_reads_reg(uint64_t inst,uint32_t r,bool ignore_a,bool ignore_b)44 _reads_reg(uint64_t inst, uint32_t r, bool ignore_a, bool ignore_b)
45 {
46 struct {
47 uint32_t mux, addr;
48 } src_regs[] = {
49 { QPU_GET_FIELD(inst, QPU_ADD_A) },
50 { QPU_GET_FIELD(inst, QPU_ADD_B) },
51 { QPU_GET_FIELD(inst, QPU_MUL_A) },
52 { QPU_GET_FIELD(inst, QPU_MUL_B) },
53 };
54
55 /* Branches only reference raddr_a (no mux), and we don't use that
56 * feature of branching.
57 */
58 if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH)
59 return false;
60
61 /* Load immediates don't read any registers. */
62 if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LOAD_IMM)
63 return false;
64
65 for (int i = 0; i < ARRAY_SIZE(src_regs); i++) {
66 if (!ignore_a &&
67 src_regs[i].mux == QPU_MUX_A &&
68 (QPU_GET_FIELD(inst, QPU_RADDR_A) == r))
69 return true;
70
71 if (!ignore_b &&
72 QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_SMALL_IMM &&
73 src_regs[i].mux == QPU_MUX_B &&
74 (QPU_GET_FIELD(inst, QPU_RADDR_B) == r))
75 return true;
76 }
77
78 return false;
79 }
80
81 static bool
reads_reg(uint64_t inst,uint32_t r)82 reads_reg(uint64_t inst, uint32_t r)
83 {
84 return _reads_reg(inst, r, false, false);
85 }
86
87 static bool
reads_a_reg(uint64_t inst,uint32_t r)88 reads_a_reg(uint64_t inst, uint32_t r)
89 {
90 return _reads_reg(inst, r, false, true);
91 }
92
93 static bool
reads_b_reg(uint64_t inst,uint32_t r)94 reads_b_reg(uint64_t inst, uint32_t r)
95 {
96 return _reads_reg(inst, r, true, false);
97 }
98
99 static bool
writes_sfu(uint64_t inst)100 writes_sfu(uint64_t inst)
101 {
102 return (writes_reg(inst, QPU_W_SFU_RECIP) ||
103 writes_reg(inst, QPU_W_SFU_RECIPSQRT) ||
104 writes_reg(inst, QPU_W_SFU_EXP) ||
105 writes_reg(inst, QPU_W_SFU_LOG));
106 }
107
108 /**
109 * Checks for the instruction restrictions from page 37 ("Summary of
110 * Instruction Restrictions").
111 */
112 void
vc4_qpu_validate(uint64_t * insts,uint32_t num_inst)113 vc4_qpu_validate(uint64_t *insts, uint32_t num_inst)
114 {
115 bool scoreboard_locked = false;
116 bool threaded = false;
117
118 /* We don't want to do validation in release builds, but we want to
119 * keep compiling the validation code to make sure it doesn't get
120 * broken.
121 */
122 #ifndef DEBUG
123 return;
124 #endif
125
126 for (int i = 0; i < num_inst; i++) {
127 uint64_t inst = insts[i];
128 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
129
130 if (sig != QPU_SIG_PROG_END) {
131 if (qpu_inst_is_tlb(inst))
132 scoreboard_locked = true;
133
134 if (sig == QPU_SIG_THREAD_SWITCH ||
135 sig == QPU_SIG_LAST_THREAD_SWITCH) {
136 threaded = true;
137 }
138
139 continue;
140 }
141
142 /* "The Thread End instruction must not write to either physical
143 * regfile A or B."
144 */
145 if (QPU_GET_FIELD(inst, QPU_WADDR_ADD) < 32 ||
146 QPU_GET_FIELD(inst, QPU_WADDR_MUL) < 32) {
147 fail_instr(inst, "write to phys reg in thread end");
148 }
149
150 /* Can't trigger an implicit wait on scoreboard in the program
151 * end instruction.
152 */
153 if (qpu_inst_is_tlb(inst) && !scoreboard_locked)
154 fail_instr(inst, "implicit sb wait in program end");
155
156 /* Two delay slots will be executed. */
157 assert(i + 2 <= num_inst);
158
159 for (int j = i; j < i + 2; j++) {
160 /* "The last three instructions of any program
161 * (Thread End plus the following two delay-slot
162 * instructions) must not do varyings read, uniforms
163 * read or any kind of VPM, VDR, or VDW read or
164 * write."
165 */
166 if (writes_reg(insts[j], QPU_W_VPM) ||
167 reads_reg(insts[j], QPU_R_VARY) ||
168 reads_reg(insts[j], QPU_R_UNIF) ||
169 reads_reg(insts[j], QPU_R_VPM)) {
170 fail_instr(insts[j], "last 3 instructions "
171 "using fixed functions");
172 }
173
174 /* "The Thread End instruction and the following two
175 * delay slot instructions must not write or read
176 * address 14 in either regfile A or B."
177 */
178 if (writes_reg(insts[j], 14) ||
179 reads_reg(insts[j], 14)) {
180 fail_instr(insts[j], "last 3 instructions "
181 "must not use r14");
182 }
183 }
184
185 /* "The final program instruction (the second delay slot
186 * instruction) must not do a TLB Z write."
187 */
188 if (writes_reg(insts[i + 2], QPU_W_TLB_Z)) {
189 fail_instr(insts[i + 2], "final instruction doing "
190 "Z write");
191 }
192 }
193
194 /* "A scoreboard wait must not occur in the first two instructions of
195 * a fragment shader. This is either the explicit Wait for Scoreboard
196 * signal or an implicit wait with the first tile-buffer read or
197 * write instruction."
198 */
199 for (int i = 0; i < 2; i++) {
200 uint64_t inst = insts[i];
201
202 if (qpu_inst_is_tlb(inst))
203 fail_instr(inst, "sb wait in first two insts");
204 }
205
206 /* "If TMU_NOSWAP is written, the write must be three instructions
207 * before the first TMU write instruction. For example, if
208 * TMU_NOSWAP is written in the first shader instruction, the first
209 * TMU write cannot occur before the 4th shader instruction."
210 */
211 int last_tmu_noswap = -10;
212 for (int i = 0; i < num_inst; i++) {
213 uint64_t inst = insts[i];
214
215 if ((i - last_tmu_noswap) <= 3 &&
216 (writes_reg(inst, QPU_W_TMU0_S) ||
217 writes_reg(inst, QPU_W_TMU1_S))) {
218 fail_instr(inst, "TMU write too soon after TMU_NOSWAP");
219 }
220
221 if (writes_reg(inst, QPU_W_TMU_NOSWAP))
222 last_tmu_noswap = i;
223 }
224
225 /* "An instruction must not read from a location in physical regfile A
226 * or B that was written to by the previous instruction."
227 */
228 for (int i = 0; i < num_inst - 1; i++) {
229 uint64_t inst = insts[i];
230 uint32_t add_waddr = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
231 uint32_t mul_waddr = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
232 uint32_t waddr_a, waddr_b;
233
234 if (inst & QPU_WS) {
235 waddr_b = add_waddr;
236 waddr_a = mul_waddr;
237 } else {
238 waddr_a = add_waddr;
239 waddr_b = mul_waddr;
240 }
241
242 if ((waddr_a < 32 && reads_a_reg(insts[i + 1], waddr_a)) ||
243 (waddr_b < 32 && reads_b_reg(insts[i + 1], waddr_b))) {
244 fail_instr(insts[i + 1],
245 "Reads physical reg too soon after write");
246 }
247 }
248
249 /* "After an SFU lookup instruction, accumulator r4 must not be read
250 * in the following two instructions. Any other instruction that
251 * results in r4 being written (that is, TMU read, TLB read, SFU
252 * lookup) cannot occur in the two instructions following an SFU
253 * lookup."
254 */
255 int last_sfu_inst = -10;
256 for (int i = 0; i < num_inst - 1; i++) {
257 uint64_t inst = insts[i];
258 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
259
260 if (i - last_sfu_inst <= 2 &&
261 (writes_sfu(inst) ||
262 sig == QPU_SIG_LOAD_TMU0 ||
263 sig == QPU_SIG_LOAD_TMU1 ||
264 sig == QPU_SIG_COLOR_LOAD)) {
265 fail_instr(inst, "R4 write too soon after SFU write");
266 }
267
268 if (writes_sfu(inst))
269 last_sfu_inst = i;
270 }
271
272 for (int i = 0; i < num_inst - 1; i++) {
273 uint64_t inst = insts[i];
274
275 if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_SMALL_IMM &&
276 QPU_GET_FIELD(inst, QPU_SMALL_IMM) >=
277 QPU_SMALL_IMM_MUL_ROT) {
278 uint32_t mux_a = QPU_GET_FIELD(inst, QPU_MUL_A);
279 uint32_t mux_b = QPU_GET_FIELD(inst, QPU_MUL_B);
280
281 /* "The full horizontal vector rotate is only
282 * available when both of the mul ALU input arguments
283 * are taken from accumulators r0-r3."
284 */
285 if (mux_a > QPU_MUX_R3 || mux_b > QPU_MUX_R3) {
286 fail_instr(inst,
287 "MUL rotate using non-accumulator "
288 "input");
289 }
290
291 if (QPU_GET_FIELD(inst, QPU_SMALL_IMM) ==
292 QPU_SMALL_IMM_MUL_ROT) {
293 /* "An instruction that does a vector rotate
294 * by r5 must not immediately follow an
295 * instruction that writes to r5."
296 */
297 if (writes_reg(insts[i - 1], QPU_W_ACC5)) {
298 fail_instr(inst,
299 "vector rotate by r5 "
300 "immediately after r5 write");
301 }
302 }
303
304 /* "An instruction that does a vector rotate must not
305 * immediately follow an instruction that writes to the
306 * accumulator that is being rotated."
307 */
308 if (writes_reg(insts[i - 1], QPU_W_ACC0 + mux_a) ||
309 writes_reg(insts[i - 1], QPU_W_ACC0 + mux_b)) {
310 fail_instr(inst,
311 "vector rotate of value "
312 "written in previous instruction");
313 }
314 }
315 }
316
317 /* "An instruction that does a vector rotate must not immediately
318 * follow an instruction that writes to the accumulator that is being
319 * rotated.
320 *
321 * XXX: TODO.
322 */
323
324 /* "After an instruction that does a TLB Z write, the multisample mask
325 * must not be read as an instruction input argument in the following
326 * two instruction. The TLB Z write instruction can, however, be
327 * followed immediately by a TLB color write."
328 */
329 for (int i = 0; i < num_inst - 1; i++) {
330 uint64_t inst = insts[i];
331 if (writes_reg(inst, QPU_W_TLB_Z) &&
332 (reads_a_reg(insts[i + 1], QPU_R_MS_REV_FLAGS) ||
333 reads_a_reg(insts[i + 2], QPU_R_MS_REV_FLAGS))) {
334 fail_instr(inst, "TLB Z write followed by MS mask read");
335 }
336 }
337
338 /*
339 * "A single instruction can only perform a maximum of one of the
340 * following closely coupled peripheral accesses in a single
341 * instruction: TMU write, TMU read, TLB write, TLB read, TLB
342 * combined color read and write, SFU write, Mutex read or Semaphore
343 * access."
344 */
345 for (int i = 0; i < num_inst - 1; i++) {
346 uint64_t inst = insts[i];
347
348 if (qpu_num_sf_accesses(inst) > 1)
349 fail_instr(inst, "Single instruction writes SFU twice");
350 }
351
352 /* "The uniform base pointer can be written (from SIMD element 0) by
353 * the processor to reset the stream, there must be at least two
354 * nonuniform-accessing instructions following a pointer change
355 * before uniforms can be accessed once more."
356 */
357 int last_unif_pointer_update = -3;
358 for (int i = 0; i < num_inst; i++) {
359 uint64_t inst = insts[i];
360 uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
361 uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
362
363 if (reads_reg(inst, QPU_R_UNIF) &&
364 i - last_unif_pointer_update <= 2) {
365 fail_instr(inst,
366 "uniform read too soon after pointer update");
367 }
368
369 if (waddr_add == QPU_W_UNIFORMS_ADDRESS ||
370 waddr_mul == QPU_W_UNIFORMS_ADDRESS)
371 last_unif_pointer_update = i;
372 }
373
374 if (threaded) {
375 bool last_thrsw_found = false;
376 bool scoreboard_locked = false;
377 int tex_samples_outstanding = 0;
378 int last_tex_samples_outstanding = 0;
379 int thrsw_ip = -1;
380
381 for (int i = 0; i < num_inst; i++) {
382 uint64_t inst = insts[i];
383 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
384
385 if (i == thrsw_ip) {
386 /* In order to get texture results back in the
387 * correct order, before a new thrsw we have
388 * to read all the texture results from before
389 * the previous thrsw.
390 *
391 * FIXME: Is collecting the remaining results
392 * during the delay slots OK, or should we do
393 * this at THRSW signal time?
394 */
395 if (last_tex_samples_outstanding != 0) {
396 fail_instr(inst, "THRSW with texture "
397 "results from the previous "
398 "THRSW still in the FIFO.");
399 }
400
401 last_tex_samples_outstanding =
402 tex_samples_outstanding;
403 tex_samples_outstanding = 0;
404 }
405
406 if (qpu_inst_is_tlb(inst))
407 scoreboard_locked = true;
408
409 switch (sig) {
410 case QPU_SIG_THREAD_SWITCH:
411 case QPU_SIG_LAST_THREAD_SWITCH:
412 /* No thread switching with the scoreboard
413 * locked. Doing so means we may deadlock
414 * when the other thread tries to lock
415 * scoreboard.
416 */
417 if (scoreboard_locked) {
418 fail_instr(inst, "THRSW with the "
419 "scoreboard locked.");
420 }
421
422 /* No thread switching after lthrsw, since
423 * lthrsw means that we get delayed until the
424 * other shader is ready for us to terminate.
425 */
426 if (last_thrsw_found) {
427 fail_instr(inst, "THRSW after a "
428 "previous LTHRSW");
429 }
430
431 if (sig == QPU_SIG_LAST_THREAD_SWITCH)
432 last_thrsw_found = true;
433
434 /* No THRSW while we already have a THRSW
435 * queued.
436 */
437 if (i < thrsw_ip) {
438 fail_instr(inst,
439 "THRSW with a THRSW queued.");
440 }
441
442 thrsw_ip = i + 3;
443 break;
444
445 case QPU_SIG_LOAD_TMU0:
446 case QPU_SIG_LOAD_TMU1:
447 if (last_tex_samples_outstanding == 0) {
448 fail_instr(inst, "TMU load with nothing "
449 "in the results fifo from "
450 "the previous THRSW.");
451 }
452
453 last_tex_samples_outstanding--;
454 break;
455 }
456
457 uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
458 uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
459 if (waddr_add == QPU_W_TMU0_S ||
460 waddr_add == QPU_W_TMU1_S ||
461 waddr_mul == QPU_W_TMU0_S ||
462 waddr_mul == QPU_W_TMU1_S) {
463 tex_samples_outstanding++;
464 }
465 }
466 }
467 }
468