1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /**
25 * @file
26 *
27 * Validates the QPU instruction sequence after register allocation and
28 * scheduling.
29 */
30
31 #include <assert.h>
32 #include <stdio.h>
33 #include <stdlib.h>
34 #include "v3d_compiler.h"
35 #include "qpu/qpu_disasm.h"
36
37 struct v3d_qpu_validate_state {
38 struct v3d_compile *c;
39 const struct v3d_qpu_instr *last;
40 int ip;
41 int last_sfu_write;
42 int last_branch_ip;
43 int last_thrsw_ip;
44 int first_tlb_z_write;
45
46 /* Set when we've found the last-THRSW signal, or if we were started
47 * in single-segment mode.
48 */
49 bool last_thrsw_found;
50
51 /* Set when we've found the THRSW after the last THRSW */
52 bool thrend_found;
53
54 int thrsw_count;
55 };
56
57 static void
fail_instr(struct v3d_qpu_validate_state * state,const char * msg)58 fail_instr(struct v3d_qpu_validate_state *state, const char *msg)
59 {
60 struct v3d_compile *c = state->c;
61
62 fprintf(stderr, "v3d_qpu_validate at ip %d: %s:\n", state->ip, msg);
63
64 int dump_ip = 0;
65 vir_for_each_inst_inorder(inst, c) {
66 v3d_qpu_dump(c->devinfo, &inst->qpu);
67
68 if (dump_ip++ == state->ip)
69 fprintf(stderr, " *** ERROR ***");
70
71 fprintf(stderr, "\n");
72 }
73
74 fprintf(stderr, "\n");
75 abort();
76 }
77
78 static bool
in_branch_delay_slots(struct v3d_qpu_validate_state * state)79 in_branch_delay_slots(struct v3d_qpu_validate_state *state)
80 {
81 return (state->ip - state->last_branch_ip) < 3;
82 }
83
84 static bool
in_thrsw_delay_slots(struct v3d_qpu_validate_state * state)85 in_thrsw_delay_slots(struct v3d_qpu_validate_state *state)
86 {
87 return (state->ip - state->last_thrsw_ip) < 3;
88 }
89
90 static bool
qpu_magic_waddr_matches(const struct v3d_qpu_instr * inst,bool (* predicate)(enum v3d_qpu_waddr waddr))91 qpu_magic_waddr_matches(const struct v3d_qpu_instr *inst,
92 bool (*predicate)(enum v3d_qpu_waddr waddr))
93 {
94 if (inst->type == V3D_QPU_INSTR_TYPE_ALU)
95 return false;
96
97 if (inst->alu.add.op != V3D_QPU_A_NOP &&
98 inst->alu.add.magic_write &&
99 predicate(inst->alu.add.waddr))
100 return true;
101
102 if (inst->alu.mul.op != V3D_QPU_M_NOP &&
103 inst->alu.mul.magic_write &&
104 predicate(inst->alu.mul.waddr))
105 return true;
106
107 return false;
108 }
109
110 static void
qpu_validate_inst(struct v3d_qpu_validate_state * state,struct qinst * qinst)111 qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
112 {
113 const struct v3d_device_info *devinfo = state->c->devinfo;
114
115 if (qinst->is_tlb_z_write && state->ip < state->first_tlb_z_write)
116 state->first_tlb_z_write = state->ip;
117
118 const struct v3d_qpu_instr *inst = &qinst->qpu;
119
120 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH &&
121 state->first_tlb_z_write >= 0 &&
122 state->ip > state->first_tlb_z_write &&
123 inst->branch.msfign != V3D_QPU_MSFIGN_NONE &&
124 inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS &&
125 inst->branch.cond != V3D_QPU_BRANCH_COND_A0 &&
126 inst->branch.cond != V3D_QPU_BRANCH_COND_NA0) {
127 fail_instr(state, "Implicit branch MSF read after TLB Z write");
128 }
129
130 if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
131 return;
132
133 if (inst->alu.add.op == V3D_QPU_A_SETMSF &&
134 state->first_tlb_z_write >= 0 &&
135 state->ip > state->first_tlb_z_write) {
136 fail_instr(state, "SETMSF after TLB Z write");
137 }
138
139 if (state->first_tlb_z_write >= 0 &&
140 state->ip > state->first_tlb_z_write &&
141 inst->alu.add.op == V3D_QPU_A_MSF) {
142 fail_instr(state, "MSF read after TLB Z write");
143 }
144
145 if (devinfo->ver < 71) {
146 if (inst->sig.small_imm_a || inst->sig.small_imm_c ||
147 inst->sig.small_imm_d) {
148 fail_instr(state, "small imm a/c/d added after V3D 7.1");
149 }
150 } else {
151 if ((inst->sig.small_imm_a || inst->sig.small_imm_b) &&
152 !vir_is_add(qinst)) {
153 fail_instr(state, "small imm a/b used but no ADD inst");
154 }
155 if ((inst->sig.small_imm_c || inst->sig.small_imm_d) &&
156 !vir_is_mul(qinst)) {
157 fail_instr(state, "small imm c/d used but no MUL inst");
158 }
159 if (inst->sig.small_imm_a + inst->sig.small_imm_b +
160 inst->sig.small_imm_c + inst->sig.small_imm_d > 1) {
161 fail_instr(state, "only one small immediate can be "
162 "enabled per instruction");
163 }
164 }
165
166 /* LDVARY writes r5 two instructions later and LDUNIF writes
167 * r5 one instruction later, which is illegal to have
168 * together.
169 */
170 if (state->last && state->last->sig.ldvary &&
171 (inst->sig.ldunif || inst->sig.ldunifa)) {
172 fail_instr(state, "LDUNIF after a LDVARY");
173 }
174
175 /* GFXH-1633 (fixed since V3D 4.2.14, which is Rpi4)
176 *
177 * FIXME: This would not check correctly for V3D 4.2 versions lower
178 * than V3D 4.2.14, but that is not a real issue because the simulator
179 * will still catch this, and we are not really targeting any such
180 * versions anyway.
181 */
182 if (state->c->devinfo->ver < 42) {
183 bool last_reads_ldunif = (state->last && (state->last->sig.ldunif ||
184 state->last->sig.ldunifrf));
185 bool last_reads_ldunifa = (state->last && (state->last->sig.ldunifa ||
186 state->last->sig.ldunifarf));
187 bool reads_ldunif = inst->sig.ldunif || inst->sig.ldunifrf;
188 bool reads_ldunifa = inst->sig.ldunifa || inst->sig.ldunifarf;
189 if ((last_reads_ldunif && reads_ldunifa) ||
190 (last_reads_ldunifa && reads_ldunif)) {
191 fail_instr(state,
192 "LDUNIF and LDUNIFA can't be next to each other");
193 }
194 }
195
196 int tmu_writes = 0;
197 int sfu_writes = 0;
198 int vpm_writes = 0;
199 int tlb_writes = 0;
200 int tsy_writes = 0;
201
202 if (inst->alu.add.op != V3D_QPU_A_NOP) {
203 if (inst->alu.add.magic_write) {
204 if (v3d_qpu_magic_waddr_is_tmu(state->c->devinfo,
205 inst->alu.add.waddr)) {
206 tmu_writes++;
207 }
208 if (v3d_qpu_magic_waddr_is_sfu(inst->alu.add.waddr))
209 sfu_writes++;
210 if (v3d_qpu_magic_waddr_is_vpm(inst->alu.add.waddr))
211 vpm_writes++;
212 if (v3d_qpu_magic_waddr_is_tlb(inst->alu.add.waddr))
213 tlb_writes++;
214 if (v3d_qpu_magic_waddr_is_tsy(inst->alu.add.waddr))
215 tsy_writes++;
216 }
217 }
218
219 if (inst->alu.mul.op != V3D_QPU_M_NOP) {
220 if (inst->alu.mul.magic_write) {
221 if (v3d_qpu_magic_waddr_is_tmu(state->c->devinfo,
222 inst->alu.mul.waddr)) {
223 tmu_writes++;
224 }
225 if (v3d_qpu_magic_waddr_is_sfu(inst->alu.mul.waddr))
226 sfu_writes++;
227 if (v3d_qpu_magic_waddr_is_vpm(inst->alu.mul.waddr))
228 vpm_writes++;
229 if (v3d_qpu_magic_waddr_is_tlb(inst->alu.mul.waddr))
230 tlb_writes++;
231 if (v3d_qpu_magic_waddr_is_tsy(inst->alu.mul.waddr))
232 tsy_writes++;
233 }
234 }
235
236 if (in_thrsw_delay_slots(state)) {
237 /* There's no way you want to start SFU during the THRSW delay
238 * slots, since the result would land in the other thread.
239 */
240 if (sfu_writes) {
241 fail_instr(state,
242 "SFU write started during THRSW delay slots ");
243 }
244
245 if (inst->sig.ldvary) {
246 if (devinfo->ver == 42)
247 fail_instr(state, "LDVARY during THRSW delay slots");
248 if (devinfo->ver >= 71 &&
249 state->ip - state->last_thrsw_ip == 2) {
250 fail_instr(state, "LDVARY in 2nd THRSW delay slot");
251 }
252 }
253 }
254
255 (void)qpu_magic_waddr_matches; /* XXX */
256
257 /* SFU r4 results come back two instructions later. No doing
258 * r4 read/writes or other SFU lookups until it's done.
259 */
260 if (state->ip - state->last_sfu_write < 2) {
261 if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_R4))
262 fail_instr(state, "R4 read too soon after SFU");
263
264 if (v3d_qpu_writes_r4(devinfo, inst))
265 fail_instr(state, "R4 write too soon after SFU");
266
267 if (sfu_writes)
268 fail_instr(state, "SFU write too soon after SFU");
269 }
270
271 /* XXX: The docs say VPM can happen with the others, but the simulator
272 * disagrees.
273 */
274 if (tmu_writes +
275 sfu_writes +
276 vpm_writes +
277 tlb_writes +
278 tsy_writes +
279 (devinfo->ver == 42 ? inst->sig.ldtmu : 0) +
280 inst->sig.ldtlb +
281 inst->sig.ldvpm +
282 inst->sig.ldtlbu > 1) {
283 fail_instr(state,
284 "Only one of [TMU, SFU, TSY, TLB read, VPM] allowed");
285 }
286
287 if (sfu_writes)
288 state->last_sfu_write = state->ip;
289
290 if (inst->sig.thrsw) {
291 if (in_branch_delay_slots(state))
292 fail_instr(state, "THRSW in a branch delay slot.");
293
294 if (state->last_thrsw_found)
295 state->thrend_found = true;
296
297 if (state->last_thrsw_ip == state->ip - 1) {
298 /* If it's the second THRSW in a row, then it's just a
299 * last-thrsw signal.
300 */
301 if (state->last_thrsw_found)
302 fail_instr(state, "Two last-THRSW signals");
303 state->last_thrsw_found = true;
304 } else {
305 if (in_thrsw_delay_slots(state)) {
306 fail_instr(state,
307 "THRSW too close to another THRSW.");
308 }
309 state->thrsw_count++;
310 state->last_thrsw_ip = state->ip;
311 }
312 }
313
314 if (state->thrend_found &&
315 state->last_thrsw_ip - state->ip <= 2 &&
316 inst->type == V3D_QPU_INSTR_TYPE_ALU) {
317 if ((inst->alu.add.op != V3D_QPU_A_NOP &&
318 !inst->alu.add.magic_write)) {
319 if (devinfo->ver == 42) {
320 fail_instr(state, "RF write after THREND");
321 } else if (devinfo->ver >= 71) {
322 if (state->last_thrsw_ip - state->ip == 0) {
323 fail_instr(state,
324 "ADD RF write at THREND");
325 }
326 if (inst->alu.add.waddr == 2 ||
327 inst->alu.add.waddr == 3) {
328 fail_instr(state,
329 "RF2-3 write after THREND");
330 }
331 }
332 }
333
334 if ((inst->alu.mul.op != V3D_QPU_M_NOP &&
335 !inst->alu.mul.magic_write)) {
336 if (devinfo->ver == 42) {
337 fail_instr(state, "RF write after THREND");
338 } else if (devinfo->ver >= 71) {
339 if (state->last_thrsw_ip - state->ip == 0) {
340 fail_instr(state,
341 "MUL RF write at THREND");
342 }
343
344 if (inst->alu.mul.waddr == 2 ||
345 inst->alu.mul.waddr == 3) {
346 fail_instr(state,
347 "RF2-3 write after THREND");
348 }
349 }
350 }
351
352 if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) &&
353 !inst->sig_magic) {
354 if (devinfo->ver == 42) {
355 fail_instr(state, "RF write after THREND");
356 } else if (devinfo->ver >= 71 &&
357 (inst->sig_addr == 2 ||
358 inst->sig_addr == 3)) {
359 fail_instr(state, "RF2-3 write after THREND");
360 }
361 }
362
363 /* GFXH-1625: No TMUWT in the last instruction */
364 if (state->last_thrsw_ip - state->ip == 2 &&
365 inst->alu.add.op == V3D_QPU_A_TMUWT)
366 fail_instr(state, "TMUWT in last instruction");
367 }
368
369 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
370 if (in_branch_delay_slots(state))
371 fail_instr(state, "branch in a branch delay slot.");
372 if (in_thrsw_delay_slots(state))
373 fail_instr(state, "branch in a THRSW delay slot.");
374 state->last_branch_ip = state->ip;
375 }
376 }
377
378 static void
qpu_validate_block(struct v3d_qpu_validate_state * state,struct qblock * block)379 qpu_validate_block(struct v3d_qpu_validate_state *state, struct qblock *block)
380 {
381 vir_for_each_inst(qinst, block) {
382 qpu_validate_inst(state, qinst);
383
384 state->last = &qinst->qpu;
385 state->ip++;
386 }
387 }
388
389 /**
390 * Checks for the instruction restrictions from page 37 ("Summary of
391 * Instruction Restrictions").
392 */
393 void
qpu_validate(struct v3d_compile * c)394 qpu_validate(struct v3d_compile *c)
395 {
396 /* We don't want to do validation in release builds, but we want to
397 * keep compiling the validation code to make sure it doesn't get
398 * broken.
399 */
400 #ifndef DEBUG
401 return;
402 #endif
403
404 struct v3d_qpu_validate_state state = {
405 .c = c,
406 .last_sfu_write = -10,
407 .last_thrsw_ip = -10,
408 .last_branch_ip = -10,
409 .first_tlb_z_write = INT_MAX,
410 .ip = 0,
411
412 .last_thrsw_found = !c->last_thrsw,
413 };
414
415 vir_for_each_block(block, c) {
416 qpu_validate_block(&state, block);
417 }
418
419 if (state.thrsw_count > 1 && !state.last_thrsw_found) {
420 fail_instr(&state,
421 "thread switch found without last-THRSW in program");
422 }
423
424 if (!state.thrend_found)
425 fail_instr(&state, "No program-end THRSW found");
426 }
427