• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2022 Google, Inc.
3  * Copyright © 2022 Valve Corporation
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #include "util/macros.h"
26 #include "crashdec.h"
27 #include "cffdec.h"
28 
29 #define MAX_PREFETCH_IBS 4
30 
31 /* CP_INDIRECT_BUFFER contains an optimization to read ahead and start
32  * fetching up to 3 subsequent CP_INDIRECT_BUFFER contents into the ROQ before
33  * starting to execute the current IB. This effectively combines them into one
34  * CP_INDIRECT_BUFFER. The result is that if the ROQ is fast enough and
35  * prefetches some of the extra IBs before the first IB finishes, the ROQ may
36  * be in a different IB than the CP is processing. That is, normally we'd have
37  * a situation like this:
38  *
39  *    CP_INDIRECT_BUFFER
40  *       ...
41  *       CP_FOO <- PFP/SQE is reading from here
42  *       ...
43  *       CP_BAR <- ROQ has prefetched up to here
44  *
45  * where CP_IB*_BASE and CP_IB*_REM_SIZE point to CP_BAR and the difference
46  * between CP_FOO and CP_BAR is given by CP_ROQ_AVAIL_IBn::REM, but instead we
47  * may get a situation like this:
48  *
49  *   CP_INDIRECT_BUFFER
50  *      ...
51  *      CP_FOO <- PFP/SQE is reading here
52  *      ...
53  *   CP_INDIRECT_BUFFER
54  *      ...
55  *      CP_BAR <- ROQ has prefetched up to here
56  *
57  * in this case, the "rem" we get with CP_ROQ_AVAIL_IBn::REM added will be
58  * larger than the size of the second IB, indicating that we need to back up
59  * to the IB before it. This can theoretically even happen recursively with
60  * IB2:
61  *
62  * CP_INDIRECT_BUFFER:
63  *    ...
64  *    CP_INDIRECT_BUFFER:
65  *       ...
66  *       CP_FOO <- PFP/SQE IB2 is reading here
67  *       ...
68  * CP_INDIRECT_BUFFER:
69  *    CP_INDIRECT_BUFFER:
70  *       ...
71  *       CP_BAR <- ROQ IB2 has prefetched up to here
72  *       ...
73  * CP_BAZ <- PFP/SQE IB1 is reading here
74  *
75  * Here the ROQ has prefetched the second IB1, then when processing the IB2 at
76  * the end of the first IB1 it peeks ahead in ROQ and sees another IB2 right
77  * afterward in the second IB1 and starts prefetching that too, so that the
78  * ROQ is in a different IB1 *and* IB2 from the CP.
79  *
80  * To account for this when locating the position that the SQE was at in the
81  * cmdstream at the time of the crash, we do a pre-pass scanning the
82  * CP_INDIRECT_BUFFER packets, keeping a history of previous IB's so that we
83  * can backtrack (because CP_IBn_BASE can be several IB's ahead of SQE).  Once
84  * we find the IB1 position that is being read into ROQ, we backtrack until
85  * we find the IB1 position that SQE is at, and (roughly) repeat the process
86  * in IB2.  This has one calculation in that we need to start scanning for the
87  * CP_INDIRECT_BUFFER to IB2 from before the detected IB1 position.
88  */
89 
90 struct ib {
91    uint64_t ibaddr;
92    uint32_t ibsize;
93 };
94 
95 struct prefetch_state {
96    struct ib history[MAX_PREFETCH_IBS];
97    unsigned num, next;
98 };
99 
100 static void
push_ib(struct prefetch_state * s,struct ib * ib)101 push_ib(struct prefetch_state *s, struct ib *ib)
102 {
103    s->history[s->next++ % ARRAY_SIZE(s->history)] = *ib;
104    s->num = MIN2(s->num + 1, ARRAY_SIZE(s->history));
105 }
106 
107 static struct ib *
get_ib(struct prefetch_state * s,int n)108 get_ib(struct prefetch_state *s, int n)
109 {
110    if ((n >= s->num) || (n < 0))
111       return NULL;
112    int idx = s->next - (s->num - n);
113    return &s->history[idx % ARRAY_SIZE(s->history)];
114 }
115 
116 static void
reset_state(struct prefetch_state * s)117 reset_state(struct prefetch_state *s)
118 {
119    s->num = s->next = 0;
120 }
121 
122 /**
123  * Once we find the ROQ prefetch position, work backwards to find the SQE
124  * position.
125  */
126 static struct ib *
reverse_prefetch(struct prefetch_state * s,int lvl)127 reverse_prefetch(struct prefetch_state *s, int lvl)
128 {
129    unsigned rem = options.ibs[lvl].rem;
130 
131    for (int n = s->num - 1; n >= 0; n--) {
132       struct ib *ib = get_ib(s, n);
133       if (ib->ibsize > rem) {
134          options.ibs[lvl].crash_found = 1;
135          options.ibs[lvl].base = ib->ibaddr;
136          options.ibs[lvl].rem = rem;
137 
138          return ib;
139       }
140       rem -= ib->ibsize;
141    }
142 
143    return NULL;
144 }
145 
146 /**
147  * Scan cmdstream looking for CP_INDIRECT_BUFFER packets, tracking history
148  * of consecutive CP_INDIRECT_BUFFER packets, until we find the one that
149  * matches CP_IBn_BASE.
150  */
151 static struct ib *
scan_cmdstream(struct prefetch_state * s,int lvl,uint32_t * dwords,uint32_t sizedwords)152 scan_cmdstream(struct prefetch_state *s, int lvl, uint32_t *dwords, uint32_t sizedwords)
153 {
154    int dwords_left = sizedwords;
155    uint32_t count = 0; /* dword count including packet header */
156    uint32_t val;
157 
158    while (dwords_left > 0) {
159       if (pkt_is_opcode(dwords[0], &val, &count)) {
160          if (!strcmp(pktname(val), "CP_INDIRECT_BUFFER")) {
161             uint64_t ibaddr;
162             uint32_t ibsize;
163 
164             parse_cp_indirect(&dwords[1], count - 1, &ibaddr, &ibsize);
165             push_ib(s, &(struct ib){ ibaddr, ibsize });
166 
167             /* If we've found the IB indicated by CP_IBn_BASE, then we can
168              * search backwards from here to find the SQE position:
169              */
170             if (ibaddr == options.ibs[lvl].base)
171                return reverse_prefetch(s, lvl);
172 
173             goto next_pkt;
174          }
175       } else if (pkt_is_regwrite(dwords[0], &val, &count)) {
176       } else {
177          count = find_next_packet(dwords, dwords_left);
178       }
179 
180       /* prefetch only happens across consecutive CP_INDIRECT_BUFFER, so
181        * any other packet resets the state:
182        */
183       reset_state(s);
184 
185 next_pkt:
186       dwords += count;
187       dwords_left -= count;
188    }
189 
190    return NULL;
191 }
192 
193 void
handle_prefetch(uint32_t * dwords,uint32_t sizedwords)194 handle_prefetch(uint32_t *dwords, uint32_t sizedwords)
195 {
196    struct prefetch_state rb_state = {};
197    struct ib *ib1 = scan_cmdstream(&rb_state, 1, dwords, sizedwords);
198 
199    if (!ib1)
200       return;
201 
202    /* If the gpu crashed in IB1, we can skip the rest: */
203    if (!options.ibs[2].rem)
204       return;
205 
206    struct prefetch_state ib1_state = {};
207 
208    /* Once we find the actual IB1 position, we need to find the IB2 position.
209     * But because IB2 prefetch can span IB1 CP_INDIRECT_BUFFER targets.  But
210     * there are a limited # of buffers that can be prefetched, and we already
211     * have a history of enough  RB->IB1 IB's, so we can simply scan forward
212     * from our oldest history entry until we find the IB2 match..
213     */
214    for (int n = 0; n < rb_state.num; n++) {
215       struct ib *ib = get_ib(&rb_state, n);
216       uint32_t *ibaddr = hostptr(ib->ibaddr);
217       if (!ibaddr)
218          break;
219       struct ib *ib2 = scan_cmdstream(&ib1_state, 2, ibaddr, ib->ibsize);
220 
221       /* If the crash happens in IB2, but IB1 has a sequence of CP_INDIRECT_BUFFER's
222        * then IB1 could actually be further ahead than IB2, ie:
223        *
224        *    IB1:CP_INDIRECT_BUFFER
225        *        IB2: .. crash somewhere in here ..
226        *    IB1:CP_INDIRECT_BUFFER
227        *    IB1:CP_INDIRECT_BUFFER  <-- detected IB1 position
228        *
229        * Our logic for detecting the correct IB1 position is not incorrect.
230        * It is just that SQE has already consumed some additional IB's.  So
231        * reset the IB1 crash position back to the oldest RB->IB1 IB that we
232        * remember.
233        *
234        * This isn't *quite* correct, but cffdec will only mark the crash when
235        * it finds the location in IB2 if we've determined that the crash is
236        * in IB2, but will only consider the address in IB2 if it has seen the
237        * IB1 base.
238        *
239        * The main case we are trying to account for here is GMEM mode crash in
240        * IB2 which *isn't* the first bin/tile.  Ie. the crash happens later
241        * than the first time we encounter the IB2 crash address.
242        *
243        * This approach works in practice because there will be some other pkts
244        * in IB1 to setup for the next tile, breaking up prefetch.
245        */
246       if (ib2) {
247          assert(options.ibs[2].crash_found);
248          struct ib *first_rb_ib = get_ib(&rb_state, 0);
249 
250          options.ibs[1].base = first_rb_ib->ibaddr;
251          options.ibs[1].rem = first_rb_ib->ibsize;
252 
253          break;
254       }
255 
256       if (ib == ib1)
257          break;
258    }
259 }
260