• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2021 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include "ir3_ra.h"
25 
26 /* The spilling pass leaves out a few details required to successfully operate
27  * ldp/stp:
28  *
29  * 1. ldp/stp can only load/store 4 components at a time, but spilling ignores
30  *    that and just spills/restores entire values, including arrays and values
31  *    created for texture setup which can be more than 4 components.
32  * 2. The immediate offset only has 13 bits and is signed, so if we spill a lot
33  *    or have very large arrays before spilling then we could run out.
34  * 3. The spiller doesn't add barrier dependencies needed for post-RA
35  *    scheduling.
36  *
37  * The first one, in particular, is much easier to handle after RA because
38  * arrays and normal values can be treated the same way. Therefore this pass
39  * runs after RA, and handles all three issues. This keeps the complexity out of
40  * the spiller.
41  */
42 
43 static unsigned
component_bytes(struct ir3_register * src)44 component_bytes(struct ir3_register *src)
45 {
46    return (src->flags & IR3_REG_HALF) ? 2 : 4;
47 }
48 
49 /* Note: this won't work if the base register is anything other than 0!
50  * Dynamic bases, which we'll need for "real" function call support, will
51  * probably be a lot harder to handle and may require reserving another
52  * register.
53  */
54 static void
set_base_reg(struct ir3_instruction * mem,unsigned val)55 set_base_reg(struct ir3_instruction *mem, unsigned val)
56 {
57    struct ir3_instruction *mov = ir3_instr_create(mem->block, OPC_MOV, 1, 1);
58    ir3_dst_create(mov, mem->srcs[0]->num, mem->srcs[0]->flags);
59    ir3_src_create(mov, INVALID_REG, IR3_REG_IMMED)->uim_val = val;
60    mov->cat1.dst_type = mov->cat1.src_type = TYPE_U32;
61 
62    ir3_instr_move_before(mov, mem);
63 }
64 
65 static void
reset_base_reg(struct ir3_instruction * mem)66 reset_base_reg(struct ir3_instruction *mem)
67 {
68    /* If the base register is killed, then we don't need to clobber it and it
69     * may be reused as a destination so we can't always clobber it after the
70     * instruction anyway.
71     */
72    struct ir3_register *base = mem->srcs[0];
73    if (base->flags & IR3_REG_KILL)
74       return;
75 
76    struct ir3_instruction *mov = ir3_instr_create(mem->block, OPC_MOV, 1, 1);
77    ir3_dst_create(mov, base->num, base->flags);
78    ir3_src_create(mov, INVALID_REG, IR3_REG_IMMED)->uim_val = 0;
79    mov->cat1.dst_type = mov->cat1.src_type = TYPE_U32;
80 
81    ir3_instr_move_after(mov, mem);
82 }
83 
84 /* There are 13 bits, but 1 << 12 will be sign-extended into a negative offset
85  * so it can't be used directly. Therefore only offsets under 1 << 12 can be
86  * used without any adjustments.
87  */
88 #define MAX_CAT6_SIZE (1u << 12)
89 
90 static void
handle_oob_offset_spill(struct ir3_instruction * spill)91 handle_oob_offset_spill(struct ir3_instruction *spill)
92 {
93    unsigned components = spill->srcs[2]->uim_val;
94 
95    if (spill->cat6.dst_offset + components * component_bytes(spill->srcs[1]) < MAX_CAT6_SIZE)
96       return;
97 
98    set_base_reg(spill, spill->cat6.dst_offset);
99    reset_base_reg(spill);
100    spill->cat6.dst_offset = 0;
101 }
102 
103 static void
handle_oob_offset_reload(struct ir3_instruction * reload)104 handle_oob_offset_reload(struct ir3_instruction *reload)
105 {
106    unsigned components = reload->srcs[2]->uim_val;
107    unsigned offset = reload->srcs[1]->uim_val;
108    if (offset + components * component_bytes(reload->dsts[0]) < MAX_CAT6_SIZE)
109       return;
110 
111    set_base_reg(reload, offset);
112    reset_base_reg(reload);
113    reload->srcs[1]->uim_val = 0;
114 }
115 
116 static void
split_spill(struct ir3_instruction * spill)117 split_spill(struct ir3_instruction *spill)
118 {
119    unsigned orig_components = spill->srcs[2]->uim_val;
120 
121    /* We don't handle splitting dependencies. */
122    assert(spill->deps_count == 0);
123 
124    if (orig_components <= 4) {
125       if (spill->srcs[1]->flags & IR3_REG_ARRAY) {
126          spill->srcs[1]->wrmask = MASK(orig_components);
127          spill->srcs[1]->num = spill->srcs[1]->array.base;
128          spill->srcs[1]->flags &= ~IR3_REG_ARRAY;
129       }
130       return;
131    }
132 
133    for (unsigned comp = 0; comp < orig_components; comp += 4) {
134       unsigned components = MIN2(orig_components - comp, 4);
135       struct ir3_instruction *clone = ir3_instr_clone(spill);
136       ir3_instr_move_before(clone, spill);
137 
138       clone->srcs[1]->wrmask = MASK(components);
139       if (clone->srcs[1]->flags & IR3_REG_ARRAY) {
140          clone->srcs[1]->num = clone->srcs[1]->array.base + comp;
141          clone->srcs[1]->flags &= ~IR3_REG_ARRAY;
142       }
143 
144       clone->srcs[2]->uim_val = components;
145       clone->cat6.dst_offset += comp * component_bytes(spill->srcs[1]);
146    }
147 
148    list_delinit(&spill->node);
149 }
150 
151 static void
split_reload(struct ir3_instruction * reload)152 split_reload(struct ir3_instruction *reload)
153 {
154    unsigned orig_components = reload->srcs[2]->uim_val;
155 
156    assert(reload->deps_count == 0);
157 
158    if (orig_components <= 4) {
159       if (reload->dsts[0]->flags & IR3_REG_ARRAY) {
160          reload->dsts[0]->wrmask = MASK(orig_components);
161          reload->dsts[0]->num = reload->dsts[0]->array.base;
162          reload->dsts[0]->flags &= ~IR3_REG_ARRAY;
163       }
164       return;
165    }
166 
167    for (unsigned comp = 0; comp < orig_components; comp += 4) {
168       unsigned components = MIN2(orig_components - comp, 4);
169       struct ir3_instruction *clone = ir3_instr_clone(reload);
170       ir3_instr_move_before(clone, reload);
171 
172       clone->dsts[0]->wrmask = MASK(components);
173       if (clone->dsts[0]->flags & IR3_REG_ARRAY) {
174          clone->dsts[0]->num = clone->dsts[0]->array.base + comp;
175          clone->dsts[0]->flags &= ~IR3_REG_ARRAY;
176       }
177 
178       clone->srcs[2]->uim_val = components;
179       clone->srcs[1]->uim_val += comp * component_bytes(reload->dsts[0]);
180    }
181 
182    list_delinit(&reload->node);
183 }
184 
185 static void
add_spill_reload_deps(struct ir3_block * block)186 add_spill_reload_deps(struct ir3_block *block)
187 {
188    struct ir3_instruction *last_spill = NULL;
189 
190    foreach_instr (instr, &block->instr_list) {
191       if ((instr->opc == OPC_SPILL_MACRO || instr->opc == OPC_RELOAD_MACRO) &&
192           last_spill) {
193          ir3_instr_add_dep(instr, last_spill);
194       }
195 
196       if (instr->opc == OPC_SPILL_MACRO)
197          last_spill = instr;
198    }
199 
200 
201    last_spill = NULL;
202 
203    foreach_instr_rev (instr, &block->instr_list) {
204       if ((instr->opc == OPC_SPILL_MACRO || instr->opc == OPC_RELOAD_MACRO) &&
205           last_spill) {
206          ir3_instr_add_dep(last_spill, instr);
207       }
208 
209       if (instr->opc == OPC_SPILL_MACRO)
210          last_spill = instr;
211    }
212 }
213 
214 bool
ir3_lower_spill(struct ir3 * ir)215 ir3_lower_spill(struct ir3 *ir)
216 {
217    foreach_block (block, &ir->block_list) {
218       foreach_instr_safe (instr, &block->instr_list) {
219          if (instr->opc == OPC_SPILL_MACRO) {
220             handle_oob_offset_spill(instr);
221             split_spill(instr);
222          } else if (instr->opc == OPC_RELOAD_MACRO) {
223             handle_oob_offset_reload(instr);
224             split_reload(instr);
225          }
226       }
227 
228       add_spill_reload_deps(block);
229 
230       foreach_instr (instr, &block->instr_list) {
231          if (instr->opc == OPC_SPILL_MACRO)
232             instr->opc = OPC_STP;
233          else if (instr->opc == OPC_RELOAD_MACRO)
234             instr->opc = OPC_LDP;
235       }
236    }
237 
238    return true;
239 }
240