• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* -*- c++ -*- */
2 /*
3  * Copyright © 2010-2015 Intel Corporation
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22  * IN THE SOFTWARE.
23  */
24 
25 #pragma once
26 
27 #include "elk_shader.h"
28 
29 class elk_fs_inst;
30 
31 class elk_fs_reg : public elk_backend_reg {
32 public:
33    DECLARE_RALLOC_CXX_OPERATORS(elk_fs_reg)
34 
35    void init();
36 
37    elk_fs_reg();
38    elk_fs_reg(struct ::elk_reg reg);
39    elk_fs_reg(enum elk_reg_file file, unsigned nr);
40    elk_fs_reg(enum elk_reg_file file, unsigned nr, enum elk_reg_type type);
41 
42    bool equals(const elk_fs_reg &r) const;
43    bool negative_equals(const elk_fs_reg &r) const;
44    bool is_contiguous() const;
45 
46    /**
47     * Return the size in bytes of a single logical component of the
48     * register assuming the given execution width.
49     */
50    unsigned component_size(unsigned width) const;
51 
52    /** Register region horizontal stride */
53    uint8_t stride;
54 };
55 
56 static inline elk_fs_reg
negate(elk_fs_reg reg)57 negate(elk_fs_reg reg)
58 {
59    assert(reg.file != IMM);
60    reg.negate = !reg.negate;
61    return reg;
62 }
63 
64 static inline elk_fs_reg
retype(elk_fs_reg reg,enum elk_reg_type type)65 retype(elk_fs_reg reg, enum elk_reg_type type)
66 {
67    reg.type = type;
68    return reg;
69 }
70 
71 static inline elk_fs_reg
byte_offset(elk_fs_reg reg,unsigned delta)72 byte_offset(elk_fs_reg reg, unsigned delta)
73 {
74    switch (reg.file) {
75    case BAD_FILE:
76       break;
77    case VGRF:
78    case ATTR:
79    case UNIFORM:
80       reg.offset += delta;
81       break;
82    case MRF: {
83       const unsigned suboffset = reg.offset + delta;
84       reg.nr += suboffset / REG_SIZE;
85       reg.offset = suboffset % REG_SIZE;
86       break;
87    }
88    case ARF:
89    case FIXED_GRF: {
90       const unsigned suboffset = reg.subnr + delta;
91       reg.nr += suboffset / REG_SIZE;
92       reg.subnr = suboffset % REG_SIZE;
93       break;
94    }
95    case IMM:
96    default:
97       assert(delta == 0);
98    }
99    return reg;
100 }
101 
102 static inline elk_fs_reg
horiz_offset(const elk_fs_reg & reg,unsigned delta)103 horiz_offset(const elk_fs_reg &reg, unsigned delta)
104 {
105    switch (reg.file) {
106    case BAD_FILE:
107    case UNIFORM:
108    case IMM:
109       /* These only have a single component that is implicitly splatted.  A
110        * horizontal offset should be a harmless no-op.
111        * XXX - Handle vector immediates correctly.
112        */
113       return reg;
114    case VGRF:
115    case MRF:
116    case ATTR:
117       return byte_offset(reg, delta * reg.stride * type_sz(reg.type));
118    case ARF:
119    case FIXED_GRF:
120       if (reg.is_null()) {
121          return reg;
122       } else {
123          const unsigned hstride = reg.hstride ? 1 << (reg.hstride - 1) : 0;
124          const unsigned vstride = reg.vstride ? 1 << (reg.vstride - 1) : 0;
125          const unsigned width = 1 << reg.width;
126 
127          if (delta % width == 0) {
128             return byte_offset(reg, delta / width * vstride * type_sz(reg.type));
129          } else {
130             assert(vstride == hstride * width);
131             return byte_offset(reg, delta * hstride * type_sz(reg.type));
132          }
133       }
134    }
135    unreachable("Invalid register file");
136 }
137 
138 static inline elk_fs_reg
offset(elk_fs_reg reg,unsigned width,unsigned delta)139 offset(elk_fs_reg reg, unsigned width, unsigned delta)
140 {
141    switch (reg.file) {
142    case BAD_FILE:
143       break;
144    case ARF:
145    case FIXED_GRF:
146    case MRF:
147    case VGRF:
148    case ATTR:
149    case UNIFORM:
150       return byte_offset(reg, delta * reg.component_size(width));
151    case IMM:
152       assert(delta == 0);
153    }
154    return reg;
155 }
156 
157 /**
158  * Get the scalar channel of \p reg given by \p idx and replicate it to all
159  * channels of the result.
160  */
161 static inline elk_fs_reg
component(elk_fs_reg reg,unsigned idx)162 component(elk_fs_reg reg, unsigned idx)
163 {
164    reg = horiz_offset(reg, idx);
165    reg.stride = 0;
166    if (reg.file == ARF || reg.file == FIXED_GRF) {
167       reg.vstride = ELK_VERTICAL_STRIDE_0;
168       reg.width = ELK_WIDTH_1;
169       reg.hstride = ELK_HORIZONTAL_STRIDE_0;
170    }
171    return reg;
172 }
173 
174 /**
175  * Return an integer identifying the discrete address space a register is
176  * contained in.  A register is by definition fully contained in the single
177  * reg_space it belongs to, so two registers with different reg_space ids are
178  * guaranteed not to overlap.  Most register files are a single reg_space of
179  * its own, only the VGRF and ATTR files are composed of multiple discrete
180  * address spaces, one for each allocation and input attribute respectively.
181  */
182 static inline uint32_t
reg_space(const elk_fs_reg & r)183 reg_space(const elk_fs_reg &r)
184 {
185    return r.file << 16 | (r.file == VGRF || r.file == ATTR ? r.nr : 0);
186 }
187 
188 /**
189  * Return the base offset in bytes of a register relative to the start of its
190  * reg_space().
191  */
192 static inline unsigned
reg_offset(const elk_fs_reg & r)193 reg_offset(const elk_fs_reg &r)
194 {
195    return (r.file == VGRF || r.file == IMM || r.file == ATTR ? 0 : r.nr) *
196           (r.file == UNIFORM ? 4 : REG_SIZE) + r.offset +
197           (r.file == ARF || r.file == FIXED_GRF ? r.subnr : 0);
198 }
199 
200 /**
201  * Return the amount of padding in bytes left unused between individual
202  * components of register \p r due to a (horizontal) stride value greater than
203  * one, or zero if components are tightly packed in the register file.
204  */
205 static inline unsigned
reg_padding(const elk_fs_reg & r)206 reg_padding(const elk_fs_reg &r)
207 {
208    const unsigned stride = ((r.file != ARF && r.file != FIXED_GRF) ? r.stride :
209                             r.hstride == 0 ? 0 :
210                             1 << (r.hstride - 1));
211    return (MAX2(1, stride) - 1) * type_sz(r.type);
212 }
213 
214 /* Do not call this directly. Call regions_overlap() instead. */
215 static inline bool
regions_overlap_MRF(const elk_fs_reg & r,unsigned dr,const elk_fs_reg & s,unsigned ds)216 regions_overlap_MRF(const elk_fs_reg &r, unsigned dr, const elk_fs_reg &s, unsigned ds)
217 {
218    if (r.nr & ELK_MRF_COMPR4) {
219       elk_fs_reg t = r;
220       t.nr &= ~ELK_MRF_COMPR4;
221       /* COMPR4 regions are translated by the hardware during decompression
222        * into two separate half-regions 4 MRFs apart from each other.
223        *
224        * Note: swapping s and t in this parameter list eliminates one possible
225        * level of recursion (since the s in the called versions of
226        * regions_overlap_MRF can't be COMPR4), and that makes the compiled
227        * code a lot smaller.
228        */
229       return regions_overlap_MRF(s, ds, t, dr / 2) ||
230              regions_overlap_MRF(s, ds, byte_offset(t, 4 * REG_SIZE), dr / 2);
231    } else if (s.nr & ELK_MRF_COMPR4) {
232       return regions_overlap_MRF(s, ds, r, dr);
233    }
234 
235    return !((r.nr * REG_SIZE + r.offset + dr) <= (s.nr * REG_SIZE + s.offset) ||
236             (s.nr * REG_SIZE + s.offset + ds) <= (r.nr * REG_SIZE + r.offset));
237 }
238 
239 /**
240  * Return whether the register region starting at \p r and spanning \p dr
241  * bytes could potentially overlap the register region starting at \p s and
242  * spanning \p ds bytes.
243  */
244 static inline bool
regions_overlap(const elk_fs_reg & r,unsigned dr,const elk_fs_reg & s,unsigned ds)245 regions_overlap(const elk_fs_reg &r, unsigned dr, const elk_fs_reg &s, unsigned ds)
246 {
247    if (r.file != s.file)
248       return false;
249 
250    if (r.file == VGRF) {
251       return r.nr == s.nr &&
252              !(r.offset + dr <= s.offset || s.offset + ds <= r.offset);
253    } else if (r.file != MRF) {
254       return !(reg_offset(r) + dr <= reg_offset(s) ||
255                reg_offset(s) + ds <= reg_offset(r));
256    } else {
257       return regions_overlap_MRF(r, dr, s, ds);
258    }
259 }
260 
261 /**
262  * Check that the register region given by r [r.offset, r.offset + dr[
263  * is fully contained inside the register region given by s
264  * [s.offset, s.offset + ds[.
265  */
266 static inline bool
region_contained_in(const elk_fs_reg & r,unsigned dr,const elk_fs_reg & s,unsigned ds)267 region_contained_in(const elk_fs_reg &r, unsigned dr, const elk_fs_reg &s, unsigned ds)
268 {
269    return reg_space(r) == reg_space(s) &&
270           reg_offset(r) >= reg_offset(s) &&
271           reg_offset(r) + dr <= reg_offset(s) + ds;
272 }
273 
274 /**
275  * Return whether the given register region is n-periodic, i.e. whether the
276  * original region remains invariant after shifting it by \p n scalar
277  * channels.
278  */
279 static inline bool
is_periodic(const elk_fs_reg & reg,unsigned n)280 is_periodic(const elk_fs_reg &reg, unsigned n)
281 {
282    if (reg.file == BAD_FILE || reg.is_null()) {
283       return true;
284 
285    } else if (reg.file == IMM) {
286       const unsigned period = (reg.type == ELK_REGISTER_TYPE_UV ||
287                                reg.type == ELK_REGISTER_TYPE_V ? 8 :
288                                reg.type == ELK_REGISTER_TYPE_VF ? 4 :
289                                1);
290       return n % period == 0;
291 
292    } else if (reg.file == ARF || reg.file == FIXED_GRF) {
293       const unsigned period = (reg.hstride == 0 && reg.vstride == 0 ? 1 :
294                                reg.vstride == 0 ? 1 << reg.width :
295                                ~0);
296       return n % period == 0;
297 
298    } else {
299       return reg.stride == 0;
300    }
301 }
302 
303 static inline bool
is_uniform(const elk_fs_reg & reg)304 is_uniform(const elk_fs_reg &reg)
305 {
306    return is_periodic(reg, 1);
307 }
308 
309 /**
310  * Get the specified 8-component quarter of a register.
311  */
312 static inline elk_fs_reg
quarter(const elk_fs_reg & reg,unsigned idx)313 quarter(const elk_fs_reg &reg, unsigned idx)
314 {
315    assert(idx < 4);
316    return horiz_offset(reg, 8 * idx);
317 }
318 
319 /**
320  * Reinterpret each channel of register \p reg as a vector of values of the
321  * given smaller type and take the i-th subcomponent from each.
322  */
323 static inline elk_fs_reg
subscript(elk_fs_reg reg,elk_reg_type type,unsigned i)324 subscript(elk_fs_reg reg, elk_reg_type type, unsigned i)
325 {
326    assert((i + 1) * type_sz(type) <= type_sz(reg.type));
327 
328    if (reg.file == ARF || reg.file == FIXED_GRF) {
329       /* The stride is encoded inconsistently for fixed GRF and ARF registers
330        * as the log2 of the actual vertical and horizontal strides.
331        */
332       const int delta = util_logbase2(type_sz(reg.type)) -
333                         util_logbase2(type_sz(type));
334       reg.hstride += (reg.hstride ? delta : 0);
335       reg.vstride += (reg.vstride ? delta : 0);
336 
337    } else if (reg.file == IMM) {
338       unsigned bit_size = type_sz(type) * 8;
339       reg.u64 >>= i * bit_size;
340       reg.u64 &= BITFIELD64_MASK(bit_size);
341       if (bit_size <= 16)
342          reg.u64 |= reg.u64 << 16;
343       return retype(reg, type);
344    } else {
345       reg.stride *= type_sz(reg.type) / type_sz(type);
346    }
347 
348    return byte_offset(retype(reg, type), i * type_sz(type));
349 }
350 
351 static inline elk_fs_reg
horiz_stride(elk_fs_reg reg,unsigned s)352 horiz_stride(elk_fs_reg reg, unsigned s)
353 {
354    reg.stride *= s;
355    return reg;
356 }
357 
358 static const elk_fs_reg reg_undef;
359 
360 class elk_fs_inst : public elk_backend_instruction {
361    elk_fs_inst &operator=(const elk_fs_inst &);
362 
363    void init(enum elk_opcode opcode, uint8_t exec_width, const elk_fs_reg &dst,
364              const elk_fs_reg *src, unsigned sources);
365 
366 public:
367    DECLARE_RALLOC_CXX_OPERATORS(elk_fs_inst)
368 
369    elk_fs_inst();
370    elk_fs_inst(enum elk_opcode opcode, uint8_t exec_size);
371    elk_fs_inst(enum elk_opcode opcode, uint8_t exec_size, const elk_fs_reg &dst);
372    elk_fs_inst(enum elk_opcode opcode, uint8_t exec_size, const elk_fs_reg &dst,
373            const elk_fs_reg &src0);
374    elk_fs_inst(enum elk_opcode opcode, uint8_t exec_size, const elk_fs_reg &dst,
375            const elk_fs_reg &src0, const elk_fs_reg &src1);
376    elk_fs_inst(enum elk_opcode opcode, uint8_t exec_size, const elk_fs_reg &dst,
377            const elk_fs_reg &src0, const elk_fs_reg &src1, const elk_fs_reg &src2);
378    elk_fs_inst(enum elk_opcode opcode, uint8_t exec_size, const elk_fs_reg &dst,
379            const elk_fs_reg src[], unsigned sources);
380    elk_fs_inst(const elk_fs_inst &that);
381    ~elk_fs_inst();
382 
383    void resize_sources(uint8_t num_sources);
384 
385    bool is_send_from_grf() const;
386    bool is_payload(unsigned arg) const;
387    bool is_partial_write() const;
388    unsigned components_read(unsigned i) const;
389    unsigned size_read(int arg) const;
390    bool can_do_source_mods(const struct intel_device_info *devinfo) const;
391    bool can_do_cmod();
392    bool can_change_types() const;
393    bool has_source_and_destination_hazard() const;
394    unsigned implied_mrf_writes() const;
395 
396    /**
397     * Return whether \p arg is a control source of a virtual instruction which
398     * shouldn't contribute to the execution type and usual regioning
399     * restriction calculations of arithmetic instructions.
400     */
401    bool is_control_source(unsigned arg) const;
402 
403    /**
404     * Return the subset of flag registers read by the instruction as a bitset
405     * with byte granularity.
406     */
407    unsigned flags_read(const intel_device_info *devinfo) const;
408 
409    /**
410     * Return the subset of flag registers updated by the instruction (either
411     * partially or fully) as a bitset with byte granularity.
412     */
413    unsigned flags_written(const intel_device_info *devinfo) const;
414 
415    /**
416     * Return true if this instruction is a sampler message gathering residency
417     * data.
418     */
419    bool has_sampler_residency() const;
420 
421    elk_fs_reg dst;
422    elk_fs_reg *src;
423 
424    uint8_t sources; /**< Number of elk_fs_reg sources. */
425 
426    bool last_rt:1;
427    bool pi_noperspective:1;   /**< Pixel interpolator noperspective flag */
428    bool keep_payload_trailing_zeros;
429 };
430 
431 /**
432  * Make the execution of \p inst dependent on the evaluation of a possibly
433  * inverted predicate.
434  */
435 static inline elk_fs_inst *
set_predicate_inv(enum elk_predicate pred,bool inverse,elk_fs_inst * inst)436 set_predicate_inv(enum elk_predicate pred, bool inverse,
437                   elk_fs_inst *inst)
438 {
439    inst->predicate = pred;
440    inst->predicate_inverse = inverse;
441    return inst;
442 }
443 
444 /**
445  * Make the execution of \p inst dependent on the evaluation of a predicate.
446  */
447 static inline elk_fs_inst *
set_predicate(enum elk_predicate pred,elk_fs_inst * inst)448 set_predicate(enum elk_predicate pred, elk_fs_inst *inst)
449 {
450    return set_predicate_inv(pred, false, inst);
451 }
452 
453 /**
454  * Write the result of evaluating the condition given by \p mod to a flag
455  * register.
456  */
457 static inline elk_fs_inst *
set_condmod(enum elk_conditional_mod mod,elk_fs_inst * inst)458 set_condmod(enum elk_conditional_mod mod, elk_fs_inst *inst)
459 {
460    inst->conditional_mod = mod;
461    return inst;
462 }
463 
464 /**
465  * Clamp the result of \p inst to the saturation range of its destination
466  * datatype.
467  */
468 static inline elk_fs_inst *
set_saturate(bool saturate,elk_fs_inst * inst)469 set_saturate(bool saturate, elk_fs_inst *inst)
470 {
471    inst->saturate = saturate;
472    return inst;
473 }
474 
475 /**
476  * Return the number of dataflow registers written by the instruction (either
477  * fully or partially) counted from 'floor(reg_offset(inst->dst) /
478  * register_size)'.  The somewhat arbitrary register size unit is 4B for the
479  * UNIFORM and IMM files and 32B for all other files.
480  */
481 inline unsigned
regs_written(const elk_fs_inst * inst)482 regs_written(const elk_fs_inst *inst)
483 {
484    assert(inst->dst.file != UNIFORM && inst->dst.file != IMM);
485    return DIV_ROUND_UP(reg_offset(inst->dst) % REG_SIZE +
486                        inst->size_written -
487                        MIN2(inst->size_written, reg_padding(inst->dst)),
488                        REG_SIZE);
489 }
490 
491 /**
492  * Return the number of dataflow registers read by the instruction (either
493  * fully or partially) counted from 'floor(reg_offset(inst->src[i]) /
494  * register_size)'.  The somewhat arbitrary register size unit is 4B for the
495  * UNIFORM files and 32B for all other files.
496  */
497 inline unsigned
regs_read(const elk_fs_inst * inst,unsigned i)498 regs_read(const elk_fs_inst *inst, unsigned i)
499 {
500    if (inst->src[i].file == IMM)
501       return 1;
502 
503    const unsigned reg_size = inst->src[i].file == UNIFORM ? 4 : REG_SIZE;
504    return DIV_ROUND_UP(reg_offset(inst->src[i]) % reg_size +
505                        inst->size_read(i) -
506                        MIN2(inst->size_read(i), reg_padding(inst->src[i])),
507                        reg_size);
508 }
509 
510 static inline enum elk_reg_type
get_exec_type(const elk_fs_inst * inst)511 get_exec_type(const elk_fs_inst *inst)
512 {
513    elk_reg_type exec_type = ELK_REGISTER_TYPE_B;
514 
515    for (int i = 0; i < inst->sources; i++) {
516       if (inst->src[i].file != BAD_FILE &&
517           !inst->is_control_source(i)) {
518          const elk_reg_type t = get_exec_type(inst->src[i].type);
519          if (type_sz(t) > type_sz(exec_type))
520             exec_type = t;
521          else if (type_sz(t) == type_sz(exec_type) &&
522                   elk_reg_type_is_floating_point(t))
523             exec_type = t;
524       }
525    }
526 
527    if (exec_type == ELK_REGISTER_TYPE_B)
528       exec_type = inst->dst.type;
529 
530    assert(exec_type != ELK_REGISTER_TYPE_B);
531 
532    /* Promotion of the execution type to 32-bit for conversions from or to
533     * half-float seems to be consistent with the following text from the
534     * Cherryview PRM Vol. 7, "Execution Data Type":
535     *
536     * "When single precision and half precision floats are mixed between
537     *  source operands or between source and destination operand [..] single
538     *  precision float is the execution datatype."
539     *
540     * and from "Register Region Restrictions":
541     *
542     * "Conversion between Integer and HF (Half Float) must be DWord aligned
543     *  and strided by a DWord on the destination."
544     */
545    if (type_sz(exec_type) == 2 &&
546        inst->dst.type != exec_type) {
547       if (exec_type == ELK_REGISTER_TYPE_HF)
548          exec_type = ELK_REGISTER_TYPE_F;
549       else if (inst->dst.type == ELK_REGISTER_TYPE_HF)
550          exec_type = ELK_REGISTER_TYPE_D;
551    }
552 
553    return exec_type;
554 }
555 
556 static inline unsigned
get_exec_type_size(const elk_fs_inst * inst)557 get_exec_type_size(const elk_fs_inst *inst)
558 {
559    return type_sz(get_exec_type(inst));
560 }
561 
562 static inline bool
is_send(const elk_fs_inst * inst)563 is_send(const elk_fs_inst *inst)
564 {
565    return inst->mlen || inst->is_send_from_grf();
566 }
567 
568 /**
569  * Return whether the instruction isn't an ALU instruction and cannot be
570  * assumed to complete in-order.
571  */
572 static inline bool
is_unordered(const intel_device_info * devinfo,const elk_fs_inst * inst)573 is_unordered(const intel_device_info *devinfo, const elk_fs_inst *inst)
574 {
575    return is_send(inst) || inst->is_math() ||
576           (devinfo->has_64bit_float_via_math_pipe &&
577            (get_exec_type(inst) == ELK_REGISTER_TYPE_DF ||
578             inst->dst.type == ELK_REGISTER_TYPE_DF));
579 }
580 
581 /**
582  * Return whether the following regioning restriction applies to the specified
583  * instruction.  From the Cherryview PRM Vol 7. "Register Region
584  * Restrictions":
585  *
586  * "When source or destination datatype is 64b or operation is integer DWord
587  *  multiply, regioning in Align1 must follow these rules:
588  *
589  *  1. Source and Destination horizontal stride must be aligned to the same qword.
590  *  2. Regioning must ensure Src.Vstride = Src.Width * Src.Hstride.
591  *  3. Source and Destination offset must be the same, except the case of
592  *     scalar source."
593  */
594 static inline bool
has_dst_aligned_region_restriction(const intel_device_info * devinfo,const elk_fs_inst * inst,elk_reg_type dst_type)595 has_dst_aligned_region_restriction(const intel_device_info *devinfo,
596                                    const elk_fs_inst *inst,
597                                    elk_reg_type dst_type)
598 {
599    const elk_reg_type exec_type = get_exec_type(inst);
600    /* Even though the hardware spec claims that "integer DWord multiply"
601     * operations are restricted, empirical evidence and the behavior of the
602     * simulator suggest that only 32x32-bit integer multiplication is
603     * restricted.
604     */
605    const bool is_dword_multiply = !elk_reg_type_is_floating_point(exec_type) &&
606       ((inst->opcode == ELK_OPCODE_MUL &&
607         MIN2(type_sz(inst->src[0].type), type_sz(inst->src[1].type)) >= 4) ||
608        (inst->opcode == ELK_OPCODE_MAD &&
609         MIN2(type_sz(inst->src[1].type), type_sz(inst->src[2].type)) >= 4));
610 
611    if (type_sz(dst_type) > 4 || type_sz(exec_type) > 4 ||
612        (type_sz(exec_type) == 4 && is_dword_multiply))
613       return devinfo->platform == INTEL_PLATFORM_CHV;
614 
615    else
616       return false;
617 }
618 
619 static inline bool
has_dst_aligned_region_restriction(const intel_device_info * devinfo,const elk_fs_inst * inst)620 has_dst_aligned_region_restriction(const intel_device_info *devinfo,
621                                    const elk_fs_inst *inst)
622 {
623    return has_dst_aligned_region_restriction(devinfo, inst, inst->dst.type);
624 }
625 
626 /**
627  * Return whether the LOAD_PAYLOAD instruction is a plain copy of bits from
628  * the specified register file into a VGRF.
629  *
630  * This implies identity register regions without any source-destination
631  * overlap, but otherwise has no implications on the location of sources and
632  * destination in the register file: Gathering any number of portions from
633  * multiple virtual registers in any order is allowed.
634  */
635 inline bool
is_copy_payload(elk_reg_file file,const elk_fs_inst * inst)636 is_copy_payload(elk_reg_file file, const elk_fs_inst *inst)
637 {
638    if (inst->opcode != ELK_SHADER_OPCODE_LOAD_PAYLOAD ||
639        inst->is_partial_write() || inst->saturate ||
640        inst->dst.file != VGRF)
641       return false;
642 
643    for (unsigned i = 0; i < inst->sources; i++) {
644       if (inst->src[i].file != file ||
645           inst->src[i].abs || inst->src[i].negate)
646          return false;
647 
648       if (!inst->src[i].is_contiguous())
649          return false;
650 
651       if (regions_overlap(inst->dst, inst->size_written,
652                           inst->src[i], inst->size_read(i)))
653          return false;
654    }
655 
656    return true;
657 }
658 
659 /**
660  * Like is_copy_payload(), but the instruction is required to copy a single
661  * contiguous block of registers from the given register file into the
662  * destination without any reordering.
663  */
664 inline bool
is_identity_payload(elk_reg_file file,const elk_fs_inst * inst)665 is_identity_payload(elk_reg_file file, const elk_fs_inst *inst) {
666    if (is_copy_payload(file, inst)) {
667       elk_fs_reg reg = inst->src[0];
668 
669       for (unsigned i = 0; i < inst->sources; i++) {
670          reg.type = inst->src[i].type;
671          if (!inst->src[i].equals(reg))
672             return false;
673 
674          reg = byte_offset(reg, inst->size_read(i));
675       }
676 
677       return true;
678    } else {
679       return false;
680    }
681 }
682 
683 /**
684  * Like is_copy_payload(), but the instruction is required to source data from
685  * at least two disjoint VGRFs.
686  *
687  * This doesn't necessarily rule out the elimination of this instruction
688  * through register coalescing, but due to limitations of the register
689  * coalesce pass it might be impossible to do so directly until a later stage,
690  * when the LOAD_PAYLOAD instruction is unrolled into a sequence of MOV
691  * instructions.
692  */
693 inline bool
is_multi_copy_payload(const elk_fs_inst * inst)694 is_multi_copy_payload(const elk_fs_inst *inst) {
695    if (is_copy_payload(VGRF, inst)) {
696       for (unsigned i = 0; i < inst->sources; i++) {
697             if (inst->src[i].nr != inst->src[0].nr)
698                return true;
699       }
700    }
701 
702    return false;
703 }
704 
705 /**
706  * Like is_identity_payload(), but the instruction is required to copy the
707  * whole contents of a single VGRF into the destination.
708  *
709  * This means that there is a good chance that the instruction will be
710  * eliminated through register coalescing, but it's neither a necessary nor a
711  * sufficient condition for that to happen -- E.g. consider the case where
712  * source and destination registers diverge due to other instructions in the
713  * program overwriting part of their contents, which isn't something we can
714  * predict up front based on a cheap strictly local test of the copy
715  * instruction.
716  */
717 inline bool
is_coalescing_payload(const elk::simple_allocator & alloc,const elk_fs_inst * inst)718 is_coalescing_payload(const elk::simple_allocator &alloc, const elk_fs_inst *inst)
719 {
720    return is_identity_payload(VGRF, inst) &&
721           inst->src[0].offset == 0 &&
722           alloc.sizes[inst->src[0].nr] * REG_SIZE == inst->size_written;
723 }
724 
725 bool
726 elk_has_bank_conflict(const struct elk_isa_info *isa, const elk_fs_inst *inst);
727