/* * Copyright 2023 Intel Corporation * SPDX-License-Identifier: MIT */ #include "brw_fs.h" #include "brw_fs_builder.h" using namespace brw; static void f16_using_mac(const fs_builder &bld, fs_inst *inst) { /* We only intend to support configurations where the destination and * accumulator have the same type. */ if (!inst->src[0].is_null()) assert(inst->dst.type == inst->src[0].type); assert(inst->src[1].type == BRW_TYPE_HF); assert(inst->src[2].type == BRW_TYPE_HF); const brw_reg_type src0_type = inst->dst.type; const brw_reg_type src1_type = BRW_TYPE_HF; const brw_reg_type src2_type = BRW_TYPE_HF; const brw_reg dest = inst->dst; brw_reg src0 = inst->src[0]; const brw_reg src1 = retype(inst->src[1], src1_type); const brw_reg src2 = retype(inst->src[2], src2_type); const unsigned dest_stride = dest.type == BRW_TYPE_HF ? REG_SIZE / 2 : REG_SIZE; for (unsigned r = 0; r < inst->rcount; r++) { brw_reg temp = bld.vgrf(BRW_TYPE_HF); for (unsigned subword = 0; subword < 2; subword++) { for (unsigned s = 0; s < inst->sdepth; s++) { /* The first multiply of the dot-product operation has to * explicitly write the accumulator register. The successive MAC * instructions will implicitly read *and* write the * accumulator. Those MAC instructions can also optionally * explicitly write some other register. * * FINISHME: The accumulator can actually hold 16 HF values. On * Gfx12 there are two accumulators. It should be possible to do * this in SIMD16 or even SIMD32. I was unable to get this to work * properly. */ if (s == 0 && subword == 0) { const unsigned acc_width = 8; brw_reg acc = suboffset(retype(brw_acc_reg(inst->exec_size), BRW_TYPE_UD), inst->group % acc_width); if (bld.shader->devinfo->verx10 >= 125) { acc = subscript(acc, BRW_TYPE_HF, subword); } else { acc = retype(acc, BRW_TYPE_HF); } bld.MUL(acc, subscript(retype(byte_offset(src1, s * REG_SIZE), BRW_TYPE_UD), BRW_TYPE_HF, subword), component(retype(byte_offset(src2, r * REG_SIZE), BRW_TYPE_HF), s * 2 + subword)) ->writes_accumulator = true; } else { brw_reg result; /* As mentioned above, the MAC had an optional, explicit * destination register. Various optimization passes are not * clever enough to understand the intricacies of this * instruction, so only write the result register on the final * MAC in the sequence. */ if ((s + 1) == inst->sdepth && subword == 1) result = temp; else result = retype(bld.null_reg_ud(), BRW_TYPE_HF); bld.MAC(result, subscript(retype(byte_offset(src1, s * REG_SIZE), BRW_TYPE_UD), BRW_TYPE_HF, subword), component(retype(byte_offset(src2, r * REG_SIZE), BRW_TYPE_HF), s * 2 + subword)) ->writes_accumulator = true; } } } if (!src0.is_null()) { if (src0_type != BRW_TYPE_HF) { brw_reg temp2 = bld.vgrf(src0_type); bld.MOV(temp2, temp); bld.ADD(byte_offset(dest, r * dest_stride), temp2, byte_offset(src0, r * dest_stride)); } else { bld.ADD(byte_offset(dest, r * dest_stride), temp, byte_offset(src0, r * dest_stride)); } } else { bld.MOV(byte_offset(dest, r * dest_stride), temp); } } } static void int8_using_dp4a(const fs_builder &bld, fs_inst *inst) { /* We only intend to support configurations where the destination and * accumulator have the same type. */ if (!inst->src[0].is_null()) assert(inst->dst.type == inst->src[0].type); assert(inst->src[1].type == BRW_TYPE_B || inst->src[1].type == BRW_TYPE_UB); assert(inst->src[2].type == BRW_TYPE_B || inst->src[2].type == BRW_TYPE_UB); const brw_reg_type src1_type = inst->src[1].type == BRW_TYPE_UB ? BRW_TYPE_UD : BRW_TYPE_D; const brw_reg_type src2_type = inst->src[2].type == BRW_TYPE_UB ? BRW_TYPE_UD : BRW_TYPE_D; brw_reg dest = inst->dst; brw_reg src0 = inst->src[0]; const brw_reg src1 = retype(inst->src[1], src1_type); const brw_reg src2 = retype(inst->src[2], src2_type); const unsigned dest_stride = reg_unit(bld.shader->devinfo) * REG_SIZE; for (unsigned r = 0; r < inst->rcount; r++) { if (!src0.is_null()) { bld.MOV(dest, src0); src0 = byte_offset(src0, dest_stride); } else { bld.MOV(dest, retype(brw_imm_d(0), dest.type)); } for (unsigned s = 0; s < inst->sdepth; s++) { bld.DP4A(dest, dest, byte_offset(src1, s * inst->exec_size * 4), component(byte_offset(src2, r * inst->sdepth * 4), s)) ->saturate = inst->saturate; } dest = byte_offset(dest, dest_stride); } } static void int8_using_mul_add(const fs_builder &bld, fs_inst *inst) { /* We only intend to support configurations where the destination and * accumulator have the same type. */ if (!inst->src[0].is_null()) assert(inst->dst.type == inst->src[0].type); assert(inst->src[1].type == BRW_TYPE_B || inst->src[1].type == BRW_TYPE_UB); assert(inst->src[2].type == BRW_TYPE_B || inst->src[2].type == BRW_TYPE_UB); const brw_reg_type src0_type = inst->dst.type; const brw_reg_type src1_type = inst->src[1].type == BRW_TYPE_UB ? BRW_TYPE_UD : BRW_TYPE_D; const brw_reg_type src2_type = inst->src[2].type == BRW_TYPE_UB ? BRW_TYPE_UD : BRW_TYPE_D; brw_reg dest = inst->dst; brw_reg src0 = inst->src[0]; const brw_reg src1 = retype(inst->src[1], src1_type); const brw_reg src2 = retype(inst->src[2], src2_type); const unsigned dest_stride = REG_SIZE; for (unsigned r = 0; r < inst->rcount; r++) { if (!src0.is_null()) { bld.MOV(dest, src0); src0 = byte_offset(src0, dest_stride); } else { bld.MOV(dest, retype(brw_imm_d(0), dest.type)); } for (unsigned s = 0; s < inst->sdepth; s++) { brw_reg temp1 = bld.vgrf(BRW_TYPE_UD); brw_reg temp2 = bld.vgrf(BRW_TYPE_UD); brw_reg temp3 = bld.vgrf(BRW_TYPE_UD, 2); const brw_reg_type temp_type = (inst->src[1].type == BRW_TYPE_B || inst->src[2].type == BRW_TYPE_B) ? BRW_TYPE_W : BRW_TYPE_UW; /* Expand 8 dwords of packed bytes into 16 dwords of packed * words. * * FINISHME: Gfx9 should not need this work around. Gfx11 * may be able to use integer MAD. Both platforms may be * able to use MAC. */ bld.group(32, 0).MOV(retype(temp3, temp_type), retype(byte_offset(src2, r * REG_SIZE), inst->src[2].type)); bld.MUL(subscript(temp1, temp_type, 0), subscript(retype(byte_offset(src1, s * REG_SIZE), BRW_TYPE_UD), inst->src[1].type, 0), subscript(component(retype(temp3, BRW_TYPE_UD), s * 2), temp_type, 0)); bld.MUL(subscript(temp1, temp_type, 1), subscript(retype(byte_offset(src1, s * REG_SIZE), BRW_TYPE_UD), inst->src[1].type, 1), subscript(component(retype(temp3, BRW_TYPE_UD), s * 2), temp_type, 1)); bld.MUL(subscript(temp2, temp_type, 0), subscript(retype(byte_offset(src1, s * REG_SIZE), BRW_TYPE_UD), inst->src[1].type, 2), subscript(component(retype(temp3, BRW_TYPE_UD), s * 2 + 1), temp_type, 0)); bld.MUL(subscript(temp2, temp_type, 1), subscript(retype(byte_offset(src1, s * REG_SIZE), BRW_TYPE_UD), inst->src[1].type, 3), subscript(component(retype(temp3, BRW_TYPE_UD), s * 2 + 1), temp_type, 1)); bld.ADD(subscript(temp1, src0_type, 0), subscript(temp1, temp_type, 0), subscript(temp1, temp_type, 1)); bld.ADD(subscript(temp2, src0_type, 0), subscript(temp2, temp_type, 0), subscript(temp2, temp_type, 1)); bld.ADD(retype(temp1, src0_type), retype(temp1, src0_type), retype(temp2, src0_type)); bld.ADD(dest, dest, retype(temp1, src0_type)) ->saturate = inst->saturate; } dest = byte_offset(dest, dest_stride); } } bool brw_lower_dpas(fs_visitor &v) { bool progress = false; foreach_block_and_inst_safe(block, fs_inst, inst, v.cfg) { if (inst->opcode != BRW_OPCODE_DPAS) continue; const unsigned exec_size = v.devinfo->ver >= 20 ? 16 : 8; const fs_builder bld = fs_builder(&v, block, inst).group(exec_size, 0).exec_all(); if (brw_type_is_float(inst->dst.type)) { f16_using_mac(bld, inst); } else { if (v.devinfo->ver >= 12) { int8_using_dp4a(bld, inst); } else { int8_using_mul_add(bld, inst); } } inst->remove(block); progress = true; } if (progress) v.invalidate_analysis(DEPENDENCY_INSTRUCTIONS); return progress; }