1; RUN: llc -march=amdgcn -mcpu=gfx900 -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX900 2; RUN: llc -march=amdgcn -mcpu=gfx906 -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-UNSAFE 3; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906 4; RUN: llc -march=amdgcn -mcpu=gfx906 -mattr=-fp64-fp16-denormals,-fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-CONTRACT 5; RUN: llc -march=amdgcn -mcpu=gfx906 -mattr=+fp64-fp16-denormals,+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-DENORM-CONTRACT 6; (fadd (fmul S1.x, S2.x), (fadd (fmul (S1.y, S2.y), z))) -> (fdot2 S1, S2, z) 7 8; Tests to make sure fdot2 is not generated when vector elements of dot-product expressions 9; are not converted from f16 to f32. 10; GCN-LABEL: {{^}}dotproduct_f16 11; GFX900: v_fma_legacy_f16 12; GCN900: v_fma_legacy_f16 13 14; GFX906: v_mul_f16_e32 15; GFX906: v_mul_f16_e32 16 17; GFX906-UNSAFE: v_fma_legacy_f16 18 19; GFX906-CONTRACT: v_mac_f16_e32 20; GFX906-DENORM-CONTRACT: v_fma_legacy_f16 21define amdgpu_kernel void @dotproduct_f16(<2 x half> addrspace(1)* %src1, 22 <2 x half> addrspace(1)* %src2, 23 half addrspace(1)* nocapture %dst) { 24entry: 25 %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1 26 %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2 27 28 %src1.el1 = extractelement <2 x half> %src1.vec, i64 0 29 %src2.el1 = extractelement <2 x half> %src2.vec, i64 0 30 31 %src1.el2 = extractelement <2 x half> %src1.vec, i64 1 32 %src2.el2 = extractelement <2 x half> %src2.vec, i64 1 33 34 %mul2 = fmul half %src1.el2, %src2.el2 35 %mul1 = fmul half %src1.el1, %src2.el1 36 %acc = load half, half addrspace(1)* %dst, align 2 37 %acc1 = fadd half %mul2, %acc 38 %acc2 = fadd half %mul1, %acc1 39 store half %acc2, half addrspace(1)* %dst, align 2 40 ret void 41} 42 43 44; We only want to generate fdot2 if vector element of dot product is converted from f16 to f32 45; and the vectors are of type <2 x half> 46; GCN-LABEL: {{^}}dotproduct_f16_f32 47; GFX900: v_mad_mix_f32 48; GCN900: v_mad_mix_f32 49 50; GFX906: v_mad_f32 51; GFX906: v_mac_f32_e32 52 53; GFX906-UNSAFE: v_dot2_f32_f16 54 55; GFX906-CONTRACT: v_dot2_f32_f16 56 57; GFX906-DENORM-CONTRACT: v_dot2_f32_f16 58define amdgpu_kernel void @dotproduct_f16_f32(<2 x half> addrspace(1)* %src1, 59 <2 x half> addrspace(1)* %src2, 60 float addrspace(1)* nocapture %dst) { 61entry: 62 %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1 63 %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2 64 65 %src1.el1 = extractelement <2 x half> %src1.vec, i64 0 66 %csrc1.el1 = fpext half %src1.el1 to float 67 %src2.el1 = extractelement <2 x half> %src2.vec, i64 0 68 %csrc2.el1 = fpext half %src2.el1 to float 69 70 %src1.el2 = extractelement <2 x half> %src1.vec, i64 1 71 %csrc1.el2 = fpext half %src1.el2 to float 72 %src2.el2 = extractelement <2 x half> %src2.vec, i64 1 73 %csrc2.el2 = fpext half %src2.el2 to float 74 75 %mul2 = fmul float %csrc1.el2, %csrc2.el2 76 %mul1 = fmul float %csrc1.el1, %csrc2.el1 77 %acc = load float, float addrspace(1)* %dst, align 4 78 %acc1 = fadd float %mul2, %acc 79 %acc2 = fadd float %mul1, %acc1 80 store float %acc2, float addrspace(1)* %dst, align 4 81 ret void 82} 83 84; We only want to generate fdot2 if vector element of dot product is converted from f16 to f32 85; and the vectors are of type <2 x half> 86; GCN-LABEL: {{^}}dotproduct_diffvecorder 87; GFX900: v_mad_mix_f32 88; GCN900: v_mad_mix_f32 89 90; GFX906: v_mad_f32 91; GFX906: v_mac_f32_e32 92 93; GFX906-UNSAFE: v_dot2_f32_f16 94 95; GFX906-CONTRACT: v_dot2_f32_f16 96; GFX906-DENORM-CONTRACT: v_dot2_f32_f16 97define amdgpu_kernel void @dotproduct_diffvecorder(<2 x half> addrspace(1)* %src1, 98 <2 x half> addrspace(1)* %src2, 99 float addrspace(1)* nocapture %dst) { 100entry: 101 %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1 102 %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2 103 104 %src1.el1 = extractelement <2 x half> %src1.vec, i64 0 105 %csrc1.el1 = fpext half %src1.el1 to float 106 %src2.el1 = extractelement <2 x half> %src2.vec, i64 0 107 %csrc2.el1 = fpext half %src2.el1 to float 108 109 %src1.el2 = extractelement <2 x half> %src1.vec, i64 1 110 %csrc1.el2 = fpext half %src1.el2 to float 111 %src2.el2 = extractelement <2 x half> %src2.vec, i64 1 112 %csrc2.el2 = fpext half %src2.el2 to float 113 114 %mul2 = fmul float %csrc2.el2, %csrc1.el2 115 %mul1 = fmul float %csrc1.el1, %csrc2.el1 116 %acc = load float, float addrspace(1)* %dst, align 4 117 %acc1 = fadd float %mul2, %acc 118 %acc2 = fadd float %mul1, %acc1 119 store float %acc2, float addrspace(1)* %dst, align 4 120 ret void 121} 122 123; Tests to make sure dot product is not generated when the vectors are not of <2 x half>. 124; GCN-LABEL: {{^}}dotproduct_v4f16 125; GFX900: v_mad_mix_f32 126 127; GFX906: v_mad_f32 128; GFX906: v_mac_f32_e32 129 130; GFX906-UNSAFE: v_fma_mix_f32 131 132; GFX906-CONTRACT: v_fma_mix_f32 133; GFX906-DENORM-CONTRACT: v_fma_mix_f32 134define amdgpu_kernel void @dotproduct_v4f16(<4 x half> addrspace(1)* %src1, 135 <4 x half> addrspace(1)* %src2, 136 float addrspace(1)* nocapture %dst) { 137entry: 138 %src1.vec = load <4 x half>, <4 x half> addrspace(1)* %src1 139 %src2.vec = load <4 x half>, <4 x half> addrspace(1)* %src2 140 141 %src1.el1 = extractelement <4 x half> %src1.vec, i64 0 142 %csrc1.el1 = fpext half %src1.el1 to float 143 %src2.el1 = extractelement <4 x half> %src2.vec, i64 0 144 %csrc2.el1 = fpext half %src2.el1 to float 145 146 %src1.el2 = extractelement <4 x half> %src1.vec, i64 1 147 %csrc1.el2 = fpext half %src1.el2 to float 148 %src2.el2 = extractelement <4 x half> %src2.vec, i64 1 149 %csrc2.el2 = fpext half %src2.el2 to float 150 151 %mul2 = fmul float %csrc1.el2, %csrc2.el2 152 %mul1 = fmul float %csrc1.el1, %csrc2.el1 153 %acc = load float, float addrspace(1)* %dst, align 4 154 %acc1 = fadd float %mul2, %acc 155 %acc2 = fadd float %mul1, %acc1 156 store float %acc2, float addrspace(1)* %dst, align 4 157 ret void 158} 159 160; GCN-LABEL: {{^}}NotAdotproduct 161; GFX900: v_mad_mix_f32 162; GCN900: v_mad_mix_f32 163 164; GFX906: v_mad_f32 165; GFX906: v_mac_f32_e32 166 167; GFX906-UNSAFE: v_fma_mix_f32 168 169; GFX906-CONTRACT: v_fma_mix_f32 170; GFX906-DENORM-CONTRACT: v_fma_mix_f32 171define amdgpu_kernel void @NotAdotproduct(<2 x half> addrspace(1)* %src1, 172 <2 x half> addrspace(1)* %src2, 173 float addrspace(1)* nocapture %dst) { 174entry: 175 %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1 176 %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2 177 178 %src1.el1 = extractelement <2 x half> %src1.vec, i64 0 179 %csrc1.el1 = fpext half %src1.el1 to float 180 %src2.el1 = extractelement <2 x half> %src2.vec, i64 0 181 %csrc2.el1 = fpext half %src2.el1 to float 182 183 %src1.el2 = extractelement <2 x half> %src1.vec, i64 1 184 %csrc1.el2 = fpext half %src1.el2 to float 185 %src2.el2 = extractelement <2 x half> %src2.vec, i64 1 186 %csrc2.el2 = fpext half %src2.el2 to float 187 188 %mul2 = fmul float %csrc1.el2, %csrc1.el1 189 %mul1 = fmul float %csrc2.el1, %csrc2.el2 190 %acc = load float, float addrspace(1)* %dst, align 4 191 %acc1 = fadd float %mul2, %acc 192 %acc2 = fadd float %mul1, %acc1 193 store float %acc2, float addrspace(1)* %dst, align 4 194 ret void 195} 196 197; GCN-LABEL: {{^}}Diff_Idx_NotAdotproduct 198; GFX900: v_mad_mix_f32 199; GCN900: v_mad_mix_f32 200 201; GFX906: v_mad_f32 202; GFX906: v_mac_f32_e32 203 204; GFX906-UNSAFE: v_fma_mix_f32 205 206; GFX906-CONTRACT: v_fma_mix_f32 207; GFX906-DENORM-CONTRACT: v_fma_mix_f32 208define amdgpu_kernel void @Diff_Idx_NotAdotproduct(<2 x half> addrspace(1)* %src1, 209 <2 x half> addrspace(1)* %src2, 210 float addrspace(1)* nocapture %dst) { 211entry: 212 %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1 213 %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2 214 215 %src1.el1 = extractelement <2 x half> %src1.vec, i64 0 216 %csrc1.el1 = fpext half %src1.el1 to float 217 %src2.el1 = extractelement <2 x half> %src2.vec, i64 0 218 %csrc2.el1 = fpext half %src2.el1 to float 219 220 %src1.el2 = extractelement <2 x half> %src1.vec, i64 1 221 %csrc1.el2 = fpext half %src1.el2 to float 222 %src2.el2 = extractelement <2 x half> %src2.vec, i64 1 223 %csrc2.el2 = fpext half %src2.el2 to float 224 225 %mul2 = fmul float %csrc1.el2, %csrc2.el1 226 %mul1 = fmul float %csrc1.el1, %csrc2.el2 227 %acc = load float, float addrspace(1)* %dst, align 4 228 %acc1 = fadd float %mul2, %acc 229 %acc2 = fadd float %mul1, %acc1 230 store float %acc2, float addrspace(1)* %dst, align 4 231 ret void 232}