1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s 3; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX8 %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-NODL %s 5; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-DL %s 6; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-DL %s 7; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-DL %s 8 9; add(mul(S0.x, S1.y), 10; add (mul (S0.y, S1.y), S3)) -> v_dot2_{I|U}32_{I|U}16(S1, S2, S3) 11 12define amdgpu_kernel void @udot2(<2 x i16> addrspace(1)* %src1, 13; GFX7-LABEL: udot2: 14; GFX7: ; %bb.0: ; %entry 15; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 16; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 17; GFX7-NEXT: s_mov_b32 s8, 0xffff 18; GFX7-NEXT: s_mov_b32 s3, 0xf000 19; GFX7-NEXT: s_mov_b32 s2, -1 20; GFX7-NEXT: s_waitcnt lgkmcnt(0) 21; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 22; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 23; GFX7-NEXT: s_waitcnt lgkmcnt(0) 24; GFX7-NEXT: s_lshr_b32 s6, s4, 16 25; GFX7-NEXT: s_lshr_b32 s7, s5, 16 26; GFX7-NEXT: s_and_b32 s4, s4, s8 27; GFX7-NEXT: s_and_b32 s5, s5, s8 28; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0 29; GFX7-NEXT: v_mov_b32_e32 v0, s6 30; GFX7-NEXT: s_waitcnt lgkmcnt(0) 31; GFX7-NEXT: v_mov_b32_e32 v1, s8 32; GFX7-NEXT: v_mad_u32_u24 v0, s7, v0, v1 33; GFX7-NEXT: v_mov_b32_e32 v1, s4 34; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 35; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 36; GFX7-NEXT: s_endpgm 37; 38; GFX8-LABEL: udot2: 39; GFX8: ; %bb.0: ; %entry 40; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 41; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 42; GFX8-NEXT: s_mov_b32 s2, 0xffff 43; GFX8-NEXT: s_waitcnt lgkmcnt(0) 44; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0 45; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 46; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 47; GFX8-NEXT: s_waitcnt lgkmcnt(0) 48; GFX8-NEXT: s_and_b32 s6, s3, s2 49; GFX8-NEXT: s_lshr_b32 s3, s3, 16 50; GFX8-NEXT: s_and_b32 s2, s4, s2 51; GFX8-NEXT: s_lshr_b32 s4, s4, 16 52; GFX8-NEXT: v_mov_b32_e32 v0, s5 53; GFX8-NEXT: v_mov_b32_e32 v1, s3 54; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0 55; GFX8-NEXT: v_mov_b32_e32 v1, s6 56; GFX8-NEXT: v_mad_u32_u24 v2, s2, v1, v0 57; GFX8-NEXT: v_mov_b32_e32 v0, s0 58; GFX8-NEXT: v_mov_b32_e32 v1, s1 59; GFX8-NEXT: flat_store_dword v[0:1], v2 60; GFX8-NEXT: s_endpgm 61; 62; GFX9-NODL-LABEL: udot2: 63; GFX9-NODL: ; %bb.0: ; %entry 64; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 65; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 66; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff 67; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 68; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 69; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 70; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 71; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 72; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 73; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 74; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 75; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 76; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 77; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 78; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 79; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v1, v2 80; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 81; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 82; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] 83; GFX9-NODL-NEXT: s_endpgm 84; 85; GFX9-DL-LABEL: udot2: 86; GFX9-DL: ; %bb.0: ; %entry 87; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 88; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 89; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 90; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 91; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 92; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 93; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 94; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 95; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 96; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 97; GFX9-DL-NEXT: v_dot2_u32_u16 v1, s4, v1, v2 98; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] 99; GFX9-DL-NEXT: s_endpgm 100; 101; GFX10-DL-LABEL: udot2: 102; GFX10-DL: ; %bb.0: ; %entry 103; GFX10-DL-NEXT: s_clause 0x1 104; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 105; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 106; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 107; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 108; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 109; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 110; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 111; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 112; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 113; GFX10-DL-NEXT: v_dot2_u32_u16 v0, s1, s0, v0 114; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] 115; GFX10-DL-NEXT: s_endpgm 116 <2 x i16> addrspace(1)* %src2, 117 i32 addrspace(1)* nocapture %dst) { 118entry: 119 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1 120 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2 121 122 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 123 %conv = zext i16 %s1.elt1 to i32 124 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 125 %conv2 = zext i16 %s2.elt1 to i32 126 %mul1 = mul nuw i32 %conv2, %conv 127 128 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 129 %conv3 = zext i16 %s1.elt2 to i32 130 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 131 %conv4 = zext i16 %s2.elt2 to i32 132 %mul2 = mul nuw i32 %conv4, %conv3 133 134 %s3 = load i32, i32 addrspace(1)* %dst, align 4 135 %add = add i32 %mul2, %s3 136 %add6 = add i32 %add, %mul1 137 store i32 %add6, i32 addrspace(1)* %dst, align 4 138 ret void 139} 140 141; TODO: Support this pattern 142; add(S3, 143; add (mul (S0.y, S1.y), mul (S0.y, S1.y))) -> v_dot2_{I|U}32_{I|U}16(S1, S2, S3) 144define amdgpu_kernel void @udot2_MulMul(<2 x i16> addrspace(1)* %src1, 145; GFX7-LABEL: udot2_MulMul: 146; GFX7: ; %bb.0: ; %entry 147; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 148; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 149; GFX7-NEXT: s_mov_b32 s8, 0xffff 150; GFX7-NEXT: s_mov_b32 s3, 0xf000 151; GFX7-NEXT: s_mov_b32 s2, -1 152; GFX7-NEXT: s_waitcnt lgkmcnt(0) 153; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 154; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 155; GFX7-NEXT: s_waitcnt lgkmcnt(0) 156; GFX7-NEXT: s_lshr_b32 s6, s4, 16 157; GFX7-NEXT: s_and_b32 s4, s4, s8 158; GFX7-NEXT: v_mov_b32_e32 v0, s4 159; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 160; GFX7-NEXT: s_lshr_b32 s7, s5, 16 161; GFX7-NEXT: s_and_b32 s5, s5, s8 162; GFX7-NEXT: v_mul_u32_u24_e32 v0, s5, v0 163; GFX7-NEXT: v_mov_b32_e32 v1, s6 164; GFX7-NEXT: v_mad_u32_u24 v0, s7, v1, v0 165; GFX7-NEXT: s_waitcnt lgkmcnt(0) 166; GFX7-NEXT: v_add_i32_e32 v0, vcc, s4, v0 167; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 168; GFX7-NEXT: s_endpgm 169; 170; GFX8-LABEL: udot2_MulMul: 171; GFX8: ; %bb.0: ; %entry 172; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 173; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 174; GFX8-NEXT: s_mov_b32 s2, 0xffff 175; GFX8-NEXT: s_waitcnt lgkmcnt(0) 176; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0 177; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 178; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 179; GFX8-NEXT: s_waitcnt lgkmcnt(0) 180; GFX8-NEXT: s_and_b32 s6, s3, s2 181; GFX8-NEXT: s_and_b32 s2, s4, s2 182; GFX8-NEXT: v_mov_b32_e32 v0, s6 183; GFX8-NEXT: s_lshr_b32 s3, s3, 16 184; GFX8-NEXT: s_lshr_b32 s4, s4, 16 185; GFX8-NEXT: v_mov_b32_e32 v1, s3 186; GFX8-NEXT: v_mul_u32_u24_e32 v0, s2, v0 187; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0 188; GFX8-NEXT: v_add_u32_e32 v2, vcc, s5, v0 189; GFX8-NEXT: v_mov_b32_e32 v0, s0 190; GFX8-NEXT: v_mov_b32_e32 v1, s1 191; GFX8-NEXT: flat_store_dword v[0:1], v2 192; GFX8-NEXT: s_endpgm 193; 194; GFX9-NODL-LABEL: udot2_MulMul: 195; GFX9-NODL: ; %bb.0: ; %entry 196; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 197; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 198; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff 199; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 200; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 201; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 202; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 203; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 204; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 205; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 206; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 207; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 208; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 209; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 210; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v1, s2, v1 211; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 212; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 213; GFX9-NODL-NEXT: v_add_u32_e32 v1, s5, v1 214; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] 215; GFX9-NODL-NEXT: s_endpgm 216; 217; GFX9-DL-LABEL: udot2_MulMul: 218; GFX9-DL: ; %bb.0: ; %entry 219; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 220; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 221; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff 222; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 223; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 224; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 225; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 226; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 227; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 228; GFX9-DL-NEXT: s_and_b32 s6, s3, s2 229; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 230; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 231; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 232; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16 233; GFX9-DL-NEXT: v_mul_u32_u24_e32 v1, s2, v1 234; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 235; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 236; GFX9-DL-NEXT: v_add_u32_e32 v1, s5, v1 237; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] 238; GFX9-DL-NEXT: s_endpgm 239; 240; GFX10-DL-LABEL: udot2_MulMul: 241; GFX10-DL: ; %bb.0: ; %entry 242; GFX10-DL-NEXT: s_clause 0x1 243; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 244; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 245; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 246; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 247; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 248; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 249; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 250; GFX10-DL-NEXT: s_mov_b32 s5, 0xffff 251; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 252; GFX10-DL-NEXT: s_and_b32 s6, s2, s5 253; GFX10-DL-NEXT: s_and_b32 s5, s3, s5 254; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 16 255; GFX10-DL-NEXT: v_mul_u32_u24_e64 v0, s5, s6 256; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 16 257; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 258; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, s4, v0 259; GFX10-DL-NEXT: global_store_dword v1, v0, s[0:1] 260; GFX10-DL-NEXT: s_endpgm 261 <2 x i16> addrspace(1)* %src2, 262 i32 addrspace(1)* nocapture %dst) { 263entry: 264 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1 265 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2 266 267 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 268 %conv = zext i16 %s1.elt1 to i32 269 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 270 %conv2 = zext i16 %s2.elt1 to i32 271 %mul1 = mul nuw i32 %conv2, %conv 272 273 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 274 %conv3 = zext i16 %s1.elt2 to i32 275 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 276 %conv4 = zext i16 %s2.elt2 to i32 277 %mul2 = mul nuw i32 %conv4, %conv3 278 %s3 = load i32, i32 addrspace(1)* %dst, align 4 279 %add = add i32 %mul2, %mul1 280 %add6 = add i32 %add, %s3 281 store i32 %add6, i32 addrspace(1)* %dst, align 4 282 ret void 283} 284 285define amdgpu_kernel void @idot2(<2 x i16> addrspace(1)* %src1, 286; GFX7-LABEL: idot2: 287; GFX7: ; %bb.0: ; %entry 288; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 289; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 290; GFX7-NEXT: s_mov_b32 s3, 0xf000 291; GFX7-NEXT: s_mov_b32 s2, -1 292; GFX7-NEXT: s_waitcnt lgkmcnt(0) 293; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 294; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 295; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0 296; GFX7-NEXT: s_waitcnt lgkmcnt(0) 297; GFX7-NEXT: s_sext_i32_i16 s7, s4 298; GFX7-NEXT: s_ashr_i32 s4, s4, 16 299; GFX7-NEXT: s_sext_i32_i16 s8, s5 300; GFX7-NEXT: s_ashr_i32 s5, s5, 16 301; GFX7-NEXT: v_mov_b32_e32 v0, s4 302; GFX7-NEXT: v_mov_b32_e32 v1, s6 303; GFX7-NEXT: v_mad_i32_i24 v0, s5, v0, v1 304; GFX7-NEXT: v_mov_b32_e32 v1, s7 305; GFX7-NEXT: v_mad_i32_i24 v0, s8, v1, v0 306; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 307; GFX7-NEXT: s_endpgm 308; 309; GFX8-LABEL: idot2: 310; GFX8: ; %bb.0: ; %entry 311; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 312; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 313; GFX8-NEXT: s_waitcnt lgkmcnt(0) 314; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 315; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 316; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 317; GFX8-NEXT: s_waitcnt lgkmcnt(0) 318; GFX8-NEXT: s_sext_i32_i16 s5, s2 319; GFX8-NEXT: s_ashr_i32 s2, s2, 16 320; GFX8-NEXT: s_sext_i32_i16 s6, s3 321; GFX8-NEXT: s_ashr_i32 s3, s3, 16 322; GFX8-NEXT: v_mov_b32_e32 v0, s4 323; GFX8-NEXT: v_mov_b32_e32 v1, s2 324; GFX8-NEXT: v_mov_b32_e32 v2, s5 325; GFX8-NEXT: v_mad_i32_i24 v0, s3, v1, v0 326; GFX8-NEXT: v_mad_i32_i24 v2, s6, v2, v0 327; GFX8-NEXT: v_mov_b32_e32 v0, s0 328; GFX8-NEXT: v_mov_b32_e32 v1, s1 329; GFX8-NEXT: flat_store_dword v[0:1], v2 330; GFX8-NEXT: s_endpgm 331; 332; GFX9-NODL-LABEL: idot2: 333; GFX9-NODL: ; %bb.0: ; %entry 334; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 335; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 336; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 337; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 338; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 339; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 340; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 341; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 342; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 343; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 344; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 345; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 346; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 347; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 348; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s3, v2, v1 349; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 350; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 351; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] 352; GFX9-NODL-NEXT: s_endpgm 353; 354; GFX9-DL-LABEL: idot2: 355; GFX9-DL: ; %bb.0: ; %entry 356; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 357; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 358; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 359; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 360; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 361; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 362; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 363; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 364; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 365; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 366; GFX9-DL-NEXT: v_dot2_i32_i16 v1, s4, v1, v2 367; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] 368; GFX9-DL-NEXT: s_endpgm 369; 370; GFX10-DL-LABEL: idot2: 371; GFX10-DL: ; %bb.0: ; %entry 372; GFX10-DL-NEXT: s_clause 0x1 373; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 374; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 375; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 376; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 377; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 378; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 379; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 380; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 381; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 382; GFX10-DL-NEXT: v_dot2_i32_i16 v0, s1, s0, v0 383; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] 384; GFX10-DL-NEXT: s_endpgm 385 <2 x i16> addrspace(1)* %src2, 386 i32 addrspace(1)* nocapture %dst) { 387entry: 388 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1 389 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2 390 391 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 392 %conv = sext i16 %s1.elt1 to i32 393 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 394 %conv2 = sext i16 %s2.elt1 to i32 395 %mul1 = mul nuw i32 %conv2, %conv 396 397 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 398 %conv3 = sext i16 %s1.elt2 to i32 399 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 400 %conv4 = sext i16 %s2.elt2 to i32 401 %mul2 = mul nuw i32 %conv4, %conv3 402 403 %s3 = load i32, i32 addrspace(1)* %dst, align 4 404 %add = add i32 %mul2, %s3 405 %add6 = add i32 %add, %mul1 406 store i32 %add6, i32 addrspace(1)* %dst, align 4 407 ret void 408} 409 410define amdgpu_kernel void @idot2_MixedTypedMul(<2 x i16> addrspace(1)* %src1, 411; GFX7-LABEL: idot2_MixedTypedMul: 412; GFX7: ; %bb.0: ; %entry 413; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 414; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 415; GFX7-NEXT: s_mov_b32 s3, 0xf000 416; GFX7-NEXT: s_mov_b32 s2, -1 417; GFX7-NEXT: s_waitcnt lgkmcnt(0) 418; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 419; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 420; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0 421; GFX7-NEXT: s_waitcnt lgkmcnt(0) 422; GFX7-NEXT: s_lshr_b32 s7, s4, 16 423; GFX7-NEXT: s_lshr_b32 s8, s5, 16 424; GFX7-NEXT: s_sext_i32_i16 s4, s4 425; GFX7-NEXT: v_mov_b32_e32 v0, s7 426; GFX7-NEXT: v_mov_b32_e32 v1, s6 427; GFX7-NEXT: v_mad_u32_u24 v0, s8, v0, v1 428; GFX7-NEXT: s_sext_i32_i16 s5, s5 429; GFX7-NEXT: v_mov_b32_e32 v1, s4 430; GFX7-NEXT: v_mad_i32_i24 v0, s5, v1, v0 431; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 432; GFX7-NEXT: s_endpgm 433; 434; GFX8-LABEL: idot2_MixedTypedMul: 435; GFX8: ; %bb.0: ; %entry 436; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 437; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 438; GFX8-NEXT: s_waitcnt lgkmcnt(0) 439; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 440; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 441; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 442; GFX8-NEXT: s_waitcnt lgkmcnt(0) 443; GFX8-NEXT: s_sext_i32_i16 s5, s2 444; GFX8-NEXT: s_lshr_b32 s2, s2, 16 445; GFX8-NEXT: s_sext_i32_i16 s6, s3 446; GFX8-NEXT: s_lshr_b32 s3, s3, 16 447; GFX8-NEXT: v_mov_b32_e32 v0, s4 448; GFX8-NEXT: v_mov_b32_e32 v1, s2 449; GFX8-NEXT: v_mov_b32_e32 v2, s5 450; GFX8-NEXT: v_mad_u32_u24 v0, s3, v1, v0 451; GFX8-NEXT: v_mad_i32_i24 v2, s6, v2, v0 452; GFX8-NEXT: v_mov_b32_e32 v0, s0 453; GFX8-NEXT: v_mov_b32_e32 v1, s1 454; GFX8-NEXT: flat_store_dword v[0:1], v2 455; GFX8-NEXT: s_endpgm 456; 457; GFX9-NODL-LABEL: idot2_MixedTypedMul: 458; GFX9-NODL: ; %bb.0: ; %entry 459; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 460; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 461; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 462; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 463; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 464; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 465; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 466; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 467; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 468; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16 469; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 470; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 471; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 472; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 473; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 474; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 475; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 476; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] 477; GFX9-NODL-NEXT: s_endpgm 478; 479; GFX9-DL-LABEL: idot2_MixedTypedMul: 480; GFX9-DL: ; %bb.0: ; %entry 481; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 482; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 483; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 484; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 485; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 486; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 487; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 488; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 489; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 490; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 16 491; GFX9-DL-NEXT: s_sext_i32_i16 s6, s3 492; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 493; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 494; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 495; GFX9-DL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 496; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 497; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 498; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] 499; GFX9-DL-NEXT: s_endpgm 500; 501; GFX10-DL-LABEL: idot2_MixedTypedMul: 502; GFX10-DL: ; %bb.0: ; %entry 503; GFX10-DL-NEXT: s_clause 0x1 504; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 505; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 506; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 507; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 508; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 509; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 510; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 511; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 512; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 513; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 16 514; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 16 515; GFX10-DL-NEXT: s_sext_i32_i16 s0, s0 516; GFX10-DL-NEXT: s_sext_i32_i16 s1, s1 517; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 518; GFX10-DL-NEXT: v_mad_i32_i24 v0, s1, s0, v0 519; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] 520; GFX10-DL-NEXT: s_endpgm 521 <2 x i16> addrspace(1)* %src2, 522 i32 addrspace(1)* nocapture %dst) { 523entry: 524 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1 525 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2 526 527 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 528 %conv = sext i16 %s1.elt1 to i32 529 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 530 %conv2 = sext i16 %s2.elt1 to i32 531 %mul1 = mul nuw i32 %conv2, %conv 532 533 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 534 %conv3 = zext i16 %s1.elt2 to i32 535 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 536 %conv4 = zext i16 %s2.elt2 to i32 537 %mul2 = mul nuw i32 %conv4, %conv3 538 539 %s3 = load i32, i32 addrspace(1)* %dst, align 4 540 %add = add i32 %mul2, %s3 541 %add6 = add i32 %add, %mul1 542 store i32 %add6, i32 addrspace(1)* %dst, align 4 543 ret void 544} 545 546define amdgpu_kernel void @udot2_alt_AddOperands(<2 x i16> addrspace(1)* %src1, 547; GFX7-LABEL: udot2_alt_AddOperands: 548; GFX7: ; %bb.0: ; %entry 549; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 550; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 551; GFX7-NEXT: s_mov_b32 s8, 0xffff 552; GFX7-NEXT: s_mov_b32 s3, 0xf000 553; GFX7-NEXT: s_mov_b32 s2, -1 554; GFX7-NEXT: s_waitcnt lgkmcnt(0) 555; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 556; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 557; GFX7-NEXT: s_waitcnt lgkmcnt(0) 558; GFX7-NEXT: s_lshr_b32 s6, s4, 16 559; GFX7-NEXT: s_lshr_b32 s7, s5, 16 560; GFX7-NEXT: s_and_b32 s4, s4, s8 561; GFX7-NEXT: s_and_b32 s5, s5, s8 562; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0 563; GFX7-NEXT: v_mov_b32_e32 v0, s6 564; GFX7-NEXT: s_waitcnt lgkmcnt(0) 565; GFX7-NEXT: v_mov_b32_e32 v1, s8 566; GFX7-NEXT: v_mad_u32_u24 v0, s7, v0, v1 567; GFX7-NEXT: v_mov_b32_e32 v1, s4 568; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 569; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 570; GFX7-NEXT: s_endpgm 571; 572; GFX8-LABEL: udot2_alt_AddOperands: 573; GFX8: ; %bb.0: ; %entry 574; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 575; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 576; GFX8-NEXT: s_mov_b32 s2, 0xffff 577; GFX8-NEXT: s_waitcnt lgkmcnt(0) 578; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0 579; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 580; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 581; GFX8-NEXT: s_waitcnt lgkmcnt(0) 582; GFX8-NEXT: s_and_b32 s6, s3, s2 583; GFX8-NEXT: s_lshr_b32 s3, s3, 16 584; GFX8-NEXT: s_and_b32 s2, s4, s2 585; GFX8-NEXT: s_lshr_b32 s4, s4, 16 586; GFX8-NEXT: v_mov_b32_e32 v0, s5 587; GFX8-NEXT: v_mov_b32_e32 v1, s3 588; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0 589; GFX8-NEXT: v_mov_b32_e32 v1, s6 590; GFX8-NEXT: v_mad_u32_u24 v2, s2, v1, v0 591; GFX8-NEXT: v_mov_b32_e32 v0, s0 592; GFX8-NEXT: v_mov_b32_e32 v1, s1 593; GFX8-NEXT: flat_store_dword v[0:1], v2 594; GFX8-NEXT: s_endpgm 595; 596; GFX9-NODL-LABEL: udot2_alt_AddOperands: 597; GFX9-NODL: ; %bb.0: ; %entry 598; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 599; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 600; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff 601; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 602; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 603; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 604; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 605; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 606; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 607; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 608; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 609; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 610; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 611; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 612; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 613; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v1, v2 614; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 615; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 616; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] 617; GFX9-NODL-NEXT: s_endpgm 618; 619; GFX9-DL-LABEL: udot2_alt_AddOperands: 620; GFX9-DL: ; %bb.0: ; %entry 621; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 622; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 623; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 624; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 625; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 626; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 627; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 628; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 629; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 630; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 631; GFX9-DL-NEXT: v_dot2_u32_u16 v1, s4, v1, v2 632; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] 633; GFX9-DL-NEXT: s_endpgm 634; 635; GFX10-DL-LABEL: udot2_alt_AddOperands: 636; GFX10-DL: ; %bb.0: ; %entry 637; GFX10-DL-NEXT: s_clause 0x1 638; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 639; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 640; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 641; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 642; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 643; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 644; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 645; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 646; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 647; GFX10-DL-NEXT: v_dot2_u32_u16 v0, s1, s0, v0 648; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] 649; GFX10-DL-NEXT: s_endpgm 650 <2 x i16> addrspace(1)* %src2, 651 i32 addrspace(1)* nocapture %dst) { 652entry: 653 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1 654 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2 655 656 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 657 %conv = zext i16 %s1.elt1 to i32 658 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 659 %conv2 = zext i16 %s2.elt1 to i32 660 %mul1 = mul nuw i32 %conv2, %conv 661 662 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 663 %conv3 = zext i16 %s1.elt2 to i32 664 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 665 %conv4 = zext i16 %s2.elt2 to i32 666 %mul2 = mul nuw i32 %conv4, %conv3 667 668 %s3 = load i32, i32 addrspace(1)* %dst, align 4 669 %add = add i32 %s3, %mul2 670 %add6 = add i32 %mul1, %add 671 store i32 %add6, i32 addrspace(1)* %dst, align 4 672 ret void 673} 674 675define amdgpu_kernel void @idot2_MixedExt(<2 x i16> addrspace(1)* %src1, 676; GFX7-LABEL: idot2_MixedExt: 677; GFX7: ; %bb.0: ; %entry 678; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 679; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 680; GFX7-NEXT: s_mov_b32 s3, 0xf000 681; GFX7-NEXT: s_mov_b32 s2, -1 682; GFX7-NEXT: s_waitcnt lgkmcnt(0) 683; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 684; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 685; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0 686; GFX7-NEXT: s_waitcnt lgkmcnt(0) 687; GFX7-NEXT: s_sext_i32_i16 s7, s4 688; GFX7-NEXT: s_ashr_i32 s4, s4, 16 689; GFX7-NEXT: s_and_b32 s8, s5, 0xffff 690; GFX7-NEXT: s_ashr_i32 s5, s5, 16 691; GFX7-NEXT: v_mov_b32_e32 v0, s4 692; GFX7-NEXT: v_mov_b32_e32 v1, s6 693; GFX7-NEXT: v_mad_i32_i24 v0, s5, v0, v1 694; GFX7-NEXT: v_mov_b32_e32 v1, s7 695; GFX7-NEXT: v_mad_i32_i24 v0, s8, v1, v0 696; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 697; GFX7-NEXT: s_endpgm 698; 699; GFX8-LABEL: idot2_MixedExt: 700; GFX8: ; %bb.0: ; %entry 701; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 702; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 703; GFX8-NEXT: s_waitcnt lgkmcnt(0) 704; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 705; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 706; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 707; GFX8-NEXT: s_waitcnt lgkmcnt(0) 708; GFX8-NEXT: s_sext_i32_i16 s5, s2 709; GFX8-NEXT: s_ashr_i32 s2, s2, 16 710; GFX8-NEXT: s_and_b32 s6, s3, 0xffff 711; GFX8-NEXT: s_ashr_i32 s3, s3, 16 712; GFX8-NEXT: v_mov_b32_e32 v0, s4 713; GFX8-NEXT: v_mov_b32_e32 v1, s2 714; GFX8-NEXT: v_mov_b32_e32 v2, s5 715; GFX8-NEXT: v_mad_i32_i24 v0, s3, v1, v0 716; GFX8-NEXT: v_mad_i32_i24 v2, s6, v2, v0 717; GFX8-NEXT: v_mov_b32_e32 v0, s0 718; GFX8-NEXT: v_mov_b32_e32 v1, s1 719; GFX8-NEXT: flat_store_dword v[0:1], v2 720; GFX8-NEXT: s_endpgm 721; 722; GFX9-NODL-LABEL: idot2_MixedExt: 723; GFX9-NODL: ; %bb.0: ; %entry 724; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 725; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 726; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 727; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 728; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 729; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 730; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 731; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 732; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 733; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 734; GFX9-NODL-NEXT: s_and_b32 s6, s3, 0xffff 735; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 736; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 737; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 738; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s3, v2, v1 739; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 740; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 741; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] 742; GFX9-NODL-NEXT: s_endpgm 743; 744; GFX9-DL-LABEL: idot2_MixedExt: 745; GFX9-DL: ; %bb.0: ; %entry 746; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 747; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 748; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 749; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 750; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 751; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 752; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 753; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 754; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 755; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16 756; GFX9-DL-NEXT: s_and_b32 s6, s3, 0xffff 757; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16 758; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 759; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 760; GFX9-DL-NEXT: v_mad_i32_i24 v1, s3, v2, v1 761; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 762; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 763; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] 764; GFX9-DL-NEXT: s_endpgm 765; 766; GFX10-DL-LABEL: idot2_MixedExt: 767; GFX10-DL: ; %bb.0: ; %entry 768; GFX10-DL-NEXT: s_clause 0x1 769; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 770; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 771; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 772; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 773; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 774; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 775; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 776; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 777; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 778; GFX10-DL-NEXT: s_ashr_i32 s2, s0, 16 779; GFX10-DL-NEXT: s_ashr_i32 s3, s1, 16 780; GFX10-DL-NEXT: s_sext_i32_i16 s0, s0 781; GFX10-DL-NEXT: s_and_b32 s1, s1, 0xffff 782; GFX10-DL-NEXT: v_mad_i32_i24 v0, s3, s2, v0 783; GFX10-DL-NEXT: v_mad_i32_i24 v0, s1, s0, v0 784; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] 785; GFX10-DL-NEXT: s_endpgm 786 <2 x i16> addrspace(1)* %src2, 787 i32 addrspace(1)* nocapture %dst) { 788entry: 789 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1 790 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2 791 792 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 793 %conv = sext i16 %s1.elt1 to i32 794 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 795 %conv2 = zext i16 %s2.elt1 to i32 796 %mul1 = mul nuw i32 %conv2, %conv 797 798 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 799 %conv3 = sext i16 %s1.elt2 to i32 800 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 801 %conv4 = sext i16 %s2.elt2 to i32 802 %mul2 = mul nuw i32 %conv4, %conv3 803 804 %s3 = load i32, i32 addrspace(1)* %dst, align 4 805 %add = add i32 %mul2, %s3 806 %add6 = add i32 %add, %mul1 807 store i32 %add6, i32 addrspace(1)* %dst, align 4 808 ret void 809} 810 811define amdgpu_kernel void @notudot2_SameVec(<2 x i16> addrspace(1)* %src1, 812; GFX7-LABEL: notudot2_SameVec: 813; GFX7: ; %bb.0: ; %entry 814; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 815; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 816; GFX7-NEXT: s_mov_b32 s3, 0xf000 817; GFX7-NEXT: s_mov_b32 s2, -1 818; GFX7-NEXT: s_waitcnt lgkmcnt(0) 819; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 820; GFX7-NEXT: s_load_dword s7, s[0:1], 0x0 821; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 822; GFX7-NEXT: s_waitcnt lgkmcnt(0) 823; GFX7-NEXT: s_lshr_b32 s5, s6, 16 824; GFX7-NEXT: v_mov_b32_e32 v0, s7 825; GFX7-NEXT: s_and_b32 s4, s4, 0xffff 826; GFX7-NEXT: v_mad_u32_u24 v0, s5, s5, v0 827; GFX7-NEXT: v_mad_u32_u24 v0, s4, s4, v0 828; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 829; GFX7-NEXT: s_endpgm 830; 831; GFX8-LABEL: notudot2_SameVec: 832; GFX8: ; %bb.0: ; %entry 833; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 834; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 835; GFX8-NEXT: s_waitcnt lgkmcnt(0) 836; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 837; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 838; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 839; GFX8-NEXT: s_waitcnt lgkmcnt(0) 840; GFX8-NEXT: s_lshr_b32 s2, s2, 16 841; GFX8-NEXT: v_mov_b32_e32 v0, s3 842; GFX8-NEXT: s_and_b32 s4, s4, 0xffff 843; GFX8-NEXT: v_mad_u32_u24 v0, s2, s2, v0 844; GFX8-NEXT: v_mad_u32_u24 v2, s4, s4, v0 845; GFX8-NEXT: v_mov_b32_e32 v0, s0 846; GFX8-NEXT: v_mov_b32_e32 v1, s1 847; GFX8-NEXT: flat_store_dword v[0:1], v2 848; GFX8-NEXT: s_endpgm 849; 850; GFX9-NODL-LABEL: notudot2_SameVec: 851; GFX9-NODL: ; %bb.0: ; %entry 852; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 853; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 854; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 855; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 856; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 857; GFX9-NODL-NEXT: s_load_dword s3, s[0:1], 0x0 858; GFX9-NODL-NEXT: s_load_dword s4, s[4:5], 0x0 859; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 860; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16 861; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 862; GFX9-NODL-NEXT: s_and_b32 s4, s4, 0xffff 863; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, s2, v1 864; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, s4, v1 865; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] 866; GFX9-NODL-NEXT: s_endpgm 867; 868; GFX9-DL-LABEL: notudot2_SameVec: 869; GFX9-DL: ; %bb.0: ; %entry 870; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 871; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 872; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 873; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 874; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 875; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 876; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 877; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 878; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 16 879; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 880; GFX9-DL-NEXT: s_and_b32 s4, s4, 0xffff 881; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, s2, v1 882; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, s4, v1 883; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] 884; GFX9-DL-NEXT: s_endpgm 885; 886; GFX10-DL-LABEL: notudot2_SameVec: 887; GFX10-DL: ; %bb.0: ; %entry 888; GFX10-DL-NEXT: s_clause 0x1 889; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 890; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 891; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 892; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 893; GFX10-DL-NEXT: s_load_dword s2, s[6:7], 0x0 894; GFX10-DL-NEXT: s_load_dword s3, s[0:1], 0x0 895; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0 896; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 897; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 16 898; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s2, s3 899; GFX10-DL-NEXT: s_and_b32 s2, s4, 0xffff 900; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s2, v0 901; GFX10-DL-NEXT: global_store_dword v1, v0, s[0:1] 902; GFX10-DL-NEXT: s_endpgm 903 <2 x i16> addrspace(1)* %src2, 904 i32 addrspace(1)* nocapture %dst) { 905entry: 906 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1 907 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2 908 909 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 910 %conv = zext i16 %s1.elt1 to i32 911 %s2.elt1 = extractelement <2 x i16> %vec1, i64 0 912 %conv2 = zext i16 %s2.elt1 to i32 913 %mul1 = mul i32 %conv2, %conv 914 915 %s1.elt2 = extractelement <2 x i16> %vec2, i64 1 916 %conv3 = zext i16 %s1.elt2 to i32 917 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 918 %conv4 = zext i16 %s2.elt2 to i32 919 %mul2 = mul i32 %conv4, %conv3 920 921 %s3 = load i32, i32 addrspace(1)* %dst, align 4 922 %add = add i32 %mul2, %s3 923 %add6 = add i32 %add, %mul1 924 store i32 %add6, i32 addrspace(1)* %dst, align 4 925 ret void 926} 927 928define amdgpu_kernel void @udot2_v4i16(<4 x i16> addrspace(1)* %src1, 929; GFX7-LABEL: udot2_v4i16: 930; GFX7: ; %bb.0: ; %entry 931; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 932; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 933; GFX7-NEXT: s_mov_b32 s8, 0xffff 934; GFX7-NEXT: s_mov_b32 s3, 0xf000 935; GFX7-NEXT: s_mov_b32 s2, -1 936; GFX7-NEXT: s_waitcnt lgkmcnt(0) 937; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 938; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 939; GFX7-NEXT: s_waitcnt lgkmcnt(0) 940; GFX7-NEXT: s_and_b32 s6, s4, s8 941; GFX7-NEXT: s_and_b32 s7, s5, s8 942; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0 943; GFX7-NEXT: s_lshr_b32 s4, s4, 16 944; GFX7-NEXT: s_lshr_b32 s5, s5, 16 945; GFX7-NEXT: v_mov_b32_e32 v0, s4 946; GFX7-NEXT: s_waitcnt lgkmcnt(0) 947; GFX7-NEXT: v_mov_b32_e32 v1, s8 948; GFX7-NEXT: v_mad_u32_u24 v0, s5, v0, v1 949; GFX7-NEXT: v_mov_b32_e32 v1, s6 950; GFX7-NEXT: v_mad_u32_u24 v0, s7, v1, v0 951; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 952; GFX7-NEXT: s_endpgm 953; 954; GFX8-LABEL: udot2_v4i16: 955; GFX8: ; %bb.0: ; %entry 956; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 957; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 958; GFX8-NEXT: s_mov_b32 s2, 0xffff 959; GFX8-NEXT: s_waitcnt lgkmcnt(0) 960; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0 961; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 962; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 963; GFX8-NEXT: s_waitcnt lgkmcnt(0) 964; GFX8-NEXT: s_and_b32 s6, s3, s2 965; GFX8-NEXT: s_lshr_b32 s3, s3, 16 966; GFX8-NEXT: s_and_b32 s2, s4, s2 967; GFX8-NEXT: s_lshr_b32 s4, s4, 16 968; GFX8-NEXT: v_mov_b32_e32 v0, s5 969; GFX8-NEXT: v_mov_b32_e32 v1, s3 970; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0 971; GFX8-NEXT: v_mov_b32_e32 v1, s6 972; GFX8-NEXT: v_mad_u32_u24 v2, s2, v1, v0 973; GFX8-NEXT: v_mov_b32_e32 v0, s0 974; GFX8-NEXT: v_mov_b32_e32 v1, s1 975; GFX8-NEXT: flat_store_dword v[0:1], v2 976; GFX8-NEXT: s_endpgm 977; 978; GFX9-NODL-LABEL: udot2_v4i16: 979; GFX9-NODL: ; %bb.0: ; %entry 980; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 981; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 982; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff 983; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 984; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 985; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 986; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 987; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 988; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 989; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 990; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 991; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 992; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 993; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 994; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 995; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v1, v2 996; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 997; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 998; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] 999; GFX9-NODL-NEXT: s_endpgm 1000; 1001; GFX9-DL-LABEL: udot2_v4i16: 1002; GFX9-DL: ; %bb.0: ; %entry 1003; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1004; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1005; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1006; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1007; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 1008; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 1009; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 1010; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1011; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 1012; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 1013; GFX9-DL-NEXT: v_dot2_u32_u16 v1, s4, v1, v2 1014; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] 1015; GFX9-DL-NEXT: s_endpgm 1016; 1017; GFX10-DL-LABEL: udot2_v4i16: 1018; GFX10-DL: ; %bb.0: ; %entry 1019; GFX10-DL-NEXT: s_clause 0x1 1020; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 1021; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1022; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 1023; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1024; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 1025; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 1026; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 1027; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1028; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 1029; GFX10-DL-NEXT: v_dot2_u32_u16 v0, s1, s0, v0 1030; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] 1031; GFX10-DL-NEXT: s_endpgm 1032 <4 x i16> addrspace(1)* %src2, 1033 i32 addrspace(1)* nocapture %dst) { 1034entry: 1035 %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %src1 1036 %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %src2 1037 1038 %s1.elt1 = extractelement <4 x i16> %vec1, i64 0 1039 %conv = zext i16 %s1.elt1 to i32 1040 %s2.elt1 = extractelement <4 x i16> %vec2, i64 0 1041 %conv2 = zext i16 %s2.elt1 to i32 1042 %mul1 = mul i32 %conv2, %conv 1043 1044 %s1.elt2 = extractelement <4 x i16> %vec1, i64 1 1045 %conv3 = zext i16 %s1.elt2 to i32 1046 %s2.elt2 = extractelement <4 x i16> %vec2, i64 1 1047 %conv4 = zext i16 %s2.elt2 to i32 1048 %mul2 = mul i32 %conv4, %conv3 1049 1050 %s3 = load i32, i32 addrspace(1)* %dst, align 4 1051 %add = add i32 %mul2, %s3 1052 %add6 = add i32 %add, %mul1 1053 store i32 %add6, i32 addrspace(1)* %dst, align 4 1054 ret void 1055} 1056 1057define amdgpu_kernel void @udot2_v4i16_Hi(<4 x i16> addrspace(1)* %src1, 1058; GFX7-LABEL: udot2_v4i16_Hi: 1059; GFX7: ; %bb.0: ; %entry 1060; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1061; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1062; GFX7-NEXT: s_mov_b32 s8, 0xffff 1063; GFX7-NEXT: s_mov_b32 s3, 0xf000 1064; GFX7-NEXT: s_mov_b32 s2, -1 1065; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1066; GFX7-NEXT: s_load_dword s4, s[4:5], 0x1 1067; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 1068; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1069; GFX7-NEXT: s_and_b32 s6, s4, s8 1070; GFX7-NEXT: s_and_b32 s7, s5, s8 1071; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0 1072; GFX7-NEXT: s_lshr_b32 s4, s4, 16 1073; GFX7-NEXT: s_lshr_b32 s5, s5, 16 1074; GFX7-NEXT: v_mov_b32_e32 v0, s4 1075; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1076; GFX7-NEXT: v_mov_b32_e32 v1, s8 1077; GFX7-NEXT: v_mad_u32_u24 v0, s5, v0, v1 1078; GFX7-NEXT: v_mov_b32_e32 v1, s6 1079; GFX7-NEXT: v_mad_u32_u24 v0, s7, v1, v0 1080; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1081; GFX7-NEXT: s_endpgm 1082; 1083; GFX8-LABEL: udot2_v4i16_Hi: 1084; GFX8: ; %bb.0: ; %entry 1085; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1086; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1087; GFX8-NEXT: s_mov_b32 s2, 0xffff 1088; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1089; GFX8-NEXT: s_load_dword s3, s[4:5], 0x4 1090; GFX8-NEXT: s_load_dword s4, s[6:7], 0x4 1091; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 1092; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1093; GFX8-NEXT: s_and_b32 s6, s3, s2 1094; GFX8-NEXT: s_lshr_b32 s3, s3, 16 1095; GFX8-NEXT: s_and_b32 s2, s4, s2 1096; GFX8-NEXT: s_lshr_b32 s4, s4, 16 1097; GFX8-NEXT: v_mov_b32_e32 v0, s5 1098; GFX8-NEXT: v_mov_b32_e32 v1, s3 1099; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0 1100; GFX8-NEXT: v_mov_b32_e32 v1, s6 1101; GFX8-NEXT: v_mad_u32_u24 v2, s2, v1, v0 1102; GFX8-NEXT: v_mov_b32_e32 v0, s0 1103; GFX8-NEXT: v_mov_b32_e32 v1, s1 1104; GFX8-NEXT: flat_store_dword v[0:1], v2 1105; GFX8-NEXT: s_endpgm 1106; 1107; GFX9-NODL-LABEL: udot2_v4i16_Hi: 1108; GFX9-NODL: ; %bb.0: ; %entry 1109; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1110; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1111; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff 1112; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 1113; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1114; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x4 1115; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x4 1116; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 1117; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1118; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 1119; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 1120; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 1121; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 1122; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 1123; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 1124; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v1, v2 1125; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 1126; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 1127; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] 1128; GFX9-NODL-NEXT: s_endpgm 1129; 1130; GFX9-DL-LABEL: udot2_v4i16_Hi: 1131; GFX9-DL: ; %bb.0: ; %entry 1132; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1133; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1134; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1135; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1136; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x4 1137; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 1138; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x4 1139; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1140; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 1141; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 1142; GFX9-DL-NEXT: v_dot2_u32_u16 v1, s4, v1, v2 1143; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] 1144; GFX9-DL-NEXT: s_endpgm 1145; 1146; GFX10-DL-LABEL: udot2_v4i16_Hi: 1147; GFX10-DL: ; %bb.0: ; %entry 1148; GFX10-DL-NEXT: s_clause 0x1 1149; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 1150; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1151; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 1152; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1153; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 1154; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x4 1155; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x4 1156; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1157; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 1158; GFX10-DL-NEXT: v_dot2_u32_u16 v0, s1, s0, v0 1159; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] 1160; GFX10-DL-NEXT: s_endpgm 1161 <4 x i16> addrspace(1)* %src2, 1162 i32 addrspace(1)* nocapture %dst) { 1163entry: 1164 %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %src1 1165 %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %src2 1166 1167 %s1.elt1 = extractelement <4 x i16> %vec1, i64 2 1168 %conv = zext i16 %s1.elt1 to i32 1169 %s2.elt1 = extractelement <4 x i16> %vec2, i64 2 1170 %conv2 = zext i16 %s2.elt1 to i32 1171 %mul1 = mul i32 %conv2, %conv 1172 1173 %s1.elt2 = extractelement <4 x i16> %vec1, i64 3 1174 %conv3 = zext i16 %s1.elt2 to i32 1175 %s2.elt2 = extractelement <4 x i16> %vec2, i64 3 1176 %conv4 = zext i16 %s2.elt2 to i32 1177 %mul2 = mul i32 %conv4, %conv3 1178 1179 %s3 = load i32, i32 addrspace(1)* %dst, align 4 1180 %add = add i32 %mul2, %s3 1181 %add6 = add i32 %add, %mul1 1182 store i32 %add6, i32 addrspace(1)* %dst, align 4 1183 ret void 1184} 1185 1186define amdgpu_kernel void @notudot2_v4i16_Even(<4 x i16> addrspace(1)* %src1, 1187; GFX7-LABEL: notudot2_v4i16_Even: 1188; GFX7: ; %bb.0: ; %entry 1189; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1190; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1191; GFX7-NEXT: s_mov_b32 s8, 0xffff 1192; GFX7-NEXT: s_mov_b32 s3, 0xf000 1193; GFX7-NEXT: s_mov_b32 s2, -1 1194; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1195; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 1196; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 1197; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1198; GFX7-NEXT: s_and_b32 s5, s5, s8 1199; GFX7-NEXT: s_and_b32 s4, s4, s8 1200; GFX7-NEXT: s_and_b32 s6, s6, s8 1201; GFX7-NEXT: s_and_b32 s7, s7, s8 1202; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0 1203; GFX7-NEXT: v_mov_b32_e32 v0, s5 1204; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1205; GFX7-NEXT: v_mov_b32_e32 v1, s8 1206; GFX7-NEXT: v_mad_u32_u24 v0, s7, v0, v1 1207; GFX7-NEXT: v_mov_b32_e32 v1, s4 1208; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 1209; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1210; GFX7-NEXT: s_endpgm 1211; 1212; GFX8-LABEL: notudot2_v4i16_Even: 1213; GFX8: ; %bb.0: ; %entry 1214; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1215; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1216; GFX8-NEXT: s_mov_b32 s8, 0xffff 1217; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1218; GFX8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1219; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 1220; GFX8-NEXT: s_load_dword s6, s[0:1], 0x0 1221; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1222; GFX8-NEXT: s_and_b32 s3, s3, s8 1223; GFX8-NEXT: s_and_b32 s2, s2, s8 1224; GFX8-NEXT: s_and_b32 s5, s5, s8 1225; GFX8-NEXT: v_mov_b32_e32 v0, s6 1226; GFX8-NEXT: v_mov_b32_e32 v1, s3 1227; GFX8-NEXT: v_mad_u32_u24 v0, s5, v1, v0 1228; GFX8-NEXT: s_and_b32 s4, s4, s8 1229; GFX8-NEXT: v_mov_b32_e32 v1, s2 1230; GFX8-NEXT: v_mad_u32_u24 v2, s4, v1, v0 1231; GFX8-NEXT: v_mov_b32_e32 v0, s0 1232; GFX8-NEXT: v_mov_b32_e32 v1, s1 1233; GFX8-NEXT: flat_store_dword v[0:1], v2 1234; GFX8-NEXT: s_endpgm 1235; 1236; GFX9-NODL-LABEL: notudot2_v4i16_Even: 1237; GFX9-NODL: ; %bb.0: ; %entry 1238; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1239; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1240; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff 1241; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 1242; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1243; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1244; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 1245; GFX9-NODL-NEXT: s_load_dword s6, s[0:1], 0x0 1246; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1247; GFX9-NODL-NEXT: s_and_b32 s3, s3, s8 1248; GFX9-NODL-NEXT: s_and_b32 s2, s2, s8 1249; GFX9-NODL-NEXT: s_and_b32 s5, s5, s8 1250; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 1251; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 1252; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v1, v2 1253; GFX9-NODL-NEXT: s_and_b32 s4, s4, s8 1254; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 1255; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 1256; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] 1257; GFX9-NODL-NEXT: s_endpgm 1258; 1259; GFX9-DL-LABEL: notudot2_v4i16_Even: 1260; GFX9-DL: ; %bb.0: ; %entry 1261; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1262; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1263; GFX9-DL-NEXT: s_mov_b32 s8, 0xffff 1264; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1265; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1266; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1267; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 1268; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0 1269; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1270; GFX9-DL-NEXT: s_and_b32 s3, s3, s8 1271; GFX9-DL-NEXT: s_and_b32 s2, s2, s8 1272; GFX9-DL-NEXT: s_and_b32 s5, s5, s8 1273; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 1274; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6 1275; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v1, v2 1276; GFX9-DL-NEXT: s_and_b32 s4, s4, s8 1277; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 1278; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 1279; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] 1280; GFX9-DL-NEXT: s_endpgm 1281; 1282; GFX10-DL-LABEL: notudot2_v4i16_Even: 1283; GFX10-DL: ; %bb.0: ; %entry 1284; GFX10-DL-NEXT: s_clause 0x1 1285; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 1286; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1287; GFX10-DL-NEXT: s_mov_b32 s7, 0xffff 1288; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 1289; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1290; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 1291; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 1292; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1293; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1294; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 1295; GFX10-DL-NEXT: s_and_b32 s1, s1, s7 1296; GFX10-DL-NEXT: s_and_b32 s3, s3, s7 1297; GFX10-DL-NEXT: s_and_b32 s0, s0, s7 1298; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s1, v0 1299; GFX10-DL-NEXT: s_and_b32 s1, s2, s7 1300; GFX10-DL-NEXT: v_mad_u32_u24 v0, s1, s0, v0 1301; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] 1302; GFX10-DL-NEXT: s_endpgm 1303 <4 x i16> addrspace(1)* %src2, 1304 i32 addrspace(1)* nocapture %dst) { 1305entry: 1306 %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %src1 1307 %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %src2 1308 1309 %s1.elt1 = extractelement <4 x i16> %vec1, i64 0 1310 %conv = zext i16 %s1.elt1 to i32 1311 %s2.elt1 = extractelement <4 x i16> %vec2, i64 0 1312 %conv2 = zext i16 %s2.elt1 to i32 1313 %mul1 = mul i32 %conv2, %conv 1314 1315 %s1.elt2 = extractelement <4 x i16> %vec1, i64 2 1316 %conv3 = zext i16 %s1.elt2 to i32 1317 %s2.elt2 = extractelement <4 x i16> %vec2, i64 2 1318 %conv4 = zext i16 %s2.elt2 to i32 1319 %mul2 = mul i32 %conv4, %conv3 1320 1321 %s3 = load i32, i32 addrspace(1)* %dst, align 4 1322 %add = add i32 %mul2, %s3 1323 %add6 = add i32 %add, %mul1 1324 store i32 %add6, i32 addrspace(1)* %dst, align 4 1325 ret void 1326} 1327 1328define amdgpu_kernel void @notudot2_v4i16_Middle(<4 x i16> addrspace(1)* %src1, 1329; GFX7-LABEL: notudot2_v4i16_Middle: 1330; GFX7: ; %bb.0: ; %entry 1331; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1332; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1333; GFX7-NEXT: s_mov_b32 s8, 0xffff 1334; GFX7-NEXT: s_mov_b32 s3, 0xf000 1335; GFX7-NEXT: s_mov_b32 s2, -1 1336; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1337; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 1338; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 1339; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1340; GFX7-NEXT: s_and_b32 s5, s5, s8 1341; GFX7-NEXT: s_and_b32 s7, s7, s8 1342; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0 1343; GFX7-NEXT: s_lshr_b32 s4, s4, 16 1344; GFX7-NEXT: v_mov_b32_e32 v0, s5 1345; GFX7-NEXT: s_lshr_b32 s6, s6, 16 1346; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1347; GFX7-NEXT: v_mov_b32_e32 v1, s8 1348; GFX7-NEXT: v_mad_u32_u24 v0, s7, v0, v1 1349; GFX7-NEXT: v_mov_b32_e32 v1, s4 1350; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 1351; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1352; GFX7-NEXT: s_endpgm 1353; 1354; GFX8-LABEL: notudot2_v4i16_Middle: 1355; GFX8: ; %bb.0: ; %entry 1356; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1357; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1358; GFX8-NEXT: s_mov_b32 s8, 0xffff 1359; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1360; GFX8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1361; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 1362; GFX8-NEXT: s_load_dword s6, s[0:1], 0x0 1363; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1364; GFX8-NEXT: s_and_b32 s3, s3, s8 1365; GFX8-NEXT: s_lshr_b32 s2, s2, 16 1366; GFX8-NEXT: s_and_b32 s5, s5, s8 1367; GFX8-NEXT: v_mov_b32_e32 v0, s6 1368; GFX8-NEXT: v_mov_b32_e32 v1, s3 1369; GFX8-NEXT: v_mad_u32_u24 v0, s5, v1, v0 1370; GFX8-NEXT: s_lshr_b32 s4, s4, 16 1371; GFX8-NEXT: v_mov_b32_e32 v1, s2 1372; GFX8-NEXT: v_mad_u32_u24 v2, s4, v1, v0 1373; GFX8-NEXT: v_mov_b32_e32 v0, s0 1374; GFX8-NEXT: v_mov_b32_e32 v1, s1 1375; GFX8-NEXT: flat_store_dword v[0:1], v2 1376; GFX8-NEXT: s_endpgm 1377; 1378; GFX9-NODL-LABEL: notudot2_v4i16_Middle: 1379; GFX9-NODL: ; %bb.0: ; %entry 1380; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1381; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1382; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff 1383; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 1384; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1385; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1386; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 1387; GFX9-NODL-NEXT: s_load_dword s6, s[0:1], 0x0 1388; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1389; GFX9-NODL-NEXT: s_and_b32 s3, s3, s8 1390; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16 1391; GFX9-NODL-NEXT: s_and_b32 s5, s5, s8 1392; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 1393; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 1394; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v1, v2 1395; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 1396; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 1397; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 1398; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] 1399; GFX9-NODL-NEXT: s_endpgm 1400; 1401; GFX9-DL-LABEL: notudot2_v4i16_Middle: 1402; GFX9-DL: ; %bb.0: ; %entry 1403; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1404; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1405; GFX9-DL-NEXT: s_mov_b32 s8, 0xffff 1406; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1407; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1408; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1409; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 1410; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0 1411; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1412; GFX9-DL-NEXT: s_and_b32 s3, s3, s8 1413; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 16 1414; GFX9-DL-NEXT: s_and_b32 s5, s5, s8 1415; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 1416; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6 1417; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v1, v2 1418; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16 1419; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 1420; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 1421; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] 1422; GFX9-DL-NEXT: s_endpgm 1423; 1424; GFX10-DL-LABEL: notudot2_v4i16_Middle: 1425; GFX10-DL: ; %bb.0: ; %entry 1426; GFX10-DL-NEXT: s_clause 0x1 1427; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 1428; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1429; GFX10-DL-NEXT: s_mov_b32 s7, 0xffff 1430; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 1431; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1432; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 1433; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 1434; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1435; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1436; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 1437; GFX10-DL-NEXT: s_and_b32 s1, s1, s7 1438; GFX10-DL-NEXT: s_and_b32 s3, s3, s7 1439; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 16 1440; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s1, v0 1441; GFX10-DL-NEXT: s_lshr_b32 s1, s2, 16 1442; GFX10-DL-NEXT: v_mad_u32_u24 v0, s1, s0, v0 1443; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] 1444; GFX10-DL-NEXT: s_endpgm 1445 <4 x i16> addrspace(1)* %src2, 1446 i32 addrspace(1)* nocapture %dst) { 1447entry: 1448 %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %src1 1449 %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %src2 1450 1451 %s1.elt1 = extractelement <4 x i16> %vec1, i64 1 1452 %conv = zext i16 %s1.elt1 to i32 1453 %s2.elt1 = extractelement <4 x i16> %vec2, i64 1 1454 %conv2 = zext i16 %s2.elt1 to i32 1455 %mul1 = mul i32 %conv2, %conv 1456 1457 %s1.elt2 = extractelement <4 x i16> %vec1, i64 2 1458 %conv3 = zext i16 %s1.elt2 to i32 1459 %s2.elt2 = extractelement <4 x i16> %vec2, i64 2 1460 %conv4 = zext i16 %s2.elt2 to i32 1461 %mul2 = mul i32 %conv4, %conv3 1462 1463 %s3 = load i32, i32 addrspace(1)* %dst, align 4 1464 %add = add i32 %mul2, %s3 1465 %add6 = add i32 %add, %mul1 1466 store i32 %add6, i32 addrspace(1)* %dst, align 4 1467 ret void 1468} 1469 1470define amdgpu_kernel void @notudot2_DiffIndex(<2 x i16> addrspace(1)* %src1, 1471; GFX7-LABEL: notudot2_DiffIndex: 1472; GFX7: ; %bb.0: ; %entry 1473; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1474; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1475; GFX7-NEXT: s_mov_b32 s8, 0xffff 1476; GFX7-NEXT: s_mov_b32 s3, 0xf000 1477; GFX7-NEXT: s_mov_b32 s2, -1 1478; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1479; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 1480; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 1481; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1482; GFX7-NEXT: s_lshr_b32 s6, s4, 16 1483; GFX7-NEXT: s_lshr_b32 s7, s5, 16 1484; GFX7-NEXT: s_and_b32 s4, s4, s8 1485; GFX7-NEXT: s_and_b32 s5, s5, s8 1486; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0 1487; GFX7-NEXT: v_mov_b32_e32 v0, s6 1488; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1489; GFX7-NEXT: v_mov_b32_e32 v1, s8 1490; GFX7-NEXT: v_mad_u32_u24 v0, s5, v0, v1 1491; GFX7-NEXT: v_mov_b32_e32 v1, s4 1492; GFX7-NEXT: v_mad_u32_u24 v0, s7, v1, v0 1493; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1494; GFX7-NEXT: s_endpgm 1495; 1496; GFX8-LABEL: notudot2_DiffIndex: 1497; GFX8: ; %bb.0: ; %entry 1498; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1499; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1500; GFX8-NEXT: s_mov_b32 s2, 0xffff 1501; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1502; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0 1503; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 1504; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 1505; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1506; GFX8-NEXT: s_and_b32 s6, s3, s2 1507; GFX8-NEXT: s_lshr_b32 s3, s3, 16 1508; GFX8-NEXT: s_and_b32 s2, s4, s2 1509; GFX8-NEXT: v_mov_b32_e32 v0, s5 1510; GFX8-NEXT: v_mov_b32_e32 v1, s3 1511; GFX8-NEXT: v_mad_u32_u24 v0, s2, v1, v0 1512; GFX8-NEXT: s_lshr_b32 s7, s4, 16 1513; GFX8-NEXT: v_mov_b32_e32 v1, s6 1514; GFX8-NEXT: v_mad_u32_u24 v2, s7, v1, v0 1515; GFX8-NEXT: v_mov_b32_e32 v0, s0 1516; GFX8-NEXT: v_mov_b32_e32 v1, s1 1517; GFX8-NEXT: flat_store_dword v[0:1], v2 1518; GFX8-NEXT: s_endpgm 1519; 1520; GFX9-NODL-LABEL: notudot2_DiffIndex: 1521; GFX9-NODL: ; %bb.0: ; %entry 1522; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1523; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1524; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff 1525; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 1526; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1527; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 1528; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 1529; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 1530; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1531; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 1532; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 1533; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 1534; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 1535; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 1536; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v1, v2 1537; GFX9-NODL-NEXT: s_lshr_b32 s7, s4, 16 1538; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 1539; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s7, v2, v1 1540; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] 1541; GFX9-NODL-NEXT: s_endpgm 1542; 1543; GFX9-DL-LABEL: notudot2_DiffIndex: 1544; GFX9-DL: ; %bb.0: ; %entry 1545; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1546; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1547; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff 1548; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1549; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1550; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 1551; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 1552; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 1553; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1554; GFX9-DL-NEXT: s_and_b32 s6, s3, s2 1555; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 1556; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 1557; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 1558; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 1559; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v1, v2 1560; GFX9-DL-NEXT: s_lshr_b32 s7, s4, 16 1561; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6 1562; GFX9-DL-NEXT: v_mad_u32_u24 v1, s7, v2, v1 1563; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] 1564; GFX9-DL-NEXT: s_endpgm 1565; 1566; GFX10-DL-LABEL: notudot2_DiffIndex: 1567; GFX10-DL: ; %bb.0: ; %entry 1568; GFX10-DL-NEXT: s_clause 0x1 1569; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 1570; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1571; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 1572; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1573; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 1574; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 1575; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 1576; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff 1577; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1578; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 1579; GFX10-DL-NEXT: s_lshr_b32 s3, s0, 16 1580; GFX10-DL-NEXT: s_and_b32 s6, s1, s2 1581; GFX10-DL-NEXT: s_and_b32 s0, s0, s2 1582; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 16 1583; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s3, v0 1584; GFX10-DL-NEXT: v_mad_u32_u24 v0, s1, s0, v0 1585; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] 1586; GFX10-DL-NEXT: s_endpgm 1587 <2 x i16> addrspace(1)* %src2, 1588 i32 addrspace(1)* nocapture %dst) { 1589entry: 1590 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1 1591 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2 1592 1593 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 1594 %conv = zext i16 %s1.elt1 to i32 1595 %s2.elt1 = extractelement <2 x i16> %vec2, i64 1 1596 %conv2 = zext i16 %s2.elt1 to i32 1597 %mul1 = mul i32 %conv2, %conv 1598 1599 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 1600 %conv3 = zext i16 %s1.elt2 to i32 1601 %s2.elt2 = extractelement <2 x i16> %vec2, i64 0 1602 %conv4 = zext i16 %s2.elt2 to i32 1603 %mul2 = mul i32 %conv4, %conv3 1604 1605 %s3 = load i32, i32 addrspace(1)* %dst, align 4 1606 %add = add i32 %mul2, %s3 1607 %add6 = add i32 %add, %mul1 1608 store i32 %add6, i32 addrspace(1)* %dst, align 4 1609 ret void 1610} 1611 1612define amdgpu_kernel void @udot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1, 1613; GFX7-LABEL: udot2_MultipleUses_add1: 1614; GFX7: ; %bb.0: ; %entry 1615; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1616; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1617; GFX7-NEXT: s_mov_b32 s8, 0xffff 1618; GFX7-NEXT: s_mov_b32 s3, 0xf000 1619; GFX7-NEXT: s_mov_b32 s2, -1 1620; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1621; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 1622; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 1623; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1624; GFX7-NEXT: s_lshr_b32 s6, s4, 16 1625; GFX7-NEXT: s_lshr_b32 s7, s5, 16 1626; GFX7-NEXT: s_and_b32 s4, s4, s8 1627; GFX7-NEXT: s_and_b32 s5, s5, s8 1628; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0 1629; GFX7-NEXT: v_mov_b32_e32 v0, s6 1630; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1631; GFX7-NEXT: v_mov_b32_e32 v1, s8 1632; GFX7-NEXT: v_mad_u32_u24 v0, s7, v0, v1 1633; GFX7-NEXT: v_mov_b32_e32 v1, s4 1634; GFX7-NEXT: v_mad_u32_u24 v1, s5, v1, v0 1635; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 1636; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1637; GFX7-NEXT: s_endpgm 1638; 1639; GFX8-LABEL: udot2_MultipleUses_add1: 1640; GFX8: ; %bb.0: ; %entry 1641; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1642; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1643; GFX8-NEXT: s_mov_b32 s2, 0xffff 1644; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1645; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0 1646; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 1647; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 1648; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1649; GFX8-NEXT: s_and_b32 s6, s3, s2 1650; GFX8-NEXT: s_lshr_b32 s3, s3, 16 1651; GFX8-NEXT: s_and_b32 s2, s4, s2 1652; GFX8-NEXT: s_lshr_b32 s4, s4, 16 1653; GFX8-NEXT: v_mov_b32_e32 v0, s5 1654; GFX8-NEXT: v_mov_b32_e32 v1, s3 1655; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0 1656; GFX8-NEXT: v_mov_b32_e32 v1, s6 1657; GFX8-NEXT: v_mad_u32_u24 v1, s2, v1, v0 1658; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v1 1659; GFX8-NEXT: v_mov_b32_e32 v0, s0 1660; GFX8-NEXT: v_mov_b32_e32 v1, s1 1661; GFX8-NEXT: flat_store_dword v[0:1], v2 1662; GFX8-NEXT: s_endpgm 1663; 1664; GFX9-NODL-LABEL: udot2_MultipleUses_add1: 1665; GFX9-NODL: ; %bb.0: ; %entry 1666; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1667; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1668; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff 1669; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 1670; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1671; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 1672; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 1673; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 1674; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1675; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 1676; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 1677; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 1678; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 1679; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 1680; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 1681; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v1, v2 1682; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 1683; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v2, v1 1684; GFX9-NODL-NEXT: v_add_u32_e32 v1, v2, v1 1685; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] 1686; GFX9-NODL-NEXT: s_endpgm 1687; 1688; GFX9-DL-LABEL: udot2_MultipleUses_add1: 1689; GFX9-DL: ; %bb.0: ; %entry 1690; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1691; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1692; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff 1693; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1694; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1695; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 1696; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 1697; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 1698; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1699; GFX9-DL-NEXT: s_and_b32 s6, s3, s2 1700; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 1701; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 1702; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16 1703; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 1704; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 1705; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v1, v2 1706; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6 1707; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v2, v1 1708; GFX9-DL-NEXT: v_add_u32_e32 v1, v2, v1 1709; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] 1710; GFX9-DL-NEXT: s_endpgm 1711; 1712; GFX10-DL-LABEL: udot2_MultipleUses_add1: 1713; GFX10-DL: ; %bb.0: ; %entry 1714; GFX10-DL-NEXT: s_clause 0x1 1715; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 1716; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1717; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 1718; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1719; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 1720; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 1721; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 1722; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1723; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 1724; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 16 1725; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 16 1726; GFX10-DL-NEXT: s_mov_b32 s6, 0xffff 1727; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 1728; GFX10-DL-NEXT: s_and_b32 s0, s0, s6 1729; GFX10-DL-NEXT: s_and_b32 s1, s1, s6 1730; GFX10-DL-NEXT: v_mad_u32_u24 v1, s1, s0, v0 1731; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, v1, v0 1732; GFX10-DL-NEXT: global_store_dword v2, v0, s[4:5] 1733; GFX10-DL-NEXT: s_endpgm 1734 <2 x i16> addrspace(1)* %src2, 1735 i32 addrspace(1)* nocapture %dst) { 1736entry: 1737 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1 1738 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2 1739 1740 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 1741 %conv = zext i16 %s1.elt1 to i32 1742 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 1743 %conv2 = zext i16 %s2.elt1 to i32 1744 %mul1 = mul i32 %conv2, %conv 1745 1746 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 1747 %conv3 = zext i16 %s1.elt2 to i32 1748 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 1749 %conv4 = zext i16 %s2.elt2 to i32 1750 %mul2 = mul i32 %conv4, %conv3 1751 1752 %s3 = load i32, i32 addrspace(1)* %dst, align 4 1753 %add1 = add i32 %mul2, %s3 1754 %add2 = add i32 %add1, %mul1 1755 1756 %res = add i32 %add2, %add1 1757 store i32 %res, i32 addrspace(1)* %dst, align 4 1758 ret void 1759} 1760 1761define amdgpu_kernel void @idot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1, 1762; GFX7-LABEL: idot2_MultipleUses_add1: 1763; GFX7: ; %bb.0: ; %entry 1764; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1765; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1766; GFX7-NEXT: s_mov_b32 s3, 0xf000 1767; GFX7-NEXT: s_mov_b32 s2, -1 1768; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1769; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 1770; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 1771; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0 1772; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1773; GFX7-NEXT: s_sext_i32_i16 s7, s4 1774; GFX7-NEXT: s_ashr_i32 s4, s4, 16 1775; GFX7-NEXT: s_sext_i32_i16 s8, s5 1776; GFX7-NEXT: s_ashr_i32 s5, s5, 16 1777; GFX7-NEXT: v_mov_b32_e32 v0, s4 1778; GFX7-NEXT: v_mov_b32_e32 v1, s6 1779; GFX7-NEXT: v_mad_i32_i24 v0, s5, v0, v1 1780; GFX7-NEXT: v_mov_b32_e32 v1, s7 1781; GFX7-NEXT: v_mad_i32_i24 v1, s8, v1, v0 1782; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 1783; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1784; GFX7-NEXT: s_endpgm 1785; 1786; GFX8-LABEL: idot2_MultipleUses_add1: 1787; GFX8: ; %bb.0: ; %entry 1788; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1789; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1790; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1791; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 1792; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 1793; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 1794; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1795; GFX8-NEXT: s_sext_i32_i16 s5, s2 1796; GFX8-NEXT: s_ashr_i32 s2, s2, 16 1797; GFX8-NEXT: s_sext_i32_i16 s6, s3 1798; GFX8-NEXT: s_ashr_i32 s3, s3, 16 1799; GFX8-NEXT: v_mov_b32_e32 v0, s4 1800; GFX8-NEXT: v_mov_b32_e32 v1, s2 1801; GFX8-NEXT: v_mov_b32_e32 v2, s5 1802; GFX8-NEXT: v_mad_i32_i24 v0, s3, v1, v0 1803; GFX8-NEXT: v_mad_i32_i24 v1, s6, v2, v0 1804; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v1 1805; GFX8-NEXT: v_mov_b32_e32 v0, s0 1806; GFX8-NEXT: v_mov_b32_e32 v1, s1 1807; GFX8-NEXT: flat_store_dword v[0:1], v2 1808; GFX8-NEXT: s_endpgm 1809; 1810; GFX9-NODL-LABEL: idot2_MultipleUses_add1: 1811; GFX9-NODL: ; %bb.0: ; %entry 1812; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1813; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1814; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 1815; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1816; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 1817; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 1818; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 1819; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1820; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 1821; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 1822; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 1823; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 1824; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 1825; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 1826; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s3, v2, v1 1827; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 1828; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v2, v1 1829; GFX9-NODL-NEXT: v_add_u32_e32 v1, v2, v1 1830; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] 1831; GFX9-NODL-NEXT: s_endpgm 1832; 1833; GFX9-DL-LABEL: idot2_MultipleUses_add1: 1834; GFX9-DL: ; %bb.0: ; %entry 1835; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1836; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1837; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1838; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1839; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 1840; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 1841; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 1842; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1843; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 1844; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16 1845; GFX9-DL-NEXT: s_sext_i32_i16 s6, s3 1846; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16 1847; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 1848; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 1849; GFX9-DL-NEXT: v_mad_i32_i24 v1, s3, v2, v1 1850; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 1851; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v2, v1 1852; GFX9-DL-NEXT: v_add_u32_e32 v1, v2, v1 1853; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] 1854; GFX9-DL-NEXT: s_endpgm 1855; 1856; GFX10-DL-LABEL: idot2_MultipleUses_add1: 1857; GFX10-DL: ; %bb.0: ; %entry 1858; GFX10-DL-NEXT: s_clause 0x1 1859; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 1860; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1861; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 1862; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1863; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 1864; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 1865; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 1866; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1867; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 1868; GFX10-DL-NEXT: s_ashr_i32 s2, s0, 16 1869; GFX10-DL-NEXT: s_ashr_i32 s3, s1, 16 1870; GFX10-DL-NEXT: s_sext_i32_i16 s0, s0 1871; GFX10-DL-NEXT: s_sext_i32_i16 s1, s1 1872; GFX10-DL-NEXT: v_mad_i32_i24 v0, s3, s2, v0 1873; GFX10-DL-NEXT: v_mad_i32_i24 v1, s1, s0, v0 1874; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, v1, v0 1875; GFX10-DL-NEXT: global_store_dword v2, v0, s[4:5] 1876; GFX10-DL-NEXT: s_endpgm 1877 <2 x i16> addrspace(1)* %src2, 1878 i32 addrspace(1)* nocapture %dst) { 1879entry: 1880 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1 1881 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2 1882 1883 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 1884 %conv = sext i16 %s1.elt1 to i32 1885 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 1886 %conv2 = sext i16 %s2.elt1 to i32 1887 %mul1 = mul i32 %conv2, %conv 1888 1889 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 1890 %conv3 = sext i16 %s1.elt2 to i32 1891 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 1892 %conv4 = sext i16 %s2.elt2 to i32 1893 %mul2 = mul i32 %conv4, %conv3 1894 1895 %s3 = load i32, i32 addrspace(1)* %dst, align 4 1896 %add1 = add i32 %mul2, %s3 1897 %add2 = add i32 %add1, %mul1 1898 1899 %res = add i32 %add2, %add1 1900 store i32 %res, i32 addrspace(1)* %dst, align 4 1901 ret void 1902} 1903 1904define amdgpu_kernel void @udot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1, 1905; GFX7-LABEL: udot2_MultipleUses_mul1: 1906; GFX7: ; %bb.0: ; %entry 1907; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1908; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1909; GFX7-NEXT: s_mov_b32 s8, 0xffff 1910; GFX7-NEXT: s_mov_b32 s3, 0xf000 1911; GFX7-NEXT: s_mov_b32 s2, -1 1912; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1913; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 1914; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 1915; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1916; GFX7-NEXT: s_lshr_b32 s6, s4, 16 1917; GFX7-NEXT: s_lshr_b32 s7, s5, 16 1918; GFX7-NEXT: s_and_b32 s4, s4, s8 1919; GFX7-NEXT: s_and_b32 s5, s5, s8 1920; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0 1921; GFX7-NEXT: v_mov_b32_e32 v0, s4 1922; GFX7-NEXT: v_mov_b32_e32 v2, s6 1923; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1924; GFX7-NEXT: v_mov_b32_e32 v1, s8 1925; GFX7-NEXT: v_mad_u32_u24 v1, s5, v0, v1 1926; GFX7-NEXT: v_mad_u32_u24 v1, s7, v2, v1 1927; GFX7-NEXT: v_mad_u32_u24 v0, s5, v0, v1 1928; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1929; GFX7-NEXT: s_endpgm 1930; 1931; GFX8-LABEL: udot2_MultipleUses_mul1: 1932; GFX8: ; %bb.0: ; %entry 1933; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1934; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1935; GFX8-NEXT: s_mov_b32 s2, 0xffff 1936; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1937; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0 1938; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 1939; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 1940; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1941; GFX8-NEXT: s_and_b32 s6, s3, s2 1942; GFX8-NEXT: s_and_b32 s2, s4, s2 1943; GFX8-NEXT: s_lshr_b32 s3, s3, 16 1944; GFX8-NEXT: v_mov_b32_e32 v0, s5 1945; GFX8-NEXT: v_mov_b32_e32 v1, s6 1946; GFX8-NEXT: s_lshr_b32 s4, s4, 16 1947; GFX8-NEXT: v_mad_u32_u24 v0, s2, v1, v0 1948; GFX8-NEXT: v_mov_b32_e32 v2, s3 1949; GFX8-NEXT: v_mad_u32_u24 v0, s4, v2, v0 1950; GFX8-NEXT: v_mad_u32_u24 v2, s2, v1, v0 1951; GFX8-NEXT: v_mov_b32_e32 v0, s0 1952; GFX8-NEXT: v_mov_b32_e32 v1, s1 1953; GFX8-NEXT: flat_store_dword v[0:1], v2 1954; GFX8-NEXT: s_endpgm 1955; 1956; GFX9-NODL-LABEL: udot2_MultipleUses_mul1: 1957; GFX9-NODL: ; %bb.0: ; %entry 1958; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1959; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1960; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff 1961; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 1962; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1963; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 1964; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 1965; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 1966; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1967; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 1968; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 1969; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 1970; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 1971; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 1972; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 1973; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v2 1974; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 1975; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v3, v2 1976; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v1, v2 1977; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] 1978; GFX9-NODL-NEXT: s_endpgm 1979; 1980; GFX9-DL-LABEL: udot2_MultipleUses_mul1: 1981; GFX9-DL: ; %bb.0: ; %entry 1982; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1983; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1984; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff 1985; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1986; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1987; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 1988; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 1989; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 1990; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1991; GFX9-DL-NEXT: s_and_b32 s6, s3, s2 1992; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 1993; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 1994; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 1995; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 1996; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16 1997; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v1, v2 1998; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 1999; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v3, v2 2000; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v1, v2 2001; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] 2002; GFX9-DL-NEXT: s_endpgm 2003; 2004; GFX10-DL-LABEL: udot2_MultipleUses_mul1: 2005; GFX10-DL: ; %bb.0: ; %entry 2006; GFX10-DL-NEXT: s_clause 0x1 2007; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 2008; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2009; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 2010; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2011; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 2012; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 2013; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 2014; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff 2015; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2016; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 2017; GFX10-DL-NEXT: s_and_b32 s3, s0, s2 2018; GFX10-DL-NEXT: s_and_b32 s2, s1, s2 2019; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 16 2020; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 16 2021; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s3, v0 2022; GFX10-DL-NEXT: v_mad_u32_u24 v0, s1, s0, v0 2023; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s3, v0 2024; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] 2025; GFX10-DL-NEXT: s_endpgm 2026 <2 x i16> addrspace(1)* %src2, 2027 i32 addrspace(1)* nocapture %dst) { 2028entry: 2029 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1 2030 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2 2031 2032 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 2033 %conv = zext i16 %s1.elt1 to i32 2034 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 2035 %conv2 = zext i16 %s2.elt1 to i32 2036 %mul1 = mul i32 %conv2, %conv 2037 2038 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 2039 %conv3 = zext i16 %s1.elt2 to i32 2040 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 2041 %conv4 = zext i16 %s2.elt2 to i32 2042 %mul2 = mul i32 %conv4, %conv3 2043 2044 %s3 = load i32, i32 addrspace(1)* %dst, align 4 2045 %add0 = add i32 %mul1, %s3 2046 2047 %add1 = add i32 %mul2, %add0 2048 %add2 = add i32 %add1, %mul1 2049 2050 store i32 %add2, i32 addrspace(1)* %dst, align 4 2051 ret void 2052} 2053 2054define amdgpu_kernel void @idot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1, 2055; GFX7-LABEL: idot2_MultipleUses_mul1: 2056; GFX7: ; %bb.0: ; %entry 2057; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 2058; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 2059; GFX7-NEXT: s_mov_b32 s3, 0xf000 2060; GFX7-NEXT: s_mov_b32 s2, -1 2061; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2062; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 2063; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 2064; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0 2065; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2066; GFX7-NEXT: s_sext_i32_i16 s7, s4 2067; GFX7-NEXT: s_sext_i32_i16 s8, s5 2068; GFX7-NEXT: s_ashr_i32 s4, s4, 16 2069; GFX7-NEXT: v_mov_b32_e32 v0, s7 2070; GFX7-NEXT: v_mov_b32_e32 v1, s6 2071; GFX7-NEXT: s_ashr_i32 s5, s5, 16 2072; GFX7-NEXT: v_mad_i32_i24 v1, s8, v0, v1 2073; GFX7-NEXT: v_mov_b32_e32 v2, s4 2074; GFX7-NEXT: v_mad_i32_i24 v1, s5, v2, v1 2075; GFX7-NEXT: v_mad_i32_i24 v0, s8, v0, v1 2076; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 2077; GFX7-NEXT: s_endpgm 2078; 2079; GFX8-LABEL: idot2_MultipleUses_mul1: 2080; GFX8: ; %bb.0: ; %entry 2081; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2082; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2083; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2084; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 2085; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 2086; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 2087; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2088; GFX8-NEXT: s_sext_i32_i16 s5, s2 2089; GFX8-NEXT: s_sext_i32_i16 s6, s3 2090; GFX8-NEXT: s_ashr_i32 s2, s2, 16 2091; GFX8-NEXT: v_mov_b32_e32 v0, s4 2092; GFX8-NEXT: v_mov_b32_e32 v1, s5 2093; GFX8-NEXT: s_ashr_i32 s3, s3, 16 2094; GFX8-NEXT: v_mov_b32_e32 v2, s2 2095; GFX8-NEXT: v_mad_i32_i24 v0, s6, v1, v0 2096; GFX8-NEXT: v_mad_i32_i24 v0, s3, v2, v0 2097; GFX8-NEXT: v_mad_i32_i24 v2, s6, v1, v0 2098; GFX8-NEXT: v_mov_b32_e32 v0, s0 2099; GFX8-NEXT: v_mov_b32_e32 v1, s1 2100; GFX8-NEXT: flat_store_dword v[0:1], v2 2101; GFX8-NEXT: s_endpgm 2102; 2103; GFX9-NODL-LABEL: idot2_MultipleUses_mul1: 2104; GFX9-NODL: ; %bb.0: ; %entry 2105; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2106; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2107; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 2108; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2109; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 2110; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 2111; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 2112; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2113; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 2114; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 2115; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 2116; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 2117; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 2118; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 2119; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 2120; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 2121; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s3, v3, v1 2122; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 2123; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] 2124; GFX9-NODL-NEXT: s_endpgm 2125; 2126; GFX9-DL-LABEL: idot2_MultipleUses_mul1: 2127; GFX9-DL: ; %bb.0: ; %entry 2128; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2129; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2130; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 2131; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2132; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 2133; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 2134; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 2135; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2136; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 2137; GFX9-DL-NEXT: s_sext_i32_i16 s6, s3 2138; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16 2139; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 2140; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 2141; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16 2142; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 2143; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 2144; GFX9-DL-NEXT: v_mad_i32_i24 v1, s3, v3, v1 2145; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 2146; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] 2147; GFX9-DL-NEXT: s_endpgm 2148; 2149; GFX10-DL-LABEL: idot2_MultipleUses_mul1: 2150; GFX10-DL: ; %bb.0: ; %entry 2151; GFX10-DL-NEXT: s_clause 0x1 2152; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 2153; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2154; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 2155; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2156; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 2157; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 2158; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 2159; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2160; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 2161; GFX10-DL-NEXT: s_sext_i32_i16 s2, s0 2162; GFX10-DL-NEXT: s_sext_i32_i16 s3, s1 2163; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 16 2164; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 16 2165; GFX10-DL-NEXT: v_mad_i32_i24 v0, s3, s2, v0 2166; GFX10-DL-NEXT: v_mad_i32_i24 v0, s1, s0, v0 2167; GFX10-DL-NEXT: v_mad_i32_i24 v0, s3, s2, v0 2168; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] 2169; GFX10-DL-NEXT: s_endpgm 2170 <2 x i16> addrspace(1)* %src2, 2171 i32 addrspace(1)* nocapture %dst) { 2172entry: 2173 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1 2174 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2 2175 2176 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 2177 %conv = sext i16 %s1.elt1 to i32 2178 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 2179 %conv2 = sext i16 %s2.elt1 to i32 2180 %mul1 = mul i32 %conv2, %conv 2181 2182 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 2183 %conv3 = sext i16 %s1.elt2 to i32 2184 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 2185 %conv4 = sext i16 %s2.elt2 to i32 2186 %mul2 = mul i32 %conv4, %conv3 2187 2188 %s3 = load i32, i32 addrspace(1)* %dst, align 4 2189 %add0 = add i32 %mul1, %s3 2190 2191 %add1 = add i32 %mul2, %add0 2192 %add2 = add i32 %add1, %mul1 2193 2194 store i32 %add2, i32 addrspace(1)* %dst, align 4 2195 ret void 2196} 2197 2198define amdgpu_kernel void @udot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1, 2199; GFX7-LABEL: udot2_MultipleUses_mul2: 2200; GFX7: ; %bb.0: ; %entry 2201; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 2202; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 2203; GFX7-NEXT: s_mov_b32 s8, 0xffff 2204; GFX7-NEXT: s_mov_b32 s3, 0xf000 2205; GFX7-NEXT: s_mov_b32 s2, -1 2206; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2207; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 2208; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 2209; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2210; GFX7-NEXT: s_lshr_b32 s6, s4, 16 2211; GFX7-NEXT: s_lshr_b32 s7, s5, 16 2212; GFX7-NEXT: s_and_b32 s4, s4, s8 2213; GFX7-NEXT: s_and_b32 s5, s5, s8 2214; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0 2215; GFX7-NEXT: v_mov_b32_e32 v0, s6 2216; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2217; GFX7-NEXT: v_mov_b32_e32 v1, s8 2218; GFX7-NEXT: v_mad_u32_u24 v1, s7, v0, v1 2219; GFX7-NEXT: v_mad_u32_u24 v0, s7, v0, v1 2220; GFX7-NEXT: v_mov_b32_e32 v1, s4 2221; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 2222; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 2223; GFX7-NEXT: s_endpgm 2224; 2225; GFX8-LABEL: udot2_MultipleUses_mul2: 2226; GFX8: ; %bb.0: ; %entry 2227; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2228; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2229; GFX8-NEXT: s_mov_b32 s2, 0xffff 2230; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2231; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0 2232; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 2233; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 2234; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2235; GFX8-NEXT: s_and_b32 s6, s3, s2 2236; GFX8-NEXT: s_lshr_b32 s3, s3, 16 2237; GFX8-NEXT: s_and_b32 s2, s4, s2 2238; GFX8-NEXT: s_lshr_b32 s4, s4, 16 2239; GFX8-NEXT: v_mov_b32_e32 v0, s5 2240; GFX8-NEXT: v_mov_b32_e32 v1, s3 2241; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0 2242; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0 2243; GFX8-NEXT: v_mov_b32_e32 v1, s6 2244; GFX8-NEXT: v_mad_u32_u24 v2, s2, v1, v0 2245; GFX8-NEXT: v_mov_b32_e32 v0, s0 2246; GFX8-NEXT: v_mov_b32_e32 v1, s1 2247; GFX8-NEXT: flat_store_dword v[0:1], v2 2248; GFX8-NEXT: s_endpgm 2249; 2250; GFX9-NODL-LABEL: udot2_MultipleUses_mul2: 2251; GFX9-NODL: ; %bb.0: ; %entry 2252; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2253; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2254; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff 2255; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 2256; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2257; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 2258; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 2259; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 2260; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2261; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 2262; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 2263; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 2264; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 2265; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 2266; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 2267; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v1, v2 2268; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v1, v2 2269; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 2270; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 2271; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] 2272; GFX9-NODL-NEXT: s_endpgm 2273; 2274; GFX9-DL-LABEL: udot2_MultipleUses_mul2: 2275; GFX9-DL: ; %bb.0: ; %entry 2276; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2277; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2278; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff 2279; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 2280; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2281; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 2282; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 2283; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 2284; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2285; GFX9-DL-NEXT: s_and_b32 s6, s3, s2 2286; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 2287; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 2288; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16 2289; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 2290; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 2291; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v1, v2 2292; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v1, v2 2293; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6 2294; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 2295; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] 2296; GFX9-DL-NEXT: s_endpgm 2297; 2298; GFX10-DL-LABEL: udot2_MultipleUses_mul2: 2299; GFX10-DL: ; %bb.0: ; %entry 2300; GFX10-DL-NEXT: s_clause 0x1 2301; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 2302; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2303; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 2304; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2305; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 2306; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 2307; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 2308; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2309; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 2310; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 16 2311; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 16 2312; GFX10-DL-NEXT: s_mov_b32 s6, 0xffff 2313; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 2314; GFX10-DL-NEXT: s_and_b32 s0, s0, s6 2315; GFX10-DL-NEXT: s_and_b32 s1, s1, s6 2316; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 2317; GFX10-DL-NEXT: v_mad_u32_u24 v0, s1, s0, v0 2318; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] 2319; GFX10-DL-NEXT: s_endpgm 2320 <2 x i16> addrspace(1)* %src2, 2321 i32 addrspace(1)* nocapture %dst) { 2322entry: 2323 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1 2324 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2 2325 2326 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 2327 %conv = zext i16 %s1.elt1 to i32 2328 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 2329 %conv2 = zext i16 %s2.elt1 to i32 2330 %mul1 = mul i32 %conv2, %conv 2331 2332 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 2333 %conv3 = zext i16 %s1.elt2 to i32 2334 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 2335 %conv4 = zext i16 %s2.elt2 to i32 2336 %mul2 = mul i32 %conv4, %conv3 2337 2338 %s3 = load i32, i32 addrspace(1)* %dst, align 4 2339 %add0 = add i32 %mul2, %s3 2340 2341 %add1 = add i32 %mul2, %add0 2342 %add2 = add i32 %add1, %mul1 2343 2344 store i32 %add2, i32 addrspace(1)* %dst, align 4 2345 ret void 2346} 2347 2348define amdgpu_kernel void @idot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1, 2349; GFX7-LABEL: idot2_MultipleUses_mul2: 2350; GFX7: ; %bb.0: ; %entry 2351; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 2352; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 2353; GFX7-NEXT: s_mov_b32 s3, 0xf000 2354; GFX7-NEXT: s_mov_b32 s2, -1 2355; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2356; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 2357; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 2358; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0 2359; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2360; GFX7-NEXT: s_sext_i32_i16 s7, s4 2361; GFX7-NEXT: s_ashr_i32 s4, s4, 16 2362; GFX7-NEXT: s_sext_i32_i16 s8, s5 2363; GFX7-NEXT: s_ashr_i32 s5, s5, 16 2364; GFX7-NEXT: v_mov_b32_e32 v0, s4 2365; GFX7-NEXT: v_mov_b32_e32 v1, s6 2366; GFX7-NEXT: v_mad_i32_i24 v1, s5, v0, v1 2367; GFX7-NEXT: v_mad_i32_i24 v0, s5, v0, v1 2368; GFX7-NEXT: v_mov_b32_e32 v1, s7 2369; GFX7-NEXT: v_mad_i32_i24 v0, s8, v1, v0 2370; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 2371; GFX7-NEXT: s_endpgm 2372; 2373; GFX8-LABEL: idot2_MultipleUses_mul2: 2374; GFX8: ; %bb.0: ; %entry 2375; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2376; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2377; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2378; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 2379; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 2380; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 2381; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2382; GFX8-NEXT: s_sext_i32_i16 s5, s2 2383; GFX8-NEXT: s_ashr_i32 s2, s2, 16 2384; GFX8-NEXT: s_sext_i32_i16 s6, s3 2385; GFX8-NEXT: s_ashr_i32 s3, s3, 16 2386; GFX8-NEXT: v_mov_b32_e32 v0, s4 2387; GFX8-NEXT: v_mov_b32_e32 v1, s2 2388; GFX8-NEXT: v_mad_i32_i24 v0, s3, v1, v0 2389; GFX8-NEXT: v_mov_b32_e32 v2, s5 2390; GFX8-NEXT: v_mad_i32_i24 v0, s3, v1, v0 2391; GFX8-NEXT: v_mad_i32_i24 v2, s6, v2, v0 2392; GFX8-NEXT: v_mov_b32_e32 v0, s0 2393; GFX8-NEXT: v_mov_b32_e32 v1, s1 2394; GFX8-NEXT: flat_store_dword v[0:1], v2 2395; GFX8-NEXT: s_endpgm 2396; 2397; GFX9-NODL-LABEL: idot2_MultipleUses_mul2: 2398; GFX9-NODL: ; %bb.0: ; %entry 2399; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2400; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2401; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 2402; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2403; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 2404; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 2405; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 2406; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2407; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 2408; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 2409; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 2410; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 2411; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 2412; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 2413; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s3, v2, v1 2414; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s3, v2, v1 2415; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 2416; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 2417; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] 2418; GFX9-NODL-NEXT: s_endpgm 2419; 2420; GFX9-DL-LABEL: idot2_MultipleUses_mul2: 2421; GFX9-DL: ; %bb.0: ; %entry 2422; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2423; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2424; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 2425; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2426; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 2427; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 2428; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 2429; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2430; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 2431; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16 2432; GFX9-DL-NEXT: s_sext_i32_i16 s6, s3 2433; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16 2434; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 2435; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 2436; GFX9-DL-NEXT: v_mad_i32_i24 v1, s3, v2, v1 2437; GFX9-DL-NEXT: v_mad_i32_i24 v1, s3, v2, v1 2438; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 2439; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 2440; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] 2441; GFX9-DL-NEXT: s_endpgm 2442; 2443; GFX10-DL-LABEL: idot2_MultipleUses_mul2: 2444; GFX10-DL: ; %bb.0: ; %entry 2445; GFX10-DL-NEXT: s_clause 0x1 2446; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 2447; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2448; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 2449; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2450; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 2451; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 2452; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 2453; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2454; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 2455; GFX10-DL-NEXT: s_ashr_i32 s2, s0, 16 2456; GFX10-DL-NEXT: s_ashr_i32 s3, s1, 16 2457; GFX10-DL-NEXT: s_sext_i32_i16 s0, s0 2458; GFX10-DL-NEXT: s_sext_i32_i16 s1, s1 2459; GFX10-DL-NEXT: v_mad_i32_i24 v0, s3, s2, v0 2460; GFX10-DL-NEXT: v_mad_i32_i24 v0, s3, s2, v0 2461; GFX10-DL-NEXT: v_mad_i32_i24 v0, s1, s0, v0 2462; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] 2463; GFX10-DL-NEXT: s_endpgm 2464 <2 x i16> addrspace(1)* %src2, 2465 i32 addrspace(1)* nocapture %dst) { 2466entry: 2467 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1 2468 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2 2469 2470 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 2471 %conv = sext i16 %s1.elt1 to i32 2472 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 2473 %conv2 = sext i16 %s2.elt1 to i32 2474 %mul1 = mul i32 %conv2, %conv 2475 2476 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 2477 %conv3 = sext i16 %s1.elt2 to i32 2478 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 2479 %conv4 = sext i16 %s2.elt2 to i32 2480 %mul2 = mul i32 %conv4, %conv3 2481 2482 %s3 = load i32, i32 addrspace(1)* %dst, align 4 2483 %add0 = add i32 %mul2, %s3 2484 2485 %add1 = add i32 %mul2, %add0 2486 %add2 = add i32 %add1, %mul1 2487 2488 store i32 %add2, i32 addrspace(1)* %dst, align 4 2489 ret void 2490} 2491 2492define amdgpu_kernel void @udot2_acc16(<2 x i16> addrspace(1)* %src1, 2493; GFX7-LABEL: udot2_acc16: 2494; GFX7: ; %bb.0: ; %entry 2495; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 2496; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 2497; GFX7-NEXT: s_mov_b32 s3, 0xf000 2498; GFX7-NEXT: s_mov_b32 s2, -1 2499; GFX7-NEXT: s_mov_b32 s8, 0xffff 2500; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2501; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 2502; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 2503; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 2504; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2505; GFX7-NEXT: s_lshr_b32 s6, s4, 16 2506; GFX7-NEXT: s_lshr_b32 s7, s5, 16 2507; GFX7-NEXT: v_mov_b32_e32 v1, s7 2508; GFX7-NEXT: s_and_b32 s5, s5, s8 2509; GFX7-NEXT: s_and_b32 s4, s4, s8 2510; GFX7-NEXT: s_waitcnt vmcnt(0) 2511; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 2512; GFX7-NEXT: v_mov_b32_e32 v1, s5 2513; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 2514; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 2515; GFX7-NEXT: s_endpgm 2516; 2517; GFX8-LABEL: udot2_acc16: 2518; GFX8: ; %bb.0: ; %entry 2519; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2520; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2521; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2522; GFX8-NEXT: v_mov_b32_e32 v0, s0 2523; GFX8-NEXT: v_mov_b32_e32 v1, s1 2524; GFX8-NEXT: flat_load_ushort v2, v[0:1] 2525; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 2526; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 2527; GFX8-NEXT: s_mov_b32 s0, 0xffff 2528; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2529; GFX8-NEXT: s_and_b32 s3, s2, s0 2530; GFX8-NEXT: s_lshr_b32 s2, s2, 16 2531; GFX8-NEXT: s_and_b32 s0, s1, s0 2532; GFX8-NEXT: s_lshr_b32 s1, s1, 16 2533; GFX8-NEXT: v_mov_b32_e32 v3, s2 2534; GFX8-NEXT: s_waitcnt vmcnt(0) 2535; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 2536; GFX8-NEXT: v_mov_b32_e32 v3, s3 2537; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 2538; GFX8-NEXT: flat_store_short v[0:1], v2 2539; GFX8-NEXT: s_endpgm 2540; 2541; GFX9-NODL-LABEL: udot2_acc16: 2542; GFX9-NODL: ; %bb.0: ; %entry 2543; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2544; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2545; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 2546; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff 2547; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2548; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[0:1] 2549; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 2550; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 2551; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2552; GFX9-NODL-NEXT: s_and_b32 s5, s4, s2 2553; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 2554; GFX9-NODL-NEXT: s_and_b32 s2, s3, s2 2555; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 2556; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 2557; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 2558; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 2559; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 2560; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 2561; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] 2562; GFX9-NODL-NEXT: s_endpgm 2563; 2564; GFX9-DL-LABEL: udot2_acc16: 2565; GFX9-DL: ; %bb.0: ; %entry 2566; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2567; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2568; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 2569; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2570; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 2571; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 2572; GFX9-DL-NEXT: global_load_ushort v1, v0, s[0:1] 2573; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2574; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 2575; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 2576; GFX9-DL-NEXT: v_dot2_u32_u16 v1, s2, v2, v1 2577; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] 2578; GFX9-DL-NEXT: s_endpgm 2579; 2580; GFX10-DL-LABEL: udot2_acc16: 2581; GFX10-DL: ; %bb.0: ; %entry 2582; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 2583; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 2584; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2585; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2586; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5] 2587; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 2588; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 2589; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2590; GFX10-DL-NEXT: v_dot2_u32_u16 v1, s0, s1, v1 2591; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5] 2592; GFX10-DL-NEXT: s_endpgm 2593 <2 x i16> addrspace(1)* %src2, 2594 i16 addrspace(1)* nocapture %dst) { 2595entry: 2596 %v1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1 2597 %v2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2 2598 2599 %v1e1 = extractelement <2 x i16> %v1, i64 0 2600 %v2e1 = extractelement <2 x i16> %v2, i64 0 2601 %mul1 = mul i16 %v1e1, %v2e1 2602 2603 %v1e2 = extractelement <2 x i16> %v1, i64 1 2604 %v2e2 = extractelement <2 x i16> %v2, i64 1 2605 %mul2 = mul i16 %v1e2, %v2e2 2606 2607 %s2 = load i16, i16 addrspace(1)* %dst, align 2 2608 %add1 = add i16 %mul2, %s2 2609 %add2 = add i16 %add1, %mul1 2610 store i16 %add2, i16 addrspace(1)* %dst, align 2 2611 ret void 2612} 2613 2614define amdgpu_kernel void @notsdot2_sext8(<2 x i8> addrspace(1)* %src1, 2615; GFX7-LABEL: notsdot2_sext8: 2616; GFX7: ; %bb.0: ; %entry 2617; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 2618; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 2619; GFX7-NEXT: s_mov_b32 s3, 0xf000 2620; GFX7-NEXT: s_mov_b32 s2, -1 2621; GFX7-NEXT: s_mov_b32 s10, s2 2622; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2623; GFX7-NEXT: s_mov_b32 s0, s4 2624; GFX7-NEXT: s_mov_b32 s1, s5 2625; GFX7-NEXT: s_mov_b32 s4, s6 2626; GFX7-NEXT: s_mov_b32 s5, s7 2627; GFX7-NEXT: s_mov_b32 s6, s2 2628; GFX7-NEXT: s_mov_b32 s7, s3 2629; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 2630; GFX7-NEXT: buffer_load_ushort v1, off, s[4:7], 0 2631; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 2632; GFX7-NEXT: s_mov_b32 s11, s3 2633; GFX7-NEXT: s_waitcnt vmcnt(1) 2634; GFX7-NEXT: v_bfe_i32 v2, v0, 0, 8 2635; GFX7-NEXT: s_waitcnt vmcnt(0) 2636; GFX7-NEXT: v_bfe_i32 v3, v1, 0, 8 2637; GFX7-NEXT: v_bfe_i32 v0, v0, 8, 8 2638; GFX7-NEXT: v_bfe_i32 v1, v1, 8, 8 2639; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2640; GFX7-NEXT: v_mad_i32_i24 v0, v1, v0, s0 2641; GFX7-NEXT: v_mad_i32_i24 v0, v3, v2, v0 2642; GFX7-NEXT: buffer_store_dword v0, off, s[8:11], 0 2643; GFX7-NEXT: s_endpgm 2644; 2645; GFX8-LABEL: notsdot2_sext8: 2646; GFX8: ; %bb.0: ; %entry 2647; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2648; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2649; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2650; GFX8-NEXT: v_mov_b32_e32 v0, s4 2651; GFX8-NEXT: v_mov_b32_e32 v1, s5 2652; GFX8-NEXT: v_mov_b32_e32 v2, s6 2653; GFX8-NEXT: v_mov_b32_e32 v3, s7 2654; GFX8-NEXT: flat_load_ushort v0, v[0:1] 2655; GFX8-NEXT: flat_load_ushort v1, v[2:3] 2656; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 2657; GFX8-NEXT: s_waitcnt vmcnt(1) 2658; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8 2659; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0 2660; GFX8-NEXT: s_waitcnt vmcnt(0) 2661; GFX8-NEXT: v_bfe_i32 v3, v1, 0, 8 2662; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v1 2663; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8 2664; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 8 2665; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2666; GFX8-NEXT: v_mad_i32_i24 v0, v1, v0, s2 2667; GFX8-NEXT: v_mad_i32_i24 v2, v3, v2, v0 2668; GFX8-NEXT: v_mov_b32_e32 v0, s0 2669; GFX8-NEXT: v_mov_b32_e32 v1, s1 2670; GFX8-NEXT: flat_store_dword v[0:1], v2 2671; GFX8-NEXT: s_endpgm 2672; 2673; GFX9-NODL-LABEL: notsdot2_sext8: 2674; GFX9-NODL: ; %bb.0: ; %entry 2675; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2676; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2677; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 2678; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2679; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[4:5] 2680; GFX9-NODL-NEXT: global_load_ushort v2, v0, s[6:7] 2681; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 2682; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 2683; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 8 2684; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v1, 8, v1 2685; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 2686; GFX9-NODL-NEXT: v_bfe_i32 v4, v2, 0, 8 2687; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v2, 8, v2 2688; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 8 2689; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8 2690; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2691; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s2 2692; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v4, v3, v1 2693; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] 2694; GFX9-NODL-NEXT: s_endpgm 2695; 2696; GFX9-DL-LABEL: notsdot2_sext8: 2697; GFX9-DL: ; %bb.0: ; %entry 2698; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2699; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2700; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 2701; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2702; GFX9-DL-NEXT: global_load_ushort v1, v0, s[4:5] 2703; GFX9-DL-NEXT: global_load_ushort v2, v0, s[6:7] 2704; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 2705; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 2706; GFX9-DL-NEXT: v_bfe_i32 v3, v1, 0, 8 2707; GFX9-DL-NEXT: v_lshrrev_b16_e32 v1, 8, v1 2708; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 2709; GFX9-DL-NEXT: v_bfe_i32 v4, v2, 0, 8 2710; GFX9-DL-NEXT: v_lshrrev_b16_e32 v2, 8, v2 2711; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 2712; GFX9-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 2713; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2714; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s2 2715; GFX9-DL-NEXT: v_mad_i32_i24 v1, v4, v3, v1 2716; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] 2717; GFX9-DL-NEXT: s_endpgm 2718; 2719; GFX10-DL-LABEL: notsdot2_sext8: 2720; GFX10-DL: ; %bb.0: ; %entry 2721; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2722; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 2723; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2724; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2725; GFX10-DL-NEXT: s_clause 0x1 2726; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5] 2727; GFX10-DL-NEXT: global_load_ushort v2, v0, s[6:7] 2728; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 2729; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 2730; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, v1 2731; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 2732; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, v2 2733; GFX10-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 2734; GFX10-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 2735; GFX10-DL-NEXT: v_bfe_i32 v3, v3, 0, 8 2736; GFX10-DL-NEXT: v_bfe_i32 v4, v4, 0, 8 2737; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2738; GFX10-DL-NEXT: v_mad_i32_i24 v3, v4, v3, s2 2739; GFX10-DL-NEXT: v_mad_i32_i24 v1, v2, v1, v3 2740; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] 2741; GFX10-DL-NEXT: s_endpgm 2742 <2 x i8> addrspace(1)* %src2, 2743 i32 addrspace(1)* nocapture %dst) { 2744entry: 2745 %vec1 = load <2 x i8>, <2 x i8> addrspace(1)* %src1 2746 %vec2 = load <2 x i8>, <2 x i8> addrspace(1)* %src2 2747 2748 %s1.elt1 = extractelement <2 x i8> %vec1, i64 0 2749 %conv = sext i8 %s1.elt1 to i32 2750 %s2.elt1 = extractelement <2 x i8> %vec2, i64 0 2751 %conv2 = sext i8 %s2.elt1 to i32 2752 %mul1 = mul nuw i32 %conv2, %conv 2753 2754 %s1.elt2 = extractelement <2 x i8> %vec1, i64 1 2755 %conv3 = sext i8 %s1.elt2 to i32 2756 %s2.elt2 = extractelement <2 x i8> %vec2, i64 1 2757 %conv4 = sext i8 %s2.elt2 to i32 2758 %mul2 = mul nuw i32 %conv4, %conv3 2759 2760 %s3 = load i32, i32 addrspace(1)* %dst, align 4 2761 %add = add i32 %mul2, %s3 2762 %add6 = add i32 %add, %mul1 2763 store i32 %add6, i32 addrspace(1)* %dst, align 4 2764 ret void 2765} 2766