1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -mattr=-unaligned-scratch-access < %s | FileCheck -check-prefixes=GCN,GFX7-ALIGNED %s 3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -mattr=+unaligned-scratch-access < %s | FileCheck -check-prefixes=GCN,GFX7-UNALIGNED %s 4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+unaligned-scratch-access < %s | FileCheck -check-prefixes=GCN,GFX9 %s 5; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+unaligned-scratch-access -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,GFX9-FLASTSCR %s 6 7; Should not merge this to a dword load 8define i32 @private_load_2xi16_align2(i16 addrspace(5)* %p) #0 { 9; GFX7-ALIGNED-LABEL: private_load_2xi16_align2: 10; GFX7-ALIGNED: ; %bb.0: 11; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12; GFX7-ALIGNED-NEXT: v_add_i32_e32 v1, vcc, 2, v0 13; GFX7-ALIGNED-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen 14; GFX7-ALIGNED-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 offen 15; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) 16; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 17; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 18; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31] 19; 20; GFX7-UNALIGNED-LABEL: private_load_2xi16_align2: 21; GFX7-UNALIGNED: ; %bb.0: 22; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 23; GFX7-UNALIGNED-NEXT: v_add_i32_e32 v1, vcc, 2, v0 24; GFX7-UNALIGNED-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen 25; GFX7-UNALIGNED-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 offen 26; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) 27; GFX7-UNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 28; GFX7-UNALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 29; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31] 30; 31; GFX9-LABEL: private_load_2xi16_align2: 32; GFX9: ; %bb.0: 33; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34; GFX9-NEXT: buffer_load_ushort v1, v0, s[0:3], 0 offen 35; GFX9-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen offset:2 36; GFX9-NEXT: s_waitcnt vmcnt(0) 37; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 38; GFX9-NEXT: s_setpc_b64 s[30:31] 39; 40; GFX9-FLASTSCR-LABEL: private_load_2xi16_align2: 41; GFX9-FLASTSCR: ; %bb.0: 42; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 43; GFX9-FLASTSCR-NEXT: scratch_load_ushort v1, v0, off 44; GFX9-FLASTSCR-NEXT: scratch_load_ushort v0, v0, off offset:2 45; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) 46; GFX9-FLASTSCR-NEXT: v_lshl_or_b32 v0, v0, 16, v1 47; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] 48 %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1 49 %p.0 = load i16, i16 addrspace(5)* %p, align 2 50 %p.1 = load i16, i16 addrspace(5)* %gep.p, align 2 51 %zext.0 = zext i16 %p.0 to i32 52 %zext.1 = zext i16 %p.1 to i32 53 %shl.1 = shl i32 %zext.1, 16 54 %or = or i32 %zext.0, %shl.1 55 ret i32 %or 56} 57 58; Should not merge this to a dword store 59define void @private_store_2xi16_align2(i16 addrspace(5)* %p, i16 addrspace(5)* %r) #0 { 60; GFX7-ALIGNED-LABEL: private_store_2xi16_align2: 61; GFX7-ALIGNED: ; %bb.0: 62; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 63; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 1 64; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, 2 65; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 2, v1 66; GFX7-ALIGNED-NEXT: buffer_store_short v3, v1, s[0:3], 0 offen 67; GFX7-ALIGNED-NEXT: buffer_store_short v0, v2, s[0:3], 0 offen 68; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) 69; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31] 70; 71; GFX7-UNALIGNED-LABEL: private_store_2xi16_align2: 72; GFX7-UNALIGNED: ; %bb.0: 73; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 74; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v3, 1 75; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, 2 76; GFX7-UNALIGNED-NEXT: v_add_i32_e32 v2, vcc, 2, v1 77; GFX7-UNALIGNED-NEXT: buffer_store_short v3, v1, s[0:3], 0 offen 78; GFX7-UNALIGNED-NEXT: buffer_store_short v0, v2, s[0:3], 0 offen 79; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) 80; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31] 81; 82; GFX9-LABEL: private_store_2xi16_align2: 83; GFX9: ; %bb.0: 84; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 85; GFX9-NEXT: v_mov_b32_e32 v0, 1 86; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen 87; GFX9-NEXT: v_mov_b32_e32 v0, 2 88; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen offset:2 89; GFX9-NEXT: s_waitcnt vmcnt(0) 90; GFX9-NEXT: s_setpc_b64 s[30:31] 91; 92; GFX9-FLASTSCR-LABEL: private_store_2xi16_align2: 93; GFX9-FLASTSCR: ; %bb.0: 94; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 95; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v0, 1 96; GFX9-FLASTSCR-NEXT: scratch_store_short v1, v0, off 97; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v0, 2 98; GFX9-FLASTSCR-NEXT: scratch_store_short v1, v0, off offset:2 99; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) 100; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] 101 %gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1 102 store i16 1, i16 addrspace(5)* %r, align 2 103 store i16 2, i16 addrspace(5)* %gep.r, align 2 104 ret void 105} 106 107; Should produce align 1 dword when legal 108define i32 @private_load_2xi16_align1(i16 addrspace(5)* %p) #0 { 109; GFX7-ALIGNED-LABEL: private_load_2xi16_align1: 110; GFX7-ALIGNED: ; %bb.0: 111; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 112; GFX7-ALIGNED-NEXT: v_add_i32_e32 v1, vcc, 2, v0 113; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 1, v0 114; GFX7-ALIGNED-NEXT: buffer_load_ubyte v3, v0, s[0:3], 0 offen 115; GFX7-ALIGNED-NEXT: v_add_i32_e32 v0, vcc, 3, v0 116; GFX7-ALIGNED-NEXT: buffer_load_ubyte v0, v0, s[0:3], 0 offen 117; GFX7-ALIGNED-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen 118; GFX7-ALIGNED-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen 119; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(2) 120; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 8, v0 121; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(1) 122; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v2, 8, v2 123; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) 124; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 125; GFX7-ALIGNED-NEXT: v_or_b32_e32 v2, v2, v3 126; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0 127; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v2, v0 128; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31] 129; 130; GFX7-UNALIGNED-LABEL: private_load_2xi16_align1: 131; GFX7-UNALIGNED: ; %bb.0: 132; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 133; GFX7-UNALIGNED-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen 134; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) 135; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31] 136; 137; GFX9-LABEL: private_load_2xi16_align1: 138; GFX9: ; %bb.0: 139; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 140; GFX9-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen 141; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff 142; GFX9-NEXT: s_mov_b32 s4, 0xffff 143; GFX9-NEXT: s_waitcnt vmcnt(0) 144; GFX9-NEXT: v_bfi_b32 v1, v1, 0, v0 145; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 146; GFX9-NEXT: s_setpc_b64 s[30:31] 147; 148; GFX9-FLASTSCR-LABEL: private_load_2xi16_align1: 149; GFX9-FLASTSCR: ; %bb.0: 150; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 151; GFX9-FLASTSCR-NEXT: scratch_load_dword v0, v0, off 152; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v1, 0xffff 153; GFX9-FLASTSCR-NEXT: s_mov_b32 s0, 0xffff 154; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) 155; GFX9-FLASTSCR-NEXT: v_bfi_b32 v1, v1, 0, v0 156; GFX9-FLASTSCR-NEXT: v_and_or_b32 v0, v0, s0, v1 157; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] 158 %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1 159 %p.0 = load i16, i16 addrspace(5)* %p, align 1 160 %p.1 = load i16, i16 addrspace(5)* %gep.p, align 1 161 %zext.0 = zext i16 %p.0 to i32 162 %zext.1 = zext i16 %p.1 to i32 163 %shl.1 = shl i32 %zext.1, 16 164 %or = or i32 %zext.0, %shl.1 165 ret i32 %or 166} 167 168; Should produce align 1 dword when legal 169define void @private_store_2xi16_align1(i16 addrspace(5)* %p, i16 addrspace(5)* %r) #0 { 170; GFX7-ALIGNED-LABEL: private_store_2xi16_align1: 171; GFX7-ALIGNED: ; %bb.0: 172; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 173; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 1 174; GFX7-ALIGNED-NEXT: buffer_store_byte v3, v1, s[0:3], 0 offen 175; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 2, v1 176; GFX7-ALIGNED-NEXT: v_add_i32_e32 v3, vcc, 1, v1 177; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v4, 0 178; GFX7-ALIGNED-NEXT: v_add_i32_e32 v1, vcc, 3, v1 179; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, 2 180; GFX7-ALIGNED-NEXT: buffer_store_byte v4, v3, s[0:3], 0 offen 181; GFX7-ALIGNED-NEXT: buffer_store_byte v4, v1, s[0:3], 0 offen 182; GFX7-ALIGNED-NEXT: buffer_store_byte v0, v2, s[0:3], 0 offen 183; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) 184; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31] 185; 186; GFX7-UNALIGNED-LABEL: private_store_2xi16_align1: 187; GFX7-UNALIGNED: ; %bb.0: 188; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 189; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x20001 190; GFX7-UNALIGNED-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen 191; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) 192; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31] 193; 194; GFX9-LABEL: private_store_2xi16_align1: 195; GFX9: ; %bb.0: 196; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 197; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001 198; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen 199; GFX9-NEXT: s_waitcnt vmcnt(0) 200; GFX9-NEXT: s_setpc_b64 s[30:31] 201; 202; GFX9-FLASTSCR-LABEL: private_store_2xi16_align1: 203; GFX9-FLASTSCR: ; %bb.0: 204; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 205; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v0, 0x20001 206; GFX9-FLASTSCR-NEXT: scratch_store_dword v1, v0, off 207; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) 208; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] 209 %gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1 210 store i16 1, i16 addrspace(5)* %r, align 1 211 store i16 2, i16 addrspace(5)* %gep.r, align 1 212 ret void 213} 214 215; Should merge this to a dword load 216define i32 @private_load_2xi16_align4(i16 addrspace(5)* %p) #0 { 217; GFX7-LABEL: load_2xi16_align4: 218; GFX7: ; %bb.0: 219; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 220; GFX7-NEXT: flat_load_dword v0, v[0:1] 221; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 222; GFX7-NEXT: s_setpc_b64 s[30:31] 223; 224; GFX7-ALIGNED-LABEL: private_load_2xi16_align4: 225; GFX7-ALIGNED: ; %bb.0: 226; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 227; GFX7-ALIGNED-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen 228; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) 229; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31] 230; 231; GFX7-UNALIGNED-LABEL: private_load_2xi16_align4: 232; GFX7-UNALIGNED: ; %bb.0: 233; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 234; GFX7-UNALIGNED-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen 235; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) 236; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31] 237; 238; GFX9-LABEL: private_load_2xi16_align4: 239; GFX9: ; %bb.0: 240; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 241; GFX9-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen 242; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff 243; GFX9-NEXT: s_mov_b32 s4, 0xffff 244; GFX9-NEXT: s_waitcnt vmcnt(0) 245; GFX9-NEXT: v_bfi_b32 v1, v1, 0, v0 246; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 247; GFX9-NEXT: s_setpc_b64 s[30:31] 248; 249; GFX9-FLASTSCR-LABEL: private_load_2xi16_align4: 250; GFX9-FLASTSCR: ; %bb.0: 251; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 252; GFX9-FLASTSCR-NEXT: scratch_load_dword v0, v0, off 253; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v1, 0xffff 254; GFX9-FLASTSCR-NEXT: s_mov_b32 s0, 0xffff 255; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) 256; GFX9-FLASTSCR-NEXT: v_bfi_b32 v1, v1, 0, v0 257; GFX9-FLASTSCR-NEXT: v_and_or_b32 v0, v0, s0, v1 258; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] 259 %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1 260 %p.0 = load i16, i16 addrspace(5)* %p, align 4 261 %p.1 = load i16, i16 addrspace(5)* %gep.p, align 2 262 %zext.0 = zext i16 %p.0 to i32 263 %zext.1 = zext i16 %p.1 to i32 264 %shl.1 = shl i32 %zext.1, 16 265 %or = or i32 %zext.0, %shl.1 266 ret i32 %or 267} 268 269; Should merge this to a dword store 270define void @private_store_2xi16_align4(i16 addrspace(5)* %p, i16 addrspace(5)* %r) #0 { 271; GFX7-LABEL: private_store_2xi16_align4: 272; GFX7: ; %bb.0: 273; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 274; GFX7-NEXT: v_mov_b32_e32 v2, 0x20001 275; GFX7-NEXT: s_waitcnt lgkmcnt(0) 276; GFX7-NEXT: v_mov_b32_e32 v0, s0 277; GFX7-NEXT: v_mov_b32_e32 v1, s1 278; GFX7-NEXT: flat_store_dword v[0:1], v2 279; GFX7-NEXT: s_endpgm 280; 281; GFX7-ALIGNED-LABEL: private_store_2xi16_align4: 282; GFX7-ALIGNED: ; %bb.0: 283; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 284; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, 0x20001 285; GFX7-ALIGNED-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen 286; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) 287; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31] 288; 289; GFX7-UNALIGNED-LABEL: private_store_2xi16_align4: 290; GFX7-UNALIGNED: ; %bb.0: 291; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 292; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x20001 293; GFX7-UNALIGNED-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen 294; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) 295; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31] 296; 297; GFX9-LABEL: private_store_2xi16_align4: 298; GFX9: ; %bb.0: 299; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 300; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001 301; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen 302; GFX9-NEXT: s_waitcnt vmcnt(0) 303; GFX9-NEXT: s_setpc_b64 s[30:31] 304; 305; GFX9-FLASTSCR-LABEL: private_store_2xi16_align4: 306; GFX9-FLASTSCR: ; %bb.0: 307; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 308; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v0, 0x20001 309; GFX9-FLASTSCR-NEXT: scratch_store_dword v1, v0, off 310; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) 311; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] 312 %gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1 313 store i16 1, i16 addrspace(5)* %r, align 4 314 store i16 2, i16 addrspace(5)* %gep.r, align 2 315 ret void 316} 317