1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX6 %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8 %s 4; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s 5 6define amdgpu_kernel void @sin_f16(half addrspace(1)* %r, half addrspace(1)* %a) { 7; GFX6-LABEL: sin_f16: 8; GFX6: ; %bb.0: 9; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 10; GFX6-NEXT: s_mov_b32 s3, 0xf000 11; GFX6-NEXT: s_mov_b32 s2, -1 12; GFX6-NEXT: s_mov_b32 s10, s2 13; GFX6-NEXT: s_mov_b32 s11, s3 14; GFX6-NEXT: s_waitcnt lgkmcnt(0) 15; GFX6-NEXT: s_mov_b32 s8, s6 16; GFX6-NEXT: s_mov_b32 s9, s7 17; GFX6-NEXT: buffer_load_ushort v0, off, s[8:11], 0 18; GFX6-NEXT: s_mov_b32 s0, s4 19; GFX6-NEXT: s_mov_b32 s1, s5 20; GFX6-NEXT: s_waitcnt vmcnt(0) 21; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 22; GFX6-NEXT: v_mul_f32_e32 v0, 0x3e22f983, v0 23; GFX6-NEXT: v_fract_f32_e32 v0, v0 24; GFX6-NEXT: v_sin_f32_e32 v0, v0 25; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 26; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 27; GFX6-NEXT: s_endpgm 28; 29; GFX8-LABEL: sin_f16: 30; GFX8: ; %bb.0: 31; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 32; GFX8-NEXT: s_waitcnt lgkmcnt(0) 33; GFX8-NEXT: v_mov_b32_e32 v0, s2 34; GFX8-NEXT: v_mov_b32_e32 v1, s3 35; GFX8-NEXT: flat_load_ushort v0, v[0:1] 36; GFX8-NEXT: v_mov_b32_e32 v1, s1 37; GFX8-NEXT: s_waitcnt vmcnt(0) 38; GFX8-NEXT: v_mul_f16_e32 v0, 0.15915494, v0 39; GFX8-NEXT: v_fract_f16_e32 v0, v0 40; GFX8-NEXT: v_sin_f16_e32 v2, v0 41; GFX8-NEXT: v_mov_b32_e32 v0, s0 42; GFX8-NEXT: flat_store_short v[0:1], v2 43; GFX8-NEXT: s_endpgm 44; 45; GFX9-LABEL: sin_f16: 46; GFX9: ; %bb.0: 47; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 48; GFX9-NEXT: v_mov_b32_e32 v0, 0 49; GFX9-NEXT: s_waitcnt lgkmcnt(0) 50; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] 51; GFX9-NEXT: s_waitcnt vmcnt(0) 52; GFX9-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 53; GFX9-NEXT: v_sin_f16_e32 v1, v1 54; GFX9-NEXT: global_store_short v0, v1, s[0:1] 55; GFX9-NEXT: s_endpgm 56 %a.val = load half, half addrspace(1)* %a 57 %r.val = call half @llvm.sin.f16(half %a.val) 58 store half %r.val, half addrspace(1)* %r 59 ret void 60} 61 62define amdgpu_kernel void @sin_v2f16(<2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { 63; GFX6-LABEL: sin_v2f16: 64; GFX6: ; %bb.0: 65; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 66; GFX6-NEXT: s_mov_b32 s3, 0xf000 67; GFX6-NEXT: s_mov_b32 s2, -1 68; GFX6-NEXT: s_mov_b32 s10, s2 69; GFX6-NEXT: s_mov_b32 s11, s3 70; GFX6-NEXT: s_waitcnt lgkmcnt(0) 71; GFX6-NEXT: s_mov_b32 s8, s6 72; GFX6-NEXT: s_mov_b32 s9, s7 73; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 74; GFX6-NEXT: s_mov_b32 s0, 0x3e22f983 75; GFX6-NEXT: s_mov_b32 s1, s5 76; GFX6-NEXT: s_waitcnt vmcnt(0) 77; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0 78; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 79; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 80; GFX6-NEXT: v_mul_f32_e32 v1, s0, v1 81; GFX6-NEXT: v_fract_f32_e32 v1, v1 82; GFX6-NEXT: v_mul_f32_e32 v0, s0, v0 83; GFX6-NEXT: v_fract_f32_e32 v0, v0 84; GFX6-NEXT: v_sin_f32_e32 v0, v0 85; GFX6-NEXT: v_sin_f32_e32 v1, v1 86; GFX6-NEXT: s_mov_b32 s0, s4 87; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 88; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 89; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 90; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 91; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 92; GFX6-NEXT: s_endpgm 93; 94; GFX8-LABEL: sin_v2f16: 95; GFX8: ; %bb.0: 96; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 97; GFX8-NEXT: s_waitcnt lgkmcnt(0) 98; GFX8-NEXT: v_mov_b32_e32 v0, s2 99; GFX8-NEXT: v_mov_b32_e32 v1, s3 100; GFX8-NEXT: flat_load_dword v0, v[0:1] 101; GFX8-NEXT: v_mov_b32_e32 v1, 0x3118 102; GFX8-NEXT: s_waitcnt vmcnt(0) 103; GFX8-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 104; GFX8-NEXT: v_mul_f16_e32 v0, 0.15915494, v0 105; GFX8-NEXT: v_fract_f16_e32 v1, v1 106; GFX8-NEXT: v_fract_f16_e32 v0, v0 107; GFX8-NEXT: v_sin_f16_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 108; GFX8-NEXT: v_sin_f16_e32 v3, v0 109; GFX8-NEXT: v_mov_b32_e32 v0, s0 110; GFX8-NEXT: v_mov_b32_e32 v1, s1 111; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 112; GFX8-NEXT: flat_store_dword v[0:1], v2 113; GFX8-NEXT: s_endpgm 114; 115; GFX9-LABEL: sin_v2f16: 116; GFX9: ; %bb.0: 117; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 118; GFX9-NEXT: v_mov_b32_e32 v0, 0 119; GFX9-NEXT: v_mov_b32_e32 v2, 0x3118 120; GFX9-NEXT: s_waitcnt lgkmcnt(0) 121; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 122; GFX9-NEXT: s_waitcnt vmcnt(0) 123; GFX9-NEXT: v_mul_f16_e32 v3, 0.15915494, v1 124; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 125; GFX9-NEXT: v_sin_f16_e32 v3, v3 126; GFX9-NEXT: v_sin_f16_e32 v1, v1 127; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3 128; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 129; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 130; GFX9-NEXT: s_endpgm 131 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 132 %r.val = call <2 x half> @llvm.sin.v2f16(<2 x half> %a.val) 133 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 134 ret void 135} 136 137declare half @llvm.sin.f16(half %a) 138declare <2 x half> @llvm.sin.v2f16(<2 x half> %a) 139