1; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VIPLUS %s 2; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=VIPLUS %s 3 4; FIXME: Need to handle non-uniform case for function below (load without gep). 5; GCN-LABEL: {{^}}v_test_imax_sge_i16: 6; VIPLUS: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 7define amdgpu_kernel void @v_test_imax_sge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind { 8 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 9 %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid 10 %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid 11 %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid 12 %a = load i16, i16 addrspace(1)* %gep0, align 4 13 %b = load i16, i16 addrspace(1)* %gep1, align 4 14 %cmp = icmp sge i16 %a, %b 15 %val = select i1 %cmp, i16 %a, i16 %b 16 store i16 %val, i16 addrspace(1)* %outgep, align 4 17 ret void 18} 19 20; FIXME: Need to handle non-uniform case for function below (load without gep). 21; GCN-LABEL: {{^}}v_test_imax_sge_v2i16: 22; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 23; VI: v_max_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 24 25; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 26define amdgpu_kernel void @v_test_imax_sge_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %aptr, <2 x i16> addrspace(1)* %bptr) nounwind { 27 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 28 %gep0 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %aptr, i32 %tid 29 %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %bptr, i32 %tid 30 %outgep = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid 31 %a = load <2 x i16>, <2 x i16> addrspace(1)* %gep0, align 4 32 %b = load <2 x i16>, <2 x i16> addrspace(1)* %gep1, align 4 33 %cmp = icmp sge <2 x i16> %a, %b 34 %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b 35 store <2 x i16> %val, <2 x i16> addrspace(1)* %outgep, align 4 36 ret void 37} 38 39; FIXME: Need to handle non-uniform case for function below (load without gep). 40; GCN-LABEL: {{^}}v_test_imax_sge_v3i16: 41; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 42; VI: v_max_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 43; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 44; VI-NOT: v_max_i16 45 46; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 47; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 48define amdgpu_kernel void @v_test_imax_sge_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %aptr, <3 x i16> addrspace(1)* %bptr) nounwind { 49 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 50 %gep0 = getelementptr <3 x i16>, <3 x i16> addrspace(1)* %aptr, i32 %tid 51 %gep1 = getelementptr <3 x i16>, <3 x i16> addrspace(1)* %bptr, i32 %tid 52 %outgep = getelementptr <3 x i16>, <3 x i16> addrspace(1)* %out, i32 %tid 53 %a = load <3 x i16>, <3 x i16> addrspace(1)* %gep0, align 4 54 %b = load <3 x i16>, <3 x i16> addrspace(1)* %gep1, align 4 55 %cmp = icmp sge <3 x i16> %a, %b 56 %val = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b 57 store <3 x i16> %val, <3 x i16> addrspace(1)* %outgep, align 4 58 ret void 59} 60 61; FIXME: Need to handle non-uniform case for function below (load without gep). 62; GCN-LABEL: {{^}}v_test_imax_sge_v4i16: 63; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 64; VI: v_max_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 65; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 66; VI: v_max_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 67 68; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 69; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 70define amdgpu_kernel void @v_test_imax_sge_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %aptr, <4 x i16> addrspace(1)* %bptr) nounwind { 71 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 72 %gep0 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %aptr, i32 %tid 73 %gep1 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %bptr, i32 %tid 74 %outgep = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %out, i32 %tid 75 %a = load <4 x i16>, <4 x i16> addrspace(1)* %gep0, align 4 76 %b = load <4 x i16>, <4 x i16> addrspace(1)* %gep1, align 4 77 %cmp = icmp sge <4 x i16> %a, %b 78 %val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b 79 store <4 x i16> %val, <4 x i16> addrspace(1)* %outgep, align 4 80 ret void 81} 82 83; FIXME: Need to handle non-uniform case for function below (load without gep). 84; GCN-LABEL: {{^}}v_test_imax_sgt_i16: 85; VIPLUS: v_max_i16_e32 86define amdgpu_kernel void @v_test_imax_sgt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind { 87 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 88 %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid 89 %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid 90 %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid 91 %a = load i16, i16 addrspace(1)* %gep0, align 4 92 %b = load i16, i16 addrspace(1)* %gep1, align 4 93 %cmp = icmp sgt i16 %a, %b 94 %val = select i1 %cmp, i16 %a, i16 %b 95 store i16 %val, i16 addrspace(1)* %outgep, align 4 96 ret void 97} 98 99; FIXME: Need to handle non-uniform case for function below (load without gep). 100; GCN-LABEL: {{^}}v_test_umax_uge_i16: 101; VIPLUS: v_max_u16_e32 102define amdgpu_kernel void @v_test_umax_uge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind { 103 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 104 %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid 105 %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid 106 %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid 107 %a = load i16, i16 addrspace(1)* %gep0, align 4 108 %b = load i16, i16 addrspace(1)* %gep1, align 4 109 %cmp = icmp uge i16 %a, %b 110 %val = select i1 %cmp, i16 %a, i16 %b 111 store i16 %val, i16 addrspace(1)* %outgep, align 4 112 ret void 113} 114 115; FIXME: Need to handle non-uniform case for function below (load without gep). 116; GCN-LABEL: {{^}}v_test_umax_ugt_i16: 117; VIPLUS: v_max_u16_e32 118define amdgpu_kernel void @v_test_umax_ugt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind { 119 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 120 %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid 121 %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid 122 %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid 123 %a = load i16, i16 addrspace(1)* %gep0, align 4 124 %b = load i16, i16 addrspace(1)* %gep1, align 4 125 %cmp = icmp ugt i16 %a, %b 126 %val = select i1 %cmp, i16 %a, i16 %b 127 store i16 %val, i16 addrspace(1)* %outgep, align 4 128 ret void 129} 130 131; GCN-LABEL: {{^}}v_test_umax_ugt_v2i16: 132; VI: v_max_u16_e32 133; VI: v_max_u16_sdwa 134 135; GFX9: v_pk_max_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 136define amdgpu_kernel void @v_test_umax_ugt_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %aptr, <2 x i16> addrspace(1)* %bptr) nounwind { 137 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 138 %gep0 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %aptr, i32 %tid 139 %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %bptr, i32 %tid 140 %outgep = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid 141 %a = load <2 x i16>, <2 x i16> addrspace(1)* %gep0, align 4 142 %b = load <2 x i16>, <2 x i16> addrspace(1)* %gep1, align 4 143 %cmp = icmp ugt <2 x i16> %a, %b 144 %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b 145 store <2 x i16> %val, <2 x i16> addrspace(1)* %outgep, align 4 146 ret void 147} 148 149declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 150