1; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd -mattr=+avx512vl| FileCheck %s 2 3declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) nounwind readonly 4 5declare <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32>, <4 x i32>, i8) 6 7define <4 x i32>@test_int_x86_avx512_mask_vplzcnt_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) { 8; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_d_128: 9; CHECK: ## BB#0: 10; CHECK-NEXT: movzbl %dil, %eax 11; CHECK-NEXT: kmovw %eax, %k1 12; CHECK-NEXT: vplzcntd %xmm0, %xmm1 {%k1} 13; CHECK-NEXT: vplzcntd %xmm0, %xmm2 {%k1} {z} 14; CHECK-NEXT: vplzcntd %xmm0, %xmm0 15; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 16; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 17; CHECK-NEXT: retq 18 %res = call <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) 19 %res1 = call <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1) 20 %res3 = call <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32> %x0, <4 x i32> zeroinitializer, i8 %x2) 21 %res2 = add <4 x i32> %res, %res1 22 %res4 = add <4 x i32> %res2, %res3 23 ret <4 x i32> %res4 24} 25 26declare <8 x i32> @llvm.x86.avx512.mask.lzcnt.d.256(<8 x i32>, <8 x i32>, i8) 27 28define <8 x i32>@test_int_x86_avx512_mask_vplzcnt_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) { 29; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_d_256: 30; CHECK: ## BB#0: 31; CHECK-NEXT: movzbl %dil, %eax 32; CHECK-NEXT: kmovw %eax, %k1 33; CHECK-NEXT: vplzcntd %ymm0, %ymm1 {%k1} 34; CHECK-NEXT: vplzcntd %ymm0, %ymm0 35; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 36; CHECK-NEXT: retq 37 %res = call <8 x i32> @llvm.x86.avx512.mask.lzcnt.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) 38 %res1 = call <8 x i32> @llvm.x86.avx512.mask.lzcnt.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 -1) 39 %res2 = add <8 x i32> %res, %res1 40 ret <8 x i32> %res2 41} 42 43declare <2 x i64> @llvm.x86.avx512.mask.lzcnt.q.128(<2 x i64>, <2 x i64>, i8) 44 45define <2 x i64>@test_int_x86_avx512_mask_vplzcnt_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) { 46; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_q_128: 47; CHECK: ## BB#0: 48; CHECK-NEXT: movzbl %dil, %eax 49; CHECK-NEXT: kmovw %eax, %k1 50; CHECK-NEXT: vplzcntq %xmm0, %xmm1 {%k1} 51; CHECK-NEXT: vplzcntq %xmm0, %xmm0 52; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 53; CHECK-NEXT: retq 54 %res = call <2 x i64> @llvm.x86.avx512.mask.lzcnt.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) 55 %res1 = call <2 x i64> @llvm.x86.avx512.mask.lzcnt.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 -1) 56 %res2 = add <2 x i64> %res, %res1 57 ret <2 x i64> %res2 58} 59 60declare <4 x i64> @llvm.x86.avx512.mask.lzcnt.q.256(<4 x i64>, <4 x i64>, i8) 61 62define <4 x i64>@test_int_x86_avx512_mask_vplzcnt_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) { 63; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_q_256: 64; CHECK: ## BB#0: 65; CHECK-NEXT: movzbl %dil, %eax 66; CHECK-NEXT: kmovw %eax, %k1 67; CHECK-NEXT: vplzcntq %ymm0, %ymm1 {%k1} 68; CHECK-NEXT: vplzcntq %ymm0, %ymm0 69; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 70; CHECK-NEXT: retq 71 %res = call <4 x i64> @llvm.x86.avx512.mask.lzcnt.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) 72 %res1 = call <4 x i64> @llvm.x86.avx512.mask.lzcnt.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 -1) 73 %res2 = add <4 x i64> %res, %res1 74 ret <4 x i64> %res2 75} 76 77declare <4 x i32> @llvm.x86.avx512.mask.conflict.d.128(<4 x i32>, <4 x i32>, i8) 78 79define <4 x i32>@test_int_x86_avx512_mask_vpconflict_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) { 80; CHECK-LABEL: test_int_x86_avx512_mask_vpconflict_d_128: 81; CHECK: ## BB#0: 82; CHECK-NEXT: movzbl %dil, %eax 83; CHECK-NEXT: kmovw %eax, %k1 84; CHECK-NEXT: vpconflictd %xmm0, %xmm1 {%k1} 85; CHECK-NEXT: vpconflictd %xmm0, %xmm2 {%k1} {z} 86; CHECK-NEXT: vpconflictd %xmm0, %xmm0 87; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 88; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 89; CHECK-NEXT: retq 90 %res = call <4 x i32> @llvm.x86.avx512.mask.conflict.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) 91 %res1 = call <4 x i32> @llvm.x86.avx512.mask.conflict.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1) 92 %res3 = call <4 x i32> @llvm.x86.avx512.mask.conflict.d.128(<4 x i32> %x0, <4 x i32> zeroinitializer, i8 %x2) 93 %res2 = add <4 x i32> %res, %res1 94 %res4 = add <4 x i32> %res2, %res3 95 ret <4 x i32> %res4 96} 97 98declare <8 x i32> @llvm.x86.avx512.mask.conflict.d.256(<8 x i32>, <8 x i32>, i8) 99 100define <8 x i32>@test_int_x86_avx512_mask_vpconflict_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) { 101; CHECK-LABEL: test_int_x86_avx512_mask_vpconflict_d_256: 102; CHECK: ## BB#0: 103; CHECK-NEXT: movzbl %dil, %eax 104; CHECK-NEXT: kmovw %eax, %k1 105; CHECK-NEXT: vpconflictd %ymm0, %ymm1 {%k1} 106; CHECK-NEXT: vpconflictd %ymm0, %ymm0 107; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 108; CHECK-NEXT: retq 109 %res = call <8 x i32> @llvm.x86.avx512.mask.conflict.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) 110 %res1 = call <8 x i32> @llvm.x86.avx512.mask.conflict.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 -1) 111 %res2 = add <8 x i32> %res, %res1 112 ret <8 x i32> %res2 113} 114 115declare <2 x i64> @llvm.x86.avx512.mask.conflict.q.128(<2 x i64>, <2 x i64>, i8) 116 117define <2 x i64>@test_int_x86_avx512_mask_vpconflict_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) { 118; CHECK-LABEL: test_int_x86_avx512_mask_vpconflict_q_128: 119; CHECK: ## BB#0: 120; CHECK-NEXT: movzbl %dil, %eax 121; CHECK-NEXT: kmovw %eax, %k1 122; CHECK-NEXT: vpconflictq %xmm0, %xmm1 {%k1} 123; CHECK-NEXT: vpconflictq %xmm0, %xmm0 124; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 125; CHECK-NEXT: retq 126 %res = call <2 x i64> @llvm.x86.avx512.mask.conflict.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) 127 %res1 = call <2 x i64> @llvm.x86.avx512.mask.conflict.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 -1) 128 %res2 = add <2 x i64> %res, %res1 129 ret <2 x i64> %res2 130} 131 132declare <4 x i64> @llvm.x86.avx512.mask.conflict.q.256(<4 x i64>, <4 x i64>, i8) 133 134define <4 x i64>@test_int_x86_avx512_mask_vpconflict_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) { 135; CHECK-LABEL: test_int_x86_avx512_mask_vpconflict_q_256: 136; CHECK: ## BB#0: 137; CHECK-NEXT: movzbl %dil, %eax 138; CHECK-NEXT: kmovw %eax, %k1 139; CHECK-NEXT: vpconflictq %ymm0, %ymm1 {%k1} 140; CHECK-NEXT: vpconflictq %ymm0, %ymm0 141; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 142; CHECK-NEXT: retq 143 %res = call <4 x i64> @llvm.x86.avx512.mask.conflict.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) 144 %res1 = call <4 x i64> @llvm.x86.avx512.mask.conflict.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 -1) 145 %res2 = add <4 x i64> %res, %res1 146 ret <4 x i64> %res2 147} 148 149define <8 x i32> @test_x86_vbroadcastmw_256(i16 %a0) { 150 ; CHECK: test_x86_vbroadcastmw_256 151 ; CHECK: vpbroadcastmw2d %k0, %ymm0 152 %res = call <8 x i32> @llvm.x86.avx512.broadcastmw.256(i16 %a0) ; 153 ret <8 x i32> %res 154} 155declare <8 x i32> @llvm.x86.avx512.broadcastmw.256(i16) 156 157define <4 x i32> @test_x86_vbroadcastmw_128(i16 %a0) { 158 ; CHECK: test_x86_vbroadcastmw_128 159 ; CHECK: vpbroadcastmw2d %k0, %xmm0 160 %res = call <4 x i32> @llvm.x86.avx512.broadcastmw.128(i16 %a0) ; 161 ret <4 x i32> %res 162} 163declare <4 x i32> @llvm.x86.avx512.broadcastmw.128(i16) 164 165define <4 x i64> @test_x86_broadcastmb_256(i8 %a0) { 166 ; CHECK: test_x86_broadcastmb_256 167 ; CHECK: vpbroadcastmb2q %k0, %ymm0 168 %res = call <4 x i64> @llvm.x86.avx512.broadcastmb.256(i8 %a0) ; 169 ret <4 x i64> %res 170} 171declare <4 x i64> @llvm.x86.avx512.broadcastmb.256(i8) 172 173define <2 x i64> @test_x86_broadcastmb_128(i8 %a0) { 174 ; CHECK: test_x86_broadcastmb_128 175 ; CHECK: vpbroadcastmb2q %k0, %xmm0 176 %res = call <2 x i64> @llvm.x86.avx512.broadcastmb.128(i8 %a0) ; 177 ret <2 x i64> %res 178} 179declare <2 x i64> @llvm.x86.avx512.broadcastmb.128(i8) 180