1; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl --show-mc-encoding| FileCheck %s 2 3; 256-bit 4 5define i32 @test_pcmpeq_b_256(<32 x i8> %a, <32 x i8> %b) { 6; CHECK-LABEL: test_pcmpeq_b_256 7; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 ## 8 %res = call i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8> %a, <32 x i8> %b, i32 -1) 9 ret i32 %res 10} 11 12define i32 @test_mask_pcmpeq_b_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { 13; CHECK-LABEL: test_mask_pcmpeq_b_256 14; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} ## 15 %res = call i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8> %a, <32 x i8> %b, i32 %mask) 16 ret i32 %res 17} 18 19declare i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8>, <32 x i8>, i32) 20 21define i16 @test_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b) { 22; CHECK-LABEL: test_pcmpeq_w_256 23; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 ## 24 %res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1) 25 ret i16 %res 26} 27 28define i16 @test_mask_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { 29; CHECK-LABEL: test_mask_pcmpeq_w_256 30; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ## 31 %res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask) 32 ret i16 %res 33} 34 35declare i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16>, <16 x i16>, i16) 36 37define i32 @test_pcmpgt_b_256(<32 x i8> %a, <32 x i8> %b) { 38; CHECK-LABEL: test_pcmpgt_b_256 39; CHECK: vpcmpgtb %ymm1, %ymm0, %k0 ## 40 %res = call i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8> %a, <32 x i8> %b, i32 -1) 41 ret i32 %res 42} 43 44define i32 @test_mask_pcmpgt_b_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { 45; CHECK-LABEL: test_mask_pcmpgt_b_256 46; CHECK: vpcmpgtb %ymm1, %ymm0, %k0 {%k1} ## 47 %res = call i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8> %a, <32 x i8> %b, i32 %mask) 48 ret i32 %res 49} 50 51declare i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8>, <32 x i8>, i32) 52 53define i16 @test_pcmpgt_w_256(<16 x i16> %a, <16 x i16> %b) { 54; CHECK-LABEL: test_pcmpgt_w_256 55; CHECK: vpcmpgtw %ymm1, %ymm0, %k0 ## 56 %res = call i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1) 57 ret i16 %res 58} 59 60define i16 @test_mask_pcmpgt_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { 61; CHECK-LABEL: test_mask_pcmpgt_w_256 62; CHECK: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} ## 63 %res = call i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask) 64 ret i16 %res 65} 66 67declare i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16>, <16 x i16>, i16) 68 69define <8 x i32> @test_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1) { 70; CHECK_LABEL: test_cmp_b_256 71; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 ## 72 %res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1) 73 %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0 74; CHECK: vpcmpltb %ymm1, %ymm0, %k0 ## 75 %res1 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 -1) 76 %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1 77; CHECK: vpcmpleb %ymm1, %ymm0, %k0 ## 78 %res2 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 -1) 79 %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2 80; CHECK: vpcmpunordb %ymm1, %ymm0, %k0 ## 81 %res3 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 -1) 82 %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3 83; CHECK: vpcmpneqb %ymm1, %ymm0, %k0 ## 84 %res4 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 -1) 85 %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4 86; CHECK: vpcmpnltb %ymm1, %ymm0, %k0 ## 87 %res5 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 -1) 88 %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5 89; CHECK: vpcmpnleb %ymm1, %ymm0, %k0 ## 90 %res6 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 -1) 91 %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6 92; CHECK: vpcmpordb %ymm1, %ymm0, %k0 ## 93 %res7 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 -1) 94 %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7 95 ret <8 x i32> %vec7 96} 97 98define <8 x i32> @test_mask_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) { 99; CHECK_LABEL: test_mask_cmp_b_256 100; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} ## 101 %res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask) 102 %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0 103; CHECK: vpcmpltb %ymm1, %ymm0, %k0 {%k1} ## 104 %res1 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 %mask) 105 %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1 106; CHECK: vpcmpleb %ymm1, %ymm0, %k0 {%k1} ## 107 %res2 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 %mask) 108 %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2 109; CHECK: vpcmpunordb %ymm1, %ymm0, %k0 {%k1} ## 110 %res3 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 %mask) 111 %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3 112; CHECK: vpcmpneqb %ymm1, %ymm0, %k0 {%k1} ## 113 %res4 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 %mask) 114 %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4 115; CHECK: vpcmpnltb %ymm1, %ymm0, %k0 {%k1} ## 116 %res5 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 %mask) 117 %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5 118; CHECK: vpcmpnleb %ymm1, %ymm0, %k0 {%k1} ## 119 %res6 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 %mask) 120 %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6 121; CHECK: vpcmpordb %ymm1, %ymm0, %k0 {%k1} ## 122 %res7 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 %mask) 123 %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7 124 ret <8 x i32> %vec7 125} 126 127declare i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8>, <32 x i8>, i32, i32) nounwind readnone 128 129define <8 x i32> @test_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1) { 130; CHECK_LABEL: test_ucmp_b_256 131; CHECK: vpcmpequb %ymm1, %ymm0, %k0 ## 132 %res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1) 133 %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0 134; CHECK: vpcmpltub %ymm1, %ymm0, %k0 ## 135 %res1 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 -1) 136 %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1 137; CHECK: vpcmpleub %ymm1, %ymm0, %k0 ## 138 %res2 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 -1) 139 %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2 140; CHECK: vpcmpunordub %ymm1, %ymm0, %k0 ## 141 %res3 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 -1) 142 %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3 143; CHECK: vpcmpnequb %ymm1, %ymm0, %k0 ## 144 %res4 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 -1) 145 %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4 146; CHECK: vpcmpnltub %ymm1, %ymm0, %k0 ## 147 %res5 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 -1) 148 %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5 149; CHECK: vpcmpnleub %ymm1, %ymm0, %k0 ## 150 %res6 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 -1) 151 %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6 152; CHECK: vpcmpordub %ymm1, %ymm0, %k0 ## 153 %res7 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 -1) 154 %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7 155 ret <8 x i32> %vec7 156} 157 158define <8 x i32> @test_mask_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) { 159; CHECK_LABEL: test_mask_ucmp_b_256 160; CHECK: vpcmpequb %ymm1, %ymm0, %k0 {%k1} ## 161 %res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask) 162 %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0 163; CHECK: vpcmpltub %ymm1, %ymm0, %k0 {%k1} ## 164 %res1 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 %mask) 165 %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1 166; CHECK: vpcmpleub %ymm1, %ymm0, %k0 {%k1} ## 167 %res2 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 %mask) 168 %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2 169; CHECK: vpcmpunordub %ymm1, %ymm0, %k0 {%k1} ## 170 %res3 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 %mask) 171 %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3 172; CHECK: vpcmpnequb %ymm1, %ymm0, %k0 {%k1} ## 173 %res4 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 %mask) 174 %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4 175; CHECK: vpcmpnltub %ymm1, %ymm0, %k0 {%k1} ## 176 %res5 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 %mask) 177 %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5 178; CHECK: vpcmpnleub %ymm1, %ymm0, %k0 {%k1} ## 179 %res6 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 %mask) 180 %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6 181; CHECK: vpcmpordub %ymm1, %ymm0, %k0 {%k1} ## 182 %res7 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 %mask) 183 %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7 184 ret <8 x i32> %vec7 185} 186 187declare i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8>, <32 x i8>, i32, i32) nounwind readnone 188 189define <8 x i16> @test_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1) { 190; CHECK_LABEL: test_cmp_w_256 191; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 ## 192 %res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1) 193 %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 194; CHECK: vpcmpltw %ymm1, %ymm0, %k0 ## 195 %res1 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 -1) 196 %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 197; CHECK: vpcmplew %ymm1, %ymm0, %k0 ## 198 %res2 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 -1) 199 %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 200; CHECK: vpcmpunordw %ymm1, %ymm0, %k0 ## 201 %res3 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 -1) 202 %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 203; CHECK: vpcmpneqw %ymm1, %ymm0, %k0 ## 204 %res4 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 -1) 205 %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 206; CHECK: vpcmpnltw %ymm1, %ymm0, %k0 ## 207 %res5 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 -1) 208 %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 209; CHECK: vpcmpnlew %ymm1, %ymm0, %k0 ## 210 %res6 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 -1) 211 %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 212; CHECK: vpcmpordw %ymm1, %ymm0, %k0 ## 213 %res7 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 -1) 214 %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 215 ret <8 x i16> %vec7 216} 217 218define <8 x i16> @test_mask_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask) { 219; CHECK_LABEL: test_mask_cmp_w_256 220; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ## 221 %res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 %mask) 222 %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 223; CHECK: vpcmpltw %ymm1, %ymm0, %k0 {%k1} ## 224 %res1 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 %mask) 225 %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 226; CHECK: vpcmplew %ymm1, %ymm0, %k0 {%k1} ## 227 %res2 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 %mask) 228 %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 229; CHECK: vpcmpunordw %ymm1, %ymm0, %k0 {%k1} ## 230 %res3 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 %mask) 231 %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 232; CHECK: vpcmpneqw %ymm1, %ymm0, %k0 {%k1} ## 233 %res4 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 %mask) 234 %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 235; CHECK: vpcmpnltw %ymm1, %ymm0, %k0 {%k1} ## 236 %res5 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 %mask) 237 %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 238; CHECK: vpcmpnlew %ymm1, %ymm0, %k0 {%k1} ## 239 %res6 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 %mask) 240 %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 241; CHECK: vpcmpordw %ymm1, %ymm0, %k0 {%k1} ## 242 %res7 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 %mask) 243 %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 244 ret <8 x i16> %vec7 245} 246 247declare i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16>, <16 x i16>, i32, i16) nounwind readnone 248 249define <8 x i16> @test_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1) { 250; CHECK_LABEL: test_ucmp_w_256 251; CHECK: vpcmpequw %ymm1, %ymm0, %k0 ## 252 %res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1) 253 %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 254; CHECK: vpcmpltuw %ymm1, %ymm0, %k0 ## 255 %res1 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 -1) 256 %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 257; CHECK: vpcmpleuw %ymm1, %ymm0, %k0 ## 258 %res2 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 -1) 259 %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 260; CHECK: vpcmpunorduw %ymm1, %ymm0, %k0 ## 261 %res3 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 -1) 262 %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 263; CHECK: vpcmpnequw %ymm1, %ymm0, %k0 ## 264 %res4 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 -1) 265 %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 266; CHECK: vpcmpnltuw %ymm1, %ymm0, %k0 ## 267 %res5 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 -1) 268 %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 269; CHECK: vpcmpnleuw %ymm1, %ymm0, %k0 ## 270 %res6 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 -1) 271 %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 272; CHECK: vpcmporduw %ymm1, %ymm0, %k0 ## 273 %res7 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 -1) 274 %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 275 ret <8 x i16> %vec7 276} 277 278define <8 x i16> @test_mask_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask) { 279; CHECK_LABEL: test_mask_ucmp_w_256 280; CHECK: vpcmpequw %ymm1, %ymm0, %k0 {%k1} ## 281 %res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 %mask) 282 %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 283; CHECK: vpcmpltuw %ymm1, %ymm0, %k0 {%k1} ## 284 %res1 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 %mask) 285 %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 286; CHECK: vpcmpleuw %ymm1, %ymm0, %k0 {%k1} ## 287 %res2 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 %mask) 288 %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 289; CHECK: vpcmpunorduw %ymm1, %ymm0, %k0 {%k1} ## 290 %res3 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 %mask) 291 %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 292; CHECK: vpcmpnequw %ymm1, %ymm0, %k0 {%k1} ## 293 %res4 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 %mask) 294 %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 295; CHECK: vpcmpnltuw %ymm1, %ymm0, %k0 {%k1} ## 296 %res5 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 %mask) 297 %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 298; CHECK: vpcmpnleuw %ymm1, %ymm0, %k0 {%k1} ## 299 %res6 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 %mask) 300 %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 301; CHECK: vpcmporduw %ymm1, %ymm0, %k0 {%k1} ## 302 %res7 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 %mask) 303 %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 304 ret <8 x i16> %vec7 305} 306 307declare i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16>, <16 x i16>, i32, i16) nounwind readnone 308 309; 128-bit 310 311define i16 @test_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b) { 312; CHECK-LABEL: test_pcmpeq_b_128 313; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 ## 314 %res = call i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8> %a, <16 x i8> %b, i16 -1) 315 ret i16 %res 316} 317 318define i16 @test_mask_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { 319; CHECK-LABEL: test_mask_pcmpeq_b_128 320; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} ## 321 %res = call i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8> %a, <16 x i8> %b, i16 %mask) 322 ret i16 %res 323} 324 325declare i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8>, <16 x i8>, i16) 326 327define i8 @test_pcmpeq_w_128(<8 x i16> %a, <8 x i16> %b) { 328; CHECK-LABEL: test_pcmpeq_w_128 329; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 ## 330 %res = call i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16> %a, <8 x i16> %b, i8 -1) 331 ret i8 %res 332} 333 334define i8 @test_mask_pcmpeq_w_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { 335; CHECK-LABEL: test_mask_pcmpeq_w_128 336; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} ## 337 %res = call i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16> %a, <8 x i16> %b, i8 %mask) 338 ret i8 %res 339} 340 341declare i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16>, <8 x i16>, i8) 342 343define i16 @test_pcmpgt_b_128(<16 x i8> %a, <16 x i8> %b) { 344; CHECK-LABEL: test_pcmpgt_b_128 345; CHECK: vpcmpgtb %xmm1, %xmm0, %k0 ## 346 %res = call i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8> %a, <16 x i8> %b, i16 -1) 347 ret i16 %res 348} 349 350define i16 @test_mask_pcmpgt_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { 351; CHECK-LABEL: test_mask_pcmpgt_b_128 352; CHECK: vpcmpgtb %xmm1, %xmm0, %k0 {%k1} ## 353 %res = call i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8> %a, <16 x i8> %b, i16 %mask) 354 ret i16 %res 355} 356 357declare i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8>, <16 x i8>, i16) 358 359define i8 @test_pcmpgt_w_128(<8 x i16> %a, <8 x i16> %b) { 360; CHECK-LABEL: test_pcmpgt_w_128 361; CHECK: vpcmpgtw %xmm1, %xmm0, %k0 ## 362 %res = call i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16> %a, <8 x i16> %b, i8 -1) 363 ret i8 %res 364} 365 366define i8 @test_mask_pcmpgt_w_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { 367; CHECK-LABEL: test_mask_pcmpgt_w_128 368; CHECK: vpcmpgtw %xmm1, %xmm0, %k0 {%k1} ## 369 %res = call i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16> %a, <8 x i16> %b, i8 %mask) 370 ret i8 %res 371} 372 373declare i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16>, <8 x i16>, i8) 374 375define <8 x i16> @test_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1) { 376; CHECK_LABEL: test_cmp_b_128 377; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 ## 378 %res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 -1) 379 %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 380; CHECK: vpcmpltb %xmm1, %xmm0, %k0 ## 381 %res1 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 -1) 382 %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 383; CHECK: vpcmpleb %xmm1, %xmm0, %k0 ## 384 %res2 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 -1) 385 %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 386; CHECK: vpcmpunordb %xmm1, %xmm0, %k0 ## 387 %res3 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 -1) 388 %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 389; CHECK: vpcmpneqb %xmm1, %xmm0, %k0 ## 390 %res4 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 -1) 391 %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 392; CHECK: vpcmpnltb %xmm1, %xmm0, %k0 ## 393 %res5 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 -1) 394 %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 395; CHECK: vpcmpnleb %xmm1, %xmm0, %k0 ## 396 %res6 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 -1) 397 %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 398; CHECK: vpcmpordb %xmm1, %xmm0, %k0 ## 399 %res7 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 -1) 400 %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 401 ret <8 x i16> %vec7 402} 403 404define <8 x i16> @test_mask_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) { 405; CHECK_LABEL: test_mask_cmp_b_128 406; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} ## 407 %res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 %mask) 408 %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 409; CHECK: vpcmpltb %xmm1, %xmm0, %k0 {%k1} ## 410 %res1 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 %mask) 411 %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 412; CHECK: vpcmpleb %xmm1, %xmm0, %k0 {%k1} ## 413 %res2 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 %mask) 414 %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 415; CHECK: vpcmpunordb %xmm1, %xmm0, %k0 {%k1} ## 416 %res3 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 %mask) 417 %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 418; CHECK: vpcmpneqb %xmm1, %xmm0, %k0 {%k1} ## 419 %res4 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 %mask) 420 %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 421; CHECK: vpcmpnltb %xmm1, %xmm0, %k0 {%k1} ## 422 %res5 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 %mask) 423 %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 424; CHECK: vpcmpnleb %xmm1, %xmm0, %k0 {%k1} ## 425 %res6 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 %mask) 426 %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 427; CHECK: vpcmpordb %xmm1, %xmm0, %k0 {%k1} ## 428 %res7 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 %mask) 429 %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 430 ret <8 x i16> %vec7 431} 432 433declare i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8>, <16 x i8>, i32, i16) nounwind readnone 434 435define <8 x i16> @test_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1) { 436; CHECK_LABEL: test_ucmp_b_128 437; CHECK: vpcmpequb %xmm1, %xmm0, %k0 ## 438 %res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 -1) 439 %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 440; CHECK: vpcmpltub %xmm1, %xmm0, %k0 ## 441 %res1 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 -1) 442 %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 443; CHECK: vpcmpleub %xmm1, %xmm0, %k0 ## 444 %res2 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 -1) 445 %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 446; CHECK: vpcmpunordub %xmm1, %xmm0, %k0 ## 447 %res3 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 -1) 448 %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 449; CHECK: vpcmpnequb %xmm1, %xmm0, %k0 ## 450 %res4 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 -1) 451 %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 452; CHECK: vpcmpnltub %xmm1, %xmm0, %k0 ## 453 %res5 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 -1) 454 %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 455; CHECK: vpcmpnleub %xmm1, %xmm0, %k0 ## 456 %res6 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 -1) 457 %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 458; CHECK: vpcmpordub %xmm1, %xmm0, %k0 ## 459 %res7 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 -1) 460 %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 461 ret <8 x i16> %vec7 462} 463 464define <8 x i16> @test_mask_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) { 465; CHECK_LABEL: test_mask_ucmp_b_128 466; CHECK: vpcmpequb %xmm1, %xmm0, %k0 {%k1} ## 467 %res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 %mask) 468 %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 469; CHECK: vpcmpltub %xmm1, %xmm0, %k0 {%k1} ## 470 %res1 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 %mask) 471 %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 472; CHECK: vpcmpleub %xmm1, %xmm0, %k0 {%k1} ## 473 %res2 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 %mask) 474 %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 475; CHECK: vpcmpunordub %xmm1, %xmm0, %k0 {%k1} ## 476 %res3 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 %mask) 477 %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 478; CHECK: vpcmpnequb %xmm1, %xmm0, %k0 {%k1} ## 479 %res4 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 %mask) 480 %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 481; CHECK: vpcmpnltub %xmm1, %xmm0, %k0 {%k1} ## 482 %res5 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 %mask) 483 %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 484; CHECK: vpcmpnleub %xmm1, %xmm0, %k0 {%k1} ## 485 %res6 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 %mask) 486 %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 487; CHECK: vpcmpordub %xmm1, %xmm0, %k0 {%k1} ## 488 %res7 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 %mask) 489 %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 490 ret <8 x i16> %vec7 491} 492 493declare i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8>, <16 x i8>, i32, i16) nounwind readnone 494 495define <8 x i8> @test_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1) { 496; CHECK_LABEL: test_cmp_w_128 497; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 ## 498 %res0 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 -1) 499 %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 500; CHECK: vpcmpltw %xmm1, %xmm0, %k0 ## 501 %res1 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 -1) 502 %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 503; CHECK: vpcmplew %xmm1, %xmm0, %k0 ## 504 %res2 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 -1) 505 %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 506; CHECK: vpcmpunordw %xmm1, %xmm0, %k0 ## 507 %res3 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 -1) 508 %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 509; CHECK: vpcmpneqw %xmm1, %xmm0, %k0 ## 510 %res4 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 -1) 511 %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 512; CHECK: vpcmpnltw %xmm1, %xmm0, %k0 ## 513 %res5 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 -1) 514 %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 515; CHECK: vpcmpnlew %xmm1, %xmm0, %k0 ## 516 %res6 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 -1) 517 %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 518; CHECK: vpcmpordw %xmm1, %xmm0, %k0 ## 519 %res7 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 -1) 520 %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 521 ret <8 x i8> %vec7 522} 523 524define <8 x i8> @test_mask_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) { 525; CHECK_LABEL: test_mask_cmp_w_128 526; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} ## 527 %res0 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 %mask) 528 %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 529; CHECK: vpcmpltw %xmm1, %xmm0, %k0 {%k1} ## 530 %res1 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 %mask) 531 %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 532; CHECK: vpcmplew %xmm1, %xmm0, %k0 {%k1} ## 533 %res2 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 %mask) 534 %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 535; CHECK: vpcmpunordw %xmm1, %xmm0, %k0 {%k1} ## 536 %res3 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 %mask) 537 %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 538; CHECK: vpcmpneqw %xmm1, %xmm0, %k0 {%k1} ## 539 %res4 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 %mask) 540 %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 541; CHECK: vpcmpnltw %xmm1, %xmm0, %k0 {%k1} ## 542 %res5 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 %mask) 543 %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 544; CHECK: vpcmpnlew %xmm1, %xmm0, %k0 {%k1} ## 545 %res6 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 %mask) 546 %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 547; CHECK: vpcmpordw %xmm1, %xmm0, %k0 {%k1} ## 548 %res7 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 %mask) 549 %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 550 ret <8 x i8> %vec7 551} 552 553declare i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16>, <8 x i16>, i32, i8) nounwind readnone 554 555define <8 x i8> @test_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1) { 556; CHECK_LABEL: test_ucmp_w_128 557; CHECK: vpcmpequw %xmm1, %xmm0, %k0 ## 558 %res0 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 -1) 559 %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 560; CHECK: vpcmpltuw %xmm1, %xmm0, %k0 ## 561 %res1 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 -1) 562 %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 563; CHECK: vpcmpleuw %xmm1, %xmm0, %k0 ## 564 %res2 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 -1) 565 %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 566; CHECK: vpcmpunorduw %xmm1, %xmm0, %k0 ## 567 %res3 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 -1) 568 %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 569; CHECK: vpcmpnequw %xmm1, %xmm0, %k0 ## 570 %res4 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 -1) 571 %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 572; CHECK: vpcmpnltuw %xmm1, %xmm0, %k0 ## 573 %res5 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 -1) 574 %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 575; CHECK: vpcmpnleuw %xmm1, %xmm0, %k0 ## 576 %res6 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 -1) 577 %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 578; CHECK: vpcmporduw %xmm1, %xmm0, %k0 ## 579 %res7 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 -1) 580 %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 581 ret <8 x i8> %vec7 582} 583 584define <8 x i8> @test_mask_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) { 585; CHECK_LABEL: test_mask_ucmp_w_128 586; CHECK: vpcmpequw %xmm1, %xmm0, %k0 {%k1} ## 587 %res0 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 %mask) 588 %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 589; CHECK: vpcmpltuw %xmm1, %xmm0, %k0 {%k1} ## 590 %res1 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 %mask) 591 %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 592; CHECK: vpcmpleuw %xmm1, %xmm0, %k0 {%k1} ## 593 %res2 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 %mask) 594 %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 595; CHECK: vpcmpunorduw %xmm1, %xmm0, %k0 {%k1} ## 596 %res3 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 %mask) 597 %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 598; CHECK: vpcmpnequw %xmm1, %xmm0, %k0 {%k1} ## 599 %res4 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 %mask) 600 %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 601; CHECK: vpcmpnltuw %xmm1, %xmm0, %k0 {%k1} ## 602 %res5 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 %mask) 603 %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 604; CHECK: vpcmpnleuw %xmm1, %xmm0, %k0 {%k1} ## 605 %res6 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 %mask) 606 %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 607; CHECK: vpcmporduw %xmm1, %xmm0, %k0 {%k1} ## 608 %res7 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 %mask) 609 %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 610 ret <8 x i8> %vec7 611} 612 613declare i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16>, <8 x i16>, i32, i8) nounwind readnone 614 615declare <8 x float> @llvm.x86.avx512.mask.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone 616 617define <8 x float> @test_mask_vfmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) { 618 ; CHECK-LABEL: test_mask_vfmadd256_ps 619 ; CHECK: vfmadd213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xa8,0xc2] 620 %res = call <8 x float> @llvm.x86.avx512.mask.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind 621 ret <8 x float> %res 622} 623 624declare <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone 625 626define <4 x float> @test_mask_vfmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { 627 ; CHECK-LABEL: test_mask_vfmadd128_ps 628 ; CHECK: vfmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0xc2] 629 %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind 630 ret <4 x float> %res 631} 632 633declare <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) 634 635define <4 x double> @test_mask_fmadd256_pd(<4 x double> %a, <4 x double> %b, <4 x double> %c, i8 %mask) { 636; CHECK-LABEL: test_mask_fmadd256_pd: 637; CHECK: vfmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0xc2] 638 %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c, i8 %mask) 639 ret <4 x double> %res 640} 641 642declare <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) 643 644define <2 x double> @test_mask_fmadd128_pd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { 645; CHECK-LABEL: test_mask_fmadd128_pd: 646; CHECK: vfmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0xc2] 647 %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) 648 ret <2 x double> %res 649} 650 651define <2 x double>@test_int_x86_avx512_mask_vfmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { 652; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_pd_128: 653; CHECK: ## BB#0: 654; CHECK-NEXT: movzbl %dil, %eax 655; CHECK-NEXT: kmovw %eax, %k1 656; CHECK-NEXT: vmovaps %zmm0, %zmm3 657; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm3 {%k1} 658; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 659; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 660; CHECK-NEXT: retq 661 %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) 662 %res1 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) 663 %res2 = fadd <2 x double> %res, %res1 664 ret <2 x double> %res2 665} 666 667declare <2 x double> @llvm.x86.avx512.mask3.vfmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) 668 669define <2 x double>@test_int_x86_avx512_mask3_vfmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { 670; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_pd_128: 671; CHECK: ## BB#0: 672; CHECK-NEXT: movzbl %dil, %eax 673; CHECK-NEXT: kmovw %eax, %k1 674; CHECK-NEXT: vmovaps %zmm2, %zmm3 675; CHECK-NEXT: vfmadd231pd %xmm1, %xmm0, %xmm3 {%k1} 676; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 677; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 678; CHECK-NEXT: retq 679 %res = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) 680 %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) 681 %res2 = fadd <2 x double> %res, %res1 682 ret <2 x double> %res2 683} 684 685declare <2 x double> @llvm.x86.avx512.maskz.vfmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) 686 687define <2 x double>@test_int_x86_avx512_maskz_vfmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { 688; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_pd_128: 689; CHECK: ## BB#0: 690; CHECK-NEXT: movzbl %dil, %eax 691; CHECK-NEXT: kmovw %eax, %k1 692; CHECK-NEXT: vmovaps %zmm0, %zmm3 693; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm3 {%k1} {z} 694; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 695; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 696; CHECK-NEXT: retq 697 %res = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) 698 %res1 = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) 699 %res2 = fadd <2 x double> %res, %res1 700 ret <2 x double> %res2 701} 702 703define <4 x double>@test_int_x86_avx512_mask_vfmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { 704; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_pd_256: 705; CHECK: ## BB#0: 706; CHECK-NEXT: movzbl %dil, %eax 707; CHECK-NEXT: kmovw %eax, %k1 708; CHECK-NEXT: vmovaps %zmm0, %zmm3 709; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm3 {%k1} 710; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 711; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 712; CHECK-NEXT: retq 713 %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) 714 %res1 = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) 715 %res2 = fadd <4 x double> %res, %res1 716 ret <4 x double> %res2 717} 718 719declare <4 x double> @llvm.x86.avx512.mask3.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) 720 721define <4 x double>@test_int_x86_avx512_mask3_vfmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { 722; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_pd_256: 723; CHECK: ## BB#0: 724; CHECK-NEXT: movzbl %dil, %eax 725; CHECK-NEXT: kmovw %eax, %k1 726; CHECK-NEXT: vmovaps %zmm2, %zmm3 727; CHECK-NEXT: vfmadd231pd %ymm1, %ymm0, %ymm3 {%k1} 728; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 729; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 730; CHECK-NEXT: retq 731 %res = call <4 x double> @llvm.x86.avx512.mask3.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) 732 %res1 = call <4 x double> @llvm.x86.avx512.mask3.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) 733 %res2 = fadd <4 x double> %res, %res1 734 ret <4 x double> %res2 735} 736 737declare <4 x double> @llvm.x86.avx512.maskz.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) 738 739define <4 x double>@test_int_x86_avx512_maskz_vfmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { 740; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_pd_256: 741; CHECK: ## BB#0: 742; CHECK-NEXT: movzbl %dil, %eax 743; CHECK-NEXT: kmovw %eax, %k1 744; CHECK-NEXT: vmovaps %zmm0, %zmm3 745; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm3 {%k1} {z} 746; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 747; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 748; CHECK-NEXT: retq 749 %res = call <4 x double> @llvm.x86.avx512.maskz.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) 750 %res1 = call <4 x double> @llvm.x86.avx512.maskz.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) 751 %res2 = fadd <4 x double> %res, %res1 752 ret <4 x double> %res2 753} 754 755define <4 x float>@test_int_x86_avx512_mask_vfmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { 756; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ps_128: 757; CHECK: ## BB#0: 758; CHECK-NEXT: movzbl %dil, %eax 759; CHECK-NEXT: kmovw %eax, %k1 760; CHECK-NEXT: vmovaps %zmm0, %zmm3 761; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm3 {%k1} 762; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 763; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 764; CHECK-NEXT: retq 765 %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) 766 %res1 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) 767 %res2 = fadd <4 x float> %res, %res1 768 ret <4 x float> %res2 769} 770 771declare <4 x float> @llvm.x86.avx512.mask3.vfmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) 772 773define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { 774; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ps_128: 775; CHECK: ## BB#0: 776; CHECK-NEXT: movzbl %dil, %eax 777; CHECK-NEXT: kmovw %eax, %k1 778; CHECK-NEXT: vmovaps %zmm2, %zmm3 779; CHECK-NEXT: vfmadd231ps %xmm1, %xmm0, %xmm3 {%k1} 780; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 781; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 782; CHECK-NEXT: retq 783 %res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) 784 %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) 785 %res2 = fadd <4 x float> %res, %res1 786 ret <4 x float> %res2 787} 788 789declare <4 x float> @llvm.x86.avx512.maskz.vfmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) 790 791define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { 792; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ps_128: 793; CHECK: ## BB#0: 794; CHECK-NEXT: movzbl %dil, %eax 795; CHECK-NEXT: kmovw %eax, %k1 796; CHECK-NEXT: vmovaps %zmm0, %zmm3 797; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm3 {%k1} {z} 798; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 799; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 800; CHECK-NEXT: retq 801 %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) 802 %res1 = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) 803 %res2 = fadd <4 x float> %res, %res1 804 ret <4 x float> %res2 805} 806 807define <8 x float>@test_int_x86_avx512_mask_vfmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) { 808; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ps_256: 809; CHECK: ## BB#0: 810; CHECK-NEXT: movzbl %dil, %eax 811; CHECK-NEXT: kmovw %eax, %k1 812; CHECK-NEXT: vmovaps %zmm0, %zmm3 813; CHECK-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm3 {%k1} 814; CHECK-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 815; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 816; CHECK-NEXT: retq 817 %res = call <8 x float> @llvm.x86.avx512.mask.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) 818 %res1 = call <8 x float> @llvm.x86.avx512.mask.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) 819 %res2 = fadd <8 x float> %res, %res1 820 ret <8 x float> %res2 821} 822 823declare <8 x float> @llvm.x86.avx512.mask3.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) 824 825define <8 x float>@test_int_x86_avx512_mask3_vfmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) { 826; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ps_256: 827; CHECK: ## BB#0: 828; CHECK-NEXT: movzbl %dil, %eax 829; CHECK-NEXT: kmovw %eax, %k1 830; CHECK-NEXT: vmovaps %zmm2, %zmm3 831; CHECK-NEXT: vfmadd231ps %ymm1, %ymm0, %ymm3 {%k1} 832; CHECK-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 833; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 834; CHECK-NEXT: retq 835 %res = call <8 x float> @llvm.x86.avx512.mask3.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) 836 %res1 = call <8 x float> @llvm.x86.avx512.mask3.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) 837 %res2 = fadd <8 x float> %res, %res1 838 ret <8 x float> %res2 839} 840 841declare <8 x float> @llvm.x86.avx512.maskz.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) 842 843define <8 x float>@test_int_x86_avx512_maskz_vfmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) { 844; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ps_256: 845; CHECK: ## BB#0: 846; CHECK-NEXT: movzbl %dil, %eax 847; CHECK-NEXT: kmovw %eax, %k1 848; CHECK-NEXT: vmovaps %zmm0, %zmm3 849; CHECK-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm3 {%k1} {z} 850; CHECK-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 851; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 852; CHECK-NEXT: retq 853 %res = call <8 x float> @llvm.x86.avx512.maskz.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) 854 %res1 = call <8 x float> @llvm.x86.avx512.maskz.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) 855 %res2 = fadd <8 x float> %res, %res1 856 ret <8 x float> %res2 857} 858 859 860declare <2 x double> @llvm.x86.avx512.mask3.vfmsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) 861 862define <2 x double>@test_int_x86_avx512_mask3_vfmsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { 863; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_pd_128: 864; CHECK: ## BB#0: 865; CHECK-NEXT: movzbl %dil, %eax 866; CHECK-NEXT: kmovw %eax, %k1 867; CHECK-NEXT: vmovaps %zmm2, %zmm3 868; CHECK-NEXT: vfmsub231pd %xmm1, %xmm0, %xmm3 {%k1} 869; CHECK-NEXT: vfmsub213pd %xmm2, %xmm1, %xmm0 870; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 871; CHECK-NEXT: retq 872 %res = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) 873 %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) 874 %res2 = fadd <2 x double> %res, %res1 875 ret <2 x double> %res2 876} 877 878 879declare <4 x double> @llvm.x86.avx512.mask3.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) 880 881define <4 x double>@test_int_x86_avx512_mask3_vfmsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { 882; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_pd_256: 883; CHECK: ## BB#0: 884; CHECK-NEXT: movzbl %dil, %eax 885; CHECK-NEXT: kmovw %eax, %k1 886; CHECK-NEXT: vmovaps %zmm2, %zmm3 887; CHECK-NEXT: vfmsub231pd %ymm1, %ymm0, %ymm3 {%k1} 888; CHECK-NEXT: vfmsub213pd %ymm2, %ymm1, %ymm0 889; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 890; CHECK-NEXT: retq 891 %res = call <4 x double> @llvm.x86.avx512.mask3.vfmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) 892 %res1 = call <4 x double> @llvm.x86.avx512.mask3.vfmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) 893 %res2 = fadd <4 x double> %res, %res1 894 ret <4 x double> %res2 895} 896 897declare <4 x float> @llvm.x86.avx512.mask3.vfmsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) 898 899define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { 900; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ps_128: 901; CHECK: ## BB#0: 902; CHECK-NEXT: movzbl %dil, %eax 903; CHECK-NEXT: kmovw %eax, %k1 904; CHECK-NEXT: vmovaps %zmm2, %zmm3 905; CHECK-NEXT: vfmsub231ps %xmm1, %xmm0, %xmm3 {%k1} 906; CHECK-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0 907; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 908; CHECK-NEXT: retq 909 %res = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) 910 %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) 911 %res2 = fadd <4 x float> %res, %res1 912 ret <4 x float> %res2 913} 914 915declare <8 x float> @llvm.x86.avx512.mask3.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) 916 917define <8 x float>@test_int_x86_avx512_mask3_vfmsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) { 918; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ps_256: 919; CHECK: ## BB#0: 920; CHECK-NEXT: movzbl %dil, %eax 921; CHECK-NEXT: kmovw %eax, %k1 922; CHECK-NEXT: vmovaps %zmm2, %zmm3 923; CHECK-NEXT: vfmsub231ps %ymm1, %ymm0, %ymm3 {%k1} 924; CHECK-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0 925; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 926; CHECK-NEXT: retq 927 %res = call <8 x float> @llvm.x86.avx512.mask3.vfmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) 928 %res1 = call <8 x float> @llvm.x86.avx512.mask3.vfmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) 929 %res2 = fadd <8 x float> %res, %res1 930 ret <8 x float> %res2 931} 932 933declare <8 x float> @llvm.x86.avx512.mask.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone 934 935define <8 x float> @test_mask_vfnmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) { 936 ; CHECK-LABEL: test_mask_vfnmadd256_ps 937 ; CHECK: vfnmadd213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xac,0xc2] 938 %res = call <8 x float> @llvm.x86.avx512.mask.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind 939 ret <8 x float> %res 940} 941 942declare <4 x float> @llvm.x86.avx512.mask.vfnmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone 943 944define <4 x float> @test_mask_vfnmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { 945 ; CHECK-LABEL: test_mask_vfnmadd128_ps 946 ; CHECK: vfnmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xac,0xc2] 947 %res = call <4 x float> @llvm.x86.avx512.mask.vfnmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind 948 ret <4 x float> %res 949} 950 951declare <4 x double> @llvm.x86.avx512.mask.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone 952 953define <4 x double> @test_mask_vfnmadd256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) { 954 ; CHECK-LABEL: test_mask_vfnmadd256_pd 955 ; CHECK: vfnmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xac,0xc2] 956 %res = call <4 x double> @llvm.x86.avx512.mask.vfnmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind 957 ret <4 x double> %res 958} 959 960declare <2 x double> @llvm.x86.avx512.mask.vfnmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone 961 962define <2 x double> @test_mask_vfnmadd128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { 963 ; CHECK-LABEL: test_mask_vfnmadd128_pd 964 ; CHECK: vfnmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xac,0xc2] 965 %res = call <2 x double> @llvm.x86.avx512.mask.vfnmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind 966 ret <2 x double> %res 967} 968 969declare <8 x float> @llvm.x86.avx512.mask.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone 970 971define <8 x float> @test_mask_vfnmsub256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) { 972 ; CHECK-LABEL: test_mask_vfnmsub256_ps 973 ; CHECK: vfnmsub213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xae,0xc2] 974 %res = call <8 x float> @llvm.x86.avx512.mask.vfnmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind 975 ret <8 x float> %res 976} 977 978declare <4 x float> @llvm.x86.avx512.mask.vfnmsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone 979 980define <4 x float> @test_mask_vfnmsub128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { 981 ; CHECK-LABEL: test_mask_vfnmsub128_ps 982 ; CHECK: vfnmsub213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xae,0xc2] 983 %res = call <4 x float> @llvm.x86.avx512.mask.vfnmsub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind 984 ret <4 x float> %res 985} 986 987declare <4 x double> @llvm.x86.avx512.mask.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone 988 989define <4 x double> @test_mask_vfnmsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) { 990 ; CHECK-LABEL: test_mask_vfnmsub256_pd 991 ; CHECK: vfnmsub213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xae,0xc2] 992 %res = call <4 x double> @llvm.x86.avx512.mask.vfnmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind 993 ret <4 x double> %res 994} 995 996declare <2 x double> @llvm.x86.avx512.mask.vfnmsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone 997 998define <2 x double> @test_mask_vfnmsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { 999 ; CHECK-LABEL: test_mask_vfnmsub128_pd 1000 ; CHECK: vfnmsub213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xae,0xc2] 1001 %res = call <2 x double> @llvm.x86.avx512.mask.vfnmsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind 1002 ret <2 x double> %res 1003} 1004 1005 1006define <2 x double>@test_int_x86_avx512_mask_vfnmsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { 1007; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_pd_128: 1008; CHECK: ## BB#0: 1009; CHECK-NEXT: movzbl %dil, %eax 1010; CHECK-NEXT: kmovw %eax, %k1 1011; CHECK-NEXT: vmovaps %zmm0, %zmm3 1012; CHECK-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm3 {%k1} 1013; CHECK-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0 1014; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 1015; CHECK-NEXT: retq 1016 %res = call <2 x double> @llvm.x86.avx512.mask.vfnmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) 1017 %res1 = call <2 x double> @llvm.x86.avx512.mask.vfnmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) 1018 %res2 = fadd <2 x double> %res, %res1 1019 ret <2 x double> %res2 1020} 1021 1022declare <2 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) 1023 1024define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { 1025; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_pd_128: 1026; CHECK: ## BB#0: 1027; CHECK-NEXT: movzbl %dil, %eax 1028; CHECK-NEXT: kmovw %eax, %k1 1029; CHECK-NEXT: vmovaps %zmm2, %zmm3 1030; CHECK-NEXT: vfnmsub231pd %xmm1, %xmm0, %xmm3 {%k1} 1031; CHECK-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0 1032; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 1033; CHECK-NEXT: retq 1034 %res = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) 1035 %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) 1036 %res2 = fadd <2 x double> %res, %res1 1037 ret <2 x double> %res2 1038} 1039 1040define <4 x double>@test_int_x86_avx512_mask_vfnmsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { 1041; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_pd_256: 1042; CHECK: ## BB#0: 1043; CHECK-NEXT: movzbl %dil, %eax 1044; CHECK-NEXT: kmovw %eax, %k1 1045; CHECK-NEXT: vmovaps %zmm0, %zmm3 1046; CHECK-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm3 {%k1} 1047; CHECK-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0 1048; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 1049; CHECK-NEXT: retq 1050 %res = call <4 x double> @llvm.x86.avx512.mask.vfnmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) 1051 %res1 = call <4 x double> @llvm.x86.avx512.mask.vfnmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) 1052 %res2 = fadd <4 x double> %res, %res1 1053 ret <4 x double> %res2 1054} 1055 1056declare <4 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) 1057 1058define <4 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { 1059; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_pd_256: 1060; CHECK: ## BB#0: 1061; CHECK-NEXT: movzbl %dil, %eax 1062; CHECK-NEXT: kmovw %eax, %k1 1063; CHECK-NEXT: vmovaps %zmm2, %zmm3 1064; CHECK-NEXT: vfnmsub231pd %ymm1, %ymm0, %ymm3 {%k1} 1065; CHECK-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0 1066; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 1067; CHECK-NEXT: retq 1068 %res = call <4 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) 1069 %res1 = call <4 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) 1070 %res2 = fadd <4 x double> %res, %res1 1071 ret <4 x double> %res2 1072} 1073 1074define <4 x float>@test_int_x86_avx512_mask_vfnmsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { 1075; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_ps_128: 1076; CHECK: ## BB#0: 1077; CHECK-NEXT: movzbl %dil, %eax 1078; CHECK-NEXT: kmovw %eax, %k1 1079; CHECK-NEXT: vmovaps %zmm0, %zmm3 1080; CHECK-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm3 {%k1} 1081; CHECK-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0 1082; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 1083; CHECK-NEXT: retq 1084 %res = call <4 x float> @llvm.x86.avx512.mask.vfnmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) 1085 %res1 = call <4 x float> @llvm.x86.avx512.mask.vfnmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) 1086 %res2 = fadd <4 x float> %res, %res1 1087 ret <4 x float> %res2 1088} 1089 1090declare <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) 1091 1092define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { 1093; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_ps_128: 1094; CHECK: ## BB#0: 1095; CHECK-NEXT: movzbl %dil, %eax 1096; CHECK-NEXT: kmovw %eax, %k1 1097; CHECK-NEXT: vmovaps %zmm2, %zmm3 1098; CHECK-NEXT: vfnmsub231ps %xmm1, %xmm0, %xmm3 {%k1} 1099; CHECK-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0 1100; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 1101; CHECK-NEXT: retq 1102 %res = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) 1103 %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) 1104 %res2 = fadd <4 x float> %res, %res1 1105 ret <4 x float> %res2 1106} 1107 1108define <8 x float>@test_int_x86_avx512_mask_vfnmsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) { 1109; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_ps_256: 1110; CHECK: ## BB#0: 1111; CHECK-NEXT: movzbl %dil, %eax 1112; CHECK-NEXT: kmovw %eax, %k1 1113; CHECK-NEXT: vmovaps %zmm0, %zmm3 1114; CHECK-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm3 {%k1} 1115; CHECK-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0 1116; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 1117; CHECK-NEXT: retq 1118 %res = call <8 x float> @llvm.x86.avx512.mask.vfnmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) 1119 %res1 = call <8 x float> @llvm.x86.avx512.mask.vfnmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) 1120 %res2 = fadd <8 x float> %res, %res1 1121 ret <8 x float> %res2 1122} 1123 1124declare <8 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) 1125 1126define <8 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) { 1127; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_ps_256: 1128; CHECK: ## BB#0: 1129; CHECK-NEXT: movzbl %dil, %eax 1130; CHECK-NEXT: kmovw %eax, %k1 1131; CHECK-NEXT: vmovaps %zmm2, %zmm3 1132; CHECK-NEXT: vfnmsub231ps %ymm1, %ymm0, %ymm3 {%k1} 1133; CHECK-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0 1134; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 1135; CHECK-NEXT: retq 1136 %res = call <8 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) 1137 %res1 = call <8 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) 1138 %res2 = fadd <8 x float> %res, %res1 1139 ret <8 x float> %res2 1140} 1141 1142define <2 x double>@test_int_x86_avx512_mask_vfnmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { 1143; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_pd_128: 1144; CHECK: ## BB#0: 1145; CHECK-NEXT: movzbl %dil, %eax 1146; CHECK-NEXT: kmovw %eax, %k1 1147; CHECK-NEXT: vmovaps %zmm0, %zmm3 1148; CHECK-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm3 {%k1} 1149; CHECK-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0 1150; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 1151; CHECK-NEXT: retq 1152 %res = call <2 x double> @llvm.x86.avx512.mask.vfnmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) 1153 %res1 = call <2 x double> @llvm.x86.avx512.mask.vfnmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) 1154 %res2 = fadd <2 x double> %res, %res1 1155 ret <2 x double> %res2 1156} 1157 1158define <4 x double>@test_int_x86_avx512_mask_vfnmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { 1159; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_pd_256: 1160; CHECK: ## BB#0: 1161; CHECK-NEXT: movzbl %dil, %eax 1162; CHECK-NEXT: kmovw %eax, %k1 1163; CHECK-NEXT: vmovaps %zmm0, %zmm3 1164; CHECK-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm3 {%k1} 1165; CHECK-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0 1166; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 1167; CHECK-NEXT: retq 1168 %res = call <4 x double> @llvm.x86.avx512.mask.vfnmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) 1169 %res1 = call <4 x double> @llvm.x86.avx512.mask.vfnmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) 1170 %res2 = fadd <4 x double> %res, %res1 1171 ret <4 x double> %res2 1172} 1173 1174define <4 x float>@test_int_x86_avx512_mask_vfnmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { 1175; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_ps_128: 1176; CHECK: ## BB#0: 1177; CHECK-NEXT: movzbl %dil, %eax 1178; CHECK-NEXT: kmovw %eax, %k1 1179; CHECK-NEXT: vmovaps %zmm0, %zmm3 1180; CHECK-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm3 {%k1} 1181; CHECK-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 1182; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 1183; CHECK-NEXT: retq 1184 %res = call <4 x float> @llvm.x86.avx512.mask.vfnmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) 1185 %res1 = call <4 x float> @llvm.x86.avx512.mask.vfnmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) 1186 %res2 = fadd <4 x float> %res, %res1 1187 ret <4 x float> %res2 1188} 1189 1190define <8 x float>@test_int_x86_avx512_mask_vfnmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) { 1191; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_ps_256: 1192; CHECK: ## BB#0: 1193; CHECK-NEXT: movzbl %dil, %eax 1194; CHECK-NEXT: kmovw %eax, %k1 1195; CHECK-NEXT: vmovaps %zmm0, %zmm3 1196; CHECK-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm3 {%k1} 1197; CHECK-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 1198; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 1199; CHECK-NEXT: retq 1200 %res = call <8 x float> @llvm.x86.avx512.mask.vfnmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) 1201 %res1 = call <8 x float> @llvm.x86.avx512.mask.vfnmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) 1202 %res2 = fadd <8 x float> %res, %res1 1203 ret <8 x float> %res2 1204} 1205 1206declare <8 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone 1207 1208define <8 x float> @test_mask_fmaddsub256_ps(<8 x float> %a, <8 x float> %b, <8 x float> %c, i8 %mask) { 1209; CHECK-LABEL: test_mask_fmaddsub256_ps: 1210; CHECK: vfmaddsub213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xa6,0xc2] 1211 %res = call <8 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c, i8 %mask) 1212 ret <8 x float> %res 1213} 1214 1215declare <4 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone 1216 1217define <4 x float> @test_mask_fmaddsub128_ps(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { 1218; CHECK-LABEL: test_mask_fmaddsub128_ps: 1219; CHECK: vfmaddsub213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa6,0xc2] 1220 %res = call <4 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) 1221 ret <4 x float> %res 1222} 1223 1224declare <4 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone 1225 1226define <4 x double> @test_mask_vfmaddsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) { 1227 ; CHECK-LABEL: test_mask_vfmaddsub256_pd 1228 ; CHECK: vfmaddsub213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa6,0xc2] 1229 %res = call <4 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind 1230 ret <4 x double> %res 1231} 1232 1233declare <2 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone 1234 1235define <2 x double> @test_mask_vfmaddsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { 1236 ; CHECK-LABEL: test_mask_vfmaddsub128_pd 1237 ; CHECK: vfmaddsub213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa6,0xc2] 1238 %res = call <2 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind 1239 ret <2 x double> %res 1240} 1241 1242define <2 x double>@test_int_x86_avx512_mask_vfmaddsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { 1243; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_pd_128: 1244; CHECK: ## BB#0: 1245; CHECK-NEXT: movzbl %dil, %eax 1246; CHECK-NEXT: kmovw %eax, %k1 1247; CHECK-NEXT: vmovaps %zmm0, %zmm3 1248; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm3 {%k1} 1249; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 1250; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 1251; CHECK-NEXT: retq 1252 %res = call <2 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) 1253 %res1 = call <2 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) 1254 %res2 = fadd <2 x double> %res, %res1 1255 ret <2 x double> %res2 1256} 1257 1258declare <2 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) 1259 1260define <2 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { 1261; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_pd_128: 1262; CHECK: ## BB#0: 1263; CHECK-NEXT: movzbl %dil, %eax 1264; CHECK-NEXT: kmovw %eax, %k1 1265; CHECK-NEXT: vmovaps %zmm2, %zmm3 1266; CHECK-NEXT: vfmaddsub231pd %xmm1, %xmm0, %xmm3 {%k1} 1267; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 1268; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 1269; CHECK-NEXT: retq 1270 %res = call <2 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) 1271 %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) 1272 %res2 = fadd <2 x double> %res, %res1 1273 ret <2 x double> %res2 1274} 1275 1276declare <2 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) 1277 1278define <2 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { 1279; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_pd_128: 1280; CHECK: ## BB#0: 1281; CHECK-NEXT: movzbl %dil, %eax 1282; CHECK-NEXT: kmovw %eax, %k1 1283; CHECK-NEXT: vmovaps %zmm0, %zmm3 1284; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm3 {%k1} {z} 1285; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 1286; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 1287; CHECK-NEXT: retq 1288 %res = call <2 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) 1289 %res1 = call <2 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) 1290 %res2 = fadd <2 x double> %res, %res1 1291 ret <2 x double> %res2 1292} 1293 1294define <4 x double>@test_int_x86_avx512_mask_vfmaddsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { 1295; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_pd_256: 1296; CHECK: ## BB#0: 1297; CHECK-NEXT: movzbl %dil, %eax 1298; CHECK-NEXT: kmovw %eax, %k1 1299; CHECK-NEXT: vmovaps %zmm0, %zmm3 1300; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm3 {%k1} 1301; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 1302; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 1303; CHECK-NEXT: retq 1304 %res = call <4 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) 1305 %res1 = call <4 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) 1306 %res2 = fadd <4 x double> %res, %res1 1307 ret <4 x double> %res2 1308} 1309 1310declare <4 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) 1311 1312define <4 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { 1313; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_pd_256: 1314; CHECK: ## BB#0: 1315; CHECK-NEXT: movzbl %dil, %eax 1316; CHECK-NEXT: kmovw %eax, %k1 1317; CHECK-NEXT: vmovaps %zmm2, %zmm3 1318; CHECK-NEXT: vfmaddsub231pd %ymm1, %ymm0, %ymm3 {%k1} 1319; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 1320; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 1321; CHECK-NEXT: retq 1322 %res = call <4 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) 1323 %res1 = call <4 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) 1324 %res2 = fadd <4 x double> %res, %res1 1325 ret <4 x double> %res2 1326} 1327 1328declare <4 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) 1329 1330define <4 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { 1331; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_pd_256: 1332; CHECK: ## BB#0: 1333; CHECK-NEXT: movzbl %dil, %eax 1334; CHECK-NEXT: kmovw %eax, %k1 1335; CHECK-NEXT: vmovaps %zmm0, %zmm3 1336; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm3 {%k1} {z} 1337; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 1338; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 1339; CHECK-NEXT: retq 1340 %res = call <4 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) 1341 %res1 = call <4 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) 1342 %res2 = fadd <4 x double> %res, %res1 1343 ret <4 x double> %res2 1344} 1345 1346define <4 x float>@test_int_x86_avx512_mask_vfmaddsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { 1347; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_ps_128: 1348; CHECK: ## BB#0: 1349; CHECK-NEXT: movzbl %dil, %eax 1350; CHECK-NEXT: kmovw %eax, %k1 1351; CHECK-NEXT: vmovaps %zmm0, %zmm3 1352; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm3 {%k1} 1353; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 1354; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 1355; CHECK-NEXT: retq 1356 %res = call <4 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) 1357 %res1 = call <4 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) 1358 %res2 = fadd <4 x float> %res, %res1 1359 ret <4 x float> %res2 1360} 1361 1362declare <4 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) 1363 1364define <4 x float>@test_int_x86_avx512_mask3_vfmaddsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { 1365; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_ps_128: 1366; CHECK: ## BB#0: 1367; CHECK-NEXT: movzbl %dil, %eax 1368; CHECK-NEXT: kmovw %eax, %k1 1369; CHECK-NEXT: vmovaps %zmm2, %zmm3 1370; CHECK-NEXT: vfmaddsub231ps %xmm1, %xmm0, %xmm3 {%k1} 1371; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 1372; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 1373; CHECK-NEXT: retq 1374 %res = call <4 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) 1375 %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) 1376 %res2 = fadd <4 x float> %res, %res1 1377 ret <4 x float> %res2 1378} 1379 1380declare <4 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) 1381 1382define <4 x float>@test_int_x86_avx512_maskz_vfmaddsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { 1383; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_ps_128: 1384; CHECK: ## BB#0: 1385; CHECK-NEXT: movzbl %dil, %eax 1386; CHECK-NEXT: kmovw %eax, %k1 1387; CHECK-NEXT: vmovaps %zmm0, %zmm3 1388; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm3 {%k1} {z} 1389; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 1390; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 1391; CHECK-NEXT: retq 1392 %res = call <4 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) 1393 %res1 = call <4 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) 1394 %res2 = fadd <4 x float> %res, %res1 1395 ret <4 x float> %res2 1396} 1397 1398define <8 x float>@test_int_x86_avx512_mask_vfmaddsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) { 1399; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_ps_256: 1400; CHECK: ## BB#0: 1401; CHECK-NEXT: movzbl %dil, %eax 1402; CHECK-NEXT: kmovw %eax, %k1 1403; CHECK-NEXT: vmovaps %zmm0, %zmm3 1404; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm3 {%k1} 1405; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 1406; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 1407; CHECK-NEXT: retq 1408 %res = call <8 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) 1409 %res1 = call <8 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) 1410 %res2 = fadd <8 x float> %res, %res1 1411 ret <8 x float> %res2 1412} 1413 1414declare <8 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) 1415 1416define <8 x float>@test_int_x86_avx512_mask3_vfmaddsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) { 1417; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_ps_256: 1418; CHECK: ## BB#0: 1419; CHECK-NEXT: movzbl %dil, %eax 1420; CHECK-NEXT: kmovw %eax, %k1 1421; CHECK-NEXT: vmovaps %zmm2, %zmm3 1422; CHECK-NEXT: vfmaddsub231ps %ymm1, %ymm0, %ymm3 {%k1} 1423; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 1424; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 1425; CHECK-NEXT: retq 1426 %res = call <8 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) 1427 %res1 = call <8 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) 1428 %res2 = fadd <8 x float> %res, %res1 1429 ret <8 x float> %res2 1430} 1431 1432declare <8 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) 1433 1434define <8 x float>@test_int_x86_avx512_maskz_vfmaddsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) { 1435; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_ps_256: 1436; CHECK: ## BB#0: 1437; CHECK-NEXT: movzbl %dil, %eax 1438; CHECK-NEXT: kmovw %eax, %k1 1439; CHECK-NEXT: vmovaps %zmm0, %zmm3 1440; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm3 {%k1} {z} 1441; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 1442; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 1443; CHECK-NEXT: retq 1444 %res = call <8 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) 1445 %res1 = call <8 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) 1446 %res2 = fadd <8 x float> %res, %res1 1447 ret <8 x float> %res2 1448} 1449 1450declare <2 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) 1451 1452define <2 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { 1453; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_pd_128: 1454; CHECK: ## BB#0: 1455; CHECK-NEXT: movzbl %dil, %eax 1456; CHECK-NEXT: kmovw %eax, %k1 1457; CHECK-NEXT: vmovaps %zmm2, %zmm3 1458; CHECK-NEXT: vfmsubadd231pd %xmm1, %xmm0, %xmm3 {%k1} 1459; CHECK-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0 1460; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 1461; CHECK-NEXT: retq 1462 %res = call <2 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) 1463 %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) 1464 %res2=fadd <2 x double> %res, %res1 1465 ret <2 x double> %res2 1466} 1467 1468declare <4 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) 1469 1470define <4 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { 1471; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_pd_256: 1472; CHECK: ## BB#0: 1473; CHECK-NEXT: movzbl %dil, %eax 1474; CHECK-NEXT: kmovw %eax, %k1 1475; CHECK-NEXT: vmovaps %zmm2, %zmm3 1476; CHECK-NEXT: vfmsubadd231pd %ymm1, %ymm0, %ymm3 {%k1} 1477; CHECK-NEXT: vfmsubadd213pd %ymm2, %ymm1, %ymm0 1478; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 1479; CHECK-NEXT: retq 1480 %res = call <4 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) 1481 %res1 = call <4 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) 1482 %res2=fadd <4 x double> %res, %res1 1483 ret <4 x double> %res2 1484} 1485 1486declare <4 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) 1487 1488define <4 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { 1489; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_ps_128: 1490; CHECK: ## BB#0: 1491; CHECK-NEXT: movzbl %dil, %eax 1492; CHECK-NEXT: kmovw %eax, %k1 1493; CHECK-NEXT: vmovaps %zmm2, %zmm3 1494; CHECK-NEXT: vfmsubadd231ps %xmm1, %xmm0, %xmm3 {%k1} 1495; CHECK-NEXT: vfmsubadd213ps %xmm2, %xmm1, %xmm0 1496; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 1497; CHECK-NEXT: retq 1498 %res = call <4 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) 1499 %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) 1500 %res2=fadd <4 x float> %res, %res1 1501 ret <4 x float> %res2 1502} 1503 1504declare <8 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) 1505 1506define <8 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) { 1507; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_ps_256: 1508; CHECK: ## BB#0: 1509; CHECK-NEXT: movzbl %dil, %eax 1510; CHECK-NEXT: kmovw %eax, %k1 1511; CHECK-NEXT: vmovaps %zmm2, %zmm3 1512; CHECK-NEXT: vfmsubadd231ps %ymm1, %ymm0, %ymm3 {%k1} 1513; CHECK-NEXT: vfmsubadd213ps %ymm2, %ymm1, %ymm0 1514; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 1515; CHECK-NEXT: retq 1516 %res = call <8 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) 1517 %res1 = call <8 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) 1518 %res2=fadd <8 x float> %res, %res1 1519 ret <8 x float> %res2 1520} 1521 1522 1523define <4 x float> @test_mask_vfmadd128_ps_r(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { 1524 ; CHECK-LABEL: test_mask_vfmadd128_ps_r 1525 ; CHECK: vfmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0xc2] 1526 %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind 1527 ret <4 x float> %res 1528} 1529 1530define <4 x float> @test_mask_vfmadd128_ps_rz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { 1531 ; CHECK-LABEL: test_mask_vfmadd128_ps_rz 1532 ; CHECK: vfmadd213ps %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x08,0xa8,0xc2] 1533 %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind 1534 ret <4 x float> %res 1535} 1536 1537define <4 x float> @test_mask_vfmadd128_ps_rmk(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2, i8 %mask) { 1538 ; CHECK-LABEL: test_mask_vfmadd128_ps_rmk 1539 ; CHECK: vfmadd213ps (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07] 1540 %a2 = load <4 x float>, <4 x float>* %ptr_a2 1541 %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind 1542 ret <4 x float> %res 1543} 1544 1545define <4 x float> @test_mask_vfmadd128_ps_rmka(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2, i8 %mask) { 1546 ; CHECK-LABEL: test_mask_vfmadd128_ps_rmka 1547 ; CHECK: vfmadd213ps (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07] 1548 %a2 = load <4 x float>, <4 x float>* %ptr_a2, align 8 1549 %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind 1550 ret <4 x float> %res 1551} 1552 1553define <4 x float> @test_mask_vfmadd128_ps_rmkz(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2) { 1554 ; CHECK-LABEL: test_mask_vfmadd128_ps_rmkz 1555 ; CHECK: vfmadd213ps (%rdi), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0x07] 1556 %a2 = load <4 x float>, <4 x float>* %ptr_a2 1557 %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind 1558 ret <4 x float> %res 1559} 1560 1561define <4 x float> @test_mask_vfmadd128_ps_rmkza(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2) { 1562 ; CHECK-LABEL: test_mask_vfmadd128_ps_rmkza 1563 ; CHECK: vfmadd213ps (%rdi), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0x07] 1564 %a2 = load <4 x float>, <4 x float>* %ptr_a2, align 4 1565 %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind 1566 ret <4 x float> %res 1567} 1568 1569define <4 x float> @test_mask_vfmadd128_ps_rmb(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2, i8 %mask) { 1570 ; CHECK-LABEL: test_mask_vfmadd128_ps_rmb 1571 ; CHECK: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0x07] 1572 %q = load float, float* %ptr_a2 1573 %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 1574 %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1 1575 %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2 1576 %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3 1577 %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 %mask) nounwind 1578 ret <4 x float> %res 1579} 1580 1581define <4 x float> @test_mask_vfmadd128_ps_rmba(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2, i8 %mask) { 1582 ; CHECK-LABEL: test_mask_vfmadd128_ps_rmba 1583 ; CHECK: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0x07] 1584 %q = load float, float* %ptr_a2, align 4 1585 %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 1586 %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1 1587 %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2 1588 %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3 1589 %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 %mask) nounwind 1590 ret <4 x float> %res 1591} 1592 1593define <4 x float> @test_mask_vfmadd128_ps_rmbz(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2) { 1594 ; CHECK-LABEL: test_mask_vfmadd128_ps_rmbz 1595 ; CHECK: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0x07] 1596 %q = load float, float* %ptr_a2 1597 %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 1598 %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1 1599 %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2 1600 %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3 1601 %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 -1) nounwind 1602 ret <4 x float> %res 1603} 1604 1605define <4 x float> @test_mask_vfmadd128_ps_rmbza(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2) { 1606 ; CHECK-LABEL: test_mask_vfmadd128_ps_rmbza 1607 ; CHECK: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0x07] 1608 %q = load float, float* %ptr_a2, align 4 1609 %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 1610 %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1 1611 %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2 1612 %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3 1613 %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 -1) nounwind 1614 ret <4 x float> %res 1615} 1616 1617define <2 x double> @test_mask_vfmadd128_pd_r(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { 1618 ; CHECK-LABEL: test_mask_vfmadd128_pd_r 1619 ; CHECK: vfmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0xc2] 1620 %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind 1621 ret <2 x double> %res 1622} 1623 1624define <2 x double> @test_mask_vfmadd128_pd_rz(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) { 1625 ; CHECK-LABEL: test_mask_vfmadd128_pd_rz 1626 ; CHECK: vfmadd213pd %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0xf5,0x08,0xa8,0xc2] 1627 %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 -1) nounwind 1628 ret <2 x double> %res 1629} 1630 1631define <2 x double> @test_mask_vfmadd128_pd_rmk(<2 x double> %a0, <2 x double> %a1, <2 x double>* %ptr_a2, i8 %mask) { 1632 ; CHECK-LABEL: test_mask_vfmadd128_pd_rmk 1633 ; CHECK: vfmadd213pd (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0x07] 1634 %a2 = load <2 x double>, <2 x double>* %ptr_a2 1635 %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind 1636 ret <2 x double> %res 1637} 1638 1639define <2 x double> @test_mask_vfmadd128_pd_rmkz(<2 x double> %a0, <2 x double> %a1, <2 x double>* %ptr_a2) { 1640 ; CHECK-LABEL: test_mask_vfmadd128_pd_rmkz 1641 ; CHECK: vfmadd213pd (%rdi), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0xf1,0xa8,0x07] 1642 %a2 = load <2 x double>, <2 x double>* %ptr_a2 1643 %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 -1) nounwind 1644 ret <2 x double> %res 1645} 1646 1647define <4 x double> @test_mask_vfmadd256_pd_r(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) { 1648 ; CHECK-LABEL: test_mask_vfmadd256_pd_r 1649 ; CHECK: vfmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0xc2] 1650 %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind 1651 ret <4 x double> %res 1652} 1653 1654define <4 x double> @test_mask_vfmadd256_pd_rz(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) { 1655 ; CHECK-LABEL: test_mask_vfmadd256_pd_rz 1656 ; CHECK: vfmadd213pd %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0xa8,0xc2] 1657 %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 -1) nounwind 1658 ret <4 x double> %res 1659} 1660 1661define <4 x double> @test_mask_vfmadd256_pd_rmk(<4 x double> %a0, <4 x double> %a1, <4 x double>* %ptr_a2, i8 %mask) { 1662 ; CHECK-LABEL: test_mask_vfmadd256_pd_rmk 1663 ; CHECK: vfmadd213pd (%rdi), %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0x07] 1664 %a2 = load <4 x double>, <4 x double>* %ptr_a2 1665 %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind 1666 ret <4 x double> %res 1667} 1668 1669define <4 x double> @test_mask_vfmadd256_pd_rmkz(<4 x double> %a0, <4 x double> %a1, <4 x double>* %ptr_a2) { 1670 ; CHECK-LABEL: test_mask_vfmadd256_pd_rmkz 1671 ; CHECK: vfmadd213pd (%rdi), %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0xf5,0xa8,0x07] 1672 %a2 = load <4 x double>, <4 x double>* %ptr_a2 1673 %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 -1) nounwind 1674 ret <4 x double> %res 1675} 1676define <8 x i16> @test_mask_add_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) { 1677 ;CHECK-LABEL: test_mask_add_epi16_rr_128 1678 ;CHECK: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfd,0xc1] 1679 %res = call <8 x i16> @llvm.x86.avx512.mask.padd.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) 1680 ret <8 x i16> %res 1681} 1682 1683define <8 x i16> @test_mask_add_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { 1684 ;CHECK-LABEL: test_mask_add_epi16_rrk_128 1685 ;CHECK: vpaddw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfd,0xd1] 1686 %res = call <8 x i16> @llvm.x86.avx512.mask.padd.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) 1687 ret <8 x i16> %res 1688} 1689 1690define <8 x i16> @test_mask_add_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { 1691 ;CHECK-LABEL: test_mask_add_epi16_rrkz_128 1692 ;CHECK: vpaddw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfd,0xc1] 1693 %res = call <8 x i16> @llvm.x86.avx512.mask.padd.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) 1694 ret <8 x i16> %res 1695} 1696 1697define <8 x i16> @test_mask_add_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { 1698 ;CHECK-LABEL: test_mask_add_epi16_rm_128 1699 ;CHECK: vpaddw (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfd,0x07] 1700 %b = load <8 x i16>, <8 x i16>* %ptr_b 1701 %res = call <8 x i16> @llvm.x86.avx512.mask.padd.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) 1702 ret <8 x i16> %res 1703} 1704 1705define <8 x i16> @test_mask_add_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { 1706 ;CHECK-LABEL: test_mask_add_epi16_rmk_128 1707 ;CHECK: vpaddw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfd,0x0f] 1708 %b = load <8 x i16>, <8 x i16>* %ptr_b 1709 %res = call <8 x i16> @llvm.x86.avx512.mask.padd.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) 1710 ret <8 x i16> %res 1711} 1712 1713define <8 x i16> @test_mask_add_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { 1714 ;CHECK-LABEL: test_mask_add_epi16_rmkz_128 1715 ;CHECK: vpaddw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfd,0x07] 1716 %b = load <8 x i16>, <8 x i16>* %ptr_b 1717 %res = call <8 x i16> @llvm.x86.avx512.mask.padd.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) 1718 ret <8 x i16> %res 1719} 1720 1721declare <8 x i16> @llvm.x86.avx512.mask.padd.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) 1722 1723define <16 x i16> @test_mask_add_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) { 1724 ;CHECK-LABEL: test_mask_add_epi16_rr_256 1725 ;CHECK: vpaddw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfd,0xc1] 1726 %res = call <16 x i16> @llvm.x86.avx512.mask.padd.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) 1727 ret <16 x i16> %res 1728} 1729 1730define <16 x i16> @test_mask_add_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { 1731 ;CHECK-LABEL: test_mask_add_epi16_rrk_256 1732 ;CHECK: vpaddw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfd,0xd1] 1733 %res = call <16 x i16> @llvm.x86.avx512.mask.padd.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) 1734 ret <16 x i16> %res 1735} 1736 1737define <16 x i16> @test_mask_add_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { 1738 ;CHECK-LABEL: test_mask_add_epi16_rrkz_256 1739 ;CHECK: vpaddw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfd,0xc1] 1740 %res = call <16 x i16> @llvm.x86.avx512.mask.padd.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) 1741 ret <16 x i16> %res 1742} 1743 1744define <16 x i16> @test_mask_add_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { 1745 ;CHECK-LABEL: test_mask_add_epi16_rm_256 1746 ;CHECK: vpaddw (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfd,0x07] 1747 %b = load <16 x i16>, <16 x i16>* %ptr_b 1748 %res = call <16 x i16> @llvm.x86.avx512.mask.padd.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) 1749 ret <16 x i16> %res 1750} 1751 1752define <16 x i16> @test_mask_add_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { 1753 ;CHECK-LABEL: test_mask_add_epi16_rmk_256 1754 ;CHECK: vpaddw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfd,0x0f] 1755 %b = load <16 x i16>, <16 x i16>* %ptr_b 1756 %res = call <16 x i16> @llvm.x86.avx512.mask.padd.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) 1757 ret <16 x i16> %res 1758} 1759 1760define <16 x i16> @test_mask_add_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { 1761 ;CHECK-LABEL: test_mask_add_epi16_rmkz_256 1762 ;CHECK: vpaddw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfd,0x07] 1763 %b = load <16 x i16>, <16 x i16>* %ptr_b 1764 %res = call <16 x i16> @llvm.x86.avx512.mask.padd.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) 1765 ret <16 x i16> %res 1766} 1767 1768declare <16 x i16> @llvm.x86.avx512.mask.padd.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) 1769 1770define <8 x i16> @test_mask_sub_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) { 1771 ;CHECK-LABEL: test_mask_sub_epi16_rr_128 1772 ;CHECK: vpsubw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xf9,0xc1] 1773 %res = call <8 x i16> @llvm.x86.avx512.mask.psub.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) 1774 ret <8 x i16> %res 1775} 1776 1777define <8 x i16> @test_mask_sub_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { 1778 ;CHECK-LABEL: test_mask_sub_epi16_rrk_128 1779 ;CHECK: vpsubw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xf9,0xd1] 1780 %res = call <8 x i16> @llvm.x86.avx512.mask.psub.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) 1781 ret <8 x i16> %res 1782} 1783 1784define <8 x i16> @test_mask_sub_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { 1785 ;CHECK-LABEL: test_mask_sub_epi16_rrkz_128 1786 ;CHECK: vpsubw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xf9,0xc1] 1787 %res = call <8 x i16> @llvm.x86.avx512.mask.psub.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) 1788 ret <8 x i16> %res 1789} 1790 1791define <8 x i16> @test_mask_sub_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { 1792 ;CHECK-LABEL: test_mask_sub_epi16_rm_128 1793 ;CHECK: vpsubw (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xf9,0x07] 1794 %b = load <8 x i16>, <8 x i16>* %ptr_b 1795 %res = call <8 x i16> @llvm.x86.avx512.mask.psub.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) 1796 ret <8 x i16> %res 1797} 1798 1799define <8 x i16> @test_mask_sub_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { 1800 ;CHECK-LABEL: test_mask_sub_epi16_rmk_128 1801 ;CHECK: vpsubw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xf9,0x0f] 1802 %b = load <8 x i16>, <8 x i16>* %ptr_b 1803 %res = call <8 x i16> @llvm.x86.avx512.mask.psub.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) 1804 ret <8 x i16> %res 1805} 1806 1807define <8 x i16> @test_mask_sub_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { 1808 ;CHECK-LABEL: test_mask_sub_epi16_rmkz_128 1809 ;CHECK: vpsubw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xf9,0x07] 1810 %b = load <8 x i16>, <8 x i16>* %ptr_b 1811 %res = call <8 x i16> @llvm.x86.avx512.mask.psub.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) 1812 ret <8 x i16> %res 1813} 1814 1815declare <8 x i16> @llvm.x86.avx512.mask.psub.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) 1816 1817define <16 x i16> @test_mask_sub_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) { 1818 ;CHECK-LABEL: test_mask_sub_epi16_rr_256 1819 ;CHECK: vpsubw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xf9,0xc1] 1820 %res = call <16 x i16> @llvm.x86.avx512.mask.psub.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) 1821 ret <16 x i16> %res 1822} 1823 1824define <16 x i16> @test_mask_sub_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { 1825 ;CHECK-LABEL: test_mask_sub_epi16_rrk_256 1826 ;CHECK: vpsubw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xf9,0xd1] 1827 %res = call <16 x i16> @llvm.x86.avx512.mask.psub.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) 1828 ret <16 x i16> %res 1829} 1830 1831define <16 x i16> @test_mask_sub_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { 1832 ;CHECK-LABEL: test_mask_sub_epi16_rrkz_256 1833 ;CHECK: vpsubw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xf9,0xc1] 1834 %res = call <16 x i16> @llvm.x86.avx512.mask.psub.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) 1835 ret <16 x i16> %res 1836} 1837 1838define <16 x i16> @test_mask_sub_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { 1839 ;CHECK-LABEL: test_mask_sub_epi16_rm_256 1840 ;CHECK: vpsubw (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xf9,0x07] 1841 %b = load <16 x i16>, <16 x i16>* %ptr_b 1842 %res = call <16 x i16> @llvm.x86.avx512.mask.psub.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) 1843 ret <16 x i16> %res 1844} 1845 1846define <16 x i16> @test_mask_sub_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { 1847 ;CHECK-LABEL: test_mask_sub_epi16_rmk_256 1848 ;CHECK: vpsubw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xf9,0x0f] 1849 %b = load <16 x i16>, <16 x i16>* %ptr_b 1850 %res = call <16 x i16> @llvm.x86.avx512.mask.psub.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) 1851 ret <16 x i16> %res 1852} 1853 1854define <16 x i16> @test_mask_sub_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { 1855 ;CHECK-LABEL: test_mask_sub_epi16_rmkz_256 1856 ;CHECK: vpsubw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xf9,0x07] 1857 %b = load <16 x i16>, <16 x i16>* %ptr_b 1858 %res = call <16 x i16> @llvm.x86.avx512.mask.psub.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) 1859 ret <16 x i16> %res 1860} 1861 1862declare <16 x i16> @llvm.x86.avx512.mask.psub.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) 1863 1864define <32 x i16> @test_mask_add_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { 1865 ;CHECK-LABEL: test_mask_add_epi16_rr_512 1866 ;CHECK: vpaddw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc1] 1867 %res = call <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) 1868 ret <32 x i16> %res 1869} 1870 1871define <32 x i16> @test_mask_add_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { 1872 ;CHECK-LABEL: test_mask_add_epi16_rrk_512 1873 ;CHECK: vpaddw %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xfd,0xd1] 1874 %res = call <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) 1875 ret <32 x i16> %res 1876} 1877 1878define <32 x i16> @test_mask_add_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { 1879 ;CHECK-LABEL: test_mask_add_epi16_rrkz_512 1880 ;CHECK: vpaddw %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xfd,0xc1] 1881 %res = call <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) 1882 ret <32 x i16> %res 1883} 1884 1885define <32 x i16> @test_mask_add_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { 1886 ;CHECK-LABEL: test_mask_add_epi16_rm_512 1887 ;CHECK: vpaddw (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfd,0x07] 1888 %b = load <32 x i16>, <32 x i16>* %ptr_b 1889 %res = call <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) 1890 ret <32 x i16> %res 1891} 1892 1893define <32 x i16> @test_mask_add_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { 1894 ;CHECK-LABEL: test_mask_add_epi16_rmk_512 1895 ;CHECK: vpaddw (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xfd,0x0f] 1896 %b = load <32 x i16>, <32 x i16>* %ptr_b 1897 %res = call <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) 1898 ret <32 x i16> %res 1899} 1900 1901define <32 x i16> @test_mask_add_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { 1902 ;CHECK-LABEL: test_mask_add_epi16_rmkz_512 1903 ;CHECK: vpaddw (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xfd,0x07] 1904 %b = load <32 x i16>, <32 x i16>* %ptr_b 1905 %res = call <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) 1906 ret <32 x i16> %res 1907} 1908 1909declare <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) 1910 1911define <32 x i16> @test_mask_sub_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { 1912 ;CHECK-LABEL: test_mask_sub_epi16_rr_512 1913 ;CHECK: vpsubw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xf9,0xc1] 1914 %res = call <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) 1915 ret <32 x i16> %res 1916} 1917 1918define <32 x i16> @test_mask_sub_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { 1919 ;CHECK-LABEL: test_mask_sub_epi16_rrk_512 1920 ;CHECK: vpsubw %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xf9,0xd1] 1921 %res = call <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) 1922 ret <32 x i16> %res 1923} 1924 1925define <32 x i16> @test_mask_sub_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { 1926 ;CHECK-LABEL: test_mask_sub_epi16_rrkz_512 1927 ;CHECK: vpsubw %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xf9,0xc1] 1928 %res = call <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) 1929 ret <32 x i16> %res 1930} 1931 1932define <32 x i16> @test_mask_sub_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { 1933 ;CHECK-LABEL: test_mask_sub_epi16_rm_512 1934 ;CHECK: vpsubw (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xf9,0x07] 1935 %b = load <32 x i16>, <32 x i16>* %ptr_b 1936 %res = call <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) 1937 ret <32 x i16> %res 1938} 1939 1940define <32 x i16> @test_mask_sub_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { 1941 ;CHECK-LABEL: test_mask_sub_epi16_rmk_512 1942 ;CHECK: vpsubw (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xf9,0x0f] 1943 %b = load <32 x i16>, <32 x i16>* %ptr_b 1944 %res = call <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) 1945 ret <32 x i16> %res 1946} 1947 1948define <32 x i16> @test_mask_sub_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { 1949 ;CHECK-LABEL: test_mask_sub_epi16_rmkz_512 1950 ;CHECK: vpsubw (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xf9,0x07] 1951 %b = load <32 x i16>, <32 x i16>* %ptr_b 1952 %res = call <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) 1953 ret <32 x i16> %res 1954} 1955 1956declare <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) 1957 1958define <32 x i16> @test_mask_mullo_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { 1959 ;CHECK-LABEL: test_mask_mullo_epi16_rr_512 1960 ;CHECK: vpmullw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xd5,0xc1] 1961 %res = call <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) 1962 ret <32 x i16> %res 1963} 1964 1965define <32 x i16> @test_mask_mullo_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { 1966 ;CHECK-LABEL: test_mask_mullo_epi16_rrk_512 1967 ;CHECK: vpmullw %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xd5,0xd1] 1968 %res = call <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) 1969 ret <32 x i16> %res 1970} 1971 1972define <32 x i16> @test_mask_mullo_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { 1973 ;CHECK-LABEL: test_mask_mullo_epi16_rrkz_512 1974 ;CHECK: vpmullw %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xd5,0xc1] 1975 %res = call <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) 1976 ret <32 x i16> %res 1977} 1978 1979define <32 x i16> @test_mask_mullo_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { 1980 ;CHECK-LABEL: test_mask_mullo_epi16_rm_512 1981 ;CHECK: vpmullw (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xd5,0x07] 1982 %b = load <32 x i16>, <32 x i16>* %ptr_b 1983 %res = call <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) 1984 ret <32 x i16> %res 1985} 1986 1987define <32 x i16> @test_mask_mullo_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { 1988 ;CHECK-LABEL: test_mask_mullo_epi16_rmk_512 1989 ;CHECK: vpmullw (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xd5,0x0f] 1990 %b = load <32 x i16>, <32 x i16>* %ptr_b 1991 %res = call <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) 1992 ret <32 x i16> %res 1993} 1994 1995define <32 x i16> @test_mask_mullo_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { 1996 ;CHECK-LABEL: test_mask_mullo_epi16_rmkz_512 1997 ;CHECK: vpmullw (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xd5,0x07] 1998 %b = load <32 x i16>, <32 x i16>* %ptr_b 1999 %res = call <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) 2000 ret <32 x i16> %res 2001} 2002 2003declare <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) 2004 2005define <8 x i16> @test_mask_mullo_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) { 2006 ;CHECK-LABEL: test_mask_mullo_epi16_rr_128 2007 ;CHECK: vpmullw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xd5,0xc1] 2008 %res = call <8 x i16> @llvm.x86.avx512.mask.pmull.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) 2009 ret <8 x i16> %res 2010} 2011 2012define <8 x i16> @test_mask_mullo_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { 2013 ;CHECK-LABEL: test_mask_mullo_epi16_rrk_128 2014 ;CHECK: vpmullw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd5,0xd1] 2015 %res = call <8 x i16> @llvm.x86.avx512.mask.pmull.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) 2016 ret <8 x i16> %res 2017} 2018 2019define <8 x i16> @test_mask_mullo_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { 2020 ;CHECK-LABEL: test_mask_mullo_epi16_rrkz_128 2021 ;CHECK: vpmullw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd5,0xc1] 2022 %res = call <8 x i16> @llvm.x86.avx512.mask.pmull.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) 2023 ret <8 x i16> %res 2024} 2025 2026define <8 x i16> @test_mask_mullo_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { 2027 ;CHECK-LABEL: test_mask_mullo_epi16_rm_128 2028 ;CHECK: vpmullw (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xd5,0x07] 2029 %b = load <8 x i16>, <8 x i16>* %ptr_b 2030 %res = call <8 x i16> @llvm.x86.avx512.mask.pmull.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) 2031 ret <8 x i16> %res 2032} 2033 2034define <8 x i16> @test_mask_mullo_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { 2035 ;CHECK-LABEL: test_mask_mullo_epi16_rmk_128 2036 ;CHECK: vpmullw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd5,0x0f] 2037 %b = load <8 x i16>, <8 x i16>* %ptr_b 2038 %res = call <8 x i16> @llvm.x86.avx512.mask.pmull.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) 2039 ret <8 x i16> %res 2040} 2041 2042define <8 x i16> @test_mask_mullo_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { 2043 ;CHECK-LABEL: test_mask_mullo_epi16_rmkz_128 2044 ;CHECK: vpmullw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd5,0x07] 2045 %b = load <8 x i16>, <8 x i16>* %ptr_b 2046 %res = call <8 x i16> @llvm.x86.avx512.mask.pmull.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) 2047 ret <8 x i16> %res 2048} 2049 2050declare <8 x i16> @llvm.x86.avx512.mask.pmull.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) 2051 2052define <16 x i16> @test_mask_mullo_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) { 2053 ;CHECK-LABEL: test_mask_mullo_epi16_rr_256 2054 ;CHECK: vpmullw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xd5,0xc1] 2055 %res = call <16 x i16> @llvm.x86.avx512.mask.pmull.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) 2056 ret <16 x i16> %res 2057} 2058 2059define <16 x i16> @test_mask_mullo_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { 2060 ;CHECK-LABEL: test_mask_mullo_epi16_rrk_256 2061 ;CHECK: vpmullw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd5,0xd1] 2062 %res = call <16 x i16> @llvm.x86.avx512.mask.pmull.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) 2063 ret <16 x i16> %res 2064} 2065 2066define <16 x i16> @test_mask_mullo_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { 2067 ;CHECK-LABEL: test_mask_mullo_epi16_rrkz_256 2068 ;CHECK: vpmullw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd5,0xc1] 2069 %res = call <16 x i16> @llvm.x86.avx512.mask.pmull.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) 2070 ret <16 x i16> %res 2071} 2072 2073define <16 x i16> @test_mask_mullo_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { 2074 ;CHECK-LABEL: test_mask_mullo_epi16_rm_256 2075 ;CHECK: vpmullw (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xd5,0x07] 2076 %b = load <16 x i16>, <16 x i16>* %ptr_b 2077 %res = call <16 x i16> @llvm.x86.avx512.mask.pmull.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) 2078 ret <16 x i16> %res 2079} 2080 2081define <16 x i16> @test_mask_mullo_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { 2082 ;CHECK-LABEL: test_mask_mullo_epi16_rmk_256 2083 ;CHECK: vpmullw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd5,0x0f] 2084 %b = load <16 x i16>, <16 x i16>* %ptr_b 2085 %res = call <16 x i16> @llvm.x86.avx512.mask.pmull.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) 2086 ret <16 x i16> %res 2087} 2088 2089define <16 x i16> @test_mask_mullo_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { 2090 ;CHECK-LABEL: test_mask_mullo_epi16_rmkz_256 2091 ;CHECK: vpmullw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd5,0x07] 2092 %b = load <16 x i16>, <16 x i16>* %ptr_b 2093 %res = call <16 x i16> @llvm.x86.avx512.mask.pmull.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) 2094 ret <16 x i16> %res 2095} 2096 2097declare <16 x i16> @llvm.x86.avx512.mask.pmull.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) 2098 2099 2100define <8 x i16> @test_mask_packs_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) { 2101 ;CHECK-LABEL: test_mask_packs_epi32_rr_128 2102 ;CHECK: vpackssdw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x6b,0xc1] 2103 %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 -1) 2104 ret <8 x i16> %res 2105} 2106 2107define <8 x i16> @test_mask_packs_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) { 2108 ;CHECK-LABEL: test_mask_packs_epi32_rrk_128 2109 ;CHECK: vpackssdw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6b,0xd1] 2110 %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) 2111 ret <8 x i16> %res 2112} 2113 2114define <8 x i16> @test_mask_packs_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) { 2115 ;CHECK-LABEL: test_mask_packs_epi32_rrkz_128 2116 ;CHECK: vpackssdw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6b,0xc1] 2117 %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 %mask) 2118 ret <8 x i16> %res 2119} 2120 2121define <8 x i16> @test_mask_packs_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) { 2122 ;CHECK-LABEL: test_mask_packs_epi32_rm_128 2123 ;CHECK: vpackssdw (%rdi), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x6b,0x07] 2124 %b = load <4 x i32>, <4 x i32>* %ptr_b 2125 %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 -1) 2126 ret <8 x i16> %res 2127} 2128 2129define <8 x i16> @test_mask_packs_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <8 x i16> %passThru, i8 %mask) { 2130 ;CHECK-LABEL: test_mask_packs_epi32_rmk_128 2131 ;CHECK: vpackssdw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6b,0x0f] 2132 %b = load <4 x i32>, <4 x i32>* %ptr_b 2133 %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) 2134 ret <8 x i16> %res 2135} 2136 2137define <8 x i16> @test_mask_packs_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) { 2138 ;CHECK-LABEL: test_mask_packs_epi32_rmkz_128 2139 ;CHECK: vpackssdw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6b,0x07] 2140 %b = load <4 x i32>, <4 x i32>* %ptr_b 2141 %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 %mask) 2142 ret <8 x i16> %res 2143} 2144 2145define <8 x i16> @test_mask_packs_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) { 2146 ;CHECK-LABEL: test_mask_packs_epi32_rmb_128 2147 ;CHECK: vpackssdw (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0x6b,0x07] 2148 %q = load i32, i32* %ptr_b 2149 %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 2150 %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer 2151 %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 -1) 2152 ret <8 x i16> %res 2153} 2154 2155define <8 x i16> @test_mask_packs_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <8 x i16> %passThru, i8 %mask) { 2156 ;CHECK-LABEL: test_mask_packs_epi32_rmbk_128 2157 ;CHECK: vpackssdw (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0x6b,0x0f] 2158 %q = load i32, i32* %ptr_b 2159 %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 2160 %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer 2161 %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) 2162 ret <8 x i16> %res 2163} 2164 2165define <8 x i16> @test_mask_packs_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) { 2166 ;CHECK-LABEL: test_mask_packs_epi32_rmbkz_128 2167 ;CHECK: vpackssdw (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0x6b,0x07] 2168 %q = load i32, i32* %ptr_b 2169 %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 2170 %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer 2171 %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 %mask) 2172 ret <8 x i16> %res 2173} 2174 2175declare <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32>, <4 x i32>, <8 x i16>, i8) 2176 2177define <16 x i16> @test_mask_packs_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) { 2178 ;CHECK-LABEL: test_mask_packs_epi32_rr_256 2179 ;CHECK: vpackssdw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x6b,0xc1] 2180 %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 -1) 2181 ret <16 x i16> %res 2182} 2183 2184define <16 x i16> @test_mask_packs_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) { 2185 ;CHECK-LABEL: test_mask_packs_epi32_rrk_256 2186 ;CHECK: vpackssdw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6b,0xd1] 2187 %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) 2188 ret <16 x i16> %res 2189} 2190 2191define <16 x i16> @test_mask_packs_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i16 %mask) { 2192 ;CHECK-LABEL: test_mask_packs_epi32_rrkz_256 2193 ;CHECK: vpackssdw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x6b,0xc1] 2194 %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 %mask) 2195 ret <16 x i16> %res 2196} 2197 2198define <16 x i16> @test_mask_packs_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) { 2199 ;CHECK-LABEL: test_mask_packs_epi32_rm_256 2200 ;CHECK: vpackssdw (%rdi), %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x6b,0x07] 2201 %b = load <8 x i32>, <8 x i32>* %ptr_b 2202 %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 -1) 2203 ret <16 x i16> %res 2204} 2205 2206define <16 x i16> @test_mask_packs_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <16 x i16> %passThru, i16 %mask) { 2207 ;CHECK-LABEL: test_mask_packs_epi32_rmk_256 2208 ;CHECK: vpackssdw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6b,0x0f] 2209 %b = load <8 x i32>, <8 x i32>* %ptr_b 2210 %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) 2211 ret <16 x i16> %res 2212} 2213 2214define <16 x i16> @test_mask_packs_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i16 %mask) { 2215 ;CHECK-LABEL: test_mask_packs_epi32_rmkz_256 2216 ;CHECK: vpackssdw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x6b,0x07] 2217 %b = load <8 x i32>, <8 x i32>* %ptr_b 2218 %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 %mask) 2219 ret <16 x i16> %res 2220} 2221 2222define <16 x i16> @test_mask_packs_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) { 2223 ;CHECK-LABEL: test_mask_packs_epi32_rmb_256 2224 ;CHECK: vpackssdw (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0x6b,0x07] 2225 %q = load i32, i32* %ptr_b 2226 %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 2227 %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer 2228 %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 -1) 2229 ret <16 x i16> %res 2230} 2231 2232define <16 x i16> @test_mask_packs_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <16 x i16> %passThru, i16 %mask) { 2233 ;CHECK-LABEL: test_mask_packs_epi32_rmbk_256 2234 ;CHECK: vpackssdw (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0x6b,0x0f] 2235 %q = load i32, i32* %ptr_b 2236 %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 2237 %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer 2238 %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) 2239 ret <16 x i16> %res 2240} 2241 2242define <16 x i16> @test_mask_packs_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i16 %mask) { 2243 ;CHECK-LABEL: test_mask_packs_epi32_rmbkz_256 2244 ;CHECK: vpackssdw (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0x6b,0x07] 2245 %q = load i32, i32* %ptr_b 2246 %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 2247 %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer 2248 %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 %mask) 2249 ret <16 x i16> %res 2250} 2251 2252declare <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32>, <8 x i32>, <16 x i16>, i16) 2253 2254define <16 x i8> @test_mask_packs_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) { 2255 ;CHECK-LABEL: test_mask_packs_epi16_rr_128 2256 ;CHECK: vpacksswb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x63,0xc1] 2257 %res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 -1) 2258 ret <16 x i8> %res 2259} 2260 2261define <16 x i8> @test_mask_packs_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask) { 2262 ;CHECK-LABEL: test_mask_packs_epi16_rrk_128 2263 ;CHECK: vpacksswb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x63,0xd1] 2264 %res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask) 2265 ret <16 x i8> %res 2266} 2267 2268define <16 x i8> @test_mask_packs_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i16 %mask) { 2269 ;CHECK-LABEL: test_mask_packs_epi16_rrkz_128 2270 ;CHECK: vpacksswb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x63,0xc1] 2271 %res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 %mask) 2272 ret <16 x i8> %res 2273} 2274 2275define <16 x i8> @test_mask_packs_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { 2276 ;CHECK-LABEL: test_mask_packs_epi16_rm_128 2277 ;CHECK: vpacksswb (%rdi), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x63,0x07] 2278 %b = load <8 x i16>, <8 x i16>* %ptr_b 2279 %res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 -1) 2280 ret <16 x i8> %res 2281} 2282 2283define <16 x i8> @test_mask_packs_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <16 x i8> %passThru, i16 %mask) { 2284 ;CHECK-LABEL: test_mask_packs_epi16_rmk_128 2285 ;CHECK: vpacksswb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x63,0x0f] 2286 %b = load <8 x i16>, <8 x i16>* %ptr_b 2287 %res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask) 2288 ret <16 x i8> %res 2289} 2290 2291define <16 x i8> @test_mask_packs_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i16 %mask) { 2292 ;CHECK-LABEL: test_mask_packs_epi16_rmkz_128 2293 ;CHECK: vpacksswb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x63,0x07] 2294 %b = load <8 x i16>, <8 x i16>* %ptr_b 2295 %res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 %mask) 2296 ret <16 x i8> %res 2297} 2298 2299declare <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16>, <8 x i16>, <16 x i8>, i16) 2300 2301define <32 x i8> @test_mask_packs_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) { 2302 ;CHECK-LABEL: test_mask_packs_epi16_rr_256 2303 ;CHECK: vpacksswb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x63,0xc1] 2304 %res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 -1) 2305 ret <32 x i8> %res 2306} 2307 2308define <32 x i8> @test_mask_packs_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask) { 2309 ;CHECK-LABEL: test_mask_packs_epi16_rrk_256 2310 ;CHECK: vpacksswb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x63,0xd1] 2311 %res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask) 2312 ret <32 x i8> %res 2313} 2314 2315define <32 x i8> @test_mask_packs_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i32 %mask) { 2316 ;CHECK-LABEL: test_mask_packs_epi16_rrkz_256 2317 ;CHECK: vpacksswb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x63,0xc1] 2318 %res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 %mask) 2319 ret <32 x i8> %res 2320} 2321 2322define <32 x i8> @test_mask_packs_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { 2323 ;CHECK-LABEL: test_mask_packs_epi16_rm_256 2324 ;CHECK: vpacksswb (%rdi), %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x63,0x07] 2325 %b = load <16 x i16>, <16 x i16>* %ptr_b 2326 %res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 -1) 2327 ret <32 x i8> %res 2328} 2329 2330define <32 x i8> @test_mask_packs_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <32 x i8> %passThru, i32 %mask) { 2331 ;CHECK-LABEL: test_mask_packs_epi16_rmk_256 2332 ;CHECK: vpacksswb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x63,0x0f] 2333 %b = load <16 x i16>, <16 x i16>* %ptr_b 2334 %res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask) 2335 ret <32 x i8> %res 2336} 2337 2338define <32 x i8> @test_mask_packs_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i32 %mask) { 2339 ;CHECK-LABEL: test_mask_packs_epi16_rmkz_256 2340 ;CHECK: vpacksswb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x63,0x07] 2341 %b = load <16 x i16>, <16 x i16>* %ptr_b 2342 %res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 %mask) 2343 ret <32 x i8> %res 2344} 2345 2346declare <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16>, <16 x i16>, <32 x i8>, i32) 2347 2348 2349define <8 x i16> @test_mask_packus_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) { 2350 ;CHECK-LABEL: test_mask_packus_epi32_rr_128 2351 ;CHECK: vpackusdw %xmm1, %xmm0, %xmm0 2352 %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 -1) 2353 ret <8 x i16> %res 2354} 2355 2356define <8 x i16> @test_mask_packus_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) { 2357 ;CHECK-LABEL: test_mask_packus_epi32_rrk_128 2358 ;CHECK: vpackusdw %xmm1, %xmm0, %xmm2 {%k1} 2359 %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) 2360 ret <8 x i16> %res 2361} 2362 2363define <8 x i16> @test_mask_packus_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) { 2364 ;CHECK-LABEL: test_mask_packus_epi32_rrkz_128 2365 ;CHECK: vpackusdw %xmm1, %xmm0, %xmm0 {%k1} {z} 2366 %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 %mask) 2367 ret <8 x i16> %res 2368} 2369 2370define <8 x i16> @test_mask_packus_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) { 2371 ;CHECK-LABEL: test_mask_packus_epi32_rm_128 2372 ;CHECK: vpackusdw (%rdi), %xmm0, %xmm0 2373 %b = load <4 x i32>, <4 x i32>* %ptr_b 2374 %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 -1) 2375 ret <8 x i16> %res 2376} 2377 2378define <8 x i16> @test_mask_packus_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <8 x i16> %passThru, i8 %mask) { 2379 ;CHECK-LABEL: test_mask_packus_epi32_rmk_128 2380 ;CHECK: vpackusdw (%rdi), %xmm0, %xmm1 {%k1} 2381 %b = load <4 x i32>, <4 x i32>* %ptr_b 2382 %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) 2383 ret <8 x i16> %res 2384} 2385 2386define <8 x i16> @test_mask_packus_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) { 2387 ;CHECK-LABEL: test_mask_packus_epi32_rmkz_128 2388 ;CHECK: vpackusdw (%rdi), %xmm0, %xmm0 {%k1} {z} 2389 %b = load <4 x i32>, <4 x i32>* %ptr_b 2390 %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 %mask) 2391 ret <8 x i16> %res 2392} 2393 2394define <8 x i16> @test_mask_packus_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) { 2395 ;CHECK-LABEL: test_mask_packus_epi32_rmb_128 2396 ;CHECK: vpackusdw (%rdi){1to4}, %xmm0, %xmm0 2397 %q = load i32, i32* %ptr_b 2398 %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 2399 %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer 2400 %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 -1) 2401 ret <8 x i16> %res 2402} 2403 2404define <8 x i16> @test_mask_packus_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <8 x i16> %passThru, i8 %mask) { 2405 ;CHECK-LABEL: test_mask_packus_epi32_rmbk_128 2406 ;CHECK: vpackusdw (%rdi){1to4}, %xmm0, %xmm1 {%k1} 2407 %q = load i32, i32* %ptr_b 2408 %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 2409 %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer 2410 %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) 2411 ret <8 x i16> %res 2412} 2413 2414define <8 x i16> @test_mask_packus_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) { 2415 ;CHECK-LABEL: test_mask_packus_epi32_rmbkz_128 2416 ;CHECK: vpackusdw (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} 2417 %q = load i32, i32* %ptr_b 2418 %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 2419 %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer 2420 %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 %mask) 2421 ret <8 x i16> %res 2422} 2423 2424declare <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32>, <4 x i32>, <8 x i16>, i8) 2425 2426define <16 x i16> @test_mask_packus_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) { 2427 ;CHECK-LABEL: test_mask_packus_epi32_rr_256 2428 ;CHECK: vpackusdw %ymm1, %ymm0, %ymm0 2429 %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 -1) 2430 ret <16 x i16> %res 2431} 2432 2433define <16 x i16> @test_mask_packus_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) { 2434 ;CHECK-LABEL: test_mask_packus_epi32_rrk_256 2435 ;CHECK: vpackusdw %ymm1, %ymm0, %ymm2 {%k1} 2436 %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) 2437 ret <16 x i16> %res 2438} 2439 2440define <16 x i16> @test_mask_packus_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i16 %mask) { 2441 ;CHECK-LABEL: test_mask_packus_epi32_rrkz_256 2442 ;CHECK: vpackusdw %ymm1, %ymm0, %ymm0 {%k1} {z} 2443 %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 %mask) 2444 ret <16 x i16> %res 2445} 2446 2447define <16 x i16> @test_mask_packus_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) { 2448 ;CHECK-LABEL: test_mask_packus_epi32_rm_256 2449 ;CHECK: vpackusdw (%rdi), %ymm0, %ymm0 2450 %b = load <8 x i32>, <8 x i32>* %ptr_b 2451 %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 -1) 2452 ret <16 x i16> %res 2453} 2454 2455define <16 x i16> @test_mask_packus_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <16 x i16> %passThru, i16 %mask) { 2456 ;CHECK-LABEL: test_mask_packus_epi32_rmk_256 2457 ;CHECK: vpackusdw (%rdi), %ymm0, %ymm1 {%k1} 2458 %b = load <8 x i32>, <8 x i32>* %ptr_b 2459 %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) 2460 ret <16 x i16> %res 2461} 2462 2463define <16 x i16> @test_mask_packus_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i16 %mask) { 2464 ;CHECK-LABEL: test_mask_packus_epi32_rmkz_256 2465 ;CHECK: vpackusdw (%rdi), %ymm0, %ymm0 {%k1} {z} 2466 %b = load <8 x i32>, <8 x i32>* %ptr_b 2467 %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 %mask) 2468 ret <16 x i16> %res 2469} 2470 2471define <16 x i16> @test_mask_packus_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) { 2472 ;CHECK-LABEL: test_mask_packus_epi32_rmb_256 2473 ;CHECK: vpackusdw (%rdi){1to8}, %ymm0, %ymm0 2474 %q = load i32, i32* %ptr_b 2475 %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 2476 %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer 2477 %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 -1) 2478 ret <16 x i16> %res 2479} 2480 2481define <16 x i16> @test_mask_packus_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <16 x i16> %passThru, i16 %mask) { 2482 ;CHECK-LABEL: test_mask_packus_epi32_rmbk_256 2483 ;CHECK: vpackusdw (%rdi){1to8}, %ymm0, %ymm1 {%k1} 2484 %q = load i32, i32* %ptr_b 2485 %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 2486 %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer 2487 %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) 2488 ret <16 x i16> %res 2489} 2490 2491define <16 x i16> @test_mask_packus_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i16 %mask) { 2492 ;CHECK-LABEL: test_mask_packus_epi32_rmbkz_256 2493 ;CHECK: vpackusdw (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} 2494 %q = load i32, i32* %ptr_b 2495 %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 2496 %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer 2497 %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 %mask) 2498 ret <16 x i16> %res 2499} 2500 2501declare <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32>, <8 x i32>, <16 x i16>, i16) 2502 2503define <16 x i8> @test_mask_packus_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) { 2504 ;CHECK-LABEL: test_mask_packus_epi16_rr_128 2505 ;CHECK: vpackuswb %xmm1, %xmm0, %xmm0 2506 %res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 -1) 2507 ret <16 x i8> %res 2508} 2509 2510define <16 x i8> @test_mask_packus_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask) { 2511 ;CHECK-LABEL: test_mask_packus_epi16_rrk_128 2512 ;CHECK: vpackuswb %xmm1, %xmm0, %xmm2 {%k1} 2513 %res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask) 2514 ret <16 x i8> %res 2515} 2516 2517define <16 x i8> @test_mask_packus_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i16 %mask) { 2518 ;CHECK-LABEL: test_mask_packus_epi16_rrkz_128 2519 ;CHECK: vpackuswb %xmm1, %xmm0, %xmm0 {%k1} {z} 2520 %res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 %mask) 2521 ret <16 x i8> %res 2522} 2523 2524define <16 x i8> @test_mask_packus_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { 2525 ;CHECK-LABEL: test_mask_packus_epi16_rm_128 2526 ;CHECK: vpackuswb (%rdi), %xmm0, %xmm0 2527 %b = load <8 x i16>, <8 x i16>* %ptr_b 2528 %res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 -1) 2529 ret <16 x i8> %res 2530} 2531 2532define <16 x i8> @test_mask_packus_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <16 x i8> %passThru, i16 %mask) { 2533 ;CHECK-LABEL: test_mask_packus_epi16_rmk_128 2534 ;CHECK: vpackuswb (%rdi), %xmm0, %xmm1 {%k1} 2535 %b = load <8 x i16>, <8 x i16>* %ptr_b 2536 %res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask) 2537 ret <16 x i8> %res 2538} 2539 2540define <16 x i8> @test_mask_packus_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i16 %mask) { 2541 ;CHECK-LABEL: test_mask_packus_epi16_rmkz_128 2542 ;CHECK: vpackuswb (%rdi), %xmm0, %xmm0 {%k1} {z} 2543 %b = load <8 x i16>, <8 x i16>* %ptr_b 2544 %res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 %mask) 2545 ret <16 x i8> %res 2546} 2547 2548declare <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16>, <8 x i16>, <16 x i8>, i16) 2549 2550define <32 x i8> @test_mask_packus_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) { 2551 ;CHECK-LABEL: test_mask_packus_epi16_rr_256 2552 ;CHECK: vpackuswb %ymm1, %ymm0, %ymm0 2553 %res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 -1) 2554 ret <32 x i8> %res 2555} 2556 2557define <32 x i8> @test_mask_packus_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask) { 2558 ;CHECK-LABEL: test_mask_packus_epi16_rrk_256 2559 ;CHECK: vpackuswb %ymm1, %ymm0, %ymm2 {%k1} 2560 %res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask) 2561 ret <32 x i8> %res 2562} 2563 2564define <32 x i8> @test_mask_packus_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i32 %mask) { 2565 ;CHECK-LABEL: test_mask_packus_epi16_rrkz_256 2566 ;CHECK: vpackuswb %ymm1, %ymm0, %ymm0 {%k1} {z} 2567 %res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 %mask) 2568 ret <32 x i8> %res 2569} 2570 2571define <32 x i8> @test_mask_packus_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { 2572 ;CHECK-LABEL: test_mask_packus_epi16_rm_256 2573 ;CHECK: vpackuswb (%rdi), %ymm0, %ymm0 2574 %b = load <16 x i16>, <16 x i16>* %ptr_b 2575 %res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 -1) 2576 ret <32 x i8> %res 2577} 2578 2579define <32 x i8> @test_mask_packus_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <32 x i8> %passThru, i32 %mask) { 2580 ;CHECK-LABEL: test_mask_packus_epi16_rmk_256 2581 ;CHECK: vpackuswb (%rdi), %ymm0, %ymm1 {%k1} 2582 %b = load <16 x i16>, <16 x i16>* %ptr_b 2583 %res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask) 2584 ret <32 x i8> %res 2585} 2586 2587define <32 x i8> @test_mask_packus_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i32 %mask) { 2588 ;CHECK-LABEL: test_mask_packus_epi16_rmkz_256 2589 ;CHECK: vpackuswb (%rdi), %ymm0, %ymm0 {%k1} {z} 2590 %b = load <16 x i16>, <16 x i16>* %ptr_b 2591 %res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 %mask) 2592 ret <32 x i8> %res 2593} 2594 2595declare <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16>, <16 x i16>, <32 x i8>, i32) 2596 2597define <8 x i16> @test_mask_adds_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) { 2598 ;CHECK-LABEL: test_mask_adds_epi16_rr_128 2599 ;CHECK: vpaddsw %xmm1, %xmm0, %xmm0 2600 %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) 2601 ret <8 x i16> %res 2602} 2603 2604define <8 x i16> @test_mask_adds_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { 2605 ;CHECK-LABEL: test_mask_adds_epi16_rrk_128 2606 ;CHECK: vpaddsw %xmm1, %xmm0, %xmm2 {%k1} 2607 %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) 2608 ret <8 x i16> %res 2609} 2610 2611define <8 x i16> @test_mask_adds_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { 2612 ;CHECK-LABEL: test_mask_adds_epi16_rrkz_128 2613 ;CHECK: vpaddsw %xmm1, %xmm0, %xmm0 {%k1} {z} 2614 %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) 2615 ret <8 x i16> %res 2616} 2617 2618define <8 x i16> @test_mask_adds_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { 2619 ;CHECK-LABEL: test_mask_adds_epi16_rm_128 2620 ;CHECK: vpaddsw (%rdi), %xmm0, %xmm0 2621 %b = load <8 x i16>, <8 x i16>* %ptr_b 2622 %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) 2623 ret <8 x i16> %res 2624} 2625 2626define <8 x i16> @test_mask_adds_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { 2627 ;CHECK-LABEL: test_mask_adds_epi16_rmk_128 2628 ;CHECK: vpaddsw (%rdi), %xmm0, %xmm1 {%k1} 2629 %b = load <8 x i16>, <8 x i16>* %ptr_b 2630 %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) 2631 ret <8 x i16> %res 2632} 2633 2634define <8 x i16> @test_mask_adds_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { 2635 ;CHECK-LABEL: test_mask_adds_epi16_rmkz_128 2636 ;CHECK: vpaddsw (%rdi), %xmm0, %xmm0 {%k1} {z} 2637 %b = load <8 x i16>, <8 x i16>* %ptr_b 2638 %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) 2639 ret <8 x i16> %res 2640} 2641 2642declare <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) 2643 2644define <16 x i16> @test_mask_adds_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) { 2645 ;CHECK-LABEL: test_mask_adds_epi16_rr_256 2646 ;CHECK: vpaddsw %ymm1, %ymm0, %ymm0 2647 %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) 2648 ret <16 x i16> %res 2649} 2650 2651define <16 x i16> @test_mask_adds_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { 2652 ;CHECK-LABEL: test_mask_adds_epi16_rrk_256 2653 ;CHECK: vpaddsw %ymm1, %ymm0, %ymm2 {%k1} 2654 %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) 2655 ret <16 x i16> %res 2656} 2657 2658define <16 x i16> @test_mask_adds_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { 2659 ;CHECK-LABEL: test_mask_adds_epi16_rrkz_256 2660 ;CHECK: vpaddsw %ymm1, %ymm0, %ymm0 {%k1} {z} 2661 %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) 2662 ret <16 x i16> %res 2663} 2664 2665define <16 x i16> @test_mask_adds_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { 2666 ;CHECK-LABEL: test_mask_adds_epi16_rm_256 2667 ;CHECK: vpaddsw (%rdi), %ymm0, %ymm0 2668 %b = load <16 x i16>, <16 x i16>* %ptr_b 2669 %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) 2670 ret <16 x i16> %res 2671} 2672 2673define <16 x i16> @test_mask_adds_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { 2674 ;CHECK-LABEL: test_mask_adds_epi16_rmk_256 2675 ;CHECK: vpaddsw (%rdi), %ymm0, %ymm1 {%k1} 2676 %b = load <16 x i16>, <16 x i16>* %ptr_b 2677 %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) 2678 ret <16 x i16> %res 2679} 2680 2681define <16 x i16> @test_mask_adds_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { 2682 ;CHECK-LABEL: test_mask_adds_epi16_rmkz_256 2683 ;CHECK: vpaddsw (%rdi), %ymm0, %ymm0 {%k1} {z} 2684 %b = load <16 x i16>, <16 x i16>* %ptr_b 2685 %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) 2686 ret <16 x i16> %res 2687} 2688 2689declare <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) 2690 2691define <8 x i16> @test_mask_subs_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) { 2692 ;CHECK-LABEL: test_mask_subs_epi16_rr_128 2693 ;CHECK: vpsubsw %xmm1, %xmm0, %xmm0 2694 %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) 2695 ret <8 x i16> %res 2696} 2697 2698define <8 x i16> @test_mask_subs_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { 2699 ;CHECK-LABEL: test_mask_subs_epi16_rrk_128 2700 ;CHECK: vpsubsw %xmm1, %xmm0, %xmm2 {%k1} 2701 %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) 2702 ret <8 x i16> %res 2703} 2704 2705define <8 x i16> @test_mask_subs_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { 2706 ;CHECK-LABEL: test_mask_subs_epi16_rrkz_128 2707 ;CHECK: vpsubsw %xmm1, %xmm0, %xmm0 {%k1} {z} 2708 %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) 2709 ret <8 x i16> %res 2710} 2711 2712define <8 x i16> @test_mask_subs_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { 2713 ;CHECK-LABEL: test_mask_subs_epi16_rm_128 2714 ;CHECK: vpsubsw (%rdi), %xmm0, %xmm0 2715 %b = load <8 x i16>, <8 x i16>* %ptr_b 2716 %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) 2717 ret <8 x i16> %res 2718} 2719 2720define <8 x i16> @test_mask_subs_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { 2721 ;CHECK-LABEL: test_mask_subs_epi16_rmk_128 2722 ;CHECK: vpsubsw (%rdi), %xmm0, %xmm1 {%k1} 2723 %b = load <8 x i16>, <8 x i16>* %ptr_b 2724 %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) 2725 ret <8 x i16> %res 2726} 2727 2728define <8 x i16> @test_mask_subs_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { 2729 ;CHECK-LABEL: test_mask_subs_epi16_rmkz_128 2730 ;CHECK: vpsubsw (%rdi), %xmm0, %xmm0 {%k1} {z} 2731 %b = load <8 x i16>, <8 x i16>* %ptr_b 2732 %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) 2733 ret <8 x i16> %res 2734} 2735 2736declare <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) 2737 2738define <16 x i16> @test_mask_subs_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) { 2739 ;CHECK-LABEL: test_mask_subs_epi16_rr_256 2740 ;CHECK: vpsubsw %ymm1, %ymm0, %ymm0 2741 %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) 2742 ret <16 x i16> %res 2743} 2744 2745define <16 x i16> @test_mask_subs_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { 2746 ;CHECK-LABEL: test_mask_subs_epi16_rrk_256 2747 ;CHECK: vpsubsw %ymm1, %ymm0, %ymm2 {%k1} 2748 %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) 2749 ret <16 x i16> %res 2750} 2751 2752define <16 x i16> @test_mask_subs_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { 2753 ;CHECK-LABEL: test_mask_subs_epi16_rrkz_256 2754 ;CHECK: vpsubsw %ymm1, %ymm0, %ymm0 {%k1} {z} 2755 %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) 2756 ret <16 x i16> %res 2757} 2758 2759define <16 x i16> @test_mask_subs_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { 2760 ;CHECK-LABEL: test_mask_subs_epi16_rm_256 2761 ;CHECK: vpsubsw (%rdi), %ymm0, %ymm0 2762 %b = load <16 x i16>, <16 x i16>* %ptr_b 2763 %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) 2764 ret <16 x i16> %res 2765} 2766 2767define <16 x i16> @test_mask_subs_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { 2768 ;CHECK-LABEL: test_mask_subs_epi16_rmk_256 2769 ;CHECK: vpsubsw (%rdi), %ymm0, %ymm1 {%k1} 2770 %b = load <16 x i16>, <16 x i16>* %ptr_b 2771 %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) 2772 ret <16 x i16> %res 2773} 2774 2775define <16 x i16> @test_mask_subs_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { 2776 ;CHECK-LABEL: test_mask_subs_epi16_rmkz_256 2777 ;CHECK: vpsubsw (%rdi), %ymm0, %ymm0 {%k1} {z} 2778 %b = load <16 x i16>, <16 x i16>* %ptr_b 2779 %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) 2780 ret <16 x i16> %res 2781} 2782 2783declare <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) 2784 2785define <8 x i16> @test_mask_adds_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) { 2786 ;CHECK-LABEL: test_mask_adds_epu16_rr_128 2787 ;CHECK: vpaddusw %xmm1, %xmm0, %xmm0 2788 %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) 2789 ret <8 x i16> %res 2790} 2791 2792define <8 x i16> @test_mask_adds_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { 2793 ;CHECK-LABEL: test_mask_adds_epu16_rrk_128 2794 ;CHECK: vpaddusw %xmm1, %xmm0, %xmm2 {%k1} 2795 %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) 2796 ret <8 x i16> %res 2797} 2798 2799define <8 x i16> @test_mask_adds_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { 2800 ;CHECK-LABEL: test_mask_adds_epu16_rrkz_128 2801 ;CHECK: vpaddusw %xmm1, %xmm0, %xmm0 {%k1} {z} 2802 %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) 2803 ret <8 x i16> %res 2804} 2805 2806define <8 x i16> @test_mask_adds_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { 2807 ;CHECK-LABEL: test_mask_adds_epu16_rm_128 2808 ;CHECK: vpaddusw (%rdi), %xmm0, %xmm0 2809 %b = load <8 x i16>, <8 x i16>* %ptr_b 2810 %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) 2811 ret <8 x i16> %res 2812} 2813 2814define <8 x i16> @test_mask_adds_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { 2815 ;CHECK-LABEL: test_mask_adds_epu16_rmk_128 2816 ;CHECK: vpaddusw (%rdi), %xmm0, %xmm1 {%k1} 2817 %b = load <8 x i16>, <8 x i16>* %ptr_b 2818 %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) 2819 ret <8 x i16> %res 2820} 2821 2822define <8 x i16> @test_mask_adds_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { 2823 ;CHECK-LABEL: test_mask_adds_epu16_rmkz_128 2824 ;CHECK: vpaddusw (%rdi), %xmm0, %xmm0 {%k1} {z} 2825 %b = load <8 x i16>, <8 x i16>* %ptr_b 2826 %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) 2827 ret <8 x i16> %res 2828} 2829 2830declare <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) 2831 2832define <16 x i16> @test_mask_adds_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) { 2833 ;CHECK-LABEL: test_mask_adds_epu16_rr_256 2834 ;CHECK: vpaddusw %ymm1, %ymm0, %ymm0 2835 %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) 2836 ret <16 x i16> %res 2837} 2838 2839define <16 x i16> @test_mask_adds_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { 2840 ;CHECK-LABEL: test_mask_adds_epu16_rrk_256 2841 ;CHECK: vpaddusw %ymm1, %ymm0, %ymm2 {%k1} 2842 %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) 2843 ret <16 x i16> %res 2844} 2845 2846define <16 x i16> @test_mask_adds_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { 2847 ;CHECK-LABEL: test_mask_adds_epu16_rrkz_256 2848 ;CHECK: vpaddusw %ymm1, %ymm0, %ymm0 {%k1} {z} 2849 %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) 2850 ret <16 x i16> %res 2851} 2852 2853define <16 x i16> @test_mask_adds_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { 2854 ;CHECK-LABEL: test_mask_adds_epu16_rm_256 2855 ;CHECK: vpaddusw (%rdi), %ymm0, %ymm0 2856 %b = load <16 x i16>, <16 x i16>* %ptr_b 2857 %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) 2858 ret <16 x i16> %res 2859} 2860 2861define <16 x i16> @test_mask_adds_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { 2862 ;CHECK-LABEL: test_mask_adds_epu16_rmk_256 2863 ;CHECK: vpaddusw (%rdi), %ymm0, %ymm1 {%k1} 2864 %b = load <16 x i16>, <16 x i16>* %ptr_b 2865 %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) 2866 ret <16 x i16> %res 2867} 2868 2869define <16 x i16> @test_mask_adds_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { 2870 ;CHECK-LABEL: test_mask_adds_epu16_rmkz_256 2871 ;CHECK: vpaddusw (%rdi), %ymm0, %ymm0 {%k1} {z} 2872 %b = load <16 x i16>, <16 x i16>* %ptr_b 2873 %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) 2874 ret <16 x i16> %res 2875} 2876 2877declare <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) 2878 2879define <8 x i16> @test_mask_subs_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) { 2880 ;CHECK-LABEL: test_mask_subs_epu16_rr_128 2881 ;CHECK: vpsubusw %xmm1, %xmm0, %xmm0 2882 %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) 2883 ret <8 x i16> %res 2884} 2885 2886define <8 x i16> @test_mask_subs_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { 2887 ;CHECK-LABEL: test_mask_subs_epu16_rrk_128 2888 ;CHECK: vpsubusw %xmm1, %xmm0, %xmm2 {%k1} 2889 %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) 2890 ret <8 x i16> %res 2891} 2892 2893define <8 x i16> @test_mask_subs_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { 2894 ;CHECK-LABEL: test_mask_subs_epu16_rrkz_128 2895 ;CHECK: vpsubusw %xmm1, %xmm0, %xmm0 {%k1} {z} 2896 %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) 2897 ret <8 x i16> %res 2898} 2899 2900define <8 x i16> @test_mask_subs_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { 2901 ;CHECK-LABEL: test_mask_subs_epu16_rm_128 2902 ;CHECK: vpsubusw (%rdi), %xmm0, %xmm0 2903 %b = load <8 x i16>, <8 x i16>* %ptr_b 2904 %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) 2905 ret <8 x i16> %res 2906} 2907 2908define <8 x i16> @test_mask_subs_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { 2909 ;CHECK-LABEL: test_mask_subs_epu16_rmk_128 2910 ;CHECK: vpsubusw (%rdi), %xmm0, %xmm1 {%k1} 2911 %b = load <8 x i16>, <8 x i16>* %ptr_b 2912 %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) 2913 ret <8 x i16> %res 2914} 2915 2916define <8 x i16> @test_mask_subs_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { 2917 ;CHECK-LABEL: test_mask_subs_epu16_rmkz_128 2918 ;CHECK: vpsubusw (%rdi), %xmm0, %xmm0 {%k1} {z} 2919 %b = load <8 x i16>, <8 x i16>* %ptr_b 2920 %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) 2921 ret <8 x i16> %res 2922} 2923 2924declare <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) 2925 2926define <16 x i16> @test_mask_subs_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) { 2927 ;CHECK-LABEL: test_mask_subs_epu16_rr_256 2928 ;CHECK: vpsubusw %ymm1, %ymm0, %ymm0 2929 %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) 2930 ret <16 x i16> %res 2931} 2932 2933define <16 x i16> @test_mask_subs_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { 2934 ;CHECK-LABEL: test_mask_subs_epu16_rrk_256 2935 ;CHECK: vpsubusw %ymm1, %ymm0, %ymm2 {%k1} 2936 %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) 2937 ret <16 x i16> %res 2938} 2939 2940define <16 x i16> @test_mask_subs_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { 2941 ;CHECK-LABEL: test_mask_subs_epu16_rrkz_256 2942 ;CHECK: vpsubusw %ymm1, %ymm0, %ymm0 {%k1} {z} 2943 %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) 2944 ret <16 x i16> %res 2945} 2946 2947define <16 x i16> @test_mask_subs_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { 2948 ;CHECK-LABEL: test_mask_subs_epu16_rm_256 2949 ;CHECK: vpsubusw (%rdi), %ymm0, %ymm0 2950 %b = load <16 x i16>, <16 x i16>* %ptr_b 2951 %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) 2952 ret <16 x i16> %res 2953} 2954 2955define <16 x i16> @test_mask_subs_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { 2956 ;CHECK-LABEL: test_mask_subs_epu16_rmk_256 2957 ;CHECK: vpsubusw (%rdi), %ymm0, %ymm1 {%k1} 2958 %b = load <16 x i16>, <16 x i16>* %ptr_b 2959 %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) 2960 ret <16 x i16> %res 2961} 2962 2963define <16 x i16> @test_mask_subs_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { 2964 ;CHECK-LABEL: test_mask_subs_epu16_rmkz_256 2965 ;CHECK: vpsubusw (%rdi), %ymm0, %ymm0 {%k1} {z} 2966 %b = load <16 x i16>, <16 x i16>* %ptr_b 2967 %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) 2968 ret <16 x i16> %res 2969} 2970 2971declare <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) 2972 2973define <16 x i8> @test_mask_adds_epi8_rr_128(<16 x i8> %a, <16 x i8> %b) { 2974 ;CHECK-LABEL: test_mask_adds_epi8_rr_128 2975 ;CHECK: vpaddsb %xmm1, %xmm0, %xmm0 2976 %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) 2977 ret <16 x i8> %res 2978} 2979 2980define <16 x i8> @test_mask_adds_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { 2981 ;CHECK-LABEL: test_mask_adds_epi8_rrk_128 2982 ;CHECK: vpaddsb %xmm1, %xmm0, %xmm2 {%k1} 2983 %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) 2984 ret <16 x i8> %res 2985} 2986 2987define <16 x i8> @test_mask_adds_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { 2988 ;CHECK-LABEL: test_mask_adds_epi8_rrkz_128 2989 ;CHECK: vpaddsb %xmm1, %xmm0, %xmm0 {%k1} {z} 2990 %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) 2991 ret <16 x i8> %res 2992} 2993 2994define <16 x i8> @test_mask_adds_epi8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { 2995 ;CHECK-LABEL: test_mask_adds_epi8_rm_128 2996 ;CHECK: vpaddsb (%rdi), %xmm0, %xmm0 2997 %b = load <16 x i8>, <16 x i8>* %ptr_b 2998 %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) 2999 ret <16 x i8> %res 3000} 3001 3002define <16 x i8> @test_mask_adds_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { 3003 ;CHECK-LABEL: test_mask_adds_epi8_rmk_128 3004 ;CHECK: vpaddsb (%rdi), %xmm0, %xmm1 {%k1} 3005 %b = load <16 x i8>, <16 x i8>* %ptr_b 3006 %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) 3007 ret <16 x i8> %res 3008} 3009 3010define <16 x i8> @test_mask_adds_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { 3011 ;CHECK-LABEL: test_mask_adds_epi8_rmkz_128 3012 ;CHECK: vpaddsb (%rdi), %xmm0, %xmm0 {%k1} {z} 3013 %b = load <16 x i8>, <16 x i8>* %ptr_b 3014 %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) 3015 ret <16 x i8> %res 3016} 3017 3018declare <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) 3019 3020define <32 x i8> @test_mask_adds_epi8_rr_256(<32 x i8> %a, <32 x i8> %b) { 3021 ;CHECK-LABEL: test_mask_adds_epi8_rr_256 3022 ;CHECK: vpaddsb %ymm1, %ymm0, %ymm0 3023 %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) 3024 ret <32 x i8> %res 3025} 3026 3027define <32 x i8> @test_mask_adds_epi8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { 3028 ;CHECK-LABEL: test_mask_adds_epi8_rrk_256 3029 ;CHECK: vpaddsb %ymm1, %ymm0, %ymm2 {%k1} 3030 %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) 3031 ret <32 x i8> %res 3032} 3033 3034define <32 x i8> @test_mask_adds_epi8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { 3035 ;CHECK-LABEL: test_mask_adds_epi8_rrkz_256 3036 ;CHECK: vpaddsb %ymm1, %ymm0, %ymm0 {%k1} {z} 3037 %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) 3038 ret <32 x i8> %res 3039} 3040 3041define <32 x i8> @test_mask_adds_epi8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { 3042 ;CHECK-LABEL: test_mask_adds_epi8_rm_256 3043 ;CHECK: vpaddsb (%rdi), %ymm0, %ymm0 3044 %b = load <32 x i8>, <32 x i8>* %ptr_b 3045 %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) 3046 ret <32 x i8> %res 3047} 3048 3049define <32 x i8> @test_mask_adds_epi8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { 3050 ;CHECK-LABEL: test_mask_adds_epi8_rmk_256 3051 ;CHECK: vpaddsb (%rdi), %ymm0, %ymm1 {%k1} 3052 %b = load <32 x i8>, <32 x i8>* %ptr_b 3053 %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) 3054 ret <32 x i8> %res 3055} 3056 3057define <32 x i8> @test_mask_adds_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { 3058 ;CHECK-LABEL: test_mask_adds_epi8_rmkz_256 3059 ;CHECK: vpaddsb (%rdi), %ymm0, %ymm0 {%k1} {z} 3060 %b = load <32 x i8>, <32 x i8>* %ptr_b 3061 %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) 3062 ret <32 x i8> %res 3063} 3064 3065declare <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) 3066 3067define <16 x i8> @test_mask_subs_epi8_rr_128(<16 x i8> %a, <16 x i8> %b) { 3068 ;CHECK-LABEL: test_mask_subs_epi8_rr_128 3069 ;CHECK: vpsubsb %xmm1, %xmm0, %xmm0 3070 %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) 3071 ret <16 x i8> %res 3072} 3073 3074define <16 x i8> @test_mask_subs_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { 3075 ;CHECK-LABEL: test_mask_subs_epi8_rrk_128 3076 ;CHECK: vpsubsb %xmm1, %xmm0, %xmm2 {%k1} 3077 %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) 3078 ret <16 x i8> %res 3079} 3080 3081define <16 x i8> @test_mask_subs_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { 3082 ;CHECK-LABEL: test_mask_subs_epi8_rrkz_128 3083 ;CHECK: vpsubsb %xmm1, %xmm0, %xmm0 {%k1} {z} 3084 %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) 3085 ret <16 x i8> %res 3086} 3087 3088define <16 x i8> @test_mask_subs_epi8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { 3089 ;CHECK-LABEL: test_mask_subs_epi8_rm_128 3090 ;CHECK: vpsubsb (%rdi), %xmm0, %xmm0 3091 %b = load <16 x i8>, <16 x i8>* %ptr_b 3092 %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) 3093 ret <16 x i8> %res 3094} 3095 3096define <16 x i8> @test_mask_subs_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { 3097 ;CHECK-LABEL: test_mask_subs_epi8_rmk_128 3098 ;CHECK: vpsubsb (%rdi), %xmm0, %xmm1 {%k1} 3099 %b = load <16 x i8>, <16 x i8>* %ptr_b 3100 %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) 3101 ret <16 x i8> %res 3102} 3103 3104define <16 x i8> @test_mask_subs_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { 3105 ;CHECK-LABEL: test_mask_subs_epi8_rmkz_128 3106 ;CHECK: vpsubsb (%rdi), %xmm0, %xmm0 {%k1} {z} 3107 %b = load <16 x i8>, <16 x i8>* %ptr_b 3108 %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) 3109 ret <16 x i8> %res 3110} 3111 3112declare <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) 3113 3114define <32 x i8> @test_mask_subs_epi8_rr_256(<32 x i8> %a, <32 x i8> %b) { 3115 ;CHECK-LABEL: test_mask_subs_epi8_rr_256 3116 ;CHECK: vpsubsb %ymm1, %ymm0, %ymm0 3117 %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) 3118 ret <32 x i8> %res 3119} 3120 3121define <32 x i8> @test_mask_subs_epi8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { 3122 ;CHECK-LABEL: test_mask_subs_epi8_rrk_256 3123 ;CHECK: vpsubsb %ymm1, %ymm0, %ymm2 {%k1} 3124 %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) 3125 ret <32 x i8> %res 3126} 3127 3128define <32 x i8> @test_mask_subs_epi8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { 3129 ;CHECK-LABEL: test_mask_subs_epi8_rrkz_256 3130 ;CHECK: vpsubsb %ymm1, %ymm0, %ymm0 {%k1} {z} 3131 %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) 3132 ret <32 x i8> %res 3133} 3134 3135define <32 x i8> @test_mask_subs_epi8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { 3136 ;CHECK-LABEL: test_mask_subs_epi8_rm_256 3137 ;CHECK: vpsubsb (%rdi), %ymm0, %ymm0 3138 %b = load <32 x i8>, <32 x i8>* %ptr_b 3139 %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) 3140 ret <32 x i8> %res 3141} 3142 3143define <32 x i8> @test_mask_subs_epi8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { 3144 ;CHECK-LABEL: test_mask_subs_epi8_rmk_256 3145 ;CHECK: vpsubsb (%rdi), %ymm0, %ymm1 {%k1} 3146 %b = load <32 x i8>, <32 x i8>* %ptr_b 3147 %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) 3148 ret <32 x i8> %res 3149} 3150 3151define <32 x i8> @test_mask_subs_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { 3152 ;CHECK-LABEL: test_mask_subs_epi8_rmkz_256 3153 ;CHECK: vpsubsb (%rdi), %ymm0, %ymm0 {%k1} {z} 3154 %b = load <32 x i8>, <32 x i8>* %ptr_b 3155 %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) 3156 ret <32 x i8> %res 3157} 3158 3159declare <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) 3160 3161define <16 x i8> @test_mask_adds_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) { 3162 ;CHECK-LABEL: test_mask_adds_epu8_rr_128 3163 ;CHECK: vpaddusb %xmm1, %xmm0, %xmm0 3164 %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) 3165 ret <16 x i8> %res 3166} 3167 3168define <16 x i8> @test_mask_adds_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { 3169 ;CHECK-LABEL: test_mask_adds_epu8_rrk_128 3170 ;CHECK: vpaddusb %xmm1, %xmm0, %xmm2 {%k1} 3171 %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) 3172 ret <16 x i8> %res 3173} 3174 3175define <16 x i8> @test_mask_adds_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { 3176 ;CHECK-LABEL: test_mask_adds_epu8_rrkz_128 3177 ;CHECK: vpaddusb %xmm1, %xmm0, %xmm0 {%k1} {z} 3178 %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) 3179 ret <16 x i8> %res 3180} 3181 3182define <16 x i8> @test_mask_adds_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { 3183 ;CHECK-LABEL: test_mask_adds_epu8_rm_128 3184 ;CHECK: vpaddusb (%rdi), %xmm0, %xmm0 3185 %b = load <16 x i8>, <16 x i8>* %ptr_b 3186 %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) 3187 ret <16 x i8> %res 3188} 3189 3190define <16 x i8> @test_mask_adds_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { 3191 ;CHECK-LABEL: test_mask_adds_epu8_rmk_128 3192 ;CHECK: vpaddusb (%rdi), %xmm0, %xmm1 {%k1} 3193 %b = load <16 x i8>, <16 x i8>* %ptr_b 3194 %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) 3195 ret <16 x i8> %res 3196} 3197 3198define <16 x i8> @test_mask_adds_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { 3199 ;CHECK-LABEL: test_mask_adds_epu8_rmkz_128 3200 ;CHECK: vpaddusb (%rdi), %xmm0, %xmm0 {%k1} {z} 3201 %b = load <16 x i8>, <16 x i8>* %ptr_b 3202 %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) 3203 ret <16 x i8> %res 3204} 3205 3206declare <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) 3207 3208define <32 x i8> @test_mask_adds_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) { 3209 ;CHECK-LABEL: test_mask_adds_epu8_rr_256 3210 ;CHECK: vpaddusb %ymm1, %ymm0, %ymm0 3211 %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) 3212 ret <32 x i8> %res 3213} 3214 3215define <32 x i8> @test_mask_adds_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { 3216 ;CHECK-LABEL: test_mask_adds_epu8_rrk_256 3217 ;CHECK: vpaddusb %ymm1, %ymm0, %ymm2 {%k1} 3218 %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) 3219 ret <32 x i8> %res 3220} 3221 3222define <32 x i8> @test_mask_adds_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { 3223 ;CHECK-LABEL: test_mask_adds_epu8_rrkz_256 3224 ;CHECK: vpaddusb %ymm1, %ymm0, %ymm0 {%k1} {z} 3225 %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) 3226 ret <32 x i8> %res 3227} 3228 3229define <32 x i8> @test_mask_adds_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { 3230 ;CHECK-LABEL: test_mask_adds_epu8_rm_256 3231 ;CHECK: vpaddusb (%rdi), %ymm0, %ymm0 3232 %b = load <32 x i8>, <32 x i8>* %ptr_b 3233 %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) 3234 ret <32 x i8> %res 3235} 3236 3237define <32 x i8> @test_mask_adds_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { 3238 ;CHECK-LABEL: test_mask_adds_epu8_rmk_256 3239 ;CHECK: vpaddusb (%rdi), %ymm0, %ymm1 {%k1} 3240 %b = load <32 x i8>, <32 x i8>* %ptr_b 3241 %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) 3242 ret <32 x i8> %res 3243} 3244 3245define <32 x i8> @test_mask_adds_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { 3246 ;CHECK-LABEL: test_mask_adds_epu8_rmkz_256 3247 ;CHECK: vpaddusb (%rdi), %ymm0, %ymm0 {%k1} {z} 3248 %b = load <32 x i8>, <32 x i8>* %ptr_b 3249 %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) 3250 ret <32 x i8> %res 3251} 3252 3253declare <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) 3254 3255define <16 x i8> @test_mask_subs_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) { 3256 ;CHECK-LABEL: test_mask_subs_epu8_rr_128 3257 ;CHECK: vpsubusb %xmm1, %xmm0, %xmm0 3258 %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) 3259 ret <16 x i8> %res 3260} 3261 3262define <16 x i8> @test_mask_subs_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { 3263 ;CHECK-LABEL: test_mask_subs_epu8_rrk_128 3264 ;CHECK: vpsubusb %xmm1, %xmm0, %xmm2 {%k1} 3265 %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) 3266 ret <16 x i8> %res 3267} 3268 3269define <16 x i8> @test_mask_subs_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { 3270 ;CHECK-LABEL: test_mask_subs_epu8_rrkz_128 3271 ;CHECK: vpsubusb %xmm1, %xmm0, %xmm0 {%k1} {z} 3272 %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) 3273 ret <16 x i8> %res 3274} 3275 3276define <16 x i8> @test_mask_subs_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { 3277 ;CHECK-LABEL: test_mask_subs_epu8_rm_128 3278 ;CHECK: vpsubusb (%rdi), %xmm0, %xmm0 3279 %b = load <16 x i8>, <16 x i8>* %ptr_b 3280 %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) 3281 ret <16 x i8> %res 3282} 3283 3284define <16 x i8> @test_mask_subs_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { 3285 ;CHECK-LABEL: test_mask_subs_epu8_rmk_128 3286 ;CHECK: vpsubusb (%rdi), %xmm0, %xmm1 {%k1} 3287 %b = load <16 x i8>, <16 x i8>* %ptr_b 3288 %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) 3289 ret <16 x i8> %res 3290} 3291 3292define <16 x i8> @test_mask_subs_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { 3293 ;CHECK-LABEL: test_mask_subs_epu8_rmkz_128 3294 ;CHECK: vpsubusb (%rdi), %xmm0, %xmm0 {%k1} {z} 3295 %b = load <16 x i8>, <16 x i8>* %ptr_b 3296 %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) 3297 ret <16 x i8> %res 3298} 3299 3300declare <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) 3301 3302define <32 x i8> @test_mask_subs_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) { 3303 ;CHECK-LABEL: test_mask_subs_epu8_rr_256 3304 ;CHECK: vpsubusb %ymm1, %ymm0, %ymm0 3305 %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) 3306 ret <32 x i8> %res 3307} 3308 3309define <32 x i8> @test_mask_subs_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { 3310 ;CHECK-LABEL: test_mask_subs_epu8_rrk_256 3311 ;CHECK: vpsubusb %ymm1, %ymm0, %ymm2 {%k1} 3312 %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) 3313 ret <32 x i8> %res 3314} 3315 3316define <32 x i8> @test_mask_subs_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { 3317 ;CHECK-LABEL: test_mask_subs_epu8_rrkz_256 3318 ;CHECK: vpsubusb %ymm1, %ymm0, %ymm0 {%k1} {z} 3319 %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) 3320 ret <32 x i8> %res 3321} 3322 3323define <32 x i8> @test_mask_subs_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { 3324 ;CHECK-LABEL: test_mask_subs_epu8_rm_256 3325 ;CHECK: vpsubusb (%rdi), %ymm0, %ymm0 3326 %b = load <32 x i8>, <32 x i8>* %ptr_b 3327 %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) 3328 ret <32 x i8> %res 3329} 3330 3331define <32 x i8> @test_mask_subs_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { 3332 ;CHECK-LABEL: test_mask_subs_epu8_rmk_256 3333 ;CHECK: vpsubusb (%rdi), %ymm0, %ymm1 {%k1} 3334 %b = load <32 x i8>, <32 x i8>* %ptr_b 3335 %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) 3336 ret <32 x i8> %res 3337} 3338 3339define <32 x i8> @test_mask_subs_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { 3340 ;CHECK-LABEL: test_mask_subs_epu8_rmkz_256 3341 ;CHECK: vpsubusb (%rdi), %ymm0, %ymm0 {%k1} {z} 3342 %b = load <32 x i8>, <32 x i8>* %ptr_b 3343 %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) 3344 ret <32 x i8> %res 3345} 3346 3347declare <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) 3348 3349declare <16 x i8> @llvm.x86.avx512.mask.pmaxs.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) 3350 3351; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_b_128 3352; CHECK-NOT: call 3353; CHECK: vpmaxsb %xmm 3354; CHECK: {%k1} 3355define <16 x i8>@test_int_x86_avx512_mask_pmaxs_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %mask) { 3356 %res = call <16 x i8> @llvm.x86.avx512.mask.pmaxs.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2 ,i16 %mask) 3357 %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmaxs.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %mask) 3358 %res2 = add <16 x i8> %res, %res1 3359 ret <16 x i8> %res2 3360} 3361 3362declare <32 x i8> @llvm.x86.avx512.mask.pmaxs.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) 3363 3364; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_b_256 3365; CHECK-NOT: call 3366; CHECK: vpmaxsb %ymm 3367; CHECK: {%k1} 3368define <32 x i8>@test_int_x86_avx512_mask_pmaxs_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { 3369 %res = call <32 x i8> @llvm.x86.avx512.mask.pmaxs.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) 3370 %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmaxs.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) 3371 %res2 = add <32 x i8> %res, %res1 3372 ret <32 x i8> %res2 3373} 3374 3375declare <8 x i16> @llvm.x86.avx512.mask.pmaxs.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) 3376 3377; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_w_128 3378; CHECK-NOT: call 3379; CHECK: vpmaxsw %xmm 3380; CHECK: {%k1} 3381define <8 x i16>@test_int_x86_avx512_mask_pmaxs_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { 3382 %res = call <8 x i16> @llvm.x86.avx512.mask.pmaxs.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) 3383 %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmaxs.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) 3384 %res2 = add <8 x i16> %res, %res1 3385 ret <8 x i16> %res2 3386} 3387 3388declare <16 x i16> @llvm.x86.avx512.mask.pmaxs.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) 3389 3390; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_w_256 3391; CHECK-NOT: call 3392; CHECK: vpmaxsw %ymm 3393; CHECK: {%k1} 3394define <16 x i16>@test_int_x86_avx512_mask_pmaxs_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask) { 3395 %res = call <16 x i16> @llvm.x86.avx512.mask.pmaxs.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask) 3396 %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmaxs.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %mask) 3397 %res2 = add <16 x i16> %res, %res1 3398 ret <16 x i16> %res2 3399} 3400 3401declare <16 x i8> @llvm.x86.avx512.mask.pmaxu.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) 3402 3403; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_b_128 3404; CHECK-NOT: call 3405; CHECK: vpmaxub %xmm 3406; CHECK: {%k1} 3407define <16 x i8>@test_int_x86_avx512_mask_pmaxu_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2,i16 %mask) { 3408 %res = call <16 x i8> @llvm.x86.avx512.mask.pmaxu.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %mask) 3409 %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmaxu.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %mask) 3410 %res2 = add <16 x i8> %res, %res1 3411 ret <16 x i8> %res2 3412} 3413 3414declare <32 x i8> @llvm.x86.avx512.mask.pmaxu.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) 3415 3416; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_b_256 3417; CHECK-NOT: call 3418; CHECK: vpmaxub %ymm 3419; CHECK: {%k1} 3420define <32 x i8>@test_int_x86_avx512_mask_pmaxu_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { 3421 %res = call <32 x i8> @llvm.x86.avx512.mask.pmaxu.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) 3422 %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmaxu.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) 3423 %res2 = add <32 x i8> %res, %res1 3424 ret <32 x i8> %res2 3425} 3426 3427declare <8 x i16> @llvm.x86.avx512.mask.pmaxu.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) 3428 3429; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_w_128 3430; CHECK-NOT: call 3431; CHECK: vpmaxuw %xmm 3432; CHECK: {%k1} 3433define <8 x i16>@test_int_x86_avx512_mask_pmaxu_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { 3434 %res = call <8 x i16> @llvm.x86.avx512.mask.pmaxu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) 3435 %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmaxu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) 3436 %res2 = add <8 x i16> %res, %res1 3437 ret <8 x i16> %res2 3438} 3439 3440declare <16 x i16> @llvm.x86.avx512.mask.pmaxu.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) 3441 3442; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_w_256 3443; CHECK-NOT: call 3444; CHECK: vpmaxuw %ymm 3445; CHECK: {%k1} 3446define <16 x i16>@test_int_x86_avx512_mask_pmaxu_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask) { 3447 %res = call <16 x i16> @llvm.x86.avx512.mask.pmaxu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask) 3448 %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmaxu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %mask) 3449 %res2 = add <16 x i16> %res, %res1 3450 ret <16 x i16> %res2 3451} 3452 3453declare <16 x i8> @llvm.x86.avx512.mask.pmins.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) 3454 3455; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_b_128 3456; CHECK-NOT: call 3457; CHECK: vpminsb %xmm 3458; CHECK: {%k1} 3459define <16 x i8>@test_int_x86_avx512_mask_pmins_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %mask) { 3460 %res = call <16 x i8> @llvm.x86.avx512.mask.pmins.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %mask) 3461 %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmins.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %mask) 3462 %res2 = add <16 x i8> %res, %res1 3463 ret <16 x i8> %res2 3464} 3465 3466declare <32 x i8> @llvm.x86.avx512.mask.pmins.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) 3467 3468; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_b_256 3469; CHECK-NOT: call 3470; CHECK: vpminsb %ymm 3471; CHECK: {%k1} 3472define <32 x i8>@test_int_x86_avx512_mask_pmins_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { 3473 %res = call <32 x i8> @llvm.x86.avx512.mask.pmins.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) 3474 %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmins.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) 3475 %res2 = add <32 x i8> %res, %res1 3476 ret <32 x i8> %res2 3477} 3478 3479declare <8 x i16> @llvm.x86.avx512.mask.pmins.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) 3480 3481; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_w_128 3482; CHECK-NOT: call 3483; CHECK: vpminsw %xmm 3484; CHECK: {%k1} 3485define <8 x i16>@test_int_x86_avx512_mask_pmins_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { 3486 %res = call <8 x i16> @llvm.x86.avx512.mask.pmins.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) 3487 %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmins.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) 3488 %res2 = add <8 x i16> %res, %res1 3489 ret <8 x i16> %res2 3490} 3491 3492declare <16 x i16> @llvm.x86.avx512.mask.pmins.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) 3493 3494; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_w_256 3495; CHECK-NOT: call 3496; CHECK: vpminsw %ymm 3497; CHECK: {%k1} 3498define <16 x i16>@test_int_x86_avx512_mask_pmins_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask) { 3499 %res = call <16 x i16> @llvm.x86.avx512.mask.pmins.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask) 3500 %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmins.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %mask) 3501 %res2 = add <16 x i16> %res, %res1 3502 ret <16 x i16> %res2 3503} 3504 3505declare <16 x i8> @llvm.x86.avx512.mask.pminu.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) 3506 3507; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_b_128 3508; CHECK-NOT: call 3509; CHECK: vpminub %xmm 3510; CHECK: {%k1} 3511define <16 x i8>@test_int_x86_avx512_mask_pminu_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %mask) { 3512 %res = call <16 x i8> @llvm.x86.avx512.mask.pminu.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %mask) 3513 %res1 = call <16 x i8> @llvm.x86.avx512.mask.pminu.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %mask) 3514 %res2 = add <16 x i8> %res, %res1 3515 ret <16 x i8> %res2 3516} 3517 3518declare <32 x i8> @llvm.x86.avx512.mask.pminu.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) 3519 3520; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_b_256 3521; CHECK-NOT: call 3522; CHECK: vpminub %ymm 3523; CHECK: {%k1} 3524define <32 x i8>@test_int_x86_avx512_mask_pminu_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { 3525 %res = call <32 x i8> @llvm.x86.avx512.mask.pminu.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) 3526 %res1 = call <32 x i8> @llvm.x86.avx512.mask.pminu.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) 3527 %res2 = add <32 x i8> %res, %res1 3528 ret <32 x i8> %res2 3529} 3530 3531declare <8 x i16> @llvm.x86.avx512.mask.pminu.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) 3532 3533; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_w_128 3534; CHECK-NOT: call 3535; CHECK: vpminuw %xmm 3536; CHECK: {%k1} 3537define <8 x i16>@test_int_x86_avx512_mask_pminu_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { 3538 %res = call <8 x i16> @llvm.x86.avx512.mask.pminu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) 3539 %res1 = call <8 x i16> @llvm.x86.avx512.mask.pminu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) 3540 %res2 = add <8 x i16> %res, %res1 3541 ret <8 x i16> %res2 3542} 3543 3544declare <16 x i16> @llvm.x86.avx512.mask.pminu.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) 3545 3546; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_w_256 3547; CHECK-NOT: call 3548; CHECK: vpminuw %ymm 3549; CHECK: {%k1} 3550define <16 x i16>@test_int_x86_avx512_mask_pminu_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask) { 3551 %res = call <16 x i16> @llvm.x86.avx512.mask.pminu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask) 3552 %res1 = call <16 x i16> @llvm.x86.avx512.mask.pminu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %mask) 3553 %res2 = add <16 x i16> %res, %res1 3554 ret <16 x i16> %res2 3555} 3556 3557declare <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) 3558 3559; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_hi_128 3560; CHECK-NOT: call 3561; CHECK: kmov 3562; CHECK: vpermt2w %xmm{{.*}}{%k1} 3563; CHECK-NOT: {z} 3564define <8 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { 3565 %res = call <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) 3566 %res1 = call <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) 3567 %res2 = add <8 x i16> %res, %res1 3568 ret <8 x i16> %res2 3569} 3570 3571declare <8 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) 3572 3573; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_hi_128 3574; CHECK-NOT: call 3575; CHECK: kmov 3576; CHECK: vpermt2w %xmm{{.*}}{%k1} {z} 3577define <8 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { 3578 %res = call <8 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) 3579 %res1 = call <8 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) 3580 %res2 = add <8 x i16> %res, %res1 3581 ret <8 x i16> %res2 3582} 3583 3584declare <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) 3585 3586; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_hi_256 3587; CHECK-NOT: call 3588; CHECK: kmov 3589; CHECK: vpermt2w %ymm{{.*}}{%k1} 3590define <16 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { 3591 %res = call <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) 3592 %res1 = call <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) 3593 %res2 = add <16 x i16> %res, %res1 3594 ret <16 x i16> %res2 3595} 3596 3597declare <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) 3598 3599; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_hi_256 3600; CHECK-NOT: call 3601; CHECK: kmov 3602; CHECK: vpermt2w %ymm{{.*}}{%k1} {z} 3603define <16 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { 3604 %res = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) 3605 %res1 = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) 3606 %res2 = add <16 x i16> %res, %res1 3607 ret <16 x i16> %res2 3608} 3609 3610declare <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) 3611 3612; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_hi_128 3613; CHECK-NOT: call 3614; CHECK: kmov 3615; CHECK: vpermi2w %xmm{{.*}}{%k1} 3616define <8 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { 3617 %res = call <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) 3618 %res1 = call <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) 3619 %res2 = add <8 x i16> %res, %res1 3620 ret <8 x i16> %res2 3621} 3622 3623declare <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) 3624 3625; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_hi_256 3626; CHECK-NOT: call 3627; CHECK: kmov 3628; CHECK: vpermi2w %ymm{{.*}}{%k1} 3629define <16 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { 3630 %res = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) 3631 %res1 = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) 3632 %res2 = add <16 x i16> %res, %res1 3633 ret <16 x i16> %res2 3634} 3635 3636declare <16 x i8> @llvm.x86.avx512.mask.pavg.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) 3637 3638; CHECK-LABEL: @test_int_x86_avx512_mask_pavg_b_128 3639; CHECK-NOT: call 3640; CHECK: vpavgb %xmm 3641; CHECK: {%k1} 3642define <16 x i8>@test_int_x86_avx512_mask_pavg_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) { 3643 %res = call <16 x i8> @llvm.x86.avx512.mask.pavg.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) 3644 %res1 = call <16 x i8> @llvm.x86.avx512.mask.pavg.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) 3645 %res2 = add <16 x i8> %res, %res1 3646 ret <16 x i8> %res2 3647} 3648 3649declare <32 x i8> @llvm.x86.avx512.mask.pavg.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) 3650 3651; CHECK-LABEL: @test_int_x86_avx512_mask_pavg_b_256 3652; CHECK-NOT: call 3653; CHECK: vpavgb %ymm 3654; CHECK: {%k1} 3655define <32 x i8>@test_int_x86_avx512_mask_pavg_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { 3656 %res = call <32 x i8> @llvm.x86.avx512.mask.pavg.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) 3657 %res1 = call <32 x i8> @llvm.x86.avx512.mask.pavg.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) 3658 %res2 = add <32 x i8> %res, %res1 3659 ret <32 x i8> %res2 3660} 3661 3662declare <8 x i16> @llvm.x86.avx512.mask.pavg.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) 3663 3664; CHECK-LABEL: @test_int_x86_avx512_mask_pavg_w_128 3665; CHECK-NOT: call 3666; CHECK: vpavgw %xmm 3667; CHECK: {%k1} 3668define <8 x i16>@test_int_x86_avx512_mask_pavg_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { 3669 %res = call <8 x i16> @llvm.x86.avx512.mask.pavg.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) 3670 %res1 = call <8 x i16> @llvm.x86.avx512.mask.pavg.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) 3671 %res2 = add <8 x i16> %res, %res1 3672 ret <8 x i16> %res2 3673} 3674 3675declare <16 x i16> @llvm.x86.avx512.mask.pavg.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) 3676 3677; CHECK-LABEL: @test_int_x86_avx512_mask_pavg_w_256 3678; CHECK-NOT: call 3679; CHECK: vpavgw %ymm 3680; CHECK: {%k1} 3681define <16 x i16>@test_int_x86_avx512_mask_pavg_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { 3682 %res = call <16 x i16> @llvm.x86.avx512.mask.pavg.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) 3683 %res1 = call <16 x i16> @llvm.x86.avx512.mask.pavg.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) 3684 %res2 = add <16 x i16> %res, %res1 3685 ret <16 x i16> %res2 3686} 3687 3688declare <16 x i8> @llvm.x86.avx512.mask.pshuf.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) 3689 3690; CHECK-LABEL: @test_int_x86_avx512_mask_pshuf_b_128 3691; CHECK-NOT: call 3692; CHECK: kmov 3693; CHECK: vpshufb %xmm{{.*}}{%k1} 3694define <16 x i8>@test_int_x86_avx512_mask_pshuf_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) { 3695 %res = call <16 x i8> @llvm.x86.avx512.mask.pshuf.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) 3696 %res1 = call <16 x i8> @llvm.x86.avx512.mask.pshuf.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) 3697 %res2 = add <16 x i8> %res, %res1 3698 ret <16 x i8> %res2 3699} 3700 3701declare <32 x i8> @llvm.x86.avx512.mask.pshuf.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) 3702 3703; CHECK-LABEL: @test_int_x86_avx512_mask_pshuf_b_256 3704; CHECK-NOT: call 3705; CHECK: kmov 3706; CHECK: vpshufb %ymm{{.*}}{%k1} 3707define <32 x i8>@test_int_x86_avx512_mask_pshuf_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { 3708 %res = call <32 x i8> @llvm.x86.avx512.mask.pshuf.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) 3709 %res1 = call <32 x i8> @llvm.x86.avx512.mask.pshuf.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) 3710 %res2 = add <32 x i8> %res, %res1 3711 ret <32 x i8> %res2 3712} 3713 3714declare <16 x i8> @llvm.x86.avx512.mask.pabs.b.128(<16 x i8>, <16 x i8>, i16) 3715 3716; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_b_128 3717; CHECK-NOT: call 3718; CHECK: kmov 3719; CHECK: vpabsb{{.*}}{%k1} 3720define <16 x i8>@test_int_x86_avx512_mask_pabs_b_128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2) { 3721 %res = call <16 x i8> @llvm.x86.avx512.mask.pabs.b.128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2) 3722 %res1 = call <16 x i8> @llvm.x86.avx512.mask.pabs.b.128(<16 x i8> %x0, <16 x i8> %x1, i16 -1) 3723 %res2 = add <16 x i8> %res, %res1 3724 ret <16 x i8> %res2 3725} 3726 3727declare <32 x i8> @llvm.x86.avx512.mask.pabs.b.256(<32 x i8>, <32 x i8>, i32) 3728 3729; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_b_256 3730; CHECK-NOT: call 3731; CHECK: kmov 3732; CHECK: vpabsb{{.*}}{%k1} 3733define <32 x i8>@test_int_x86_avx512_mask_pabs_b_256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2) { 3734 %res = call <32 x i8> @llvm.x86.avx512.mask.pabs.b.256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2) 3735 %res1 = call <32 x i8> @llvm.x86.avx512.mask.pabs.b.256(<32 x i8> %x0, <32 x i8> %x1, i32 -1) 3736 %res2 = add <32 x i8> %res, %res1 3737 ret <32 x i8> %res2 3738} 3739 3740declare <8 x i16> @llvm.x86.avx512.mask.pabs.w.128(<8 x i16>, <8 x i16>, i8) 3741 3742; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_w_128 3743; CHECK-NOT: call 3744; CHECK: kmov 3745; CHECK: vpabsw{{.*}}{%k1} 3746define <8 x i16>@test_int_x86_avx512_mask_pabs_w_128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2) { 3747 %res = call <8 x i16> @llvm.x86.avx512.mask.pabs.w.128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2) 3748 %res1 = call <8 x i16> @llvm.x86.avx512.mask.pabs.w.128(<8 x i16> %x0, <8 x i16> %x1, i8 -1) 3749 %res2 = add <8 x i16> %res, %res1 3750 ret <8 x i16> %res2 3751} 3752 3753declare <16 x i16> @llvm.x86.avx512.mask.pabs.w.256(<16 x i16>, <16 x i16>, i16) 3754 3755; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_w_256 3756; CHECK-NOT: call 3757; CHECK: kmov 3758; CHECK: vpabsw{{.*}}{%k1} 3759define <16 x i16>@test_int_x86_avx512_mask_pabs_w_256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2) { 3760 %res = call <16 x i16> @llvm.x86.avx512.mask.pabs.w.256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2) 3761 %res1 = call <16 x i16> @llvm.x86.avx512.mask.pabs.w.256(<16 x i16> %x0, <16 x i16> %x1, i16 -1) 3762 %res2 = add <16 x i16> %res, %res1 3763 ret <16 x i16> %res2 3764} 3765 3766; CHECK-LABEL: test_x86_mask_blend_b_256 3767; CHECK: vpblendmb 3768define <32 x i8> @test_x86_mask_blend_b_256(i32 %a0, <32 x i8> %a1, <32 x i8> %a2) { 3769 %res = call <32 x i8> @llvm.x86.avx512.mask.blend.b.256(<32 x i8> %a1, <32 x i8> %a2, i32 %a0) ; <<32 x i8>> [#uses=1] 3770 ret <32 x i8> %res 3771} 3772declare <32 x i8> @llvm.x86.avx512.mask.blend.b.256(<32 x i8>, <32 x i8>, i32) nounwind readonly 3773 3774; CHECK-LABEL: test_x86_mask_blend_w_256 3775define <16 x i16> @test_x86_mask_blend_w_256(i16 %mask, <16 x i16> %a1, <16 x i16> %a2) { 3776 ; CHECK: vpblendmw 3777 %res = call <16 x i16> @llvm.x86.avx512.mask.blend.w.256(<16 x i16> %a1, <16 x i16> %a2, i16 %mask) ; <<16 x i16>> [#uses=1] 3778 ret <16 x i16> %res 3779} 3780declare <16 x i16> @llvm.x86.avx512.mask.blend.w.256(<16 x i16>, <16 x i16>, i16) nounwind readonly 3781 3782; CHECK-LABEL: test_x86_mask_blend_b_128 3783; CHECK: vpblendmb 3784define <16 x i8> @test_x86_mask_blend_b_128(i16 %a0, <16 x i8> %a1, <16 x i8> %a2) { 3785 %res = call <16 x i8> @llvm.x86.avx512.mask.blend.b.128(<16 x i8> %a1, <16 x i8> %a2, i16 %a0) ; <<16 x i8>> [#uses=1] 3786 ret <16 x i8> %res 3787} 3788declare <16 x i8> @llvm.x86.avx512.mask.blend.b.128(<16 x i8>, <16 x i8>, i16) nounwind readonly 3789 3790; CHECK-LABEL: test_x86_mask_blend_w_128 3791define <8 x i16> @test_x86_mask_blend_w_128(i8 %mask, <8 x i16> %a1, <8 x i16> %a2) { 3792 ; CHECK: vpblendmw 3793 %res = call <8 x i16> @llvm.x86.avx512.mask.blend.w.128(<8 x i16> %a1, <8 x i16> %a2, i8 %mask) ; <<8 x i16>> [#uses=1] 3794 ret <8 x i16> %res 3795} 3796declare <8 x i16> @llvm.x86.avx512.mask.blend.w.128(<8 x i16>, <8 x i16>, i8) nounwind readonly 3797 3798declare <8 x i16> @llvm.x86.avx512.mask.pmulhu.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) 3799 3800; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhu_w_128 3801; CHECK-NOT: call 3802; CHECK: kmov 3803; CHECK: {%k1} 3804; CHECK: vpmulhuw {{.*}}encoding: [0x62 3805define <8 x i16>@test_int_x86_avx512_mask_pmulhu_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { 3806 %res = call <8 x i16> @llvm.x86.avx512.mask.pmulhu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) 3807 %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmulhu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) 3808 %res2 = add <8 x i16> %res, %res1 3809 ret <8 x i16> %res2 3810} 3811 3812declare <16 x i16> @llvm.x86.avx512.mask.pmulhu.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) 3813 3814; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhu_w_256 3815; CHECK-NOT: call 3816; CHECK: kmov 3817; CHECK: {%k1} 3818; CHECK: vpmulhuw {{.*}}encoding: [0x62 3819define <16 x i16>@test_int_x86_avx512_mask_pmulhu_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { 3820 %res = call <16 x i16> @llvm.x86.avx512.mask.pmulhu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) 3821 %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmulhu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) 3822 %res2 = add <16 x i16> %res, %res1 3823 ret <16 x i16> %res2 3824} 3825 3826declare <8 x i16> @llvm.x86.avx512.mask.pmulh.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) 3827 3828; CHECK-LABEL: @test_int_x86_avx512_mask_pmulh_w_128 3829; CHECK-NOT: call 3830; CHECK: kmov 3831; CHECK: {%k1} 3832; CHECK: vpmulhw {{.*}}encoding: [0x62 3833define <8 x i16>@test_int_x86_avx512_mask_pmulh_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { 3834 %res = call <8 x i16> @llvm.x86.avx512.mask.pmulh.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) 3835 %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmulh.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) 3836 %res2 = add <8 x i16> %res, %res1 3837 ret <8 x i16> %res2 3838} 3839 3840declare <16 x i16> @llvm.x86.avx512.mask.pmulh.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) 3841; CHECK-LABEL: @test_int_x86_avx512_mask_pmulh_w_256 3842; CHECK-NOT: call 3843; CHECK: kmov 3844; CHECK: {%k1} 3845; CHECK: vpmulhw {{.*}}encoding: [0x62 3846define <16 x i16>@test_int_x86_avx512_mask_pmulh_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { 3847 %res = call <16 x i16> @llvm.x86.avx512.mask.pmulh.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) 3848 %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmulh.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) 3849 %res2 = add <16 x i16> %res, %res1 3850 ret <16 x i16> %res2 3851} 3852 3853declare <8 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) 3854; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhr_sw_128 3855; CHECK-NOT: call 3856; CHECK: kmov 3857; CHECK: {%k1} 3858; CHECK: vpmulhrsw {{.*}}encoding: [0x62 3859define <8 x i16>@test_int_x86_avx512_mask_pmulhr_sw_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { 3860 %res = call <8 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) 3861 %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) 3862 %res2 = add <8 x i16> %res, %res1 3863 ret <8 x i16> %res2 3864} 3865 3866declare <16 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) 3867; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhr_sw_256 3868; CHECK-NOT: call 3869; CHECK: kmov 3870; CHECK: {%k1} 3871; CHECK: vpmulhrsw {{.*}}encoding: [0x62 3872define <16 x i16>@test_int_x86_avx512_mask_pmulhr_sw_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { 3873 %res = call <16 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) 3874 %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) 3875 %res2 = add <16 x i16> %res, %res1 3876 ret <16 x i16> %res2 3877} 3878 3879declare <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16>, <16 x i8>, i8) 3880 3881define <16 x i8>@test_int_x86_avx512_mask_pmov_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) { 3882; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_128: 3883; CHECK: vpmovwb %xmm0, %xmm1 {%k1} 3884; CHECK-NEXT: vpmovwb %xmm0, %xmm2 {%k1} {z} 3885; CHECK-NEXT: vpmovwb %xmm0, %xmm0 3886 %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 -1) 3887 %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) 3888 %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16> %x0, <16 x i8> zeroinitializer, i8 %x2) 3889 %res3 = add <16 x i8> %res0, %res1 3890 %res4 = add <16 x i8> %res3, %res2 3891 ret <16 x i8> %res4 3892} 3893 3894declare void @llvm.x86.avx512.mask.pmov.wb.mem.128(i8* %ptr, <8 x i16>, i8) 3895 3896define void @test_int_x86_avx512_mask_pmov_wb_mem_128(i8* %ptr, <8 x i16> %x1, i8 %x2) { 3897; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_128: 3898; CHECK: vpmovwb %xmm0, (%rdi) 3899; CHECK: vpmovwb %xmm0, (%rdi) {%k1} 3900 call void @llvm.x86.avx512.mask.pmov.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 -1) 3901 call void @llvm.x86.avx512.mask.pmov.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 %x2) 3902 ret void 3903} 3904 3905declare <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16>, <16 x i8>, i8) 3906 3907define <16 x i8>@test_int_x86_avx512_mask_pmovs_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) { 3908; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_128: 3909; CHECK: vpmovswb %xmm0, %xmm1 {%k1} 3910; CHECK-NEXT: vpmovswb %xmm0, %xmm2 {%k1} {z} 3911; CHECK-NEXT: vpmovswb %xmm0, %xmm0 3912 %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 -1) 3913 %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) 3914 %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16> %x0, <16 x i8> zeroinitializer, i8 %x2) 3915 %res3 = add <16 x i8> %res0, %res1 3916 %res4 = add <16 x i8> %res3, %res2 3917 ret <16 x i8> %res4 3918} 3919 3920declare void @llvm.x86.avx512.mask.pmovs.wb.mem.128(i8* %ptr, <8 x i16>, i8) 3921 3922define void @test_int_x86_avx512_mask_pmovs_wb_mem_128(i8* %ptr, <8 x i16> %x1, i8 %x2) { 3923; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_128: 3924; CHECK: vpmovswb %xmm0, (%rdi) 3925; CHECK: vpmovswb %xmm0, (%rdi) {%k1} 3926 call void @llvm.x86.avx512.mask.pmovs.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 -1) 3927 call void @llvm.x86.avx512.mask.pmovs.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 %x2) 3928 ret void 3929} 3930 3931declare <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16>, <16 x i8>, i8) 3932 3933define <16 x i8>@test_int_x86_avx512_mask_pmovus_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) { 3934; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_128: 3935; CHECK: vpmovuswb %xmm0, %xmm1 {%k1} 3936; CHECK-NEXT: vpmovuswb %xmm0, %xmm2 {%k1} {z} 3937; CHECK-NEXT: vpmovuswb %xmm0, %xmm0 3938 %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 -1) 3939 %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) 3940 %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16> %x0, <16 x i8> zeroinitializer, i8 %x2) 3941 %res3 = add <16 x i8> %res0, %res1 3942 %res4 = add <16 x i8> %res3, %res2 3943 ret <16 x i8> %res4 3944} 3945 3946declare void @llvm.x86.avx512.mask.pmovus.wb.mem.128(i8* %ptr, <8 x i16>, i8) 3947 3948define void @test_int_x86_avx512_mask_pmovus_wb_mem_128(i8* %ptr, <8 x i16> %x1, i8 %x2) { 3949; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_128: 3950; CHECK: vpmovuswb %xmm0, (%rdi) 3951; CHECK: vpmovuswb %xmm0, (%rdi) {%k1} 3952 call void @llvm.x86.avx512.mask.pmovus.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 -1) 3953 call void @llvm.x86.avx512.mask.pmovus.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 %x2) 3954 ret void 3955} 3956 3957declare <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16>, <16 x i8>, i16) 3958 3959define <16 x i8>@test_int_x86_avx512_mask_pmov_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) { 3960; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_256: 3961; CHECK: vpmovwb %ymm0, %xmm1 {%k1} 3962; CHECK-NEXT: vpmovwb %ymm0, %xmm2 {%k1} {z} 3963; CHECK-NEXT: vpmovwb %ymm0, %xmm0 3964 %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1) 3965 %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) 3966 %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2) 3967 %res3 = add <16 x i8> %res0, %res1 3968 %res4 = add <16 x i8> %res3, %res2 3969 ret <16 x i8> %res4 3970} 3971 3972declare void @llvm.x86.avx512.mask.pmov.wb.mem.256(i8* %ptr, <16 x i16>, i16) 3973 3974define void @test_int_x86_avx512_mask_pmov_wb_mem_256(i8* %ptr, <16 x i16> %x1, i16 %x2) { 3975; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_256: 3976; CHECK: vpmovwb %ymm0, (%rdi) 3977; CHECK: vpmovwb %ymm0, (%rdi) {%k1} 3978 call void @llvm.x86.avx512.mask.pmov.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 -1) 3979 call void @llvm.x86.avx512.mask.pmov.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 %x2) 3980 ret void 3981} 3982 3983declare <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16>, <16 x i8>, i16) 3984 3985define <16 x i8>@test_int_x86_avx512_mask_pmovs_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) { 3986; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_256: 3987; CHECK: vpmovswb %ymm0, %xmm1 {%k1} 3988; CHECK-NEXT: vpmovswb %ymm0, %xmm2 {%k1} {z} 3989; CHECK-NEXT: vpmovswb %ymm0, %xmm0 3990 %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1) 3991 %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) 3992 %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2) 3993 %res3 = add <16 x i8> %res0, %res1 3994 %res4 = add <16 x i8> %res3, %res2 3995 ret <16 x i8> %res4 3996} 3997 3998declare void @llvm.x86.avx512.mask.pmovs.wb.mem.256(i8* %ptr, <16 x i16>, i16) 3999 4000define void @test_int_x86_avx512_mask_pmovs_wb_mem_256(i8* %ptr, <16 x i16> %x1, i16 %x2) { 4001; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_256: 4002; CHECK: vpmovswb %ymm0, (%rdi) 4003; CHECK: vpmovswb %ymm0, (%rdi) {%k1} 4004 call void @llvm.x86.avx512.mask.pmovs.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 -1) 4005 call void @llvm.x86.avx512.mask.pmovs.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 %x2) 4006 ret void 4007} 4008 4009declare <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16>, <16 x i8>, i16) 4010 4011define <16 x i8>@test_int_x86_avx512_mask_pmovus_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) { 4012; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_256: 4013; CHECK: vpmovuswb %ymm0, %xmm1 {%k1} 4014; CHECK-NEXT: vpmovuswb %ymm0, %xmm2 {%k1} {z} 4015; CHECK-NEXT: vpmovuswb %ymm0, %xmm0 4016 %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1) 4017 %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) 4018 %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2) 4019 %res3 = add <16 x i8> %res0, %res1 4020 %res4 = add <16 x i8> %res3, %res2 4021 ret <16 x i8> %res4 4022} 4023 4024declare void @llvm.x86.avx512.mask.pmovus.wb.mem.256(i8* %ptr, <16 x i16>, i16) 4025 4026define void @test_int_x86_avx512_mask_pmovus_wb_mem_256(i8* %ptr, <16 x i16> %x1, i16 %x2) { 4027; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_256: 4028; CHECK: vpmovuswb %ymm0, (%rdi) 4029; CHECK: vpmovuswb %ymm0, (%rdi) {%k1} 4030 call void @llvm.x86.avx512.mask.pmovus.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 -1) 4031 call void @llvm.x86.avx512.mask.pmovus.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 %x2) 4032 ret void 4033} 4034 4035declare <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16>, <8 x i16>, <4 x i32>, i8) 4036 4037define <4 x i32>@test_int_x86_avx512_mask_pmaddw_d_128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 %x3) { 4038; CHECK-LABEL: test_int_x86_avx512_mask_pmaddw_d_128: 4039; CHECK: ## BB#0: 4040; CHECK-NEXT: movzbl %dil, %eax 4041; CHECK-NEXT: kmovw %eax, %k1 4042; CHECK-NEXT: vpmaddwd %xmm1, %xmm0, %xmm2 {%k1} 4043; CHECK-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 4044; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 4045; CHECK-NEXT: retq 4046 %res = call <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 %x3) 4047 %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 -1) 4048 %res2 = add <4 x i32> %res, %res1 4049 ret <4 x i32> %res2 4050} 4051 4052declare <8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16>, <16 x i16>, <8 x i32>, i8) 4053 4054define <8 x i32>@test_int_x86_avx512_mask_pmaddw_d_256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 %x3) { 4055; CHECK-LABEL: test_int_x86_avx512_mask_pmaddw_d_256: 4056; CHECK: ## BB#0: 4057; CHECK-NEXT: movzbl %dil, %eax 4058; CHECK-NEXT: kmovw %eax, %k1 4059; CHECK-NEXT: vpmaddwd %ymm1, %ymm0, %ymm2 {%k1} 4060; CHECK-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 4061; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 4062; CHECK-NEXT: retq 4063 %res = call <8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 %x3) 4064 %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 -1) 4065 %res2 = add <8 x i32> %res, %res1 4066 ret <8 x i32> %res2 4067} 4068 4069declare <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8>, <16 x i8>, <8 x i16>, i8) 4070 4071define <8 x i16>@test_int_x86_avx512_mask_pmaddubs_w_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 %x3) { 4072; CHECK-LABEL: test_int_x86_avx512_mask_pmaddubs_w_128: 4073; CHECK: ## BB#0: 4074; CHECK-NEXT: movzbl %dil, %eax 4075; CHECK-NEXT: kmovw %eax, %k1 4076; CHECK-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm2 {%k1} 4077; CHECK-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 4078; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 4079; CHECK-NEXT: retq 4080 %res = call <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 %x3) 4081 %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 -1) 4082 %res2 = add <8 x i16> %res, %res1 4083 ret <8 x i16> %res2 4084} 4085 4086declare <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8>, <32 x i8>, <16 x i16>, i16) 4087 4088define <16 x i16>@test_int_x86_avx512_mask_pmaddubs_w_256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2, i16 %x3) { 4089; CHECK-LABEL: test_int_x86_avx512_mask_pmaddubs_w_256: 4090; CHECK: ## BB#0: 4091; CHECK-NEXT: kmovw %edi, %k1 4092; CHECK-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm2 {%k1} 4093; CHECK-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 4094; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 4095; CHECK-NEXT: retq 4096 %res = call <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2, i16 %x3) 4097 %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2, i16 -1) 4098 %res2 = add <16 x i16> %res, %res1 4099 ret <16 x i16> %res2 4100} 4101 4102declare <16 x i8> @llvm.x86.avx512.mask.punpckhb.w.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) 4103 4104define <16 x i8>@test_int_x86_avx512_mask_punpckhb_w_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) { 4105; CHECK-LABEL: test_int_x86_avx512_mask_punpckhb_w_128: 4106; CHECK: vpunpckhbw %xmm1, %xmm0, %xmm2 {%k1} 4107; CHECK-NEXT: ## xmm2 = xmm2[8],k1[8],xmm2[9],k1[9],xmm2[10],k1[10],xmm2[11],k1[11],xmm2[12],k1[12],xmm2[13],k1[13],xmm2[14],k1[14],xmm2[15],k1[15] 4108; CHECK-NEXT: vpunpckhbw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x68,0xc1] 4109; CHECK-NEXT: ## xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 4110 %res = call <16 x i8> @llvm.x86.avx512.mask.punpckhb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) 4111 %res1 = call <16 x i8> @llvm.x86.avx512.mask.punpckhb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) 4112 %res2 = add <16 x i8> %res, %res1 4113 ret <16 x i8> %res2 4114} 4115 4116declare <16 x i8> @llvm.x86.avx512.mask.punpcklb.w.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) 4117 4118define <16 x i8>@test_int_x86_avx512_mask_punpcklb_w_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) { 4119; CHECK-LABEL: test_int_x86_avx512_mask_punpcklb_w_128: 4120; CHECK: vpunpcklbw %xmm1, %xmm0, %xmm2 {%k1} 4121; CHECK-NEXT: ## xmm2 = xmm2[0],k1[0],xmm2[1],k1[1],xmm2[2],k1[2],xmm2[3],k1[3],xmm2[4],k1[4],xmm2[5],k1[5],xmm2[6],k1[6],xmm2[7],k1[7] 4122; CHECK-NEXT: vpunpcklbw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x60,0xc1] 4123; CHECK-NEXT: ## xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 4124 %res = call <16 x i8> @llvm.x86.avx512.mask.punpcklb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) 4125 %res1 = call <16 x i8> @llvm.x86.avx512.mask.punpcklb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) 4126 %res2 = add <16 x i8> %res, %res1 4127 ret <16 x i8> %res2 4128} 4129 4130declare <32 x i8> @llvm.x86.avx512.mask.punpckhb.w.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) 4131 4132define <32 x i8>@test_int_x86_avx512_mask_punpckhb_w_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { 4133; CHECK-LABEL: test_int_x86_avx512_mask_punpckhb_w_256: 4134; CHECK: vpunpckhbw %ymm1, %ymm0, %ymm2 {%k1} 4135; CHECK-NEXT: ## ymm2 = ymm2[8],k1[8],ymm2[9],k1[9],ymm2[10],k1[10],ymm2[11],k1[11],ymm2[12],k1[12],ymm2[13],k1[13],ymm2[14],k1[14],ymm2[15],k1[15],ymm2[24],k1[24],ymm2[25],k1[25],ymm2[26],k1[26],ymm2[27],k1[27],ymm2[28],k1[28],ymm2[29],k1[29],ymm2[30],k1[30],ymm2[31],k1[31] 4136; CHECK-NEXT: vpunpckhbw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x68,0xc1] 4137; CHECK-NEXT: ## ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] 4138 %res = call <32 x i8> @llvm.x86.avx512.mask.punpckhb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) 4139 %res1 = call <32 x i8> @llvm.x86.avx512.mask.punpckhb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) 4140 %res2 = add <32 x i8> %res, %res1 4141 ret <32 x i8> %res2 4142} 4143 4144declare <32 x i8> @llvm.x86.avx512.mask.punpcklb.w.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) 4145 4146define <32 x i8>@test_int_x86_avx512_mask_punpcklb_w_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { 4147; CHECK-LABEL: test_int_x86_avx512_mask_punpcklb_w_256: 4148; CHECK: vpunpcklbw %ymm1, %ymm0, %ymm2 {%k1} 4149; CHECK-NEXT: ## ymm2 = ymm2[0],k1[0],ymm2[1],k1[1],ymm2[2],k1[2],ymm2[3],k1[3],ymm2[4],k1[4],ymm2[5],k1[5],ymm2[6],k1[6],ymm2[7],k1[7],ymm2[16],k1[16],ymm2[17],k1[17],ymm2[18],k1[18],ymm2[19],k1[19],ymm2[20],k1[20],ymm2[21],k1[21],ymm2[22],k1[22],ymm2[23],k1[23] 4150; CHECK-NEXT: vpunpcklbw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x60,0xc1] 4151; CHECK-NEXT: ## ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] 4152 %res = call <32 x i8> @llvm.x86.avx512.mask.punpcklb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) 4153 %res1 = call <32 x i8> @llvm.x86.avx512.mask.punpcklb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) 4154 %res2 = add <32 x i8> %res, %res1 4155 ret <32 x i8> %res2 4156} 4157 4158declare <8 x i16> @llvm.x86.avx512.mask.punpcklw.d.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) 4159 4160define <8 x i16>@test_int_x86_avx512_mask_punpcklw_d_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { 4161; CHECK-LABEL: test_int_x86_avx512_mask_punpcklw_d_128: 4162; CHECK: vpunpcklwd %xmm1, %xmm0, %xmm2 {%k1} 4163; CHECK-NEXT: ## xmm2 = xmm2[0],k1[0],xmm2[1],k1[1],xmm2[2],k1[2],xmm2[3],k1[3] 4164; CHECK-NEXT: vpunpcklwd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x61,0xc1] 4165; CHECK-NEXT: ## xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 4166 %res = call <8 x i16> @llvm.x86.avx512.mask.punpcklw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) 4167 %res1 = call <8 x i16> @llvm.x86.avx512.mask.punpcklw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) 4168 %res2 = add <8 x i16> %res, %res1 4169 ret <8 x i16> %res2 4170} 4171 4172declare <8 x i16> @llvm.x86.avx512.mask.punpckhw.d.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) 4173 4174define <8 x i16>@test_int_x86_avx512_mask_punpckhw_d_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { 4175; CHECK-LABEL: test_int_x86_avx512_mask_punpckhw_d_128: 4176; CHECK: vpunpckhwd %xmm1, %xmm0, %xmm2 {%k1} 4177; CHECK-NEXT: ## xmm2 = xmm2[4],k1[4],xmm2[5],k1[5],xmm2[6],k1[6],xmm2[7],k1[7] 4178; CHECK-NEXT: vpunpckhwd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x69,0xc1] 4179; CHECK-NEXT: ## xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 4180 %res = call <8 x i16> @llvm.x86.avx512.mask.punpckhw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) 4181 %res1 = call <8 x i16> @llvm.x86.avx512.mask.punpckhw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) 4182 %res2 = add <8 x i16> %res, %res1 4183 ret <8 x i16> %res2 4184} 4185 4186declare <16 x i16> @llvm.x86.avx512.mask.punpcklw.d.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) 4187 4188define <16 x i16>@test_int_x86_avx512_mask_punpcklw_d_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { 4189; CHECK-LABEL: test_int_x86_avx512_mask_punpcklw_d_256: 4190; CHECK: vpunpcklwd %ymm1, %ymm0, %ymm2 {%k1} 4191; CHECK-NEXT: ## ymm2 = ymm2[0],k1[0],ymm2[1],k1[1],ymm2[2],k1[2],ymm2[3],k1[3],ymm2[8],k1[8],ymm2[9],k1[9],ymm2[10],k1[10],ymm2[11],k1[11] 4192; CHECK-NEXT: vpunpcklwd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x61,0xc1] 4193; CHECK-NEXT: ## ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] 4194 %res = call <16 x i16> @llvm.x86.avx512.mask.punpcklw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) 4195 %res1 = call <16 x i16> @llvm.x86.avx512.mask.punpcklw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) 4196 %res2 = add <16 x i16> %res, %res1 4197 ret <16 x i16> %res2 4198} 4199 4200declare <16 x i16> @llvm.x86.avx512.mask.punpckhw.d.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) 4201 4202define <16 x i16>@test_int_x86_avx512_mask_punpckhw_d_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { 4203; CHECK-LABEL: test_int_x86_avx512_mask_punpckhw_d_256: 4204; CHECK: vpunpckhwd %ymm1, %ymm0, %ymm2 {%k1} 4205; CHECK-NEXT: ## ymm2 = ymm2[4],k1[4],ymm2[5],k1[5],ymm2[6],k1[6],ymm2[7],k1[7],ymm2[12],k1[12],ymm2[13],k1[13],ymm2[14],k1[14],ymm2[15],k1[15] 4206; CHECK-NEXT: vpunpckhwd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x69,0xc1] 4207; CHECK-NEXT: ## ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] 4208 %res = call <16 x i16> @llvm.x86.avx512.mask.punpckhw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) 4209 %res1 = call <16 x i16> @llvm.x86.avx512.mask.punpckhw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) 4210 %res2 = add <16 x i16> %res, %res1 4211 ret <16 x i16> %res2 4212} 4213 4214declare <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8>, <16 x i8>, i32, <16 x i8>, i16) 4215 4216define <16 x i8>@test_int_x86_avx512_mask_palignr_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x3, i16 %x4) { 4217; CHECK-LABEL: test_int_x86_avx512_mask_palignr_128: 4218; CHECK: ## BB#0: 4219; CHECK-NEXT: kmovw %edi, %k1 4220; CHECK-NEXT: vpalignr $2, %xmm1, %xmm0, %xmm2 {%k1} 4221; CHECK-NEXT: vpalignr $2, %xmm1, %xmm0, %xmm3 {%k1} {z} 4222; CHECK-NEXT: vpalignr $2, %xmm1, %xmm0, %xmm0 4223; CHECK-NEXT: vpaddb %xmm3, %xmm2, %xmm1 4224; CHECK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 4225; CHECK-NEXT: retq 4226 %res = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> %x3, i16 %x4) 4227 %res1 = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> zeroinitializer, i16 %x4) 4228 %res2 = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> %x3, i16 -1) 4229 %res3 = add <16 x i8> %res, %res1 4230 %res4 = add <16 x i8> %res3, %res2 4231 ret <16 x i8> %res4 4232} 4233 4234declare <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8>, <32 x i8>, i32, <32 x i8>, i32) 4235 4236define <32 x i8>@test_int_x86_avx512_mask_palignr_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x3, i32 %x4) { 4237; CHECK-LABEL: test_int_x86_avx512_mask_palignr_256: 4238; CHECK: ## BB#0: 4239; CHECK-NEXT: kmovd %edi, %k1 4240; CHECK-NEXT: vpalignr $2, %ymm1, %ymm0, %ymm2 {%k1} 4241; CHECK-NEXT: vpalignr $2, %ymm1, %ymm0, %ymm3 {%k1} {z} 4242; CHECK-NEXT: vpalignr $2, %ymm1, %ymm0, %ymm0 4243; CHECK-NEXT: vpaddb %ymm3, %ymm2, %ymm1 4244; CHECK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 4245; CHECK-NEXT: retq 4246 %res = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> %x3, i32 %x4) 4247 %res1 = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> zeroinitializer, i32 %x4) 4248 %res2 = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> %x3, i32 -1) 4249 %res3 = add <32 x i8> %res, %res1 4250 %res4 = add <32 x i8> %res3, %res2 4251 ret <32 x i8> %res4 4252} 4253 4254declare <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8>, <16 x i8>, i32, <8 x i16>, i8) 4255 4256define <8 x i16>@test_int_x86_avx512_mask_dbpsadbw_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x3, i8 %x4) { 4257; CHECK-LABEL: test_int_x86_avx512_mask_dbpsadbw_128: 4258; CHECK: ## BB#0: 4259; CHECK-NEXT: movzbl %dil, %eax 4260; CHECK-NEXT: kmovw %eax, %k1 4261; CHECK-NEXT: vdbpsadbw $2, %xmm1, %xmm0, %xmm2 {%k1} 4262; CHECK-NEXT: vdbpsadbw $2, %xmm1, %xmm0, %xmm3 {%k1} {z} 4263; CHECK-NEXT: vdbpsadbw $2, %xmm1, %xmm0, %xmm0 4264; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm1 4265; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 4266; CHECK-NEXT: retq 4267 %res = call <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <8 x i16> %x3, i8 %x4) 4268 %res1 = call <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <8 x i16> zeroinitializer, i8 %x4) 4269 %res2 = call <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <8 x i16> %x3, i8 -1) 4270 %res3 = add <8 x i16> %res, %res1 4271 %res4 = add <8 x i16> %res2, %res3 4272 ret <8 x i16> %res4 4273} 4274 4275declare <16 x i16> @llvm.x86.avx512.mask.dbpsadbw.256(<32 x i8>, <32 x i8>, i32, <16 x i16>, i16) 4276 4277define <16 x i16>@test_int_x86_avx512_mask_dbpsadbw_256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x3, i16 %x4) { 4278; CHECK-LABEL: test_int_x86_avx512_mask_dbpsadbw_256: 4279; CHECK: ## BB#0: 4280; CHECK-NEXT: kmovw %edi, %k1 4281; CHECK-NEXT: vdbpsadbw $2, %ymm1, %ymm0, %ymm2 {%k1} 4282; CHECK-NEXT: vdbpsadbw $2, %ymm1, %ymm0, %ymm3 {%k1} {z} 4283; CHECK-NEXT: vdbpsadbw $2, %ymm1, %ymm0, %ymm0 4284; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm1 4285; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 4286; CHECK-NEXT: retq 4287 %res = call <16 x i16> @llvm.x86.avx512.mask.dbpsadbw.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <16 x i16> %x3, i16 %x4) 4288 %res1 = call <16 x i16> @llvm.x86.avx512.mask.dbpsadbw.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <16 x i16> zeroinitializer, i16 %x4) 4289 %res2 = call <16 x i16> @llvm.x86.avx512.mask.dbpsadbw.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <16 x i16> %x3, i16 -1) 4290 %res3 = add <16 x i16> %res, %res1 4291 %res4 = add <16 x i16> %res3, %res2 4292 ret <16 x i16> %res4 4293} 4294 4295declare <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8>, <32 x i8>, i32) 4296 4297define <32 x i8>@test_int_x86_avx512_pbroadcastb_256(<16 x i8> %x0, <32 x i8> %x1, i32 %mask) { 4298; CHECK-LABEL: test_int_x86_avx512_pbroadcastb_256: 4299; CHECK: ## BB#0: 4300; CHECK-NEXT: kmovd %edi, %k1 4301; CHECK-NEXT: vpbroadcastb %xmm0, %ymm1 {%k1} 4302; CHECK-NEXT: vpbroadcastb %xmm0, %ymm2 {%k1} {z} 4303; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0 4304; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 4305; CHECK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 4306; CHECK-NEXT: retq 4307 %res = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 -1) 4308 %res1 = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 %mask) 4309 %res2 = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> zeroinitializer, i32 %mask) 4310 %res3 = add <32 x i8> %res, %res1 4311 %res4 = add <32 x i8> %res2, %res3 4312 ret <32 x i8> %res4 4313} 4314 4315declare <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8>, <16 x i8>, i16) 4316 4317define <16 x i8>@test_int_x86_avx512_pbroadcastb_128(<16 x i8> %x0, <16 x i8> %x1, i16 %mask) { 4318; CHECK-LABEL: test_int_x86_avx512_pbroadcastb_128: 4319; CHECK: ## BB#0: 4320; CHECK-NEXT: kmovw %edi, %k1 4321; CHECK-NEXT: vpbroadcastb %xmm0, %xmm1 {%k1} 4322; CHECK-NEXT: vpbroadcastb %xmm0, %xmm2 {%k1} {z} 4323; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0 4324; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 4325; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 4326; CHECK-NEXT: retq 4327 %res = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 -1) 4328 %res1 = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 %mask) 4329 %res2 = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> zeroinitializer, i16 %mask) 4330 %res3 = add <16 x i8> %res, %res1 4331 %res4 = add <16 x i8> %res2, %res3 4332 ret <16 x i8> %res4 4333} 4334 4335declare <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16>, <16 x i16>, i16) 4336 4337define <16 x i16>@test_int_x86_avx512_pbroadcastw_256(<8 x i16> %x0, <16 x i16> %x1, i16 %mask) { 4338; CHECK-LABEL: test_int_x86_avx512_pbroadcastw_256: 4339; CHECK: ## BB#0: 4340; CHECK-NEXT: kmovw %edi, %k1 4341; CHECK-NEXT: vpbroadcastw %xmm0, %ymm1 {%k1} 4342; CHECK-NEXT: vpbroadcastw %xmm0, %ymm2 {%k1} {z} 4343; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0 4344; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 4345; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 4346; CHECK-NEXT: retq 4347 %res = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 -1) 4348 %res1 = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 %mask) 4349 %res2 = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> zeroinitializer, i16 %mask) 4350 %res3 = add <16 x i16> %res, %res1 4351 %res4 = add <16 x i16> %res2, %res3 4352 ret <16 x i16> %res4 4353} 4354 4355declare <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16>, <8 x i16>, i8) 4356 4357define <8 x i16>@test_int_x86_avx512_pbroadcastw_128(<8 x i16> %x0, <8 x i16> %x1, i8 %mask) { 4358; CHECK-LABEL: test_int_x86_avx512_pbroadcastw_128: 4359; CHECK: ## BB#0: 4360; CHECK-NEXT: movzbl %dil, %eax 4361; CHECK-NEXT: kmovw %eax, %k1 4362; CHECK-NEXT: vpbroadcastw %xmm0, %xmm1 {%k1} 4363; CHECK-NEXT: vpbroadcastw %xmm0, %xmm2 {%k1} {z} 4364; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0 4365; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 4366; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 4367; CHECK-NEXT: retq 4368 %res = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 -1) 4369 %res1 = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 %mask) 4370 %res2 = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> zeroinitializer, i8 %mask) 4371 %res3 = add <8 x i16> %res, %res1 4372 %res4 = add <8 x i16> %res2, %res3 4373 ret <8 x i16> %res4 4374} 4375 4376declare <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8>, <64 x i8>, i64) 4377 4378define <64 x i8>@test_int_x86_avx512_pbroadcastb_512(<16 x i8> %x0, <64 x i8> %x1, i64 %mask) { 4379; CHECK-LABEL: test_int_x86_avx512_pbroadcastb_512: 4380; CHECK: ## BB#0: 4381; CHECK-NEXT: kmovq %rdi, %k1 ## encoding: [0xc4,0xe1,0xfb,0x92,0xcf] 4382; CHECK-NEXT: vpbroadcastb %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x78,0xc8] 4383; CHECK-NEXT: vpbroadcastb %xmm0, %zmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x78,0xd0] 4384; CHECK-NEXT: vpbroadcastb %xmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x78,0xc0] 4385; CHECK-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc1] 4386; CHECK-NEXT: vpaddb %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc0] 4387; CHECK-NEXT: retq ## encoding: [0xc3] 4388 %res = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 -1) 4389 %res1 = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 %mask) 4390 %res2 = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> zeroinitializer, i64 %mask) 4391 %res3 = add <64 x i8> %res, %res1 4392 %res4 = add <64 x i8> %res2, %res3 4393 ret <64 x i8> %res4 4394} 4395 4396declare <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16>, <32 x i16>, i32) 4397 4398define <32 x i16>@test_int_x86_avx512_pbroadcastw_512(<8 x i16> %x0, <32 x i16> %x1, i32 %mask) { 4399; CHECK-LABEL: test_int_x86_avx512_pbroadcastw_512: 4400; CHECK: ## BB#0: 4401; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] 4402; CHECK-NEXT: vpbroadcastw %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x79,0xc8] 4403; CHECK-NEXT: vpbroadcastw %xmm0, %zmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x79,0xd0] 4404; CHECK-NEXT: vpbroadcastw %xmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x79,0xc0] 4405; CHECK-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc1] 4406; CHECK-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0] 4407; CHECK-NEXT: retq ## encoding: [0xc3] 4408 %res = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 -1) 4409 %res1 = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 %mask) 4410 %res2 = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> zeroinitializer, i32 %mask) 4411 %res3 = add <32 x i16> %res, %res1 4412 %res4 = add <32 x i16> %res2, %res3 4413 ret <32 x i16> %res4 4414} 4415 4416