1; check AVX2 instructions that are disabled in case avx512VL/avx512BW present 2 3; RUN: llc < %s -mtriple=x86_64-apple-darwin -show-mc-encoding -mcpu=core-avx2 -mattr=+avx2 -o /dev/null 4; RUN: llc < %s -mtriple=x86_64-apple-darwin -show-mc-encoding -mcpu=knl -o /dev/null 5; RUN: llc < %s -mtriple=x86_64-apple-darwin -show-mc-encoding -mcpu=knl -mattr=+avx512vl -o /dev/null 6; RUN: llc < %s -mtriple=x86_64-apple-darwin -show-mc-encoding -mcpu=knl -mattr=+avx512bw -o /dev/null 7; RUN: llc < %s -mtriple=x86_64-apple-darwin -show-mc-encoding -mcpu=knl -mattr=+avx512vl -mattr=+avx512bw -o /dev/null 8; RUN: llc < %s -mtriple=x86_64-apple-darwin -show-mc-encoding -mcpu=skx -o /dev/null 9 10define <4 x i64> @vpand_256(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { 11 ; Force the execution domain with an add. 12 %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1> 13 %x = and <4 x i64> %a2, %b 14 ret <4 x i64> %x 15} 16 17define <2 x i64> @vpand_128(<2 x i64> %a, <2 x i64> %b) nounwind uwtable readnone ssp { 18 ; Force the execution domain with an add. 19 %a2 = add <2 x i64> %a, <i64 1, i64 1> 20 %x = and <2 x i64> %a2, %b 21 ret <2 x i64> %x 22} 23 24define <4 x i64> @vpandn_256(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { 25 ; Force the execution domain with an add. 26 %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1> 27 %y = xor <4 x i64> %a2, <i64 -1, i64 -1, i64 -1, i64 -1> 28 %x = and <4 x i64> %a, %y 29 ret <4 x i64> %x 30} 31 32define <2 x i64> @vpandn_128(<2 x i64> %a, <2 x i64> %b) nounwind uwtable readnone ssp { 33 ; Force the execution domain with an add. 34 %a2 = add <2 x i64> %a, <i64 1, i64 1> 35 %y = xor <2 x i64> %a2, <i64 -1, i64 -1> 36 %x = and <2 x i64> %a, %y 37 ret <2 x i64> %x 38} 39 40define <4 x i64> @vpor_256(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { 41 ; Force the execution domain with an add. 42 %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1> 43 %x = or <4 x i64> %a2, %b 44 ret <4 x i64> %x 45} 46 47define <4 x i64> @vpxor_256(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { 48 ; Force the execution domain with an add. 49 %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1> 50 %x = xor <4 x i64> %a2, %b 51 ret <4 x i64> %x 52} 53 54define <2 x i64> @vpor_128(<2 x i64> %a, <2 x i64> %b) nounwind uwtable readnone ssp { 55 ; Force the execution domain with an add. 56 %a2 = add <2 x i64> %a, <i64 1, i64 1> 57 %x = or <2 x i64> %a2, %b 58 ret <2 x i64> %x 59} 60 61define <2 x i64> @vpxor_128(<2 x i64> %a, <2 x i64> %b) nounwind uwtable readnone ssp { 62 ; Force the execution domain with an add. 63 %a2 = add <2 x i64> %a, <i64 1, i64 1> 64 %x = xor <2 x i64> %a2, %b 65 ret <2 x i64> %x 66} 67 68define <4 x i64> @test_vpaddq_256(<4 x i64> %i, <4 x i64> %j) nounwind readnone { 69 %x = add <4 x i64> %i, %j 70 ret <4 x i64> %x 71} 72 73define <8 x i32> @test_vpaddd_256(<8 x i32> %i, <8 x i32> %j) nounwind readnone { 74 %x = add <8 x i32> %i, %j 75 ret <8 x i32> %x 76} 77 78define <16 x i16> @test_vpaddw_256(<16 x i16> %i, <16 x i16> %j) nounwind readnone { 79 %x = add <16 x i16> %i, %j 80 ret <16 x i16> %x 81} 82 83define <32 x i8> @test_vpaddb_256(<32 x i8> %i, <32 x i8> %j) nounwind readnone { 84 %x = add <32 x i8> %i, %j 85 ret <32 x i8> %x 86} 87 88define <4 x i64> @test_vpsubq_256(<4 x i64> %i, <4 x i64> %j) nounwind readnone { 89 %x = sub <4 x i64> %i, %j 90 ret <4 x i64> %x 91} 92 93define <8 x i32> @test_vpsubd_256(<8 x i32> %i, <8 x i32> %j) nounwind readnone { 94 %x = sub <8 x i32> %i, %j 95 ret <8 x i32> %x 96} 97 98define <16 x i16> @test_vpsubw_256(<16 x i16> %i, <16 x i16> %j) nounwind readnone { 99 %x = sub <16 x i16> %i, %j 100 ret <16 x i16> %x 101} 102 103define <32 x i8> @test_vpsubb_256(<32 x i8> %i, <32 x i8> %j) nounwind readnone { 104 %x = sub <32 x i8> %i, %j 105 ret <32 x i8> %x 106} 107 108define <16 x i16> @test_vpmullw_256(<16 x i16> %i, <16 x i16> %j) nounwind readnone { 109 %x = mul <16 x i16> %i, %j 110 ret <16 x i16> %x 111} 112 113define <8 x i32> @test_vpcmpgtd_256(<8 x i32> %i, <8 x i32> %j) nounwind readnone { 114 %bincmp = icmp slt <8 x i32> %i, %j 115 %x = sext <8 x i1> %bincmp to <8 x i32> 116 ret <8 x i32> %x 117} 118 119define <32 x i8> @test_vpcmpeqb_256(<32 x i8> %i, <32 x i8> %j) nounwind readnone { 120 %bincmp = icmp eq <32 x i8> %i, %j 121 %x = sext <32 x i1> %bincmp to <32 x i8> 122 ret <32 x i8> %x 123} 124 125define <16 x i16> @test_vpcmpeqw_256(<16 x i16> %i, <16 x i16> %j) nounwind readnone { 126 %bincmp = icmp eq <16 x i16> %i, %j 127 %x = sext <16 x i1> %bincmp to <16 x i16> 128 ret <16 x i16> %x 129} 130 131define <32 x i8> @test_vpcmpgtb_256(<32 x i8> %i, <32 x i8> %j) nounwind readnone { 132 %bincmp = icmp slt <32 x i8> %i, %j 133 %x = sext <32 x i1> %bincmp to <32 x i8> 134 ret <32 x i8> %x 135} 136 137define <16 x i16> @test_vpcmpgtw_256(<16 x i16> %i, <16 x i16> %j) nounwind readnone { 138 %bincmp = icmp slt <16 x i16> %i, %j 139 %x = sext <16 x i1> %bincmp to <16 x i16> 140 ret <16 x i16> %x 141} 142 143define <8 x i32> @test_vpcmpeqd_256(<8 x i32> %i, <8 x i32> %j) nounwind readnone { 144 %bincmp = icmp eq <8 x i32> %i, %j 145 %x = sext <8 x i1> %bincmp to <8 x i32> 146 ret <8 x i32> %x 147} 148 149define <2 x i64> @test_vpaddq_128(<2 x i64> %i, <2 x i64> %j) nounwind readnone { 150 %x = add <2 x i64> %i, %j 151 ret <2 x i64> %x 152} 153 154define <4 x i32> @test_vpaddd_128(<4 x i32> %i, <4 x i32> %j) nounwind readnone { 155 %x = add <4 x i32> %i, %j 156 ret <4 x i32> %x 157} 158 159define <8 x i16> @test_vpaddw_128(<8 x i16> %i, <8 x i16> %j) nounwind readnone { 160 %x = add <8 x i16> %i, %j 161 ret <8 x i16> %x 162} 163 164define <16 x i8> @test_vpaddb_128(<16 x i8> %i, <16 x i8> %j) nounwind readnone { 165 %x = add <16 x i8> %i, %j 166 ret <16 x i8> %x 167} 168 169define <2 x i64> @test_vpsubq_128(<2 x i64> %i, <2 x i64> %j) nounwind readnone { 170 %x = sub <2 x i64> %i, %j 171 ret <2 x i64> %x 172} 173 174define <4 x i32> @test_vpsubd_128(<4 x i32> %i, <4 x i32> %j) nounwind readnone { 175 %x = sub <4 x i32> %i, %j 176 ret <4 x i32> %x 177} 178 179define <8 x i16> @test_vpsubw_128(<8 x i16> %i, <8 x i16> %j) nounwind readnone { 180 %x = sub <8 x i16> %i, %j 181 ret <8 x i16> %x 182} 183 184define <16 x i8> @test_vpsubb_128(<16 x i8> %i, <16 x i8> %j) nounwind readnone { 185 %x = sub <16 x i8> %i, %j 186 ret <16 x i8> %x 187} 188 189define <8 x i16> @test_vpmullw_128(<8 x i16> %i, <8 x i16> %j) nounwind readnone { 190 %x = mul <8 x i16> %i, %j 191 ret <8 x i16> %x 192} 193 194define <8 x i16> @test_vpcmpgtw_128(<8 x i16> %i, <8 x i16> %j) nounwind readnone { 195 %bincmp = icmp slt <8 x i16> %i, %j 196 %x = sext <8 x i1> %bincmp to <8 x i16> 197 ret <8 x i16> %x 198} 199 200define <16 x i8> @test_vpcmpgtb_128(<16 x i8> %i, <16 x i8> %j) nounwind readnone { 201 %bincmp = icmp slt <16 x i8> %i, %j 202 %x = sext <16 x i1> %bincmp to <16 x i8> 203 ret <16 x i8> %x 204} 205 206define <8 x i16> @test_vpcmpeqw_128(<8 x i16> %i, <8 x i16> %j) nounwind readnone { 207 %bincmp = icmp eq <8 x i16> %i, %j 208 %x = sext <8 x i1> %bincmp to <8 x i16> 209 ret <8 x i16> %x 210} 211 212define <16 x i8> @test_vpcmpeqb_128(<16 x i8> %i, <16 x i8> %j) nounwind readnone { 213 %bincmp = icmp eq <16 x i8> %i, %j 214 %x = sext <16 x i1> %bincmp to <16 x i8> 215 ret <16 x i8> %x 216} 217 218define <8 x i16> @shuffle_v8i16_vpalignr(<8 x i16> %a, <8 x i16> %b) { 219 %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> 220 ret <8 x i16> %shuffle 221} 222 223define <16 x i16> @shuffle_v16i16_vpalignr(<16 x i16> %a, <16 x i16> %b) { 224 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 31, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14> 225 ret <16 x i16> %shuffle 226} 227 228define <16 x i8> @shuffle_v16i8_vpalignr(<16 x i8> %a, <16 x i8> %b) { 229 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14> 230 ret <16 x i8> %shuffle 231} 232 233define <32 x i8> @shuffle_v32i8_vpalignr(<32 x i8> %a, <32 x i8> %b) { 234 %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 235 ret <32 x i8> %shuffle 236} 237 238define <2 x i64> @shuffle_v2i64_vpalignr(<2 x i64> %a, <2 x i64> %b) { 239 %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 2> 240 ret <2 x i64> %shuffle 241} 242 243define <4 x i32> @shuffle_v4i32_vpalignr(<4 x i32> %a, <4 x i32> %b) { 244 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 0, i32 1, i32 2> 245 ret <4 x i32> %shuffle 246} 247 248define <8 x i32> @shuffle_v8i32_vpalignr(<8 x i32> %a, <8 x i32> %b) { 249 %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 11, i32 0, i32 1, i32 2, i32 15, i32 4, i32 5, i32 6> 250 ret <8 x i32> %shuffle 251} 252 253define <4 x double> @shuffle_v4f64_5163(<4 x double> %a, <4 x double> %b) { 254 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 5, i32 1, i32 6, i32 3> 255 ret <4 x double> %shuffle 256} 257 258define <2 x double> @shuffle_v2f64_bitcast_1z(<2 x double> %a) { 259 %shuffle64 = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> <i32 2, i32 1> 260 %bitcast32 = bitcast <2 x double> %shuffle64 to <4 x float> 261 %shuffle32 = shufflevector <4 x float> %bitcast32, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 262 %bitcast64 = bitcast <4 x float> %shuffle32 to <2 x double> 263 ret <2 x double> %bitcast64 264} 265 266define <16 x i16> @shuffle_v16i16_zz_zz_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_24(<16 x i16> %a) { 267 %shuffle = shufflevector <16 x i16> zeroinitializer, <16 x i16> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 24> 268 ret <16 x i16> %shuffle 269} 270 271define i64 @extract_v2i64(<2 x i64> %x, i64* %dst) { 272 %r1 = extractelement <2 x i64> %x, i32 0 273 %r2 = extractelement <2 x i64> %x, i32 1 274 store i64 %r2, i64* %dst, align 1 275 ret i64 %r1 276} 277 278define i32 @extract_v4i32(<4 x i32> %x, i32* %dst) { 279 %r1 = extractelement <4 x i32> %x, i32 1 280 %r2 = extractelement <4 x i32> %x, i32 3 281 store i32 %r2, i32* %dst, align 1 282 ret i32 %r1 283} 284 285define i16 @extract_v8i16(<8 x i16> %x, i16* %dst) { 286 %r1 = extractelement <8 x i16> %x, i32 1 287 %r2 = extractelement <8 x i16> %x, i32 3 288 store i16 %r2, i16* %dst, align 1 289 ret i16 %r1 290} 291 292define i8 @extract_v16i8(<16 x i8> %x, i8* %dst) { 293 %r1 = extractelement <16 x i8> %x, i32 1 294 %r2 = extractelement <16 x i8> %x, i32 3 295 store i8 %r2, i8* %dst, align 1 296 ret i8 %r1 297} 298 299define <2 x i64> @insert_v2i64(<2 x i64> %x, i64 %y , i64* %ptr) { 300 %val = load i64, i64* %ptr 301 %r1 = insertelement <2 x i64> %x, i64 %val, i32 1 302 %r2 = insertelement <2 x i64> %r1, i64 %y, i32 3 303 ret <2 x i64> %r2 304} 305 306define <4 x i32> @insert_v4i32(<4 x i32> %x, i32 %y, i32* %ptr) { 307 %val = load i32, i32* %ptr 308 %r1 = insertelement <4 x i32> %x, i32 %val, i32 1 309 %r2 = insertelement <4 x i32> %r1, i32 %y, i32 3 310 ret <4 x i32> %r2 311} 312 313define <8 x i16> @insert_v8i16(<8 x i16> %x, i16 %y, i16* %ptr) { 314 %val = load i16, i16* %ptr 315 %r1 = insertelement <8 x i16> %x, i16 %val, i32 1 316 %r2 = insertelement <8 x i16> %r1, i16 %y, i32 5 317 ret <8 x i16> %r2 318} 319 320define <16 x i8> @insert_v16i8(<16 x i8> %x, i8 %y, i8* %ptr) { 321 %val = load i8, i8* %ptr 322 %r1 = insertelement <16 x i8> %x, i8 %val, i32 3 323 %r2 = insertelement <16 x i8> %r1, i8 %y, i32 10 324 ret <16 x i8> %r2 325} 326 327define <4 x i32> @shuffle_v4i32_0451(<4 x i32> %a, <4 x i32> %b) { 328 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 1> 329 ret <4 x i32> %shuffle 330} 331 332define <4 x i32> @shuffle_v4i32_0142(<4 x i32> %a, <4 x i32> %b) { 333 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 2> 334 ret <4 x i32> %shuffle 335} 336 337define <16 x i8> @shuffle_v16i8_0101010101010101(<16 x i8> %a, <16 x i8> %b) { 338 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> 339 ret <16 x i8> %shuffle 340} 341 342define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { 343 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> 344 ret <16 x i16> %shuffle 345} 346 347define <8 x float> @shuffle_v8f32_11335577(<8 x float> %a, <8 x float> %b) { 348; vmovshdup 256 test 349 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 350 ret <8 x float> %shuffle 351} 352 353define <4 x float> @shuffle_v4f32_1133(<4 x float> %a, <4 x float> %b) { 354; vmovshdup 128 test 355 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 356 ret <4 x float> %shuffle 357} 358 359define <8 x float> @shuffle_v8f32_00224466(<8 x float> %a, <8 x float> %b) { 360; vmovsldup 256 test 361 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 362 ret <8 x float> %shuffle 363} 364 365define <4 x float> @shuffle_v4f32_0022(<4 x float> %a, <4 x float> %b) { 366; vmovsldup 128 test 367 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 368 ret <4 x float> %shuffle 369} 370 371define <2 x double> @insert_mem_lo_v2f64(double* %ptr, <2 x double> %b) { 372 %a = load double, double* %ptr 373 %v = insertelement <2 x double> undef, double %a, i32 0 374 %shuffle = shufflevector <2 x double> %v, <2 x double> %b, <2 x i32> <i32 0, i32 3> 375 ret <2 x double> %shuffle 376} 377 378define <2 x double> @insert_mem_hi_v2f64(double* %ptr, <2 x double> %b) { 379 %a = load double, double* %ptr 380 %v = insertelement <2 x double> undef, double %a, i32 0 381 %shuffle = shufflevector <2 x double> %v, <2 x double> %b, <2 x i32> <i32 2, i32 0> 382 ret <2 x double> %shuffle 383} 384 385define void @store_floats(<4 x float> %x, i64* %p) { 386 %a = fadd <4 x float> %x, %x 387 %b = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 1> 388 %c = bitcast <2 x float> %b to i64 389 store i64 %c, i64* %p 390 ret void 391} 392 393define void @store_double(<2 x double> %x, i64* %p) { 394 %a = fadd <2 x double> %x, %x 395 %b = extractelement <2 x double> %a, i32 0 396 %c = bitcast double %b to i64 397 store i64 %c, i64* %p 398 ret void 399} 400 401define void @store_h_double(<2 x double> %x, i64* %p) { 402 %a = fadd <2 x double> %x, %x 403 %b = extractelement <2 x double> %a, i32 1 404 %c = bitcast double %b to i64 405 store i64 %c, i64* %p 406 ret void 407} 408 409define <2 x double> @test39(double* %ptr) nounwind { 410 %a = load double, double* %ptr 411 %v = insertelement <2 x double> undef, double %a, i32 0 412 %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 0, i32 0> 413 ret <2 x double> %shuffle 414 } 415 416define <2 x double> @test40(<2 x double>* %ptr) nounwind { 417 %v = load <2 x double>, <2 x double>* %ptr 418 %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 0, i32 0> 419 ret <2 x double> %shuffle 420 } 421 422define <2 x double> @shuffle_v2f64_00(<2 x double> %a, <2 x double> %b) { 423 %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 0> 424 ret <2 x double> %shuffle 425} 426 427define <4 x double> @shuffle_v4f64_0022(<4 x double> %a, <4 x double> %b) { 428 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 429 ret <4 x double> %shuffle 430} 431 432