1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=X32 3; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=X64 4 5; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx2-builtins.c 6 7define <4 x i64> @test_mm256_abs_epi8(<4 x i64> %a0) { 8; X32-LABEL: test_mm256_abs_epi8: 9; X32: # BB#0: 10; X32-NEXT: vpabsb %ymm0, %ymm0 11; X32-NEXT: retl 12; 13; X64-LABEL: test_mm256_abs_epi8: 14; X64: # BB#0: 15; X64-NEXT: vpabsb %ymm0, %ymm0 16; X64-NEXT: retq 17 %arg = bitcast <4 x i64> %a0 to <32 x i8> 18 %call = call <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8> %arg) 19 %res = bitcast <32 x i8> %call to <4 x i64> 20 ret <4 x i64> %res 21} 22declare <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8>) nounwind readnone 23 24define <4 x i64> @test_mm256_abs_epi16(<4 x i64> %a0) { 25; X32-LABEL: test_mm256_abs_epi16: 26; X32: # BB#0: 27; X32-NEXT: vpabsw %ymm0, %ymm0 28; X32-NEXT: retl 29; 30; X64-LABEL: test_mm256_abs_epi16: 31; X64: # BB#0: 32; X64-NEXT: vpabsw %ymm0, %ymm0 33; X64-NEXT: retq 34 %arg = bitcast <4 x i64> %a0 to <16 x i16> 35 %call = call <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16> %arg) 36 %res = bitcast <16 x i16> %call to <4 x i64> 37 ret <4 x i64> %res 38} 39declare <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16>) nounwind readnone 40 41define <4 x i64> @test_mm256_abs_epi32(<4 x i64> %a0) { 42; X32-LABEL: test_mm256_abs_epi32: 43; X32: # BB#0: 44; X32-NEXT: vpabsd %ymm0, %ymm0 45; X32-NEXT: retl 46; 47; X64-LABEL: test_mm256_abs_epi32: 48; X64: # BB#0: 49; X64-NEXT: vpabsd %ymm0, %ymm0 50; X64-NEXT: retq 51 %arg = bitcast <4 x i64> %a0 to <8 x i32> 52 %call = call <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32> %arg) 53 %res = bitcast <8 x i32> %call to <4 x i64> 54 ret <4 x i64> %res 55} 56declare <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32>) nounwind readnone 57 58define <4 x i64> @test_mm256_add_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind { 59; X32-LABEL: test_mm256_add_epi8: 60; X32: # BB#0: 61; X32-NEXT: vpaddb %ymm1, %ymm0, %ymm0 62; X32-NEXT: retl 63; 64; X64-LABEL: test_mm256_add_epi8: 65; X64: # BB#0: 66; X64-NEXT: vpaddb %ymm1, %ymm0, %ymm0 67; X64-NEXT: retq 68 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 69 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 70 %res = add <32 x i8> %arg0, %arg1 71 %bc = bitcast <32 x i8> %res to <4 x i64> 72 ret <4 x i64> %bc 73} 74 75define <4 x i64> @test_mm256_add_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind { 76; X32-LABEL: test_mm256_add_epi16: 77; X32: # BB#0: 78; X32-NEXT: vpaddw %ymm1, %ymm0, %ymm0 79; X32-NEXT: retl 80; 81; X64-LABEL: test_mm256_add_epi16: 82; X64: # BB#0: 83; X64-NEXT: vpaddw %ymm1, %ymm0, %ymm0 84; X64-NEXT: retq 85 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 86 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 87 %res = add <16 x i16> %arg0, %arg1 88 %bc = bitcast <16 x i16> %res to <4 x i64> 89 ret <4 x i64> %bc 90} 91 92define <4 x i64> @test_mm256_add_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 93; X32-LABEL: test_mm256_add_epi32: 94; X32: # BB#0: 95; X32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 96; X32-NEXT: retl 97; 98; X64-LABEL: test_mm256_add_epi32: 99; X64: # BB#0: 100; X64-NEXT: vpaddd %ymm1, %ymm0, %ymm0 101; X64-NEXT: retq 102 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 103 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 104 %res = add <8 x i32> %arg0, %arg1 105 %bc = bitcast <8 x i32> %res to <4 x i64> 106 ret <4 x i64> %bc 107} 108 109define <4 x i64> @test_mm256_add_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind { 110; X32-LABEL: test_mm256_add_epi64: 111; X32: # BB#0: 112; X32-NEXT: vpaddq %ymm1, %ymm0, %ymm0 113; X32-NEXT: retl 114; 115; X64-LABEL: test_mm256_add_epi64: 116; X64: # BB#0: 117; X64-NEXT: vpaddq %ymm1, %ymm0, %ymm0 118; X64-NEXT: retq 119 %res = add <4 x i64> %a0, %a1 120 ret <4 x i64> %res 121} 122 123define <4 x i64> @test_mm256_adds_epi8(<4 x i64> %a0, <4 x i64> %a1) { 124; X32-LABEL: test_mm256_adds_epi8: 125; X32: # BB#0: 126; X32-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 127; X32-NEXT: retl 128; 129; X64-LABEL: test_mm256_adds_epi8: 130; X64: # BB#0: 131; X64-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 132; X64-NEXT: retq 133 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 134 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 135 %res = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %arg0, <32 x i8> %arg1) 136 %bc = bitcast <32 x i8> %res to <4 x i64> 137 ret <4 x i64> %bc 138} 139declare <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8>, <32 x i8>) nounwind readnone 140 141define <4 x i64> @test_mm256_adds_epi16(<4 x i64> %a0, <4 x i64> %a1) { 142; X32-LABEL: test_mm256_adds_epi16: 143; X32: # BB#0: 144; X32-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 145; X32-NEXT: retl 146; 147; X64-LABEL: test_mm256_adds_epi16: 148; X64: # BB#0: 149; X64-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 150; X64-NEXT: retq 151 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 152 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 153 %res = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %arg0, <16 x i16> %arg1) 154 %bc = bitcast <16 x i16> %res to <4 x i64> 155 ret <4 x i64> %bc 156} 157declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>) nounwind readnone 158 159define <4 x i64> @test_mm256_adds_epu8(<4 x i64> %a0, <4 x i64> %a1) { 160; X32-LABEL: test_mm256_adds_epu8: 161; X32: # BB#0: 162; X32-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 163; X32-NEXT: retl 164; 165; X64-LABEL: test_mm256_adds_epu8: 166; X64: # BB#0: 167; X64-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 168; X64-NEXT: retq 169 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 170 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 171 %res = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %arg0, <32 x i8> %arg1) 172 %bc = bitcast <32 x i8> %res to <4 x i64> 173 ret <4 x i64> %bc 174} 175declare <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8>, <32 x i8>) nounwind readnone 176 177define <4 x i64> @test_mm256_adds_epu16(<4 x i64> %a0, <4 x i64> %a1) { 178; X32-LABEL: test_mm256_adds_epu16: 179; X32: # BB#0: 180; X32-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 181; X32-NEXT: retl 182; 183; X64-LABEL: test_mm256_adds_epu16: 184; X64: # BB#0: 185; X64-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 186; X64-NEXT: retq 187 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 188 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 189 %res = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %arg0, <16 x i16> %arg1) 190 %bc = bitcast <16 x i16> %res to <4 x i64> 191 ret <4 x i64> %bc 192} 193declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>) nounwind readnone 194 195define <4 x i64> @test_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) { 196; X32-LABEL: test_mm256_alignr_epi8: 197; X32: # BB#0: 198; X32-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1],ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17] 199; X32-NEXT: retl 200; 201; X64-LABEL: test_mm256_alignr_epi8: 202; X64: # BB#0: 203; X64-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1],ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17] 204; X64-NEXT: retq 205 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 206 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 207 %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49> 208 %res = bitcast <32 x i8> %shuf to <4 x i64> 209 ret <4 x i64> %res 210} 211 212define <4 x i64> @test2_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) { 213; X32-LABEL: test2_mm256_alignr_epi8: 214; X32: # BB#0: 215; X32-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16] 216; X32-NEXT: retl 217; 218; X64-LABEL: test2_mm256_alignr_epi8: 219; X64: # BB#0: 220; X64-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16] 221; X64-NEXT: retq 222 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 223 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 224 %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48> 225 %res = bitcast <32 x i8> %shuf to <4 x i64> 226 ret <4 x i64> %res 227} 228 229define <4 x i64> @test_mm256_and_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind { 230; X32-LABEL: test_mm256_and_si256: 231; X32: # BB#0: 232; X32-NEXT: vandps %ymm1, %ymm0, %ymm0 233; X32-NEXT: retl 234; 235; X64-LABEL: test_mm256_and_si256: 236; X64: # BB#0: 237; X64-NEXT: vandps %ymm1, %ymm0, %ymm0 238; X64-NEXT: retq 239 %res = and <4 x i64> %a0, %a1 240 ret <4 x i64> %res 241} 242 243define <4 x i64> @test_mm256_andnot_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind { 244; X32-LABEL: test_mm256_andnot_si256: 245; X32: # BB#0: 246; X32-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 247; X32-NEXT: vpxor %ymm2, %ymm0, %ymm0 248; X32-NEXT: vpand %ymm1, %ymm0, %ymm0 249; X32-NEXT: retl 250; 251; X64-LABEL: test_mm256_andnot_si256: 252; X64: # BB#0: 253; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 254; X64-NEXT: vpxor %ymm2, %ymm0, %ymm0 255; X64-NEXT: vpand %ymm1, %ymm0, %ymm0 256; X64-NEXT: retq 257 %not = xor <4 x i64> %a0, <i64 -1, i64 -1, i64 -1, i64 -1> 258 %res = and <4 x i64> %not, %a1 259 ret <4 x i64> %res 260} 261 262define <4 x i64> @test_mm256_avg_epu8(<4 x i64> %a0, <4 x i64> %a1) { 263; X32-LABEL: test_mm256_avg_epu8: 264; X32: # BB#0: 265; X32-NEXT: vpavgb %ymm1, %ymm0, %ymm0 266; X32-NEXT: retl 267; 268; X64-LABEL: test_mm256_avg_epu8: 269; X64: # BB#0: 270; X64-NEXT: vpavgb %ymm1, %ymm0, %ymm0 271; X64-NEXT: retq 272 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 273 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 274 %res = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %arg0, <32 x i8> %arg1) 275 %bc = bitcast <32 x i8> %res to <4 x i64> 276 ret <4 x i64> %bc 277} 278declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone 279 280define <4 x i64> @test_mm256_avg_epu16(<4 x i64> %a0, <4 x i64> %a1) { 281; X32-LABEL: test_mm256_avg_epu16: 282; X32: # BB#0: 283; X32-NEXT: vpavgw %ymm1, %ymm0, %ymm0 284; X32-NEXT: retl 285; 286; X64-LABEL: test_mm256_avg_epu16: 287; X64: # BB#0: 288; X64-NEXT: vpavgw %ymm1, %ymm0, %ymm0 289; X64-NEXT: retq 290 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 291 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 292 %res = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %arg0, <16 x i16> %arg1) 293 %bc = bitcast <16 x i16> %res to <4 x i64> 294 ret <4 x i64> %bc 295} 296declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone 297 298define <4 x i64> @test_mm256_blend_epi16(<4 x i64> %a0, <4 x i64> %a1) { 299; X32-LABEL: test_mm256_blend_epi16: 300; X32: # BB#0: 301; X32-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15] 302; X32-NEXT: retl 303; 304; X64-LABEL: test_mm256_blend_epi16: 305; X64: # BB#0: 306; X64-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15] 307; X64-NEXT: retq 308 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 309 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 310 %shuf = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 25, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 311 %res = bitcast <16 x i16> %shuf to <4 x i64> 312 ret <4 x i64> %res 313} 314 315define <2 x i64> @test_mm_blend_epi32(<2 x i64> %a0, <2 x i64> %a1) { 316; X32-LABEL: test_mm_blend_epi32: 317; X32: # BB#0: 318; X32-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 319; X32-NEXT: retl 320; 321; X64-LABEL: test_mm_blend_epi32: 322; X64: # BB#0: 323; X64-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 324; X64-NEXT: retq 325 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 326 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 327 %shuf = shufflevector <4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 328 %res = bitcast <4 x i32> %shuf to <2 x i64> 329 ret <2 x i64> %res 330} 331 332define <4 x i64> @test_mm256_blend_epi32(<4 x i64> %a0, <4 x i64> %a1) { 333; X32-LABEL: test_mm256_blend_epi32: 334; X32: # BB#0: 335; X32-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7] 336; X32-NEXT: retl 337; 338; X64-LABEL: test_mm256_blend_epi32: 339; X64: # BB#0: 340; X64-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7] 341; X64-NEXT: retq 342 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 343 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 344 %shuf = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7> 345 %res = bitcast <8 x i32> %shuf to <4 x i64> 346 ret <4 x i64> %res 347} 348 349define <4 x i64> @test_mm256_blendv_epi8(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) { 350; X32-LABEL: test_mm256_blendv_epi8: 351; X32: # BB#0: 352; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 353; X32-NEXT: retl 354; 355; X64-LABEL: test_mm256_blendv_epi8: 356; X64: # BB#0: 357; X64-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 358; X64-NEXT: retq 359 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 360 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 361 %arg2 = bitcast <4 x i64> %a2 to <32 x i8> 362 %call = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %arg0, <32 x i8> %arg1, <32 x i8> %arg2) 363 %res = bitcast <32 x i8> %call to <4 x i64> 364 ret <4 x i64> %res 365} 366declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounwind readnone 367 368define <2 x i64> @test_mm_broadcastb_epi8(<2 x i64> %a0) { 369; X32-LABEL: test_mm_broadcastb_epi8: 370; X32: # BB#0: 371; X32-NEXT: vpbroadcastb %xmm0, %xmm0 372; X32-NEXT: retl 373; 374; X64-LABEL: test_mm_broadcastb_epi8: 375; X64: # BB#0: 376; X64-NEXT: vpbroadcastb %xmm0, %xmm0 377; X64-NEXT: retq 378 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 379 %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <16 x i32> zeroinitializer 380 %res = bitcast <16 x i8> %shuf to <2 x i64> 381 ret <2 x i64> %res 382} 383 384define <4 x i64> @test_mm256_broadcastb_epi8(<4 x i64> %a0) { 385; X32-LABEL: test_mm256_broadcastb_epi8: 386; X32: # BB#0: 387; X32-NEXT: vpbroadcastb %xmm0, %ymm0 388; X32-NEXT: retl 389; 390; X64-LABEL: test_mm256_broadcastb_epi8: 391; X64: # BB#0: 392; X64-NEXT: vpbroadcastb %xmm0, %ymm0 393; X64-NEXT: retq 394 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 395 %shuf = shufflevector <32 x i8> %arg0, <32 x i8> undef, <32 x i32> zeroinitializer 396 %res = bitcast <32 x i8> %shuf to <4 x i64> 397 ret <4 x i64> %res 398} 399 400define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) { 401; X32-LABEL: test_mm_broadcastd_epi32: 402; X32: # BB#0: 403; X32-NEXT: vbroadcastss %xmm0, %xmm0 404; X32-NEXT: retl 405; 406; X64-LABEL: test_mm_broadcastd_epi32: 407; X64: # BB#0: 408; X64-NEXT: vbroadcastss %xmm0, %xmm0 409; X64-NEXT: retq 410 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 411 %shuf = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer 412 %res = bitcast <4 x i32> %shuf to <2 x i64> 413 ret <2 x i64> %res 414} 415 416define <4 x i64> @test_mm256_broadcastd_epi32(<4 x i64> %a0) { 417; X32-LABEL: test_mm256_broadcastd_epi32: 418; X32: # BB#0: 419; X32-NEXT: vbroadcastss %xmm0, %ymm0 420; X32-NEXT: retl 421; 422; X64-LABEL: test_mm256_broadcastd_epi32: 423; X64: # BB#0: 424; X64-NEXT: vbroadcastss %xmm0, %ymm0 425; X64-NEXT: retq 426 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 427 %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> zeroinitializer 428 %res = bitcast <8 x i32> %shuf to <4 x i64> 429 ret <4 x i64> %res 430} 431 432define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) { 433; X32-LABEL: test_mm_broadcastq_epi64: 434; X32: # BB#0: 435; X32-NEXT: vpbroadcastq %xmm0, %xmm0 436; X32-NEXT: retl 437; 438; X64-LABEL: test_mm_broadcastq_epi64: 439; X64: # BB#0: 440; X64-NEXT: vpbroadcastq %xmm0, %xmm0 441; X64-NEXT: retq 442 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer 443 ret <2 x i64> %res 444} 445 446define <4 x i64> @test_mm256_broadcastq_epi64(<4 x i64> %a0) { 447; X32-LABEL: test_mm256_broadcastq_epi64: 448; X32: # BB#0: 449; X32-NEXT: vbroadcastsd %xmm0, %ymm0 450; X32-NEXT: retl 451; 452; X64-LABEL: test_mm256_broadcastq_epi64: 453; X64: # BB#0: 454; X64-NEXT: vbroadcastsd %xmm0, %ymm0 455; X64-NEXT: retq 456 %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> zeroinitializer 457 ret <4 x i64> %res 458} 459 460define <2 x double> @test_mm_broadcastsd_pd(<2 x double> %a0) { 461; X32-LABEL: test_mm_broadcastsd_pd: 462; X32: # BB#0: 463; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 464; X32-NEXT: retl 465; 466; X64-LABEL: test_mm_broadcastsd_pd: 467; X64: # BB#0: 468; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 469; X64-NEXT: retq 470 %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer 471 ret <2 x double> %res 472} 473 474define <4 x double> @test_mm256_broadcastsd_pd(<4 x double> %a0) { 475; X32-LABEL: test_mm256_broadcastsd_pd: 476; X32: # BB#0: 477; X32-NEXT: vbroadcastsd %xmm0, %ymm0 478; X32-NEXT: retl 479; 480; X64-LABEL: test_mm256_broadcastsd_pd: 481; X64: # BB#0: 482; X64-NEXT: vbroadcastsd %xmm0, %ymm0 483; X64-NEXT: retq 484 %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> zeroinitializer 485 ret <4 x double> %res 486} 487 488define <4 x i64> @test_mm256_broadcastsi128_si256(<4 x i64> %a0) { 489; X32-LABEL: test_mm256_broadcastsi128_si256: 490; X32: # BB#0: 491; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 492; X32-NEXT: retl 493; 494; X64-LABEL: test_mm256_broadcastsi128_si256: 495; X64: # BB#0: 496; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 497; X64-NEXT: retq 498 %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 499 ret <4 x i64> %res 500} 501 502define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) { 503; X32-LABEL: test_mm_broadcastss_ps: 504; X32: # BB#0: 505; X32-NEXT: vbroadcastss %xmm0, %xmm0 506; X32-NEXT: retl 507; 508; X64-LABEL: test_mm_broadcastss_ps: 509; X64: # BB#0: 510; X64-NEXT: vbroadcastss %xmm0, %xmm0 511; X64-NEXT: retq 512 %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer 513 ret <4 x float> %res 514} 515 516define <8 x float> @test_mm256_broadcastss_ps(<8 x float> %a0) { 517; X32-LABEL: test_mm256_broadcastss_ps: 518; X32: # BB#0: 519; X32-NEXT: vbroadcastss %xmm0, %ymm0 520; X32-NEXT: retl 521; 522; X64-LABEL: test_mm256_broadcastss_ps: 523; X64: # BB#0: 524; X64-NEXT: vbroadcastss %xmm0, %ymm0 525; X64-NEXT: retq 526 %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> zeroinitializer 527 ret <8 x float> %res 528} 529 530define <2 x i64> @test_mm_broadcastw_epi16(<2 x i64> %a0) { 531; X32-LABEL: test_mm_broadcastw_epi16: 532; X32: # BB#0: 533; X32-NEXT: vpbroadcastw %xmm0, %xmm0 534; X32-NEXT: retl 535; 536; X64-LABEL: test_mm_broadcastw_epi16: 537; X64: # BB#0: 538; X64-NEXT: vpbroadcastw %xmm0, %xmm0 539; X64-NEXT: retq 540 %arg0 = bitcast <2 x i64> %a0 to <8 x i16> 541 %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <8 x i32> zeroinitializer 542 %res = bitcast <8 x i16> %shuf to <2 x i64> 543 ret <2 x i64> %res 544} 545 546define <4 x i64> @test_mm256_broadcastw_epi16(<4 x i64> %a0) { 547; X32-LABEL: test_mm256_broadcastw_epi16: 548; X32: # BB#0: 549; X32-NEXT: vpbroadcastw %xmm0, %ymm0 550; X32-NEXT: retl 551; 552; X64-LABEL: test_mm256_broadcastw_epi16: 553; X64: # BB#0: 554; X64-NEXT: vpbroadcastw %xmm0, %ymm0 555; X64-NEXT: retq 556 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 557 %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> zeroinitializer 558 %res = bitcast <16 x i16> %shuf to <4 x i64> 559 ret <4 x i64> %res 560} 561 562define <4 x i64> @test_mm256_bslli_epi128(<4 x i64> %a0) { 563; X32-LABEL: test_mm256_bslli_epi128: 564; X32: # BB#0: 565; X32-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28] 566; X32-NEXT: retl 567; 568; X64-LABEL: test_mm256_bslli_epi128: 569; X64: # BB#0: 570; X64-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28] 571; X64-NEXT: retq 572 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 573 %shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60> 574 %res = bitcast <32 x i8> %shuf to <4 x i64> 575 ret <4 x i64> %res 576} 577 578define <4 x i64> @test_mm256_bsrli_epi128(<4 x i64> %a0) { 579; X32-LABEL: test_mm256_bsrli_epi128: 580; X32: # BB#0: 581; X32-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero 582; X32-NEXT: retl 583; 584; X64-LABEL: test_mm256_bsrli_epi128: 585; X64: # BB#0: 586; X64-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero 587; X64-NEXT: retq 588 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 589 %shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50> 590 %res = bitcast <32 x i8> %shuf to <4 x i64> 591 ret <4 x i64> %res 592} 593 594define <4 x i64> @test_mm256_cmpeq_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind { 595; X32-LABEL: test_mm256_cmpeq_epi8: 596; X32: # BB#0: 597; X32-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 598; X32-NEXT: retl 599; 600; X64-LABEL: test_mm256_cmpeq_epi8: 601; X64: # BB#0: 602; X64-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 603; X64-NEXT: retq 604 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 605 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 606 %cmp = icmp eq <32 x i8> %arg0, %arg1 607 %res = sext <32 x i1> %cmp to <32 x i8> 608 %bc = bitcast <32 x i8> %res to <4 x i64> 609 ret <4 x i64> %bc 610} 611 612define <4 x i64> @test_mm256_cmpeq_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind { 613; X32-LABEL: test_mm256_cmpeq_epi16: 614; X32: # BB#0: 615; X32-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 616; X32-NEXT: retl 617; 618; X64-LABEL: test_mm256_cmpeq_epi16: 619; X64: # BB#0: 620; X64-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 621; X64-NEXT: retq 622 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 623 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 624 %cmp = icmp eq <16 x i16> %arg0, %arg1 625 %res = sext <16 x i1> %cmp to <16 x i16> 626 %bc = bitcast <16 x i16> %res to <4 x i64> 627 ret <4 x i64> %bc 628} 629 630define <4 x i64> @test_mm256_cmpeq_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 631; X32-LABEL: test_mm256_cmpeq_epi32: 632; X32: # BB#0: 633; X32-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 634; X32-NEXT: retl 635; 636; X64-LABEL: test_mm256_cmpeq_epi32: 637; X64: # BB#0: 638; X64-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 639; X64-NEXT: retq 640 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 641 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 642 %cmp = icmp eq <8 x i32> %arg0, %arg1 643 %res = sext <8 x i1> %cmp to <8 x i32> 644 %bc = bitcast <8 x i32> %res to <4 x i64> 645 ret <4 x i64> %bc 646} 647 648define <4 x i64> @test_mm256_cmpeq_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind { 649; X32-LABEL: test_mm256_cmpeq_epi64: 650; X32: # BB#0: 651; X32-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 652; X32-NEXT: retl 653; 654; X64-LABEL: test_mm256_cmpeq_epi64: 655; X64: # BB#0: 656; X64-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 657; X64-NEXT: retq 658 %cmp = icmp eq <4 x i64> %a0, %a1 659 %res = sext <4 x i1> %cmp to <4 x i64> 660 ret <4 x i64> %res 661} 662 663define <4 x i64> @test_mm256_cmpgt_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind { 664; X32-LABEL: test_mm256_cmpgt_epi8: 665; X32: # BB#0: 666; X32-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 667; X32-NEXT: retl 668; 669; X64-LABEL: test_mm256_cmpgt_epi8: 670; X64: # BB#0: 671; X64-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 672; X64-NEXT: retq 673 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 674 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 675 %cmp = icmp sgt <32 x i8> %arg0, %arg1 676 %res = sext <32 x i1> %cmp to <32 x i8> 677 %bc = bitcast <32 x i8> %res to <4 x i64> 678 ret <4 x i64> %bc 679} 680 681define <4 x i64> @test_mm256_cmpgt_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind { 682; X32-LABEL: test_mm256_cmpgt_epi16: 683; X32: # BB#0: 684; X32-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 685; X32-NEXT: retl 686; 687; X64-LABEL: test_mm256_cmpgt_epi16: 688; X64: # BB#0: 689; X64-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 690; X64-NEXT: retq 691 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 692 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 693 %cmp = icmp sgt <16 x i16> %arg0, %arg1 694 %res = sext <16 x i1> %cmp to <16 x i16> 695 %bc = bitcast <16 x i16> %res to <4 x i64> 696 ret <4 x i64> %bc 697} 698 699define <4 x i64> @test_mm256_cmpgt_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 700; X32-LABEL: test_mm256_cmpgt_epi32: 701; X32: # BB#0: 702; X32-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 703; X32-NEXT: retl 704; 705; X64-LABEL: test_mm256_cmpgt_epi32: 706; X64: # BB#0: 707; X64-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 708; X64-NEXT: retq 709 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 710 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 711 %cmp = icmp sgt <8 x i32> %arg0, %arg1 712 %res = sext <8 x i1> %cmp to <8 x i32> 713 %bc = bitcast <8 x i32> %res to <4 x i64> 714 ret <4 x i64> %bc 715} 716 717define <4 x i64> @test_mm256_cmpgt_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind { 718; X32-LABEL: test_mm256_cmpgt_epi64: 719; X32: # BB#0: 720; X32-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 721; X32-NEXT: retl 722; 723; X64-LABEL: test_mm256_cmpgt_epi64: 724; X64: # BB#0: 725; X64-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 726; X64-NEXT: retq 727 %cmp = icmp sgt <4 x i64> %a0, %a1 728 %res = sext <4 x i1> %cmp to <4 x i64> 729 ret <4 x i64> %res 730} 731 732define <4 x i64> @test_mm256_cvtepi8_epi16(<2 x i64> %a0) { 733; X32-LABEL: test_mm256_cvtepi8_epi16: 734; X32: # BB#0: 735; X32-NEXT: vpmovsxbw %xmm0, %ymm0 736; X32-NEXT: retl 737; 738; X64-LABEL: test_mm256_cvtepi8_epi16: 739; X64: # BB#0: 740; X64-NEXT: vpmovsxbw %xmm0, %ymm0 741; X64-NEXT: retq 742 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 743 %ext = sext <16 x i8> %arg0 to <16 x i16> 744 %res = bitcast <16 x i16> %ext to <4 x i64> 745 ret <4 x i64> %res 746} 747 748define <4 x i64> @test_mm256_cvtepi8_epi32(<2 x i64> %a0) { 749; X32-LABEL: test_mm256_cvtepi8_epi32: 750; X32: # BB#0: 751; X32-NEXT: vpmovsxbd %xmm0, %ymm0 752; X32-NEXT: retl 753; 754; X64-LABEL: test_mm256_cvtepi8_epi32: 755; X64: # BB#0: 756; X64-NEXT: vpmovsxbd %xmm0, %ymm0 757; X64-NEXT: retq 758 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 759 %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 760 %ext = sext <8 x i8> %shuf to <8 x i32> 761 %res = bitcast <8 x i32> %ext to <4 x i64> 762 ret <4 x i64> %res 763} 764 765define <4 x i64> @test_mm256_cvtepi8_epi64(<2 x i64> %a0) { 766; X32-LABEL: test_mm256_cvtepi8_epi64: 767; X32: # BB#0: 768; X32-NEXT: vpmovsxbq %xmm0, %ymm0 769; X32-NEXT: retl 770; 771; X64-LABEL: test_mm256_cvtepi8_epi64: 772; X64: # BB#0: 773; X64-NEXT: vpmovsxbq %xmm0, %ymm0 774; X64-NEXT: retq 775 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 776 %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 777 %ext = sext <4 x i8> %shuf to <4 x i64> 778 ret <4 x i64> %ext 779} 780 781define <4 x i64> @test_mm256_cvtepi16_epi32(<2 x i64> %a0) { 782; X32-LABEL: test_mm256_cvtepi16_epi32: 783; X32: # BB#0: 784; X32-NEXT: vpmovsxwd %xmm0, %ymm0 785; X32-NEXT: retl 786; 787; X64-LABEL: test_mm256_cvtepi16_epi32: 788; X64: # BB#0: 789; X64-NEXT: vpmovsxwd %xmm0, %ymm0 790; X64-NEXT: retq 791 %arg0 = bitcast <2 x i64> %a0 to <8 x i16> 792 %ext = sext <8 x i16> %arg0 to <8 x i32> 793 %res = bitcast <8 x i32> %ext to <4 x i64> 794 ret <4 x i64> %res 795} 796 797define <4 x i64> @test_mm256_cvtepi16_epi64(<2 x i64> %a0) { 798; X32-LABEL: test_mm256_cvtepi16_epi64: 799; X32: # BB#0: 800; X32-NEXT: vpmovsxwq %xmm0, %ymm0 801; X32-NEXT: retl 802; 803; X64-LABEL: test_mm256_cvtepi16_epi64: 804; X64: # BB#0: 805; X64-NEXT: vpmovsxwq %xmm0, %ymm0 806; X64-NEXT: retq 807 %arg0 = bitcast <2 x i64> %a0 to <8 x i16> 808 %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 809 %ext = sext <4 x i16> %shuf to <4 x i64> 810 ret <4 x i64> %ext 811} 812 813define <4 x i64> @test_mm256_cvtepi32_epi64(<2 x i64> %a0) { 814; X32-LABEL: test_mm256_cvtepi32_epi64: 815; X32: # BB#0: 816; X32-NEXT: vpmovsxdq %xmm0, %ymm0 817; X32-NEXT: retl 818; 819; X64-LABEL: test_mm256_cvtepi32_epi64: 820; X64: # BB#0: 821; X64-NEXT: vpmovsxdq %xmm0, %ymm0 822; X64-NEXT: retq 823 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 824 %ext = sext <4 x i32> %arg0 to <4 x i64> 825 ret <4 x i64> %ext 826} 827 828define <4 x i64> @test_mm256_cvtepu8_epi16(<2 x i64> %a0) { 829; X32-LABEL: test_mm256_cvtepu8_epi16: 830; X32: # BB#0: 831; X32-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 832; X32-NEXT: retl 833; 834; X64-LABEL: test_mm256_cvtepu8_epi16: 835; X64: # BB#0: 836; X64-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 837; X64-NEXT: retq 838 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 839 %ext = zext <16 x i8> %arg0 to <16 x i16> 840 %res = bitcast <16 x i16> %ext to <4 x i64> 841 ret <4 x i64> %res 842} 843 844define <4 x i64> @test_mm256_cvtepu8_epi32(<2 x i64> %a0) { 845; X32-LABEL: test_mm256_cvtepu8_epi32: 846; X32: # BB#0: 847; X32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 848; X32-NEXT: retl 849; 850; X64-LABEL: test_mm256_cvtepu8_epi32: 851; X64: # BB#0: 852; X64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 853; X64-NEXT: retq 854 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 855 %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 856 %ext = zext <8 x i8> %shuf to <8 x i32> 857 %res = bitcast <8 x i32> %ext to <4 x i64> 858 ret <4 x i64> %res 859} 860 861define <4 x i64> @test_mm256_cvtepu8_epi64(<2 x i64> %a0) { 862; X32-LABEL: test_mm256_cvtepu8_epi64: 863; X32: # BB#0: 864; X32-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero 865; X32-NEXT: retl 866; 867; X64-LABEL: test_mm256_cvtepu8_epi64: 868; X64: # BB#0: 869; X64-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero 870; X64-NEXT: retq 871 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 872 %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 873 %ext = zext <4 x i8> %shuf to <4 x i64> 874 ret <4 x i64> %ext 875} 876 877define <4 x i64> @test_mm256_cvtepu16_epi32(<2 x i64> %a0) { 878; X32-LABEL: test_mm256_cvtepu16_epi32: 879; X32: # BB#0: 880; X32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 881; X32-NEXT: retl 882; 883; X64-LABEL: test_mm256_cvtepu16_epi32: 884; X64: # BB#0: 885; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 886; X64-NEXT: retq 887 %arg0 = bitcast <2 x i64> %a0 to <8 x i16> 888 %ext = zext <8 x i16> %arg0 to <8 x i32> 889 %res = bitcast <8 x i32> %ext to <4 x i64> 890 ret <4 x i64> %res 891} 892 893define <4 x i64> @test_mm256_cvtepu16_epi64(<2 x i64> %a0) { 894; X32-LABEL: test_mm256_cvtepu16_epi64: 895; X32: # BB#0: 896; X32-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 897; X32-NEXT: retl 898; 899; X64-LABEL: test_mm256_cvtepu16_epi64: 900; X64: # BB#0: 901; X64-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 902; X64-NEXT: retq 903 %arg0 = bitcast <2 x i64> %a0 to <8 x i16> 904 %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 905 %ext = zext <4 x i16> %shuf to <4 x i64> 906 ret <4 x i64> %ext 907} 908 909define <4 x i64> @test_mm256_cvtepu32_epi64(<2 x i64> %a0) { 910; X32-LABEL: test_mm256_cvtepu32_epi64: 911; X32: # BB#0: 912; X32-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 913; X32-NEXT: retl 914; 915; X64-LABEL: test_mm256_cvtepu32_epi64: 916; X64: # BB#0: 917; X64-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 918; X64-NEXT: retq 919 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 920 %ext = zext <4 x i32> %arg0 to <4 x i64> 921 ret <4 x i64> %ext 922} 923 924define <2 x i64> @test_mm256_extracti128_si256(<4 x i64> %a0) nounwind { 925; X32-LABEL: test_mm256_extracti128_si256: 926; X32: # BB#0: 927; X32-NEXT: vextractf128 $1, %ymm0, %xmm0 928; X32-NEXT: vzeroupper 929; X32-NEXT: retl 930; 931; X64-LABEL: test_mm256_extracti128_si256: 932; X64: # BB#0: 933; X64-NEXT: vextractf128 $1, %ymm0, %xmm0 934; X64-NEXT: vzeroupper 935; X64-NEXT: retq 936 %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 2, i32 3> 937 ret <2 x i64> %res 938} 939 940define <4 x i64> @test_mm256_hadd_epi16(<4 x i64> %a0, <4 x i64> %a1) { 941; X32-LABEL: test_mm256_hadd_epi16: 942; X32: # BB#0: 943; X32-NEXT: vphaddw %ymm1, %ymm0, %ymm0 944; X32-NEXT: retl 945; 946; X64-LABEL: test_mm256_hadd_epi16: 947; X64: # BB#0: 948; X64-NEXT: vphaddw %ymm1, %ymm0, %ymm0 949; X64-NEXT: retq 950 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 951 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 952 %res = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %arg0, <16 x i16> %arg1) 953 %bc = bitcast <16 x i16> %res to <4 x i64> 954 ret <4 x i64> %bc 955} 956declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readnone 957 958define <4 x i64> @test_mm256_hadd_epi32(<4 x i64> %a0, <4 x i64> %a1) { 959; X32-LABEL: test_mm256_hadd_epi32: 960; X32: # BB#0: 961; X32-NEXT: vphaddd %ymm1, %ymm0, %ymm0 962; X32-NEXT: retl 963; 964; X64-LABEL: test_mm256_hadd_epi32: 965; X64: # BB#0: 966; X64-NEXT: vphaddd %ymm1, %ymm0, %ymm0 967; X64-NEXT: retq 968 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 969 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 970 %res = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %arg0, <8 x i32> %arg1) 971 %bc = bitcast <8 x i32> %res to <4 x i64> 972 ret <4 x i64> %bc 973} 974declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone 975 976define <4 x i64> @test_mm256_hadds_epi16(<4 x i64> %a0, <4 x i64> %a1) { 977; X32-LABEL: test_mm256_hadds_epi16: 978; X32: # BB#0: 979; X32-NEXT: vphaddsw %ymm1, %ymm0, %ymm0 980; X32-NEXT: retl 981; 982; X64-LABEL: test_mm256_hadds_epi16: 983; X64: # BB#0: 984; X64-NEXT: vphaddsw %ymm1, %ymm0, %ymm0 985; X64-NEXT: retq 986 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 987 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 988 %res = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %arg0, <16 x i16> %arg1) 989 %bc = bitcast <16 x i16> %res to <4 x i64> 990 ret <4 x i64> %bc 991} 992declare <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16>, <16 x i16>) nounwind readnone 993 994define <4 x i64> @test_mm256_hsub_epi16(<4 x i64> %a0, <4 x i64> %a1) { 995; X32-LABEL: test_mm256_hsub_epi16: 996; X32: # BB#0: 997; X32-NEXT: vphsubw %ymm1, %ymm0, %ymm0 998; X32-NEXT: retl 999; 1000; X64-LABEL: test_mm256_hsub_epi16: 1001; X64: # BB#0: 1002; X64-NEXT: vphsubw %ymm1, %ymm0, %ymm0 1003; X64-NEXT: retq 1004 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1005 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1006 %res = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %arg0, <16 x i16> %arg1) 1007 %bc = bitcast <16 x i16> %res to <4 x i64> 1008 ret <4 x i64> %bc 1009} 1010declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) nounwind readnone 1011 1012define <4 x i64> @test_mm256_hsub_epi32(<4 x i64> %a0, <4 x i64> %a1) { 1013; X32-LABEL: test_mm256_hsub_epi32: 1014; X32: # BB#0: 1015; X32-NEXT: vphsubd %ymm1, %ymm0, %ymm0 1016; X32-NEXT: retl 1017; 1018; X64-LABEL: test_mm256_hsub_epi32: 1019; X64: # BB#0: 1020; X64-NEXT: vphsubd %ymm1, %ymm0, %ymm0 1021; X64-NEXT: retq 1022 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 1023 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1024 %res = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %arg0, <8 x i32> %arg1) 1025 %bc = bitcast <8 x i32> %res to <4 x i64> 1026 ret <4 x i64> %bc 1027} 1028declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) nounwind readnone 1029 1030define <4 x i64> @test_mm256_hsubs_epi16(<4 x i64> %a0, <4 x i64> %a1) { 1031; X32-LABEL: test_mm256_hsubs_epi16: 1032; X32: # BB#0: 1033; X32-NEXT: vphsubsw %ymm1, %ymm0, %ymm0 1034; X32-NEXT: retl 1035; 1036; X64-LABEL: test_mm256_hsubs_epi16: 1037; X64: # BB#0: 1038; X64-NEXT: vphsubsw %ymm1, %ymm0, %ymm0 1039; X64-NEXT: retq 1040 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1041 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1042 %res = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %arg0, <16 x i16> %arg1) 1043 %bc = bitcast <16 x i16> %res to <4 x i64> 1044 ret <4 x i64> %bc 1045} 1046declare <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16>, <16 x i16>) nounwind readnone 1047 1048define <2 x i64> @test_mm_i32gather_epi32(i32 *%a0, <2 x i64> %a1) { 1049; X32-LABEL: test_mm_i32gather_epi32: 1050; X32: # BB#0: 1051; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1052; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1053; X32-NEXT: vpgatherdd %xmm2, (%eax,%xmm0,2), %xmm1 1054; X32-NEXT: vmovdqa %xmm1, %xmm0 1055; X32-NEXT: retl 1056; 1057; X64-LABEL: test_mm_i32gather_epi32: 1058; X64: # BB#0: 1059; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1060; X64-NEXT: vpgatherdd %xmm2, (%rdi,%xmm0,2), %xmm1 1061; X64-NEXT: vmovdqa %xmm1, %xmm0 1062; X64-NEXT: retq 1063 %arg0 = bitcast i32 *%a0 to i8* 1064 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 1065 %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32> 1066 %call = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> undef, i8* %arg0, <4 x i32> %arg1, <4 x i32> %mask, i8 2) 1067 %bc = bitcast <4 x i32> %call to <2 x i64> 1068 ret <2 x i64> %bc 1069} 1070declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32>, i8*, <4 x i32>, <4 x i32>, i8) nounwind readonly 1071 1072define <2 x i64> @test_mm_mask_i32gather_epi32(<2 x i64> %a0, i32 *%a1, <2 x i64> %a2, <2 x i64> %a3) { 1073; X32-LABEL: test_mm_mask_i32gather_epi32: 1074; X32: # BB#0: 1075; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1076; X32-NEXT: vpgatherdd %xmm2, (%eax,%xmm1,2), %xmm0 1077; X32-NEXT: retl 1078; 1079; X64-LABEL: test_mm_mask_i32gather_epi32: 1080; X64: # BB#0: 1081; X64-NEXT: vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0 1082; X64-NEXT: retq 1083 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 1084 %arg1 = bitcast i32 *%a1 to i8* 1085 %arg2 = bitcast <2 x i64> %a2 to <4 x i32> 1086 %arg3 = bitcast <2 x i64> %a3 to <4 x i32> 1087 %call = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %arg0, i8* %arg1, <4 x i32> %arg2, <4 x i32> %arg3, i8 2) 1088 %bc = bitcast <4 x i32> %call to <2 x i64> 1089 ret <2 x i64> %bc 1090} 1091 1092define <4 x i64> @test_mm256_i32gather_epi32(i32 *%a0, <4 x i64> %a1) { 1093; X32-LABEL: test_mm256_i32gather_epi32: 1094; X32: # BB#0: 1095; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1096; X32-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 1097; X32-NEXT: vpgatherdd %ymm2, (%eax,%ymm0,2), %ymm1 1098; X32-NEXT: vmovdqa %ymm1, %ymm0 1099; X32-NEXT: retl 1100; 1101; X64-LABEL: test_mm256_i32gather_epi32: 1102; X64: # BB#0: 1103; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 1104; X64-NEXT: vpgatherdd %ymm2, (%rdi,%ymm0,2), %ymm1 1105; X64-NEXT: vmovdqa %ymm1, %ymm0 1106; X64-NEXT: retq 1107 %arg0 = bitcast i32 *%a0 to i8* 1108 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1109 %mask = bitcast <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1> to <8 x i32> 1110 %call = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8* %arg0, <8 x i32> %arg1, <8 x i32> %mask, i8 2) 1111 %bc = bitcast <8 x i32> %call to <4 x i64> 1112 ret <4 x i64> %bc 1113} 1114declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32>, i8*, <8 x i32>, <8 x i32>, i8) nounwind readonly 1115 1116define <4 x i64> @test_mm256_mask_i32gather_epi32(<4 x i64> %a0, i32 *%a1, <4 x i64> %a2, <4 x i64> %a3) { 1117; X32-LABEL: test_mm256_mask_i32gather_epi32: 1118; X32: # BB#0: 1119; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1120; X32-NEXT: vpgatherdd %ymm2, (%eax,%ymm1,2), %ymm0 1121; X32-NEXT: retl 1122; 1123; X64-LABEL: test_mm256_mask_i32gather_epi32: 1124; X64: # BB#0: 1125; X64-NEXT: vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0 1126; X64-NEXT: retq 1127 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 1128 %arg1 = bitcast i32 *%a1 to i8* 1129 %arg2 = bitcast <4 x i64> %a2 to <8 x i32> 1130 %arg3 = bitcast <4 x i64> %a3 to <8 x i32> 1131 %call = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %arg0, i8* %arg1, <8 x i32> %arg2, <8 x i32> %arg3, i8 2) 1132 %bc = bitcast <8 x i32> %call to <4 x i64> 1133 ret <4 x i64> %bc 1134} 1135 1136define <2 x i64> @test_mm_i32gather_epi64(i64 *%a0, <2 x i64> %a1) { 1137; X32-LABEL: test_mm_i32gather_epi64: 1138; X32: # BB#0: 1139; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1140; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1141; X32-NEXT: vpgatherdq %xmm2, (%eax,%xmm0,2), %xmm1 1142; X32-NEXT: vmovdqa %xmm1, %xmm0 1143; X32-NEXT: retl 1144; 1145; X64-LABEL: test_mm_i32gather_epi64: 1146; X64: # BB#0: 1147; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1148; X64-NEXT: vpgatherdq %xmm2, (%rdi,%xmm0,2), %xmm1 1149; X64-NEXT: vmovdqa %xmm1, %xmm0 1150; X64-NEXT: retq 1151 %arg0 = bitcast i64 *%a0 to i8* 1152 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 1153 %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> undef, i8* %arg0, <4 x i32> %arg1, <2 x i64> <i64 -1, i64 -1>, i8 2) 1154 ret <2 x i64> %res 1155} 1156declare <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64>, i8*, <4 x i32>, <2 x i64>, i8) nounwind readonly 1157 1158define <2 x i64> @test_mm_mask_i32gather_epi64(<2 x i64> %a0, i64 *%a1, <2 x i64> %a2, <2 x i64> %a3) { 1159; X32-LABEL: test_mm_mask_i32gather_epi64: 1160; X32: # BB#0: 1161; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1162; X32-NEXT: vpgatherdq %xmm2, (%eax,%xmm1,2), %xmm0 1163; X32-NEXT: retl 1164; 1165; X64-LABEL: test_mm_mask_i32gather_epi64: 1166; X64: # BB#0: 1167; X64-NEXT: vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0 1168; X64-NEXT: retq 1169 %arg1 = bitcast i64 *%a1 to i8* 1170 %arg2 = bitcast <2 x i64> %a2 to <4 x i32> 1171 %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> %a0, i8* %arg1, <4 x i32> %arg2, <2 x i64> %a3, i8 2) 1172 ret <2 x i64> %res 1173} 1174 1175define <4 x i64> @test_mm256_i32gather_epi64(i64 *%a0, <2 x i64> %a1) { 1176; X32-LABEL: test_mm256_i32gather_epi64: 1177; X32: # BB#0: 1178; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1179; X32-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 1180; X32-NEXT: vpgatherdq %ymm2, (%eax,%xmm0,2), %ymm1 1181; X32-NEXT: vmovdqa %ymm1, %ymm0 1182; X32-NEXT: retl 1183; 1184; X64-LABEL: test_mm256_i32gather_epi64: 1185; X64: # BB#0: 1186; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 1187; X64-NEXT: vpgatherdq %ymm2, (%rdi,%xmm0,2), %ymm1 1188; X64-NEXT: vmovdqa %ymm1, %ymm0 1189; X64-NEXT: retq 1190 %arg0 = bitcast i64 *%a0 to i8* 1191 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 1192 %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8* %arg0, <4 x i32> %arg1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2) 1193 ret <4 x i64> %res 1194} 1195declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64>, i8*, <4 x i32>, <4 x i64>, i8) nounwind readonly 1196 1197define <4 x i64> @test_mm256_mask_i32gather_epi64(<4 x i64> %a0, i64 *%a1, <2 x i64> %a2, <4 x i64> %a3) { 1198; X32-LABEL: test_mm256_mask_i32gather_epi64: 1199; X32: # BB#0: 1200; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1201; X32-NEXT: vpgatherdq %ymm2, (%eax,%xmm1,2), %ymm0 1202; X32-NEXT: retl 1203; 1204; X64-LABEL: test_mm256_mask_i32gather_epi64: 1205; X64: # BB#0: 1206; X64-NEXT: vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0 1207; X64-NEXT: retq 1208 %arg1 = bitcast i64 *%a1 to i8* 1209 %arg2 = bitcast <2 x i64> %a2 to <4 x i32> 1210 %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %a0, i8* %arg1, <4 x i32> %arg2, <4 x i64> %a3, i8 2) 1211 ret <4 x i64> %res 1212} 1213 1214define <2 x double> @test_mm_i32gather_pd(double *%a0, <2 x i64> %a1) { 1215; X32-LABEL: test_mm_i32gather_pd: 1216; X32: # BB#0: 1217; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1218; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1219; X32-NEXT: vgatherdpd %xmm2, (%eax,%xmm0,2), %xmm1 1220; X32-NEXT: vmovapd %xmm1, %xmm0 1221; X32-NEXT: retl 1222; 1223; X64-LABEL: test_mm_i32gather_pd: 1224; X64: # BB#0: 1225; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1226; X64-NEXT: vgatherdpd %xmm2, (%rdi,%xmm0,2), %xmm1 1227; X64-NEXT: vmovapd %xmm1, %xmm0 1228; X64-NEXT: retq 1229 %arg0 = bitcast double *%a0 to i8* 1230 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 1231 %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer 1232 %sext = sext <2 x i1> %cmp to <2 x i64> 1233 %mask = bitcast <2 x i64> %sext to <2 x double> 1234 %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> undef, i8* %arg0, <4 x i32> %arg1, <2 x double> %mask, i8 2) 1235 ret <2 x double> %res 1236} 1237declare <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double>, i8*, <4 x i32>, <2 x double>, i8) nounwind readonly 1238 1239define <2 x double> @test_mm_mask_i32gather_pd(<2 x double> %a0, double *%a1, <2 x i64> %a2, <2 x double> %a3) { 1240; X32-LABEL: test_mm_mask_i32gather_pd: 1241; X32: # BB#0: 1242; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1243; X32-NEXT: vgatherdpd %xmm2, (%eax,%xmm1,2), %xmm0 1244; X32-NEXT: retl 1245; 1246; X64-LABEL: test_mm_mask_i32gather_pd: 1247; X64: # BB#0: 1248; X64-NEXT: vgatherdpd %xmm2, (%rdi,%xmm1,2), %xmm0 1249; X64-NEXT: retq 1250 %arg1 = bitcast double *%a1 to i8* 1251 %arg2 = bitcast <2 x i64> %a2 to <4 x i32> 1252 %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> %a0, i8* %arg1, <4 x i32> %arg2, <2 x double> %a3, i8 2) 1253 ret <2 x double> %res 1254} 1255 1256define <4 x double> @test_mm256_i32gather_pd(double *%a0, <2 x i64> %a1) { 1257; X32-LABEL: test_mm256_i32gather_pd: 1258; X32: # BB#0: 1259; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1260; X32-NEXT: vxorpd %ymm1, %ymm1, %ymm1 1261; X32-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2 1262; X32-NEXT: vgatherdpd %ymm2, (%eax,%xmm0,2), %ymm1 1263; X32-NEXT: vmovapd %ymm1, %ymm0 1264; X32-NEXT: retl 1265; 1266; X64-LABEL: test_mm256_i32gather_pd: 1267; X64: # BB#0: 1268; X64-NEXT: vxorpd %ymm1, %ymm1, %ymm1 1269; X64-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2 1270; X64-NEXT: vgatherdpd %ymm2, (%rdi,%xmm0,2), %ymm1 1271; X64-NEXT: vmovapd %ymm1, %ymm0 1272; X64-NEXT: retq 1273 %arg0 = bitcast double *%a0 to i8* 1274 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 1275 %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0) 1276 %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8* %arg0, <4 x i32> %arg1, <4 x double> %mask, i8 2) 1277 ret <4 x double> %res 1278} 1279declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double>, i8*, <4 x i32>, <4 x double>, i8) nounwind readonly 1280 1281define <4 x double> @test_mm256_mask_i32gather_pd(<4 x double> %a0, double *%a1, <2 x i64> %a2, <4 x double> %a3) { 1282; X32-LABEL: test_mm256_mask_i32gather_pd: 1283; X32: # BB#0: 1284; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1285; X32-NEXT: vgatherdpd %ymm2, (%eax,%xmm1,2), %ymm0 1286; X32-NEXT: retl 1287; 1288; X64-LABEL: test_mm256_mask_i32gather_pd: 1289; X64: # BB#0: 1290; X64-NEXT: vgatherdpd %ymm2, (%rdi,%xmm1,2), %ymm0 1291; X64-NEXT: retq 1292 %arg1 = bitcast double *%a1 to i8* 1293 %arg2 = bitcast <2 x i64> %a2 to <4 x i32> 1294 %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %a0, i8* %arg1, <4 x i32> %arg2, <4 x double> %a3, i8 2) 1295 ret <4 x double> %res 1296} 1297 1298define <4 x float> @test_mm_i32gather_ps(float *%a0, <2 x i64> %a1) { 1299; X32-LABEL: test_mm_i32gather_ps: 1300; X32: # BB#0: 1301; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1302; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1303; X32-NEXT: vgatherdps %xmm2, (%eax,%xmm0,2), %xmm1 1304; X32-NEXT: vmovaps %xmm1, %xmm0 1305; X32-NEXT: retl 1306; 1307; X64-LABEL: test_mm_i32gather_ps: 1308; X64: # BB#0: 1309; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1310; X64-NEXT: vgatherdps %xmm2, (%rdi,%xmm0,2), %xmm1 1311; X64-NEXT: vmovaps %xmm1, %xmm0 1312; X64-NEXT: retq 1313 %arg0 = bitcast float *%a0 to i8* 1314 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 1315 %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer 1316 %sext = sext <4 x i1> %cmp to <4 x i32> 1317 %mask = bitcast <4 x i32> %sext to <4 x float> 1318 %call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> undef, i8* %arg0, <4 x i32> %arg1, <4 x float> %mask, i8 2) 1319 ret <4 x float> %call 1320} 1321declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float>, i8*, <4 x i32>, <4 x float>, i8) nounwind readonly 1322 1323define <4 x float> @test_mm_mask_i32gather_ps(<4 x float> %a0, float *%a1, <2 x i64> %a2, <4 x float> %a3) { 1324; X32-LABEL: test_mm_mask_i32gather_ps: 1325; X32: # BB#0: 1326; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1327; X32-NEXT: vgatherdps %xmm2, (%eax,%xmm1,2), %xmm0 1328; X32-NEXT: retl 1329; 1330; X64-LABEL: test_mm_mask_i32gather_ps: 1331; X64: # BB#0: 1332; X64-NEXT: vgatherdps %xmm2, (%rdi,%xmm1,2), %xmm0 1333; X64-NEXT: retq 1334 %arg1 = bitcast float *%a1 to i8* 1335 %arg2 = bitcast <2 x i64> %a2 to <4 x i32> 1336 %call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %a0, i8* %arg1, <4 x i32> %arg2, <4 x float> %a3, i8 2) 1337 ret <4 x float> %call 1338} 1339 1340define <8 x float> @test_mm256_i32gather_ps(float *%a0, <4 x i64> %a1) { 1341; X32-LABEL: test_mm256_i32gather_ps: 1342; X32: # BB#0: 1343; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1344; X32-NEXT: vxorps %ymm1, %ymm1, %ymm1 1345; X32-NEXT: vcmpeqps %ymm1, %ymm1, %ymm2 1346; X32-NEXT: vgatherdps %ymm2, (%eax,%ymm0,2), %ymm1 1347; X32-NEXT: vmovaps %ymm1, %ymm0 1348; X32-NEXT: retl 1349; 1350; X64-LABEL: test_mm256_i32gather_ps: 1351; X64: # BB#0: 1352; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1 1353; X64-NEXT: vcmpeqps %ymm1, %ymm1, %ymm2 1354; X64-NEXT: vgatherdps %ymm2, (%rdi,%ymm0,2), %ymm1 1355; X64-NEXT: vmovaps %ymm1, %ymm0 1356; X64-NEXT: retq 1357 %arg0 = bitcast float *%a0 to i8* 1358 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1359 %mask = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> zeroinitializer, <8 x float> zeroinitializer, i8 0) 1360 %call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8* %arg0, <8 x i32> %arg1, <8 x float> %mask, i8 2) 1361 ret <8 x float> %call 1362} 1363declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float>, i8*, <8 x i32>, <8 x float>, i8) nounwind readonly 1364 1365define <8 x float> @test_mm256_mask_i32gather_ps(<8 x float> %a0, float *%a1, <4 x i64> %a2, <8 x float> %a3) { 1366; X32-LABEL: test_mm256_mask_i32gather_ps: 1367; X32: # BB#0: 1368; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1369; X32-NEXT: vgatherdps %ymm2, (%eax,%ymm1,2), %ymm0 1370; X32-NEXT: retl 1371; 1372; X64-LABEL: test_mm256_mask_i32gather_ps: 1373; X64: # BB#0: 1374; X64-NEXT: vgatherdps %ymm2, (%rdi,%ymm1,2), %ymm0 1375; X64-NEXT: retq 1376 %arg1 = bitcast float *%a1 to i8* 1377 %arg2 = bitcast <4 x i64> %a2 to <8 x i32> 1378 %call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0, i8* %arg1, <8 x i32> %arg2, <8 x float> %a3, i8 2) 1379 ret <8 x float> %call 1380} 1381 1382define <2 x i64> @test_mm_i64gather_epi32(i32 *%a0, <2 x i64> %a1) { 1383; X32-LABEL: test_mm_i64gather_epi32: 1384; X32: # BB#0: 1385; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1386; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1387; X32-NEXT: vpgatherqd %xmm2, (%eax,%xmm0,2), %xmm1 1388; X32-NEXT: vmovdqa %xmm1, %xmm0 1389; X32-NEXT: retl 1390; 1391; X64-LABEL: test_mm_i64gather_epi32: 1392; X64: # BB#0: 1393; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1394; X64-NEXT: vpgatherqd %xmm2, (%rdi,%xmm0,2), %xmm1 1395; X64-NEXT: vmovdqa %xmm1, %xmm0 1396; X64-NEXT: retq 1397 %arg0 = bitcast i32 *%a0 to i8* 1398 %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32> 1399 %call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> undef, i8* %arg0, <2 x i64> %a1, <4 x i32> %mask, i8 2) 1400 %bc = bitcast <4 x i32> %call to <2 x i64> 1401 ret <2 x i64> %bc 1402} 1403declare <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32>, i8*, <2 x i64>, <4 x i32>, i8) nounwind readonly 1404 1405define <2 x i64> @test_mm_mask_i64gather_epi32(<2 x i64> %a0, i32 *%a1, <2 x i64> %a2, <2 x i64> %a3) { 1406; X32-LABEL: test_mm_mask_i64gather_epi32: 1407; X32: # BB#0: 1408; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1409; X32-NEXT: vpgatherqd %xmm2, (%eax,%xmm1,2), %xmm0 1410; X32-NEXT: retl 1411; 1412; X64-LABEL: test_mm_mask_i64gather_epi32: 1413; X64: # BB#0: 1414; X64-NEXT: vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0 1415; X64-NEXT: retq 1416 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 1417 %arg1 = bitcast i32 *%a1 to i8* 1418 %arg3 = bitcast <2 x i64> %a3 to <4 x i32> 1419 %call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> %arg0, i8* %arg1, <2 x i64> %a2, <4 x i32> %arg3, i8 2) 1420 %bc = bitcast <4 x i32> %call to <2 x i64> 1421 ret <2 x i64> %bc 1422} 1423 1424define <2 x i64> @test_mm256_i64gather_epi32(i32 *%a0, <4 x i64> %a1) { 1425; X32-LABEL: test_mm256_i64gather_epi32: 1426; X32: # BB#0: 1427; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1428; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1429; X32-NEXT: vpgatherqd %xmm2, (%eax,%ymm0,2), %xmm1 1430; X32-NEXT: vmovdqa %xmm1, %xmm0 1431; X32-NEXT: vzeroupper 1432; X32-NEXT: retl 1433; 1434; X64-LABEL: test_mm256_i64gather_epi32: 1435; X64: # BB#0: 1436; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1437; X64-NEXT: vpgatherqd %xmm2, (%rdi,%ymm0,2), %xmm1 1438; X64-NEXT: vmovdqa %xmm1, %xmm0 1439; X64-NEXT: vzeroupper 1440; X64-NEXT: retq 1441 %arg0 = bitcast i32 *%a0 to i8* 1442 %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32> 1443 %call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8* %arg0, <4 x i64> %a1, <4 x i32> %mask, i8 2) 1444 %bc = bitcast <4 x i32> %call to <2 x i64> 1445 ret <2 x i64> %bc 1446} 1447declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32>, i8*, <4 x i64>, <4 x i32>, i8) nounwind readonly 1448 1449define <2 x i64> @test_mm256_mask_i64gather_epi32(<2 x i64> %a0, i32 *%a1, <4 x i64> %a2, <2 x i64> %a3) { 1450; X32-LABEL: test_mm256_mask_i64gather_epi32: 1451; X32: # BB#0: 1452; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1453; X32-NEXT: vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0 1454; X32-NEXT: vzeroupper 1455; X32-NEXT: retl 1456; 1457; X64-LABEL: test_mm256_mask_i64gather_epi32: 1458; X64: # BB#0: 1459; X64-NEXT: vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0 1460; X64-NEXT: vzeroupper 1461; X64-NEXT: retq 1462 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 1463 %arg1 = bitcast i32 *%a1 to i8* 1464 %arg3 = bitcast <2 x i64> %a3 to <4 x i32> 1465 %call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %arg0, i8* %arg1, <4 x i64> %a2, <4 x i32> %arg3, i8 2) 1466 %bc = bitcast <4 x i32> %call to <2 x i64> 1467 ret <2 x i64> %bc 1468} 1469 1470define <2 x i64> @test_mm_i64gather_epi64(i64 *%a0, <2 x i64> %a1) { 1471; X32-LABEL: test_mm_i64gather_epi64: 1472; X32: # BB#0: 1473; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1474; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1475; X32-NEXT: vpgatherqq %xmm2, (%eax,%xmm0,2), %xmm1 1476; X32-NEXT: vmovdqa %xmm1, %xmm0 1477; X32-NEXT: retl 1478; 1479; X64-LABEL: test_mm_i64gather_epi64: 1480; X64: # BB#0: 1481; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1482; X64-NEXT: vpgatherqq %xmm2, (%rdi,%xmm0,2), %xmm1 1483; X64-NEXT: vmovdqa %xmm1, %xmm0 1484; X64-NEXT: retq 1485 %arg0 = bitcast i64 *%a0 to i8* 1486 %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> undef, i8* %arg0, <2 x i64> %a1, <2 x i64> <i64 -1, i64 -1>, i8 2) 1487 ret <2 x i64> %call 1488} 1489declare <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64>, i8*, <2 x i64>, <2 x i64>, i8) nounwind readonly 1490 1491define <2 x i64> @test_mm_mask_i64gather_epi64(<2 x i64> %a0, i64 *%a1, <2 x i64> %a2, <2 x i64> %a3) { 1492; X32-LABEL: test_mm_mask_i64gather_epi64: 1493; X32: # BB#0: 1494; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1495; X32-NEXT: vpgatherqq %xmm2, (%eax,%xmm1,2), %xmm0 1496; X32-NEXT: retl 1497; 1498; X64-LABEL: test_mm_mask_i64gather_epi64: 1499; X64: # BB#0: 1500; X64-NEXT: vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0 1501; X64-NEXT: retq 1502 %arg1 = bitcast i64 *%a1 to i8* 1503 %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> %a0, i8* %arg1, <2 x i64> %a2, <2 x i64> %a3, i8 2) 1504 ret <2 x i64> %call 1505} 1506 1507define <4 x i64> @test_mm256_i64gather_epi64(i64 *%a0, <4 x i64> %a1) { 1508; X32-LABEL: test_mm256_i64gather_epi64: 1509; X32: # BB#0: 1510; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1511; X32-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 1512; X32-NEXT: vpgatherqq %ymm2, (%eax,%ymm0,2), %ymm1 1513; X32-NEXT: vmovdqa %ymm1, %ymm0 1514; X32-NEXT: retl 1515; 1516; X64-LABEL: test_mm256_i64gather_epi64: 1517; X64: # BB#0: 1518; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 1519; X64-NEXT: vpgatherqq %ymm2, (%rdi,%ymm0,2), %ymm1 1520; X64-NEXT: vmovdqa %ymm1, %ymm0 1521; X64-NEXT: retq 1522 %arg0 = bitcast i64 *%a0 to i8* 1523 %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8* %arg0, <4 x i64> %a1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2) 1524 ret <4 x i64> %call 1525} 1526declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64>, i8*, <4 x i64>, <4 x i64>, i8) nounwind readonly 1527 1528define <4 x i64> @test_mm256_mask_i64gather_epi64(<4 x i64> %a0, i64 *%a1, <4 x i64> %a2, <4 x i64> %a3) { 1529; X32-LABEL: test_mm256_mask_i64gather_epi64: 1530; X32: # BB#0: 1531; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1532; X32-NEXT: vpgatherqq %ymm2, (%eax,%ymm1,2), %ymm0 1533; X32-NEXT: retl 1534; 1535; X64-LABEL: test_mm256_mask_i64gather_epi64: 1536; X64: # BB#0: 1537; X64-NEXT: vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0 1538; X64-NEXT: retq 1539 %arg1 = bitcast i64 *%a1 to i8* 1540 %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %a0, i8* %arg1, <4 x i64> %a2, <4 x i64> %a3, i8 2) 1541 ret <4 x i64> %call 1542} 1543 1544define <2 x double> @test_mm_i64gather_pd(double *%a0, <2 x i64> %a1) { 1545; X32-LABEL: test_mm_i64gather_pd: 1546; X32: # BB#0: 1547; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1548; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1549; X32-NEXT: vgatherqpd %xmm2, (%eax,%xmm0,2), %xmm1 1550; X32-NEXT: vmovapd %xmm1, %xmm0 1551; X32-NEXT: retl 1552; 1553; X64-LABEL: test_mm_i64gather_pd: 1554; X64: # BB#0: 1555; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1556; X64-NEXT: vgatherqpd %xmm2, (%rdi,%xmm0,2), %xmm1 1557; X64-NEXT: vmovapd %xmm1, %xmm0 1558; X64-NEXT: retq 1559 %arg0 = bitcast double *%a0 to i8* 1560 %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer 1561 %sext = sext <2 x i1> %cmp to <2 x i64> 1562 %mask = bitcast <2 x i64> %sext to <2 x double> 1563 %call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> undef, i8* %arg0, <2 x i64> %a1, <2 x double> %mask, i8 2) 1564 ret <2 x double> %call 1565} 1566declare <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double>, i8*, <2 x i64>, <2 x double>, i8) nounwind readonly 1567 1568define <2 x double> @test_mm_mask_i64gather_pd(<2 x double> %a0, double *%a1, <2 x i64> %a2, <2 x double> %a3) { 1569; X32-LABEL: test_mm_mask_i64gather_pd: 1570; X32: # BB#0: 1571; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1572; X32-NEXT: vgatherqpd %xmm2, (%eax,%xmm1,2), %xmm0 1573; X32-NEXT: retl 1574; 1575; X64-LABEL: test_mm_mask_i64gather_pd: 1576; X64: # BB#0: 1577; X64-NEXT: vgatherqpd %xmm2, (%rdi,%xmm1,2), %xmm0 1578; X64-NEXT: retq 1579 %arg1 = bitcast double *%a1 to i8* 1580 %call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> %a0, i8* %arg1, <2 x i64> %a2, <2 x double> %a3, i8 2) 1581 ret <2 x double> %call 1582} 1583 1584define <4 x double> @test_mm256_i64gather_pd(double *%a0, <4 x i64> %a1) { 1585; X32-LABEL: test_mm256_i64gather_pd: 1586; X32: # BB#0: 1587; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1588; X32-NEXT: vxorpd %ymm1, %ymm1, %ymm1 1589; X32-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2 1590; X32-NEXT: vgatherqpd %ymm2, (%eax,%ymm0,2), %ymm1 1591; X32-NEXT: vmovapd %ymm1, %ymm0 1592; X32-NEXT: retl 1593; 1594; X64-LABEL: test_mm256_i64gather_pd: 1595; X64: # BB#0: 1596; X64-NEXT: vxorpd %ymm1, %ymm1, %ymm1 1597; X64-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2 1598; X64-NEXT: vgatherqpd %ymm2, (%rdi,%ymm0,2), %ymm1 1599; X64-NEXT: vmovapd %ymm1, %ymm0 1600; X64-NEXT: retq 1601 %arg0 = bitcast double *%a0 to i8* 1602 %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0) 1603 %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8* %arg0, <4 x i64> %a1, <4 x double> %mask, i8 2) 1604 ret <4 x double> %call 1605} 1606declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double>, i8*, <4 x i64>, <4 x double>, i8) nounwind readonly 1607 1608define <4 x double> @test_mm256_mask_i64gather_pd(<4 x double> %a0, i64 *%a1, <4 x i64> %a2, <4 x double> %a3) { 1609; X32-LABEL: test_mm256_mask_i64gather_pd: 1610; X32: # BB#0: 1611; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1612; X32-NEXT: vgatherqpd %ymm2, (%eax,%ymm1,2), %ymm0 1613; X32-NEXT: retl 1614; 1615; X64-LABEL: test_mm256_mask_i64gather_pd: 1616; X64: # BB#0: 1617; X64-NEXT: vgatherqpd %ymm2, (%rdi,%ymm1,2), %ymm0 1618; X64-NEXT: retq 1619 %arg1 = bitcast i64 *%a1 to i8* 1620 %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %a0, i8* %arg1, <4 x i64> %a2, <4 x double> %a3, i8 2) 1621 ret <4 x double> %call 1622} 1623 1624define <4 x float> @test_mm_i64gather_ps(float *%a0, <2 x i64> %a1) { 1625; X32-LABEL: test_mm_i64gather_ps: 1626; X32: # BB#0: 1627; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1628; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1629; X32-NEXT: vgatherqps %xmm2, (%eax,%xmm0,2), %xmm1 1630; X32-NEXT: vmovaps %xmm1, %xmm0 1631; X32-NEXT: retl 1632; 1633; X64-LABEL: test_mm_i64gather_ps: 1634; X64: # BB#0: 1635; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1636; X64-NEXT: vgatherqps %xmm2, (%rdi,%xmm0,2), %xmm1 1637; X64-NEXT: vmovaps %xmm1, %xmm0 1638; X64-NEXT: retq 1639 %arg0 = bitcast float *%a0 to i8* 1640 %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer 1641 %sext = sext <4 x i1> %cmp to <4 x i32> 1642 %mask = bitcast <4 x i32> %sext to <4 x float> 1643 %call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> undef, i8* %arg0, <2 x i64> %a1, <4 x float> %mask, i8 2) 1644 ret <4 x float> %call 1645} 1646declare <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float>, i8*, <2 x i64>, <4 x float>, i8) nounwind readonly 1647 1648define <4 x float> @test_mm_mask_i64gather_ps(<4 x float> %a0, float *%a1, <2 x i64> %a2, <4 x float> %a3) { 1649; X32-LABEL: test_mm_mask_i64gather_ps: 1650; X32: # BB#0: 1651; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1652; X32-NEXT: vgatherqps %xmm2, (%eax,%xmm1,2), %xmm0 1653; X32-NEXT: retl 1654; 1655; X64-LABEL: test_mm_mask_i64gather_ps: 1656; X64: # BB#0: 1657; X64-NEXT: vgatherqps %xmm2, (%rdi,%xmm1,2), %xmm0 1658; X64-NEXT: retq 1659 %arg1 = bitcast float *%a1 to i8* 1660 %call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> %a0, i8* %arg1, <2 x i64> %a2, <4 x float> %a3, i8 2) 1661 ret <4 x float> %call 1662} 1663 1664define <4 x float> @test_mm256_i64gather_ps(float *%a0, <4 x i64> %a1) { 1665; X32-LABEL: test_mm256_i64gather_ps: 1666; X32: # BB#0: 1667; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1668; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1669; X32-NEXT: vgatherqps %xmm2, (%eax,%ymm0,2), %xmm1 1670; X32-NEXT: vmovaps %xmm1, %xmm0 1671; X32-NEXT: vzeroupper 1672; X32-NEXT: retl 1673; 1674; X64-LABEL: test_mm256_i64gather_ps: 1675; X64: # BB#0: 1676; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1677; X64-NEXT: vgatherqps %xmm2, (%rdi,%ymm0,2), %xmm1 1678; X64-NEXT: vmovaps %xmm1, %xmm0 1679; X64-NEXT: vzeroupper 1680; X64-NEXT: retq 1681 %arg0 = bitcast float *%a0 to i8* 1682 %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer 1683 %sext = sext <4 x i1> %cmp to <4 x i32> 1684 %mask = bitcast <4 x i32> %sext to <4 x float> 1685 %call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8* %arg0, <4 x i64> %a1, <4 x float> %mask, i8 2) 1686 ret <4 x float> %call 1687} 1688declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float>, i8*, <4 x i64>, <4 x float>, i8) nounwind readonly 1689 1690define <4 x float> @test_mm256_mask_i64gather_ps(<4 x float> %a0, float *%a1, <4 x i64> %a2, <4 x float> %a3) { 1691; X32-LABEL: test_mm256_mask_i64gather_ps: 1692; X32: # BB#0: 1693; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1694; X32-NEXT: vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0 1695; X32-NEXT: vzeroupper 1696; X32-NEXT: retl 1697; 1698; X64-LABEL: test_mm256_mask_i64gather_ps: 1699; X64: # BB#0: 1700; X64-NEXT: vgatherqps %xmm2, (%rdi,%ymm1,2), %xmm0 1701; X64-NEXT: vzeroupper 1702; X64-NEXT: retq 1703 %arg1 = bitcast float *%a1 to i8* 1704 %call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %a0, i8* %arg1, <4 x i64> %a2, <4 x float> %a3, i8 2) 1705 ret <4 x float> %call 1706} 1707 1708define <4 x i64> @test0_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind { 1709; X32-LABEL: test0_mm256_inserti128_si256: 1710; X32: # BB#0: 1711; X32-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def> 1712; X32-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 1713; X32-NEXT: retl 1714; 1715; X64-LABEL: test0_mm256_inserti128_si256: 1716; X64: # BB#0: 1717; X64-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def> 1718; X64-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 1719; X64-NEXT: retq 1720 %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 1721 %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 1722 ret <4 x i64> %res 1723} 1724 1725define <4 x i64> @test1_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind { 1726; X32-LABEL: test1_mm256_inserti128_si256: 1727; X32: # BB#0: 1728; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1729; X32-NEXT: retl 1730; 1731; X64-LABEL: test1_mm256_inserti128_si256: 1732; X64: # BB#0: 1733; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1734; X64-NEXT: retq 1735 %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 1736 %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 1737 ret <4 x i64> %res 1738} 1739 1740define <4 x i64> @test_mm256_madd_epi16(<4 x i64> %a0, <4 x i64> %a1) { 1741; X32-LABEL: test_mm256_madd_epi16: 1742; X32: # BB#0: 1743; X32-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 1744; X32-NEXT: retl 1745; 1746; X64-LABEL: test_mm256_madd_epi16: 1747; X64: # BB#0: 1748; X64-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 1749; X64-NEXT: retq 1750 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1751 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1752 %res = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %arg0, <16 x i16> %arg1) 1753 %bc = bitcast <8 x i32> %res to <4 x i64> 1754 ret <4 x i64> %bc 1755} 1756declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone 1757 1758define <4 x i64> @test_mm256_maddubs_epi16(<4 x i64> %a0, <4 x i64> %a1) { 1759; X32-LABEL: test_mm256_maddubs_epi16: 1760; X32: # BB#0: 1761; X32-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 1762; X32-NEXT: retl 1763; 1764; X64-LABEL: test_mm256_maddubs_epi16: 1765; X64: # BB#0: 1766; X64-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 1767; X64-NEXT: retq 1768 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 1769 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 1770 %res = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %arg0, <32 x i8> %arg1) 1771 %bc = bitcast <16 x i16> %res to <4 x i64> 1772 ret <4 x i64> %bc 1773} 1774declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone 1775 1776define <2 x i64> @test_mm_maskload_epi32(i32* %a0, <2 x i64> %a1) nounwind { 1777; X32-LABEL: test_mm_maskload_epi32: 1778; X32: # BB#0: 1779; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1780; X32-NEXT: vpmaskmovd (%eax), %xmm0, %xmm0 1781; X32-NEXT: retl 1782; 1783; X64-LABEL: test_mm_maskload_epi32: 1784; X64: # BB#0: 1785; X64-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm0 1786; X64-NEXT: retq 1787 %arg0 = bitcast i32* %a0 to i8* 1788 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 1789 %call = call <4 x i32> @llvm.x86.avx2.maskload.d(i8* %arg0, <4 x i32> %arg1) 1790 %bc = bitcast <4 x i32> %call to <2 x i64> 1791 ret <2 x i64> %bc 1792} 1793declare <4 x i32> @llvm.x86.avx2.maskload.d(i8*, <4 x i32>) nounwind readonly 1794 1795define <4 x i64> @test_mm256_maskload_epi32(i32* %a0, <4 x i64> %a1) nounwind { 1796; X32-LABEL: test_mm256_maskload_epi32: 1797; X32: # BB#0: 1798; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1799; X32-NEXT: vpmaskmovd (%eax), %ymm0, %ymm0 1800; X32-NEXT: retl 1801; 1802; X64-LABEL: test_mm256_maskload_epi32: 1803; X64: # BB#0: 1804; X64-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm0 1805; X64-NEXT: retq 1806 %arg0 = bitcast i32* %a0 to i8* 1807 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1808 %call = call <8 x i32> @llvm.x86.avx2.maskload.d.256(i8* %arg0, <8 x i32> %arg1) 1809 %bc = bitcast <8 x i32> %call to <4 x i64> 1810 ret <4 x i64> %bc 1811} 1812declare <8 x i32> @llvm.x86.avx2.maskload.d.256(i8*, <8 x i32>) nounwind readonly 1813 1814define <2 x i64> @test_mm_maskload_epi64(i64* %a0, <2 x i64> %a1) nounwind { 1815; X32-LABEL: test_mm_maskload_epi64: 1816; X32: # BB#0: 1817; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1818; X32-NEXT: vpmaskmovq (%eax), %xmm0, %xmm0 1819; X32-NEXT: retl 1820; 1821; X64-LABEL: test_mm_maskload_epi64: 1822; X64: # BB#0: 1823; X64-NEXT: vpmaskmovq (%rdi), %xmm0, %xmm0 1824; X64-NEXT: retq 1825 %arg0 = bitcast i64* %a0 to i8* 1826 %res = call <2 x i64> @llvm.x86.avx2.maskload.q(i8* %arg0, <2 x i64> %a1) 1827 ret <2 x i64> %res 1828} 1829declare <2 x i64> @llvm.x86.avx2.maskload.q(i8*, <2 x i64>) nounwind readonly 1830 1831define <4 x i64> @test_mm256_maskload_epi64(i64* %a0, <4 x i64> %a1) nounwind { 1832; X32-LABEL: test_mm256_maskload_epi64: 1833; X32: # BB#0: 1834; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1835; X32-NEXT: vpmaskmovq (%eax), %ymm0, %ymm0 1836; X32-NEXT: retl 1837; 1838; X64-LABEL: test_mm256_maskload_epi64: 1839; X64: # BB#0: 1840; X64-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm0 1841; X64-NEXT: retq 1842 %arg0 = bitcast i64* %a0 to i8* 1843 %res = call <4 x i64> @llvm.x86.avx2.maskload.q.256(i8* %arg0, <4 x i64> %a1) 1844 ret <4 x i64> %res 1845} 1846declare <4 x i64> @llvm.x86.avx2.maskload.q.256(i8*, <4 x i64>) nounwind readonly 1847 1848define void @test_mm_maskstore_epi32(float* %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind { 1849; X32-LABEL: test_mm_maskstore_epi32: 1850; X32: # BB#0: 1851; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1852; X32-NEXT: vpmaskmovd %xmm1, %xmm0, (%eax) 1853; X32-NEXT: retl 1854; 1855; X64-LABEL: test_mm_maskstore_epi32: 1856; X64: # BB#0: 1857; X64-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) 1858; X64-NEXT: retq 1859 %arg0 = bitcast float* %a0 to i8* 1860 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 1861 %arg2 = bitcast <2 x i64> %a2 to <4 x i32> 1862 call void @llvm.x86.avx2.maskstore.d(i8* %arg0, <4 x i32> %arg1, <4 x i32> %arg2) 1863 ret void 1864} 1865declare void @llvm.x86.avx2.maskstore.d(i8*, <4 x i32>, <4 x i32>) nounwind readnone 1866 1867define void @test_mm256_maskstore_epi32(float* %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind { 1868; X32-LABEL: test_mm256_maskstore_epi32: 1869; X32: # BB#0: 1870; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1871; X32-NEXT: vpmaskmovd %ymm1, %ymm0, (%eax) 1872; X32-NEXT: vzeroupper 1873; X32-NEXT: retl 1874; 1875; X64-LABEL: test_mm256_maskstore_epi32: 1876; X64: # BB#0: 1877; X64-NEXT: vpmaskmovd %ymm1, %ymm0, (%rdi) 1878; X64-NEXT: vzeroupper 1879; X64-NEXT: retq 1880 %arg0 = bitcast float* %a0 to i8* 1881 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1882 %arg2 = bitcast <4 x i64> %a2 to <8 x i32> 1883 call void @llvm.x86.avx2.maskstore.d.256(i8* %arg0, <8 x i32> %arg1, <8 x i32> %arg2) 1884 ret void 1885} 1886declare void @llvm.x86.avx2.maskstore.d.256(i8*, <8 x i32>, <8 x i32>) nounwind readnone 1887 1888define void @test_mm_maskstore_epi64(i64* %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind { 1889; X32-LABEL: test_mm_maskstore_epi64: 1890; X32: # BB#0: 1891; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1892; X32-NEXT: vpmaskmovq %xmm1, %xmm0, (%eax) 1893; X32-NEXT: retl 1894; 1895; X64-LABEL: test_mm_maskstore_epi64: 1896; X64: # BB#0: 1897; X64-NEXT: vpmaskmovq %xmm1, %xmm0, (%rdi) 1898; X64-NEXT: retq 1899 %arg0 = bitcast i64* %a0 to i8* 1900 call void @llvm.x86.avx2.maskstore.q(i8* %arg0, <2 x i64> %a1, <2 x i64> %a2) 1901 ret void 1902} 1903declare void @llvm.x86.avx2.maskstore.q(i8*, <2 x i64>, <2 x i64>) nounwind readnone 1904 1905define void @test_mm256_maskstore_epi64(i64* %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind { 1906; X32-LABEL: test_mm256_maskstore_epi64: 1907; X32: # BB#0: 1908; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1909; X32-NEXT: vpmaskmovq %ymm1, %ymm0, (%eax) 1910; X32-NEXT: vzeroupper 1911; X32-NEXT: retl 1912; 1913; X64-LABEL: test_mm256_maskstore_epi64: 1914; X64: # BB#0: 1915; X64-NEXT: vpmaskmovq %ymm1, %ymm0, (%rdi) 1916; X64-NEXT: vzeroupper 1917; X64-NEXT: retq 1918 %arg0 = bitcast i64* %a0 to i8* 1919 call void @llvm.x86.avx2.maskstore.q.256(i8* %arg0, <4 x i64> %a1, <4 x i64> %a2) 1920 ret void 1921} 1922declare void @llvm.x86.avx2.maskstore.q.256(i8*, <4 x i64>, <4 x i64>) nounwind readnone 1923 1924define <4 x i64> @test_mm256_max_epi8(<4 x i64> %a0, <4 x i64> %a1) { 1925; X32-LABEL: test_mm256_max_epi8: 1926; X32: # BB#0: 1927; X32-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 1928; X32-NEXT: retl 1929; 1930; X64-LABEL: test_mm256_max_epi8: 1931; X64: # BB#0: 1932; X64-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 1933; X64-NEXT: retq 1934 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 1935 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 1936 %cmp = icmp sgt <32 x i8> %arg0, %arg1 1937 %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1 1938 %bc = bitcast <32 x i8> %sel to <4 x i64> 1939 ret <4 x i64> %bc 1940} 1941 1942define <4 x i64> @test_mm256_max_epi16(<4 x i64> %a0, <4 x i64> %a1) { 1943; X32-LABEL: test_mm256_max_epi16: 1944; X32: # BB#0: 1945; X32-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 1946; X32-NEXT: retl 1947; 1948; X64-LABEL: test_mm256_max_epi16: 1949; X64: # BB#0: 1950; X64-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 1951; X64-NEXT: retq 1952 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1953 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1954 %cmp = icmp sgt <16 x i16> %arg0, %arg1 1955 %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1 1956 %bc = bitcast <16 x i16> %sel to <4 x i64> 1957 ret <4 x i64> %bc 1958} 1959 1960define <4 x i64> @test_mm256_max_epi32(<4 x i64> %a0, <4 x i64> %a1) { 1961; X32-LABEL: test_mm256_max_epi32: 1962; X32: # BB#0: 1963; X32-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 1964; X32-NEXT: retl 1965; 1966; X64-LABEL: test_mm256_max_epi32: 1967; X64: # BB#0: 1968; X64-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 1969; X64-NEXT: retq 1970 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 1971 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1972 %cmp = icmp sgt <8 x i32> %arg0, %arg1 1973 %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1 1974 %bc = bitcast <8 x i32> %sel to <4 x i64> 1975 ret <4 x i64> %bc 1976} 1977 1978define <4 x i64> @test_mm256_max_epu8(<4 x i64> %a0, <4 x i64> %a1) { 1979; X32-LABEL: test_mm256_max_epu8: 1980; X32: # BB#0: 1981; X32-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 1982; X32-NEXT: retl 1983; 1984; X64-LABEL: test_mm256_max_epu8: 1985; X64: # BB#0: 1986; X64-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 1987; X64-NEXT: retq 1988 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 1989 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 1990 %cmp = icmp ugt <32 x i8> %arg0, %arg1 1991 %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1 1992 %bc = bitcast <32 x i8> %sel to <4 x i64> 1993 ret <4 x i64> %bc 1994} 1995 1996define <4 x i64> @test_mm256_max_epu16(<4 x i64> %a0, <4 x i64> %a1) { 1997; X32-LABEL: test_mm256_max_epu16: 1998; X32: # BB#0: 1999; X32-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 2000; X32-NEXT: retl 2001; 2002; X64-LABEL: test_mm256_max_epu16: 2003; X64: # BB#0: 2004; X64-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 2005; X64-NEXT: retq 2006 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2007 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 2008 %cmp = icmp ugt <16 x i16> %arg0, %arg1 2009 %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1 2010 %bc = bitcast <16 x i16> %sel to <4 x i64> 2011 ret <4 x i64> %bc 2012} 2013 2014define <4 x i64> @test_mm256_max_epu32(<4 x i64> %a0, <4 x i64> %a1) { 2015; X32-LABEL: test_mm256_max_epu32: 2016; X32: # BB#0: 2017; X32-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 2018; X32-NEXT: retl 2019; 2020; X64-LABEL: test_mm256_max_epu32: 2021; X64: # BB#0: 2022; X64-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 2023; X64-NEXT: retq 2024 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2025 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2026 %cmp = icmp ugt <8 x i32> %arg0, %arg1 2027 %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1 2028 %bc = bitcast <8 x i32> %sel to <4 x i64> 2029 ret <4 x i64> %bc 2030} 2031 2032define <4 x i64> @test_mm256_min_epi8(<4 x i64> %a0, <4 x i64> %a1) { 2033; X32-LABEL: test_mm256_min_epi8: 2034; X32: # BB#0: 2035; X32-NEXT: vpminsb %ymm1, %ymm0, %ymm0 2036; X32-NEXT: retl 2037; 2038; X64-LABEL: test_mm256_min_epi8: 2039; X64: # BB#0: 2040; X64-NEXT: vpminsb %ymm1, %ymm0, %ymm0 2041; X64-NEXT: retq 2042 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2043 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 2044 %cmp = icmp slt <32 x i8> %arg0, %arg1 2045 %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1 2046 %bc = bitcast <32 x i8> %sel to <4 x i64> 2047 ret <4 x i64> %bc 2048} 2049 2050define <4 x i64> @test_mm256_min_epi16(<4 x i64> %a0, <4 x i64> %a1) { 2051; X32-LABEL: test_mm256_min_epi16: 2052; X32: # BB#0: 2053; X32-NEXT: vpminsw %ymm1, %ymm0, %ymm0 2054; X32-NEXT: retl 2055; 2056; X64-LABEL: test_mm256_min_epi16: 2057; X64: # BB#0: 2058; X64-NEXT: vpminsw %ymm1, %ymm0, %ymm0 2059; X64-NEXT: retq 2060 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2061 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 2062 %cmp = icmp slt <16 x i16> %arg0, %arg1 2063 %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1 2064 %bc = bitcast <16 x i16> %sel to <4 x i64> 2065 ret <4 x i64> %bc 2066} 2067 2068define <4 x i64> @test_mm256_min_epi32(<4 x i64> %a0, <4 x i64> %a1) { 2069; X32-LABEL: test_mm256_min_epi32: 2070; X32: # BB#0: 2071; X32-NEXT: vpminsd %ymm1, %ymm0, %ymm0 2072; X32-NEXT: retl 2073; 2074; X64-LABEL: test_mm256_min_epi32: 2075; X64: # BB#0: 2076; X64-NEXT: vpminsd %ymm1, %ymm0, %ymm0 2077; X64-NEXT: retq 2078 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2079 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2080 %cmp = icmp slt <8 x i32> %arg0, %arg1 2081 %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1 2082 %bc = bitcast <8 x i32> %sel to <4 x i64> 2083 ret <4 x i64> %bc 2084} 2085 2086define <4 x i64> @test_mm256_min_epu8(<4 x i64> %a0, <4 x i64> %a1) { 2087; X32-LABEL: test_mm256_min_epu8: 2088; X32: # BB#0: 2089; X32-NEXT: vpminub %ymm1, %ymm0, %ymm0 2090; X32-NEXT: retl 2091; 2092; X64-LABEL: test_mm256_min_epu8: 2093; X64: # BB#0: 2094; X64-NEXT: vpminub %ymm1, %ymm0, %ymm0 2095; X64-NEXT: retq 2096 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2097 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 2098 %cmp = icmp ult <32 x i8> %arg0, %arg1 2099 %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1 2100 %bc = bitcast <32 x i8> %sel to <4 x i64> 2101 ret <4 x i64> %bc 2102} 2103 2104define <4 x i64> @test_mm256_min_epu16(<4 x i64> %a0, <4 x i64> %a1) { 2105; X32-LABEL: test_mm256_min_epu16: 2106; X32: # BB#0: 2107; X32-NEXT: vpminuw %ymm1, %ymm0, %ymm0 2108; X32-NEXT: retl 2109; 2110; X64-LABEL: test_mm256_min_epu16: 2111; X64: # BB#0: 2112; X64-NEXT: vpminuw %ymm1, %ymm0, %ymm0 2113; X64-NEXT: retq 2114 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2115 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 2116 %cmp = icmp ult <16 x i16> %arg0, %arg1 2117 %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1 2118 %bc = bitcast <16 x i16> %sel to <4 x i64> 2119 ret <4 x i64> %bc 2120} 2121 2122define <4 x i64> @test_mm256_min_epu32(<4 x i64> %a0, <4 x i64> %a1) { 2123; X32-LABEL: test_mm256_min_epu32: 2124; X32: # BB#0: 2125; X32-NEXT: vpminud %ymm1, %ymm0, %ymm0 2126; X32-NEXT: retl 2127; 2128; X64-LABEL: test_mm256_min_epu32: 2129; X64: # BB#0: 2130; X64-NEXT: vpminud %ymm1, %ymm0, %ymm0 2131; X64-NEXT: retq 2132 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2133 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2134 %cmp = icmp ult <8 x i32> %arg0, %arg1 2135 %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1 2136 %bc = bitcast <8 x i32> %sel to <4 x i64> 2137 ret <4 x i64> %bc 2138} 2139 2140define i32 @test_mm256_movemask_epi8(<4 x i64> %a0) nounwind { 2141; X32-LABEL: test_mm256_movemask_epi8: 2142; X32: # BB#0: 2143; X32-NEXT: vpmovmskb %ymm0, %eax 2144; X32-NEXT: vzeroupper 2145; X32-NEXT: retl 2146; 2147; X64-LABEL: test_mm256_movemask_epi8: 2148; X64: # BB#0: 2149; X64-NEXT: vpmovmskb %ymm0, %eax 2150; X64-NEXT: vzeroupper 2151; X64-NEXT: retq 2152 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2153 %res = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %arg0) 2154 ret i32 %res 2155} 2156declare i32 @llvm.x86.avx2.pmovmskb(<32 x i8>) nounwind readnone 2157 2158define <4 x i64> @test_mm256_mpsadbw_epu8(<4 x i64> %a0, <4 x i64> %a1) { 2159; X32-LABEL: test_mm256_mpsadbw_epu8: 2160; X32: # BB#0: 2161; X32-NEXT: vmpsadbw $3, %ymm1, %ymm0, %ymm0 2162; X32-NEXT: retl 2163; 2164; X64-LABEL: test_mm256_mpsadbw_epu8: 2165; X64: # BB#0: 2166; X64-NEXT: vmpsadbw $3, %ymm1, %ymm0, %ymm0 2167; X64-NEXT: retq 2168 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2169 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 2170 %call = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %arg0, <32 x i8> %arg1, i8 3) 2171 %bc = bitcast <16 x i16> %call to <4 x i64> 2172 ret <4 x i64> %bc 2173} 2174declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind readnone 2175 2176define <4 x i64> @test_mm256_mul_epi32(<4 x i64> %a0, <4 x i64> %a1) { 2177; X32-LABEL: test_mm256_mul_epi32: 2178; X32: # BB#0: 2179; X32-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 2180; X32-NEXT: retl 2181; 2182; X64-LABEL: test_mm256_mul_epi32: 2183; X64: # BB#0: 2184; X64-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 2185; X64-NEXT: retq 2186 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2187 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2188 %res = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %arg0, <8 x i32> %arg1) 2189 ret <4 x i64> %res 2190} 2191declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone 2192 2193define <4 x i64> @test_mm256_mul_epu32(<4 x i64> %a0, <4 x i64> %a1) { 2194; X32-LABEL: test_mm256_mul_epu32: 2195; X32: # BB#0: 2196; X32-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 2197; X32-NEXT: retl 2198; 2199; X64-LABEL: test_mm256_mul_epu32: 2200; X64: # BB#0: 2201; X64-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 2202; X64-NEXT: retq 2203 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2204 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2205 %res = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> %arg0, <8 x i32> %arg1) 2206 ret <4 x i64> %res 2207} 2208declare <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32>, <8 x i32>) nounwind readnone 2209 2210define <4 x i64> @test_mm256_mulhi_epi16(<4 x i64> %a0, <4 x i64> %a1) { 2211; X32-LABEL: test_mm256_mulhi_epi16: 2212; X32: # BB#0: 2213; X32-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 2214; X32-NEXT: retl 2215; 2216; X64-LABEL: test_mm256_mulhi_epi16: 2217; X64: # BB#0: 2218; X64-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 2219; X64-NEXT: retq 2220 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2221 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 2222 %res = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %arg0, <16 x i16> %arg1) 2223 %bc = bitcast <16 x i16> %res to <4 x i64> 2224 ret <4 x i64> %bc 2225} 2226declare <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16>, <16 x i16>) nounwind readnone 2227 2228define <4 x i64> @test_mm256_mulhi_epu16(<4 x i64> %a0, <4 x i64> %a1) { 2229; X32-LABEL: test_mm256_mulhi_epu16: 2230; X32: # BB#0: 2231; X32-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 2232; X32-NEXT: retl 2233; 2234; X64-LABEL: test_mm256_mulhi_epu16: 2235; X64: # BB#0: 2236; X64-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 2237; X64-NEXT: retq 2238 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2239 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 2240 %res = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %arg0, <16 x i16> %arg1) 2241 %bc = bitcast <16 x i16> %res to <4 x i64> 2242 ret <4 x i64> %bc 2243} 2244declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>) nounwind readnone 2245 2246define <4 x i64> @test_mm256_mulhrs_epi16(<4 x i64> %a0, <4 x i64> %a1) { 2247; X32-LABEL: test_mm256_mulhrs_epi16: 2248; X32: # BB#0: 2249; X32-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm0 2250; X32-NEXT: retl 2251; 2252; X64-LABEL: test_mm256_mulhrs_epi16: 2253; X64: # BB#0: 2254; X64-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm0 2255; X64-NEXT: retq 2256 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2257 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 2258 %res = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %arg0, <16 x i16> %arg1) 2259 %bc = bitcast <16 x i16> %res to <4 x i64> 2260 ret <4 x i64> %bc 2261} 2262declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind readnone 2263 2264define <4 x i64> @test_mm256_mullo_epi16(<4 x i64> %a0, <4 x i64> %a1) { 2265; X32-LABEL: test_mm256_mullo_epi16: 2266; X32: # BB#0: 2267; X32-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2268; X32-NEXT: retl 2269; 2270; X64-LABEL: test_mm256_mullo_epi16: 2271; X64: # BB#0: 2272; X64-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2273; X64-NEXT: retq 2274 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2275 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 2276 %res = mul <16 x i16> %arg0, %arg1 2277 %bc = bitcast <16 x i16> %res to <4 x i64> 2278 ret <4 x i64> %bc 2279} 2280 2281define <4 x i64> @test_mm256_mullo_epi32(<4 x i64> %a0, <4 x i64> %a1) { 2282; X32-LABEL: test_mm256_mullo_epi32: 2283; X32: # BB#0: 2284; X32-NEXT: vpmulld %ymm1, %ymm0, %ymm0 2285; X32-NEXT: retl 2286; 2287; X64-LABEL: test_mm256_mullo_epi32: 2288; X64: # BB#0: 2289; X64-NEXT: vpmulld %ymm1, %ymm0, %ymm0 2290; X64-NEXT: retq 2291 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2292 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2293 %res = mul <8 x i32> %arg0, %arg1 2294 %bc = bitcast <8 x i32> %res to <4 x i64> 2295 ret <4 x i64> %bc 2296} 2297 2298define <4 x i64> @test_mm256_or_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2299; X32-LABEL: test_mm256_or_si256: 2300; X32: # BB#0: 2301; X32-NEXT: vorps %ymm1, %ymm0, %ymm0 2302; X32-NEXT: retl 2303; 2304; X64-LABEL: test_mm256_or_si256: 2305; X64: # BB#0: 2306; X64-NEXT: vorps %ymm1, %ymm0, %ymm0 2307; X64-NEXT: retq 2308 %res = or <4 x i64> %a0, %a1 2309 ret <4 x i64> %res 2310} 2311 2312define <4 x i64> @test_mm256_packs_epi16(<4 x i64> %a0, <4 x i64> %a1) { 2313; X32-LABEL: test_mm256_packs_epi16: 2314; X32: # BB#0: 2315; X32-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 2316; X32-NEXT: retl 2317; 2318; X64-LABEL: test_mm256_packs_epi16: 2319; X64: # BB#0: 2320; X64-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 2321; X64-NEXT: retq 2322 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2323 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 2324 %call = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %arg0, <16 x i16> %arg1) 2325 %res = bitcast <32 x i8> %call to <4 x i64> 2326 ret <4 x i64> %res 2327} 2328declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone 2329 2330define <4 x i64> @test_mm256_packs_epi32(<4 x i64> %a0, <4 x i64> %a1) { 2331; X32-LABEL: test_mm256_packs_epi32: 2332; X32: # BB#0: 2333; X32-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 2334; X32-NEXT: retl 2335; 2336; X64-LABEL: test_mm256_packs_epi32: 2337; X64: # BB#0: 2338; X64-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 2339; X64-NEXT: retq 2340 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2341 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2342 %call = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %arg0, <8 x i32> %arg1) 2343 %res = bitcast <16 x i16> %call to <4 x i64> 2344 ret <4 x i64> %res 2345} 2346declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone 2347 2348define <4 x i64> @test_mm256_packus_epi16(<4 x i64> %a0, <4 x i64> %a1) { 2349; X32-LABEL: test_mm256_packus_epi16: 2350; X32: # BB#0: 2351; X32-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 2352; X32-NEXT: retl 2353; 2354; X64-LABEL: test_mm256_packus_epi16: 2355; X64: # BB#0: 2356; X64-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 2357; X64-NEXT: retq 2358 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2359 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 2360 %call = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %arg0, <16 x i16> %arg1) 2361 %res = bitcast <32 x i8> %call to <4 x i64> 2362 ret <4 x i64> %res 2363} 2364declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone 2365 2366define <4 x i64> @test_mm256_packus_epi32(<4 x i64> %a0, <4 x i64> %a1) { 2367; X32-LABEL: test_mm256_packus_epi32: 2368; X32: # BB#0: 2369; X32-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 2370; X32-NEXT: retl 2371; 2372; X64-LABEL: test_mm256_packus_epi32: 2373; X64: # BB#0: 2374; X64-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 2375; X64-NEXT: retq 2376 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2377 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2378 %call = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %arg0, <8 x i32> %arg1) 2379 %res = bitcast <16 x i16> %call to <4 x i64> 2380 ret <4 x i64> %res 2381} 2382declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone 2383 2384define <4 x i64> @test_mm256_permute2x128_si256(<4 x i64> %a0, <4 x i64> %a1) { 2385; X32-LABEL: test_mm256_permute2x128_si256: 2386; X32: # BB#0: 2387; X32-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 2388; X32-NEXT: retl 2389; 2390; X64-LABEL: test_mm256_permute2x128_si256: 2391; X64: # BB#0: 2392; X64-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 2393; X64-NEXT: retq 2394 %res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 49) 2395 ret <4 x i64> %res 2396} 2397declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind readonly 2398 2399define <4 x i64> @test_mm256_permute4x64_epi64(<4 x i64> %a0) { 2400; X32-LABEL: test_mm256_permute4x64_epi64: 2401; X32: # BB#0: 2402; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,2,0] 2403; X32-NEXT: retl 2404; 2405; X64-LABEL: test_mm256_permute4x64_epi64: 2406; X64: # BB#0: 2407; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,2,0] 2408; X64-NEXT: retq 2409 %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 0> 2410 ret <4 x i64> %res 2411} 2412 2413define <4 x double> @test_mm256_permute4x64_pd(<4 x double> %a0) { 2414; X32-LABEL: test_mm256_permute4x64_pd: 2415; X32: # BB#0: 2416; X32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,2,1,0] 2417; X32-NEXT: retl 2418; 2419; X64-LABEL: test_mm256_permute4x64_pd: 2420; X64: # BB#0: 2421; X64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,2,1,0] 2422; X64-NEXT: retq 2423 %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0> 2424 ret <4 x double> %res 2425} 2426 2427define <4 x i64> @test_mm256_permutevar8x32_epi32(<4 x i64> %a0, <4 x i64> %a1) { 2428; X32-LABEL: test_mm256_permutevar8x32_epi32: 2429; X32: # BB#0: 2430; X32-NEXT: vpermd %ymm0, %ymm1, %ymm0 2431; X32-NEXT: retl 2432; 2433; X64-LABEL: test_mm256_permutevar8x32_epi32: 2434; X64: # BB#0: 2435; X64-NEXT: vpermd %ymm0, %ymm1, %ymm0 2436; X64-NEXT: retq 2437 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2438 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2439 %call = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %arg0, <8 x i32> %arg1) 2440 %res = bitcast <8 x i32> %call to <4 x i64> 2441 ret <4 x i64> %res 2442} 2443declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly 2444 2445define <8 x float> @test_mm256_permutevar8x32_ps(<8 x float> %a0, <4 x i64> %a1) { 2446; X32-LABEL: test_mm256_permutevar8x32_ps: 2447; X32: # BB#0: 2448; X32-NEXT: vpermps %ymm0, %ymm1, %ymm0 2449; X32-NEXT: retl 2450; 2451; X64-LABEL: test_mm256_permutevar8x32_ps: 2452; X64: # BB#0: 2453; X64-NEXT: vpermps %ymm0, %ymm1, %ymm0 2454; X64-NEXT: retq 2455 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2456 %res = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %arg1) 2457 ret <8 x float> %res 2458} 2459declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly 2460 2461define <4 x i64> @test_mm256_sad_epu8(<4 x i64> %a0, <4 x i64> %a1) { 2462; X32-LABEL: test_mm256_sad_epu8: 2463; X32: # BB#0: 2464; X32-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 2465; X32-NEXT: retl 2466; 2467; X64-LABEL: test_mm256_sad_epu8: 2468; X64: # BB#0: 2469; X64-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 2470; X64-NEXT: retq 2471 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2472 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 2473 %res = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %arg0, <32 x i8> %arg1) 2474 ret <4 x i64> %res 2475} 2476declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone 2477 2478define <4 x i64> @test_mm256_shuffle_epi32(<4 x i64> %a0) { 2479; X32-LABEL: test_mm256_shuffle_epi32: 2480; X32: # BB#0: 2481; X32-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,0,0,7,7,4,4] 2482; X32-NEXT: retl 2483; 2484; X64-LABEL: test_mm256_shuffle_epi32: 2485; X64: # BB#0: 2486; X64-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,0,0,7,7,4,4] 2487; X64-NEXT: retq 2488 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2489 %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 0, i32 0, i32 7, i32 7, i32 4, i32 4> 2490 %res = bitcast <8 x i32> %shuf to <4 x i64> 2491 ret <4 x i64> %res 2492} 2493 2494define <4 x i64> @test_mm256_shuffle_epi8(<4 x i64> %a0, <4 x i64> %a1) { 2495; X32-LABEL: test_mm256_shuffle_epi8: 2496; X32: # BB#0: 2497; X32-NEXT: vpshufb %ymm1, %ymm0, %ymm0 2498; X32-NEXT: retl 2499; 2500; X64-LABEL: test_mm256_shuffle_epi8: 2501; X64: # BB#0: 2502; X64-NEXT: vpshufb %ymm1, %ymm0, %ymm0 2503; X64-NEXT: retq 2504 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2505 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 2506 %shuf = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %arg0, <32 x i8> %arg1) 2507 %res = bitcast <32 x i8> %shuf to <4 x i64> 2508 ret <4 x i64> %res 2509} 2510declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) nounwind readnone 2511 2512define <4 x i64> @test_mm256_shufflehi_epi16(<4 x i64> %a0) { 2513; X32-LABEL: test_mm256_shufflehi_epi16: 2514; X32: # BB#0: 2515; X32-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,6,5,8,9,10,11,15,14,14,13] 2516; X32-NEXT: retl 2517; 2518; X64-LABEL: test_mm256_shufflehi_epi16: 2519; X64: # BB#0: 2520; X64-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,6,5,8,9,10,11,15,14,14,13] 2521; X64-NEXT: retq 2522 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2523 %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 14, i32 13> 2524 %res = bitcast <16 x i16> %shuf to <4 x i64> 2525 ret <4 x i64> %res 2526} 2527 2528define <4 x i64> @test_mm256_shufflelo_epi16(<4 x i64> %a0) { 2529; X32-LABEL: test_mm256_shufflelo_epi16: 2530; X32: # BB#0: 2531; X32-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,0,1,1,4,5,6,7,11,8,9,9,12,13,14,15] 2532; X32-NEXT: retl 2533; 2534; X64-LABEL: test_mm256_shufflelo_epi16: 2535; X64: # BB#0: 2536; X64-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,0,1,1,4,5,6,7,11,8,9,9,12,13,14,15] 2537; X64-NEXT: retq 2538 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2539 %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 3, i32 0, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 9, i32 9, i32 12, i32 13, i32 14, i32 15> 2540 %res = bitcast <16 x i16> %shuf to <4 x i64> 2541 ret <4 x i64> %res 2542} 2543 2544define <4 x i64> @test_mm256_sign_epi8(<4 x i64> %a0, <4 x i64> %a1) { 2545; X32-LABEL: test_mm256_sign_epi8: 2546; X32: # BB#0: 2547; X32-NEXT: vpsignb %ymm1, %ymm0, %ymm0 2548; X32-NEXT: retl 2549; 2550; X64-LABEL: test_mm256_sign_epi8: 2551; X64: # BB#0: 2552; X64-NEXT: vpsignb %ymm1, %ymm0, %ymm0 2553; X64-NEXT: retq 2554 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2555 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 2556 %call = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %arg0, <32 x i8> %arg1) 2557 %res = bitcast <32 x i8> %call to <4 x i64> 2558 ret <4 x i64> %res 2559} 2560declare <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8>, <32 x i8>) nounwind readnone 2561 2562define <4 x i64> @test_mm256_sign_epi16(<4 x i64> %a0, <4 x i64> %a1) { 2563; X32-LABEL: test_mm256_sign_epi16: 2564; X32: # BB#0: 2565; X32-NEXT: vpsignw %ymm1, %ymm0, %ymm0 2566; X32-NEXT: retl 2567; 2568; X64-LABEL: test_mm256_sign_epi16: 2569; X64: # BB#0: 2570; X64-NEXT: vpsignw %ymm1, %ymm0, %ymm0 2571; X64-NEXT: retq 2572 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2573 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 2574 %call = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %arg0, <16 x i16> %arg1) 2575 %res = bitcast <16 x i16> %call to <4 x i64> 2576 ret <4 x i64> %res 2577} 2578declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readnone 2579 2580define <4 x i64> @test_mm256_sign_epi32(<4 x i64> %a0, <4 x i64> %a1) { 2581; X32-LABEL: test_mm256_sign_epi32: 2582; X32: # BB#0: 2583; X32-NEXT: vpsignd %ymm1, %ymm0, %ymm0 2584; X32-NEXT: retl 2585; 2586; X64-LABEL: test_mm256_sign_epi32: 2587; X64: # BB#0: 2588; X64-NEXT: vpsignd %ymm1, %ymm0, %ymm0 2589; X64-NEXT: retq 2590 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2591 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2592 %call = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %arg0, <8 x i32> %arg1) 2593 %res = bitcast <8 x i32> %call to <4 x i64> 2594 ret <4 x i64> %res 2595} 2596declare <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32>, <8 x i32>) nounwind readnone 2597 2598define <4 x i64> @test_mm256_sll_epi16(<4 x i64> %a0, <2 x i64> %a1) { 2599; X32-LABEL: test_mm256_sll_epi16: 2600; X32: # BB#0: 2601; X32-NEXT: vpsllw %xmm1, %ymm0, %ymm0 2602; X32-NEXT: retl 2603; 2604; X64-LABEL: test_mm256_sll_epi16: 2605; X64: # BB#0: 2606; X64-NEXT: vpsllw %xmm1, %ymm0, %ymm0 2607; X64-NEXT: retq 2608 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2609 %arg1 = bitcast <2 x i64> %a1 to <8 x i16> 2610 %res = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %arg0, <8 x i16> %arg1) 2611 %bc = bitcast <16 x i16> %res to <4 x i64> 2612 ret <4 x i64> %bc 2613} 2614declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnone 2615 2616define <4 x i64> @test_mm256_sll_epi32(<4 x i64> %a0, <2 x i64> %a1) { 2617; X32-LABEL: test_mm256_sll_epi32: 2618; X32: # BB#0: 2619; X32-NEXT: vpslld %xmm1, %ymm0, %ymm0 2620; X32-NEXT: retl 2621; 2622; X64-LABEL: test_mm256_sll_epi32: 2623; X64: # BB#0: 2624; X64-NEXT: vpslld %xmm1, %ymm0, %ymm0 2625; X64-NEXT: retq 2626 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2627 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 2628 %res = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %arg0, <4 x i32> %arg1) 2629 %bc = bitcast <8 x i32> %res to <4 x i64> 2630 ret <4 x i64> %bc 2631} 2632declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone 2633 2634define <4 x i64> @test_mm256_sll_epi64(<4 x i64> %a0, <2 x i64> %a1) { 2635; X32-LABEL: test_mm256_sll_epi64: 2636; X32: # BB#0: 2637; X32-NEXT: vpsllq %xmm1, %ymm0, %ymm0 2638; X32-NEXT: retl 2639; 2640; X64-LABEL: test_mm256_sll_epi64: 2641; X64: # BB#0: 2642; X64-NEXT: vpsllq %xmm1, %ymm0, %ymm0 2643; X64-NEXT: retq 2644 %res = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1) 2645 ret <4 x i64> %res 2646} 2647declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone 2648 2649define <4 x i64> @test_mm256_slli_epi16(<4 x i64> %a0) { 2650; X32-LABEL: test_mm256_slli_epi16: 2651; X32: # BB#0: 2652; X32-NEXT: vpsllw $3, %ymm0, %ymm0 2653; X32-NEXT: retl 2654; 2655; X64-LABEL: test_mm256_slli_epi16: 2656; X64: # BB#0: 2657; X64-NEXT: vpsllw $3, %ymm0, %ymm0 2658; X64-NEXT: retq 2659 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2660 %res = call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %arg0, i32 3) 2661 %bc = bitcast <16 x i16> %res to <4 x i64> 2662 ret <4 x i64> %bc 2663} 2664declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) nounwind readnone 2665 2666define <4 x i64> @test_mm256_slli_epi32(<4 x i64> %a0) { 2667; X32-LABEL: test_mm256_slli_epi32: 2668; X32: # BB#0: 2669; X32-NEXT: vpslld $3, %ymm0, %ymm0 2670; X32-NEXT: retl 2671; 2672; X64-LABEL: test_mm256_slli_epi32: 2673; X64: # BB#0: 2674; X64-NEXT: vpslld $3, %ymm0, %ymm0 2675; X64-NEXT: retq 2676 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2677 %res = call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %arg0, i32 3) 2678 %bc = bitcast <8 x i32> %res to <4 x i64> 2679 ret <4 x i64> %bc 2680} 2681declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) nounwind readnone 2682 2683define <4 x i64> @test_mm256_slli_epi64(<4 x i64> %a0) { 2684; X32-LABEL: test_mm256_slli_epi64: 2685; X32: # BB#0: 2686; X32-NEXT: vpsllq $3, %ymm0, %ymm0 2687; X32-NEXT: retl 2688; 2689; X64-LABEL: test_mm256_slli_epi64: 2690; X64: # BB#0: 2691; X64-NEXT: vpsllq $3, %ymm0, %ymm0 2692; X64-NEXT: retq 2693 %res = call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %a0, i32 3) 2694 ret <4 x i64> %res 2695} 2696declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) nounwind readnone 2697 2698define <4 x i64> @test_mm256_slli_si256(<4 x i64> %a0) { 2699; X32-LABEL: test_mm256_slli_si256: 2700; X32: # BB#0: 2701; X32-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28] 2702; X32-NEXT: retl 2703; 2704; X64-LABEL: test_mm256_slli_si256: 2705; X64: # BB#0: 2706; X64-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28] 2707; X64-NEXT: retq 2708 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2709 %shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60> 2710 %res = bitcast <32 x i8> %shuf to <4 x i64> 2711 ret <4 x i64> %res 2712} 2713 2714define <2 x i64> @test_mm_sllv_epi32(<2 x i64> %a0, <2 x i64> %a1) { 2715; X32-LABEL: test_mm_sllv_epi32: 2716; X32: # BB#0: 2717; X32-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 2718; X32-NEXT: retl 2719; 2720; X64-LABEL: test_mm_sllv_epi32: 2721; X64: # BB#0: 2722; X64-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 2723; X64-NEXT: retq 2724 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 2725 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 2726 %res = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %arg0, <4 x i32> %arg1) 2727 %bc = bitcast <4 x i32> %res to <2 x i64> 2728 ret <2 x i64> %bc 2729} 2730declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone 2731 2732define <4 x i64> @test_mm256_sllv_epi32(<4 x i64> %a0, <4 x i64> %a1) { 2733; X32-LABEL: test_mm256_sllv_epi32: 2734; X32: # BB#0: 2735; X32-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 2736; X32-NEXT: retl 2737; 2738; X64-LABEL: test_mm256_sllv_epi32: 2739; X64: # BB#0: 2740; X64-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 2741; X64-NEXT: retq 2742 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2743 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2744 %res = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %arg0, <8 x i32> %arg1) 2745 %bc = bitcast <8 x i32> %res to <4 x i64> 2746 ret <4 x i64> %bc 2747} 2748declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind readnone 2749 2750define <2 x i64> @test_mm_sllv_epi64(<2 x i64> %a0, <2 x i64> %a1) { 2751; X32-LABEL: test_mm_sllv_epi64: 2752; X32: # BB#0: 2753; X32-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 2754; X32-NEXT: retl 2755; 2756; X64-LABEL: test_mm_sllv_epi64: 2757; X64: # BB#0: 2758; X64-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 2759; X64-NEXT: retq 2760 %res = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1) 2761 ret <2 x i64> %res 2762} 2763declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone 2764 2765define <4 x i64> @test_mm256_sllv_epi64(<4 x i64> %a0, <4 x i64> %a1) { 2766; X32-LABEL: test_mm256_sllv_epi64: 2767; X32: # BB#0: 2768; X32-NEXT: vpsllvq %ymm1, %ymm0, %ymm0 2769; X32-NEXT: retl 2770; 2771; X64-LABEL: test_mm256_sllv_epi64: 2772; X64: # BB#0: 2773; X64-NEXT: vpsllvq %ymm1, %ymm0, %ymm0 2774; X64-NEXT: retq 2775 %res = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1) 2776 ret <4 x i64> %res 2777} 2778declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind readnone 2779 2780define <4 x i64> @test_mm256_sra_epi16(<4 x i64> %a0, <2 x i64> %a1) { 2781; X32-LABEL: test_mm256_sra_epi16: 2782; X32: # BB#0: 2783; X32-NEXT: vpsraw %xmm1, %ymm0, %ymm0 2784; X32-NEXT: retl 2785; 2786; X64-LABEL: test_mm256_sra_epi16: 2787; X64: # BB#0: 2788; X64-NEXT: vpsraw %xmm1, %ymm0, %ymm0 2789; X64-NEXT: retq 2790 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2791 %arg1 = bitcast <2 x i64> %a1 to <8 x i16> 2792 %res = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %arg0, <8 x i16> %arg1) 2793 %bc = bitcast <16 x i16> %res to <4 x i64> 2794 ret <4 x i64> %bc 2795} 2796declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnone 2797 2798define <4 x i64> @test_mm256_sra_epi32(<4 x i64> %a0, <2 x i64> %a1) { 2799; X32-LABEL: test_mm256_sra_epi32: 2800; X32: # BB#0: 2801; X32-NEXT: vpsrad %xmm1, %ymm0, %ymm0 2802; X32-NEXT: retl 2803; 2804; X64-LABEL: test_mm256_sra_epi32: 2805; X64: # BB#0: 2806; X64-NEXT: vpsrad %xmm1, %ymm0, %ymm0 2807; X64-NEXT: retq 2808 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2809 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 2810 %res = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %arg0, <4 x i32> %arg1) 2811 %bc = bitcast <8 x i32> %res to <4 x i64> 2812 ret <4 x i64> %bc 2813} 2814declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone 2815 2816define <4 x i64> @test_mm256_srai_epi16(<4 x i64> %a0) { 2817; X32-LABEL: test_mm256_srai_epi16: 2818; X32: # BB#0: 2819; X32-NEXT: vpsraw $3, %ymm0, %ymm0 2820; X32-NEXT: retl 2821; 2822; X64-LABEL: test_mm256_srai_epi16: 2823; X64: # BB#0: 2824; X64-NEXT: vpsraw $3, %ymm0, %ymm0 2825; X64-NEXT: retq 2826 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2827 %res = call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %arg0, i32 3) 2828 %bc = bitcast <16 x i16> %res to <4 x i64> 2829 ret <4 x i64> %bc 2830} 2831declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) nounwind readnone 2832 2833define <4 x i64> @test_mm256_srai_epi32(<4 x i64> %a0) { 2834; X32-LABEL: test_mm256_srai_epi32: 2835; X32: # BB#0: 2836; X32-NEXT: vpsrad $3, %ymm0, %ymm0 2837; X32-NEXT: retl 2838; 2839; X64-LABEL: test_mm256_srai_epi32: 2840; X64: # BB#0: 2841; X64-NEXT: vpsrad $3, %ymm0, %ymm0 2842; X64-NEXT: retq 2843 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2844 %res = call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %arg0, i32 3) 2845 %bc = bitcast <8 x i32> %res to <4 x i64> 2846 ret <4 x i64> %bc 2847} 2848declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32) nounwind readnone 2849 2850define <2 x i64> @test_mm_srav_epi32(<2 x i64> %a0, <2 x i64> %a1) { 2851; X32-LABEL: test_mm_srav_epi32: 2852; X32: # BB#0: 2853; X32-NEXT: vpsravd %xmm1, %xmm0, %xmm0 2854; X32-NEXT: retl 2855; 2856; X64-LABEL: test_mm_srav_epi32: 2857; X64: # BB#0: 2858; X64-NEXT: vpsravd %xmm1, %xmm0, %xmm0 2859; X64-NEXT: retq 2860 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 2861 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 2862 %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %arg0, <4 x i32> %arg1) 2863 %bc = bitcast <4 x i32> %res to <2 x i64> 2864 ret <2 x i64> %bc 2865} 2866declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone 2867 2868define <4 x i64> @test_mm256_srav_epi32(<4 x i64> %a0, <4 x i64> %a1) { 2869; X32-LABEL: test_mm256_srav_epi32: 2870; X32: # BB#0: 2871; X32-NEXT: vpsravd %ymm1, %ymm0, %ymm0 2872; X32-NEXT: retl 2873; 2874; X64-LABEL: test_mm256_srav_epi32: 2875; X64: # BB#0: 2876; X64-NEXT: vpsravd %ymm1, %ymm0, %ymm0 2877; X64-NEXT: retq 2878 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2879 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2880 %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %arg0, <8 x i32> %arg1) 2881 %bc = bitcast <8 x i32> %res to <4 x i64> 2882 ret <4 x i64> %bc 2883} 2884declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone 2885 2886define <4 x i64> @test_mm256_srl_epi16(<4 x i64> %a0, <2 x i64> %a1) { 2887; X32-LABEL: test_mm256_srl_epi16: 2888; X32: # BB#0: 2889; X32-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 2890; X32-NEXT: retl 2891; 2892; X64-LABEL: test_mm256_srl_epi16: 2893; X64: # BB#0: 2894; X64-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 2895; X64-NEXT: retq 2896 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2897 %arg1 = bitcast <2 x i64> %a1 to <8 x i16> 2898 %res = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %arg0, <8 x i16> %arg1) 2899 %bc = bitcast <16 x i16> %res to <4 x i64> 2900 ret <4 x i64> %bc 2901} 2902declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnone 2903 2904define <4 x i64> @test_mm256_srl_epi32(<4 x i64> %a0, <2 x i64> %a1) { 2905; X32-LABEL: test_mm256_srl_epi32: 2906; X32: # BB#0: 2907; X32-NEXT: vpsrld %xmm1, %ymm0, %ymm0 2908; X32-NEXT: retl 2909; 2910; X64-LABEL: test_mm256_srl_epi32: 2911; X64: # BB#0: 2912; X64-NEXT: vpsrld %xmm1, %ymm0, %ymm0 2913; X64-NEXT: retq 2914 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2915 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 2916 %res = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %arg0, <4 x i32> %arg1) 2917 %bc = bitcast <8 x i32> %res to <4 x i64> 2918 ret <4 x i64> %bc 2919} 2920declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone 2921 2922define <4 x i64> @test_mm256_srl_epi64(<4 x i64> %a0, <2 x i64> %a1) { 2923; X32-LABEL: test_mm256_srl_epi64: 2924; X32: # BB#0: 2925; X32-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 2926; X32-NEXT: retl 2927; 2928; X64-LABEL: test_mm256_srl_epi64: 2929; X64: # BB#0: 2930; X64-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 2931; X64-NEXT: retq 2932 %res = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1) 2933 ret <4 x i64> %res 2934} 2935declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone 2936 2937define <4 x i64> @test_mm256_srli_epi16(<4 x i64> %a0) { 2938; X32-LABEL: test_mm256_srli_epi16: 2939; X32: # BB#0: 2940; X32-NEXT: vpsrlw $3, %ymm0, %ymm0 2941; X32-NEXT: retl 2942; 2943; X64-LABEL: test_mm256_srli_epi16: 2944; X64: # BB#0: 2945; X64-NEXT: vpsrlw $3, %ymm0, %ymm0 2946; X64-NEXT: retq 2947 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2948 %res = call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %arg0, i32 3) 2949 %bc = bitcast <16 x i16> %res to <4 x i64> 2950 ret <4 x i64> %bc 2951} 2952declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) nounwind readnone 2953 2954define <4 x i64> @test_mm256_srli_epi32(<4 x i64> %a0) { 2955; X32-LABEL: test_mm256_srli_epi32: 2956; X32: # BB#0: 2957; X32-NEXT: vpsrld $3, %ymm0, %ymm0 2958; X32-NEXT: retl 2959; 2960; X64-LABEL: test_mm256_srli_epi32: 2961; X64: # BB#0: 2962; X64-NEXT: vpsrld $3, %ymm0, %ymm0 2963; X64-NEXT: retq 2964 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2965 %res = call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %arg0, i32 3) 2966 %bc = bitcast <8 x i32> %res to <4 x i64> 2967 ret <4 x i64> %bc 2968} 2969declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) nounwind readnone 2970 2971define <4 x i64> @test_mm256_srli_epi64(<4 x i64> %a0) { 2972; X32-LABEL: test_mm256_srli_epi64: 2973; X32: # BB#0: 2974; X32-NEXT: vpsrlq $3, %ymm0, %ymm0 2975; X32-NEXT: retl 2976; 2977; X64-LABEL: test_mm256_srli_epi64: 2978; X64: # BB#0: 2979; X64-NEXT: vpsrlq $3, %ymm0, %ymm0 2980; X64-NEXT: retq 2981 %res = call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %a0, i32 3) 2982 ret <4 x i64> %res 2983} 2984declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) nounwind readnone 2985 2986define <4 x i64> @test_mm256_srli_si256(<4 x i64> %a0) { 2987; X32-LABEL: test_mm256_srli_si256: 2988; X32: # BB#0: 2989; X32-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero 2990; X32-NEXT: retl 2991; 2992; X64-LABEL: test_mm256_srli_si256: 2993; X64: # BB#0: 2994; X64-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero 2995; X64-NEXT: retq 2996 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2997 %shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50> 2998 %res = bitcast <32 x i8> %shuf to <4 x i64> 2999 ret <4 x i64> %res 3000} 3001 3002define <2 x i64> @test_mm_srlv_epi32(<2 x i64> %a0, <2 x i64> %a1) { 3003; X32-LABEL: test_mm_srlv_epi32: 3004; X32: # BB#0: 3005; X32-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 3006; X32-NEXT: retl 3007; 3008; X64-LABEL: test_mm_srlv_epi32: 3009; X64: # BB#0: 3010; X64-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 3011; X64-NEXT: retq 3012 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 3013 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 3014 %res = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %arg0, <4 x i32> %arg1) 3015 %bc = bitcast <4 x i32> %res to <2 x i64> 3016 ret <2 x i64> %bc 3017} 3018declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone 3019 3020define <4 x i64> @test_mm256_srlv_epi32(<4 x i64> %a0, <4 x i64> %a1) { 3021; X32-LABEL: test_mm256_srlv_epi32: 3022; X32: # BB#0: 3023; X32-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 3024; X32-NEXT: retl 3025; 3026; X64-LABEL: test_mm256_srlv_epi32: 3027; X64: # BB#0: 3028; X64-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 3029; X64-NEXT: retq 3030 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 3031 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 3032 %res = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %arg0, <8 x i32> %arg1) 3033 %bc = bitcast <8 x i32> %res to <4 x i64> 3034 ret <4 x i64> %bc 3035} 3036declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind readnone 3037 3038define <2 x i64> @test_mm_srlv_epi64(<2 x i64> %a0, <2 x i64> %a1) { 3039; X32-LABEL: test_mm_srlv_epi64: 3040; X32: # BB#0: 3041; X32-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 3042; X32-NEXT: retl 3043; 3044; X64-LABEL: test_mm_srlv_epi64: 3045; X64: # BB#0: 3046; X64-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 3047; X64-NEXT: retq 3048 %res = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1) 3049 ret <2 x i64> %res 3050} 3051declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone 3052 3053define <4 x i64> @test_mm256_srlv_epi64(<4 x i64> %a0, <4 x i64> %a1) { 3054; X32-LABEL: test_mm256_srlv_epi64: 3055; X32: # BB#0: 3056; X32-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 3057; X32-NEXT: retl 3058; 3059; X64-LABEL: test_mm256_srlv_epi64: 3060; X64: # BB#0: 3061; X64-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 3062; X64-NEXT: retq 3063 %res = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1) 3064 ret <4 x i64> %res 3065} 3066declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind readnone 3067 3068define <4 x i64> @test_mm256_stream_load_si256(<4 x i64> *%a0) { 3069; X32-LABEL: test_mm256_stream_load_si256: 3070; X32: # BB#0: 3071; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 3072; X32-NEXT: vmovntdqa (%eax), %ymm0 3073; X32-NEXT: retl 3074; 3075; X64-LABEL: test_mm256_stream_load_si256: 3076; X64: # BB#0: 3077; X64-NEXT: vmovntdqa (%rdi), %ymm0 3078; X64-NEXT: retq 3079 %arg0 = bitcast <4 x i64> *%a0 to i8* 3080 %res = call <4 x i64> @llvm.x86.avx2.movntdqa(i8* %arg0) 3081 ret <4 x i64> %res 3082} 3083declare <4 x i64> @llvm.x86.avx2.movntdqa(i8*) nounwind readonly 3084 3085define <4 x i64> @test_mm256_sub_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind { 3086; X32-LABEL: test_mm256_sub_epi8: 3087; X32: # BB#0: 3088; X32-NEXT: vpsubb %ymm1, %ymm0, %ymm0 3089; X32-NEXT: retl 3090; 3091; X64-LABEL: test_mm256_sub_epi8: 3092; X64: # BB#0: 3093; X64-NEXT: vpsubb %ymm1, %ymm0, %ymm0 3094; X64-NEXT: retq 3095 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 3096 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 3097 %res = sub <32 x i8> %arg0, %arg1 3098 %bc = bitcast <32 x i8> %res to <4 x i64> 3099 ret <4 x i64> %bc 3100} 3101 3102define <4 x i64> @test_mm256_sub_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind { 3103; X32-LABEL: test_mm256_sub_epi16: 3104; X32: # BB#0: 3105; X32-NEXT: vpsubw %ymm1, %ymm0, %ymm0 3106; X32-NEXT: retl 3107; 3108; X64-LABEL: test_mm256_sub_epi16: 3109; X64: # BB#0: 3110; X64-NEXT: vpsubw %ymm1, %ymm0, %ymm0 3111; X64-NEXT: retq 3112 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 3113 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 3114 %res = sub <16 x i16> %arg0, %arg1 3115 %bc = bitcast <16 x i16> %res to <4 x i64> 3116 ret <4 x i64> %bc 3117} 3118 3119define <4 x i64> @test_mm256_sub_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 3120; X32-LABEL: test_mm256_sub_epi32: 3121; X32: # BB#0: 3122; X32-NEXT: vpsubd %ymm1, %ymm0, %ymm0 3123; X32-NEXT: retl 3124; 3125; X64-LABEL: test_mm256_sub_epi32: 3126; X64: # BB#0: 3127; X64-NEXT: vpsubd %ymm1, %ymm0, %ymm0 3128; X64-NEXT: retq 3129 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 3130 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 3131 %res = sub <8 x i32> %arg0, %arg1 3132 %bc = bitcast <8 x i32> %res to <4 x i64> 3133 ret <4 x i64> %bc 3134} 3135 3136define <4 x i64> @test_mm256_sub_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind { 3137; X32-LABEL: test_mm256_sub_epi64: 3138; X32: # BB#0: 3139; X32-NEXT: vpsubq %ymm1, %ymm0, %ymm0 3140; X32-NEXT: retl 3141; 3142; X64-LABEL: test_mm256_sub_epi64: 3143; X64: # BB#0: 3144; X64-NEXT: vpsubq %ymm1, %ymm0, %ymm0 3145; X64-NEXT: retq 3146 %res = sub <4 x i64> %a0, %a1 3147 ret <4 x i64> %res 3148} 3149 3150define <4 x i64> @test_mm256_subs_epi8(<4 x i64> %a0, <4 x i64> %a1) { 3151; X32-LABEL: test_mm256_subs_epi8: 3152; X32: # BB#0: 3153; X32-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 3154; X32-NEXT: retl 3155; 3156; X64-LABEL: test_mm256_subs_epi8: 3157; X64: # BB#0: 3158; X64-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 3159; X64-NEXT: retq 3160 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 3161 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 3162 %res = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %arg0, <32 x i8> %arg1) 3163 %bc = bitcast <32 x i8> %res to <4 x i64> 3164 ret <4 x i64> %bc 3165} 3166declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) nounwind readnone 3167 3168define <4 x i64> @test_mm256_subs_epi16(<4 x i64> %a0, <4 x i64> %a1) { 3169; X32-LABEL: test_mm256_subs_epi16: 3170; X32: # BB#0: 3171; X32-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 3172; X32-NEXT: retl 3173; 3174; X64-LABEL: test_mm256_subs_epi16: 3175; X64: # BB#0: 3176; X64-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 3177; X64-NEXT: retq 3178 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 3179 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 3180 %res = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %arg0, <16 x i16> %arg1) 3181 %bc = bitcast <16 x i16> %res to <4 x i64> 3182 ret <4 x i64> %bc 3183} 3184declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) nounwind readnone 3185 3186define <4 x i64> @test_mm256_subs_epu8(<4 x i64> %a0, <4 x i64> %a1) { 3187; X32-LABEL: test_mm256_subs_epu8: 3188; X32: # BB#0: 3189; X32-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 3190; X32-NEXT: retl 3191; 3192; X64-LABEL: test_mm256_subs_epu8: 3193; X64: # BB#0: 3194; X64-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 3195; X64-NEXT: retq 3196 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 3197 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 3198 %res = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %arg0, <32 x i8> %arg1) 3199 %bc = bitcast <32 x i8> %res to <4 x i64> 3200 ret <4 x i64> %bc 3201} 3202declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind readnone 3203 3204define <4 x i64> @test_mm256_subs_epu16(<4 x i64> %a0, <4 x i64> %a1) { 3205; X32-LABEL: test_mm256_subs_epu16: 3206; X32: # BB#0: 3207; X32-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 3208; X32-NEXT: retl 3209; 3210; X64-LABEL: test_mm256_subs_epu16: 3211; X64: # BB#0: 3212; X64-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 3213; X64-NEXT: retq 3214 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 3215 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 3216 %res = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %arg0, <16 x i16> %arg1) 3217 %bc = bitcast <16 x i16> %res to <4 x i64> 3218 ret <4 x i64> %bc 3219} 3220declare <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16>, <16 x i16>) nounwind readnone 3221 3222define <4 x i64> @test_mm256_unpackhi_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind { 3223; X32-LABEL: test_mm256_unpackhi_epi8: 3224; X32: # BB#0: 3225; X32-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] 3226; X32-NEXT: retl 3227; 3228; X64-LABEL: test_mm256_unpackhi_epi8: 3229; X64: # BB#0: 3230; X64-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] 3231; X64-NEXT: retq 3232 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 3233 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 3234 %res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63> 3235 %bc = bitcast <32 x i8> %res to <4 x i64> 3236 ret <4 x i64> %bc 3237} 3238 3239define <4 x i64> @test_mm256_unpackhi_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind { 3240; X32-LABEL: test_mm256_unpackhi_epi16: 3241; X32: # BB#0: 3242; X32-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] 3243; X32-NEXT: retl 3244; 3245; X64-LABEL: test_mm256_unpackhi_epi16: 3246; X64: # BB#0: 3247; X64-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] 3248; X64-NEXT: retq 3249 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 3250 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 3251 %res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> 3252 %bc = bitcast <16 x i16> %res to <4 x i64> 3253 ret <4 x i64> %bc 3254} 3255 3256define <4 x i64> @test_mm256_unpackhi_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 3257; X32-LABEL: test_mm256_unpackhi_epi32: 3258; X32: # BB#0: 3259; X32-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 3260; X32-NEXT: retl 3261; 3262; X64-LABEL: test_mm256_unpackhi_epi32: 3263; X64: # BB#0: 3264; X64-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 3265; X64-NEXT: retq 3266 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 3267 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 3268 %res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15> 3269 %bc = bitcast <8 x i32> %res to <4 x i64> 3270 ret <4 x i64> %bc 3271} 3272 3273define <4 x i64> @test_mm256_unpackhi_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind { 3274; X32-LABEL: test_mm256_unpackhi_epi64: 3275; X32: # BB#0: 3276; X32-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 3277; X32-NEXT: retl 3278; 3279; X64-LABEL: test_mm256_unpackhi_epi64: 3280; X64: # BB#0: 3281; X64-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 3282; X64-NEXT: retq 3283 %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7> 3284 ret <4 x i64> %res 3285} 3286 3287define <4 x i64> @test_mm256_unpacklo_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind { 3288; X32-LABEL: test_mm256_unpacklo_epi8: 3289; X32: # BB#0: 3290; X32-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] 3291; X32-NEXT: retl 3292; 3293; X64-LABEL: test_mm256_unpacklo_epi8: 3294; X64: # BB#0: 3295; X64-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] 3296; X64-NEXT: retq 3297 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 3298 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 3299 %res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55> 3300 %bc = bitcast <32 x i8> %res to <4 x i64> 3301 ret <4 x i64> %bc 3302} 3303 3304define <4 x i64> @test_mm256_unpacklo_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind { 3305; X32-LABEL: test_mm256_unpacklo_epi16: 3306; X32: # BB#0: 3307; X32-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] 3308; X32-NEXT: retl 3309; 3310; X64-LABEL: test_mm256_unpacklo_epi16: 3311; X64: # BB#0: 3312; X64-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] 3313; X64-NEXT: retq 3314 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 3315 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 3316 %res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27> 3317 %bc = bitcast <16 x i16> %res to <4 x i64> 3318 ret <4 x i64> %bc 3319} 3320 3321define <4 x i64> @test_mm256_unpacklo_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 3322; X32-LABEL: test_mm256_unpacklo_epi32: 3323; X32: # BB#0: 3324; X32-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 3325; X32-NEXT: retl 3326; 3327; X64-LABEL: test_mm256_unpacklo_epi32: 3328; X64: # BB#0: 3329; X64-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 3330; X64-NEXT: retq 3331 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 3332 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 3333 %res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13> 3334 %bc = bitcast <8 x i32> %res to <4 x i64> 3335 ret <4 x i64> %bc 3336} 3337 3338define <4 x i64> @test_mm256_unpacklo_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind { 3339; X32-LABEL: test_mm256_unpacklo_epi64: 3340; X32: # BB#0: 3341; X32-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 3342; X32-NEXT: retl 3343; 3344; X64-LABEL: test_mm256_unpacklo_epi64: 3345; X64: # BB#0: 3346; X64-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 3347; X64-NEXT: retq 3348 %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 3349 ret <4 x i64> %res 3350} 3351 3352define <4 x i64> @test_mm256_xor_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind { 3353; X32-LABEL: test_mm256_xor_si256: 3354; X32: # BB#0: 3355; X32-NEXT: vxorps %ymm1, %ymm0, %ymm0 3356; X32-NEXT: retl 3357; 3358; X64-LABEL: test_mm256_xor_si256: 3359; X64: # BB#0: 3360; X64-NEXT: vxorps %ymm1, %ymm0, %ymm0 3361; X64-NEXT: retq 3362 %res = xor <4 x i64> %a0, %a1 3363 ret <4 x i64> %res 3364} 3365 3366declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone 3367 3368declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone 3369