1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X86 3; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X64 4 5; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx2-builtins.c 6 7define <4 x i64> @test_mm256_abs_epi8(<4 x i64> %a0) { 8; CHECK-LABEL: test_mm256_abs_epi8: 9; CHECK: # %bb.0: 10; CHECK-NEXT: vpabsb %ymm0, %ymm0 11; CHECK-NEXT: ret{{[l|q]}} 12 %arg = bitcast <4 x i64> %a0 to <32 x i8> 13 %sub = sub <32 x i8> zeroinitializer, %arg 14 %cmp = icmp sgt <32 x i8> %arg, zeroinitializer 15 %sel = select <32 x i1> %cmp, <32 x i8> %arg, <32 x i8> %sub 16 %res = bitcast <32 x i8> %sel to <4 x i64> 17 ret <4 x i64> %res 18} 19declare <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8>) nounwind readnone 20 21define <4 x i64> @test_mm256_abs_epi16(<4 x i64> %a0) { 22; CHECK-LABEL: test_mm256_abs_epi16: 23; CHECK: # %bb.0: 24; CHECK-NEXT: vpabsw %ymm0, %ymm0 25; CHECK-NEXT: ret{{[l|q]}} 26 %arg = bitcast <4 x i64> %a0 to <16 x i16> 27 %sub = sub <16 x i16> zeroinitializer, %arg 28 %cmp = icmp sgt <16 x i16> %arg, zeroinitializer 29 %sel = select <16 x i1> %cmp, <16 x i16> %arg, <16 x i16> %sub 30 %res = bitcast <16 x i16> %sel to <4 x i64> 31 ret <4 x i64> %res 32} 33declare <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16>) nounwind readnone 34 35define <4 x i64> @test_mm256_abs_epi32(<4 x i64> %a0) { 36; CHECK-LABEL: test_mm256_abs_epi32: 37; CHECK: # %bb.0: 38; CHECK-NEXT: vpabsd %ymm0, %ymm0 39; CHECK-NEXT: ret{{[l|q]}} 40 %arg = bitcast <4 x i64> %a0 to <8 x i32> 41 %sub = sub <8 x i32> zeroinitializer, %arg 42 %cmp = icmp sgt <8 x i32> %arg, zeroinitializer 43 %sel = select <8 x i1> %cmp, <8 x i32> %arg, <8 x i32> %sub 44 %res = bitcast <8 x i32> %sel to <4 x i64> 45 ret <4 x i64> %res 46} 47declare <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32>) nounwind readnone 48 49define <4 x i64> @test_mm256_add_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind { 50; CHECK-LABEL: test_mm256_add_epi8: 51; CHECK: # %bb.0: 52; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 53; CHECK-NEXT: ret{{[l|q]}} 54 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 55 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 56 %res = add <32 x i8> %arg0, %arg1 57 %bc = bitcast <32 x i8> %res to <4 x i64> 58 ret <4 x i64> %bc 59} 60 61define <4 x i64> @test_mm256_add_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind { 62; CHECK-LABEL: test_mm256_add_epi16: 63; CHECK: # %bb.0: 64; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 65; CHECK-NEXT: ret{{[l|q]}} 66 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 67 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 68 %res = add <16 x i16> %arg0, %arg1 69 %bc = bitcast <16 x i16> %res to <4 x i64> 70 ret <4 x i64> %bc 71} 72 73define <4 x i64> @test_mm256_add_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 74; CHECK-LABEL: test_mm256_add_epi32: 75; CHECK: # %bb.0: 76; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 77; CHECK-NEXT: ret{{[l|q]}} 78 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 79 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 80 %res = add <8 x i32> %arg0, %arg1 81 %bc = bitcast <8 x i32> %res to <4 x i64> 82 ret <4 x i64> %bc 83} 84 85define <4 x i64> @test_mm256_add_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind { 86; CHECK-LABEL: test_mm256_add_epi64: 87; CHECK: # %bb.0: 88; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 89; CHECK-NEXT: ret{{[l|q]}} 90 %res = add <4 x i64> %a0, %a1 91 ret <4 x i64> %res 92} 93 94define <4 x i64> @test_mm256_adds_epi8(<4 x i64> %a0, <4 x i64> %a1) { 95; CHECK-LABEL: test_mm256_adds_epi8: 96; CHECK: # %bb.0: 97; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 98; CHECK-NEXT: ret{{[l|q]}} 99 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 100 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 101 %res = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %arg0, <32 x i8> %arg1) 102 %bc = bitcast <32 x i8> %res to <4 x i64> 103 ret <4 x i64> %bc 104} 105declare <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8>, <32 x i8>) nounwind readnone 106 107define <4 x i64> @test_mm256_adds_epi16(<4 x i64> %a0, <4 x i64> %a1) { 108; CHECK-LABEL: test_mm256_adds_epi16: 109; CHECK: # %bb.0: 110; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 111; CHECK-NEXT: ret{{[l|q]}} 112 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 113 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 114 %res = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %arg0, <16 x i16> %arg1) 115 %bc = bitcast <16 x i16> %res to <4 x i64> 116 ret <4 x i64> %bc 117} 118declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>) nounwind readnone 119 120define <4 x i64> @test_mm256_adds_epu8(<4 x i64> %a0, <4 x i64> %a1) { 121; CHECK-LABEL: test_mm256_adds_epu8: 122; CHECK: # %bb.0: 123; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 124; CHECK-NEXT: ret{{[l|q]}} 125 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 126 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 127 %res = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %arg0, <32 x i8> %arg1) 128 %bc = bitcast <32 x i8> %res to <4 x i64> 129 ret <4 x i64> %bc 130} 131declare <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8>, <32 x i8>) nounwind readnone 132 133define <4 x i64> @test_mm256_adds_epu16(<4 x i64> %a0, <4 x i64> %a1) { 134; CHECK-LABEL: test_mm256_adds_epu16: 135; CHECK: # %bb.0: 136; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 137; CHECK-NEXT: ret{{[l|q]}} 138 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 139 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 140 %res = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %arg0, <16 x i16> %arg1) 141 %bc = bitcast <16 x i16> %res to <4 x i64> 142 ret <4 x i64> %bc 143} 144declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>) nounwind readnone 145 146define <4 x i64> @test_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) { 147; CHECK-LABEL: test_mm256_alignr_epi8: 148; CHECK: # %bb.0: 149; CHECK-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1],ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17] 150; CHECK-NEXT: ret{{[l|q]}} 151 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 152 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 153 %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49> 154 %res = bitcast <32 x i8> %shuf to <4 x i64> 155 ret <4 x i64> %res 156} 157 158define <4 x i64> @test2_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) { 159; CHECK-LABEL: test2_mm256_alignr_epi8: 160; CHECK: # %bb.0: 161; CHECK-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16] 162; CHECK-NEXT: ret{{[l|q]}} 163 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 164 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 165 %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48> 166 %res = bitcast <32 x i8> %shuf to <4 x i64> 167 ret <4 x i64> %res 168} 169 170define <4 x i64> @test_mm256_and_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind { 171; CHECK-LABEL: test_mm256_and_si256: 172; CHECK: # %bb.0: 173; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0 174; CHECK-NEXT: ret{{[l|q]}} 175 %res = and <4 x i64> %a0, %a1 176 ret <4 x i64> %res 177} 178 179define <4 x i64> @test_mm256_andnot_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind { 180; CHECK-LABEL: test_mm256_andnot_si256: 181; CHECK: # %bb.0: 182; CHECK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 183; CHECK-NEXT: vpxor %ymm2, %ymm0, %ymm0 184; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0 185; CHECK-NEXT: ret{{[l|q]}} 186 %not = xor <4 x i64> %a0, <i64 -1, i64 -1, i64 -1, i64 -1> 187 %res = and <4 x i64> %not, %a1 188 ret <4 x i64> %res 189} 190 191define <4 x i64> @test_mm256_avg_epu8(<4 x i64> %a0, <4 x i64> %a1) nounwind { 192; CHECK-LABEL: test_mm256_avg_epu8: 193; CHECK: # %bb.0: 194; CHECK-NEXT: vpavgb %ymm1, %ymm0, %ymm0 195; CHECK-NEXT: ret{{[l|q]}} 196 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 197 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 198 %zext0 = zext <32 x i8> %arg0 to <32 x i16> 199 %zext1 = zext <32 x i8> %arg1 to <32 x i16> 200 %add = add <32 x i16> %zext0, %zext1 201 %add1 = add <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 202 %lshr = lshr <32 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 203 %res = trunc <32 x i16> %lshr to <32 x i8> 204 %bc = bitcast <32 x i8> %res to <4 x i64> 205 ret <4 x i64> %bc 206} 207 208define <4 x i64> @test_mm256_avg_epu16(<4 x i64> %a0, <4 x i64> %a1) nounwind { 209; CHECK-LABEL: test_mm256_avg_epu16: 210; CHECK: # %bb.0: 211; CHECK-NEXT: vpavgw %ymm1, %ymm0, %ymm0 212; CHECK-NEXT: ret{{[l|q]}} 213 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 214 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 215 %zext0 = zext <16 x i16> %arg0 to <16 x i32> 216 %zext1 = zext <16 x i16> %arg1 to <16 x i32> 217 %add = add <16 x i32> %zext0, %zext1 218 %add1 = add <16 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 219 %lshr = lshr <16 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 220 %res = trunc <16 x i32> %lshr to <16 x i16> 221 %bc = bitcast <16 x i16> %res to <4 x i64> 222 ret <4 x i64> %bc 223} 224 225define <4 x i64> @test_mm256_blend_epi16(<4 x i64> %a0, <4 x i64> %a1) { 226; CHECK-LABEL: test_mm256_blend_epi16: 227; CHECK: # %bb.0: 228; CHECK-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15] 229; CHECK-NEXT: ret{{[l|q]}} 230 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 231 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 232 %shuf = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 25, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 233 %res = bitcast <16 x i16> %shuf to <4 x i64> 234 ret <4 x i64> %res 235} 236 237define <2 x i64> @test_mm_blend_epi32(<2 x i64> %a0, <2 x i64> %a1) { 238; CHECK-LABEL: test_mm_blend_epi32: 239; CHECK: # %bb.0: 240; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 241; CHECK-NEXT: ret{{[l|q]}} 242 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 243 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 244 %shuf = shufflevector <4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 245 %res = bitcast <4 x i32> %shuf to <2 x i64> 246 ret <2 x i64> %res 247} 248 249define <4 x i64> @test_mm256_blend_epi32(<4 x i64> %a0, <4 x i64> %a1) { 250; CHECK-LABEL: test_mm256_blend_epi32: 251; CHECK: # %bb.0: 252; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7] 253; CHECK-NEXT: ret{{[l|q]}} 254 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 255 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 256 %shuf = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7> 257 %res = bitcast <8 x i32> %shuf to <4 x i64> 258 ret <4 x i64> %res 259} 260 261define <4 x i64> @test_mm256_blendv_epi8(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) { 262; CHECK-LABEL: test_mm256_blendv_epi8: 263; CHECK: # %bb.0: 264; CHECK-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 265; CHECK-NEXT: ret{{[l|q]}} 266 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 267 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 268 %arg2 = bitcast <4 x i64> %a2 to <32 x i8> 269 %call = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %arg0, <32 x i8> %arg1, <32 x i8> %arg2) 270 %res = bitcast <32 x i8> %call to <4 x i64> 271 ret <4 x i64> %res 272} 273declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounwind readnone 274 275define <2 x i64> @test_mm_broadcastb_epi8(<2 x i64> %a0) { 276; CHECK-LABEL: test_mm_broadcastb_epi8: 277; CHECK: # %bb.0: 278; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0 279; CHECK-NEXT: ret{{[l|q]}} 280 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 281 %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <16 x i32> zeroinitializer 282 %res = bitcast <16 x i8> %shuf to <2 x i64> 283 ret <2 x i64> %res 284} 285 286define <4 x i64> @test_mm256_broadcastb_epi8(<4 x i64> %a0) { 287; CHECK-LABEL: test_mm256_broadcastb_epi8: 288; CHECK: # %bb.0: 289; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0 290; CHECK-NEXT: ret{{[l|q]}} 291 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 292 %shuf = shufflevector <32 x i8> %arg0, <32 x i8> undef, <32 x i32> zeroinitializer 293 %res = bitcast <32 x i8> %shuf to <4 x i64> 294 ret <4 x i64> %res 295} 296 297define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) { 298; CHECK-LABEL: test_mm_broadcastd_epi32: 299; CHECK: # %bb.0: 300; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 301; CHECK-NEXT: ret{{[l|q]}} 302 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 303 %shuf = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer 304 %res = bitcast <4 x i32> %shuf to <2 x i64> 305 ret <2 x i64> %res 306} 307 308define <4 x i64> @test_mm256_broadcastd_epi32(<4 x i64> %a0) { 309; CHECK-LABEL: test_mm256_broadcastd_epi32: 310; CHECK: # %bb.0: 311; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 312; CHECK-NEXT: ret{{[l|q]}} 313 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 314 %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> zeroinitializer 315 %res = bitcast <8 x i32> %shuf to <4 x i64> 316 ret <4 x i64> %res 317} 318 319define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) { 320; CHECK-LABEL: test_mm_broadcastq_epi64: 321; CHECK: # %bb.0: 322; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 323; CHECK-NEXT: ret{{[l|q]}} 324 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer 325 ret <2 x i64> %res 326} 327 328define <4 x i64> @test_mm256_broadcastq_epi64(<4 x i64> %a0) { 329; CHECK-LABEL: test_mm256_broadcastq_epi64: 330; CHECK: # %bb.0: 331; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 332; CHECK-NEXT: ret{{[l|q]}} 333 %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> zeroinitializer 334 ret <4 x i64> %res 335} 336 337define <2 x double> @test_mm_broadcastsd_pd(<2 x double> %a0) { 338; CHECK-LABEL: test_mm_broadcastsd_pd: 339; CHECK: # %bb.0: 340; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 341; CHECK-NEXT: ret{{[l|q]}} 342 %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer 343 ret <2 x double> %res 344} 345 346define <4 x double> @test_mm256_broadcastsd_pd(<4 x double> %a0) { 347; CHECK-LABEL: test_mm256_broadcastsd_pd: 348; CHECK: # %bb.0: 349; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 350; CHECK-NEXT: ret{{[l|q]}} 351 %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> zeroinitializer 352 ret <4 x double> %res 353} 354 355define <4 x i64> @test_mm256_broadcastsi128_si256(<2 x i64> %a0) { 356; CHECK-LABEL: test_mm256_broadcastsi128_si256: 357; CHECK: # %bb.0: 358; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 359; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 360; CHECK-NEXT: ret{{[l|q]}} 361 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 362 ret <4 x i64> %res 363} 364 365define <4 x i64> @test_mm256_broadcastsi128_si256_mem(<2 x i64>* %p0) { 366; X86-LABEL: test_mm256_broadcastsi128_si256_mem: 367; X86: # %bb.0: 368; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 369; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 370; X86-NEXT: retl 371; 372; X64-LABEL: test_mm256_broadcastsi128_si256_mem: 373; X64: # %bb.0: 374; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 375; X64-NEXT: retq 376 %a0 = load <2 x i64>, <2 x i64>* %p0 377 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 378 ret <4 x i64> %res 379} 380 381define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) { 382; CHECK-LABEL: test_mm_broadcastss_ps: 383; CHECK: # %bb.0: 384; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 385; CHECK-NEXT: ret{{[l|q]}} 386 %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer 387 ret <4 x float> %res 388} 389 390define <8 x float> @test_mm256_broadcastss_ps(<8 x float> %a0) { 391; CHECK-LABEL: test_mm256_broadcastss_ps: 392; CHECK: # %bb.0: 393; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 394; CHECK-NEXT: ret{{[l|q]}} 395 %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> zeroinitializer 396 ret <8 x float> %res 397} 398 399define <2 x i64> @test_mm_broadcastw_epi16(<2 x i64> %a0) { 400; CHECK-LABEL: test_mm_broadcastw_epi16: 401; CHECK: # %bb.0: 402; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0 403; CHECK-NEXT: ret{{[l|q]}} 404 %arg0 = bitcast <2 x i64> %a0 to <8 x i16> 405 %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <8 x i32> zeroinitializer 406 %res = bitcast <8 x i16> %shuf to <2 x i64> 407 ret <2 x i64> %res 408} 409 410define <4 x i64> @test_mm256_broadcastw_epi16(<4 x i64> %a0) { 411; CHECK-LABEL: test_mm256_broadcastw_epi16: 412; CHECK: # %bb.0: 413; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0 414; CHECK-NEXT: ret{{[l|q]}} 415 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 416 %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> zeroinitializer 417 %res = bitcast <16 x i16> %shuf to <4 x i64> 418 ret <4 x i64> %res 419} 420 421define <4 x i64> @test_mm256_bslli_epi128(<4 x i64> %a0) { 422; CHECK-LABEL: test_mm256_bslli_epi128: 423; CHECK: # %bb.0: 424; CHECK-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28] 425; CHECK-NEXT: ret{{[l|q]}} 426 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 427 %shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60> 428 %res = bitcast <32 x i8> %shuf to <4 x i64> 429 ret <4 x i64> %res 430} 431 432define <4 x i64> @test_mm256_bsrli_epi128(<4 x i64> %a0) { 433; CHECK-LABEL: test_mm256_bsrli_epi128: 434; CHECK: # %bb.0: 435; CHECK-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero 436; CHECK-NEXT: ret{{[l|q]}} 437 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 438 %shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50> 439 %res = bitcast <32 x i8> %shuf to <4 x i64> 440 ret <4 x i64> %res 441} 442 443define <4 x i64> @test_mm256_cmpeq_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind { 444; CHECK-LABEL: test_mm256_cmpeq_epi8: 445; CHECK: # %bb.0: 446; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 447; CHECK-NEXT: ret{{[l|q]}} 448 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 449 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 450 %cmp = icmp eq <32 x i8> %arg0, %arg1 451 %res = sext <32 x i1> %cmp to <32 x i8> 452 %bc = bitcast <32 x i8> %res to <4 x i64> 453 ret <4 x i64> %bc 454} 455 456define <4 x i64> @test_mm256_cmpeq_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind { 457; CHECK-LABEL: test_mm256_cmpeq_epi16: 458; CHECK: # %bb.0: 459; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 460; CHECK-NEXT: ret{{[l|q]}} 461 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 462 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 463 %cmp = icmp eq <16 x i16> %arg0, %arg1 464 %res = sext <16 x i1> %cmp to <16 x i16> 465 %bc = bitcast <16 x i16> %res to <4 x i64> 466 ret <4 x i64> %bc 467} 468 469define <4 x i64> @test_mm256_cmpeq_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 470; CHECK-LABEL: test_mm256_cmpeq_epi32: 471; CHECK: # %bb.0: 472; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 473; CHECK-NEXT: ret{{[l|q]}} 474 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 475 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 476 %cmp = icmp eq <8 x i32> %arg0, %arg1 477 %res = sext <8 x i1> %cmp to <8 x i32> 478 %bc = bitcast <8 x i32> %res to <4 x i64> 479 ret <4 x i64> %bc 480} 481 482define <4 x i64> @test_mm256_cmpeq_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind { 483; CHECK-LABEL: test_mm256_cmpeq_epi64: 484; CHECK: # %bb.0: 485; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 486; CHECK-NEXT: ret{{[l|q]}} 487 %cmp = icmp eq <4 x i64> %a0, %a1 488 %res = sext <4 x i1> %cmp to <4 x i64> 489 ret <4 x i64> %res 490} 491 492define <4 x i64> @test_mm256_cmpgt_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind { 493; CHECK-LABEL: test_mm256_cmpgt_epi8: 494; CHECK: # %bb.0: 495; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 496; CHECK-NEXT: ret{{[l|q]}} 497 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 498 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 499 %cmp = icmp sgt <32 x i8> %arg0, %arg1 500 %res = sext <32 x i1> %cmp to <32 x i8> 501 %bc = bitcast <32 x i8> %res to <4 x i64> 502 ret <4 x i64> %bc 503} 504 505define <4 x i64> @test_mm256_cmpgt_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind { 506; CHECK-LABEL: test_mm256_cmpgt_epi16: 507; CHECK: # %bb.0: 508; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 509; CHECK-NEXT: ret{{[l|q]}} 510 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 511 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 512 %cmp = icmp sgt <16 x i16> %arg0, %arg1 513 %res = sext <16 x i1> %cmp to <16 x i16> 514 %bc = bitcast <16 x i16> %res to <4 x i64> 515 ret <4 x i64> %bc 516} 517 518define <4 x i64> @test_mm256_cmpgt_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 519; CHECK-LABEL: test_mm256_cmpgt_epi32: 520; CHECK: # %bb.0: 521; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 522; CHECK-NEXT: ret{{[l|q]}} 523 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 524 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 525 %cmp = icmp sgt <8 x i32> %arg0, %arg1 526 %res = sext <8 x i1> %cmp to <8 x i32> 527 %bc = bitcast <8 x i32> %res to <4 x i64> 528 ret <4 x i64> %bc 529} 530 531define <4 x i64> @test_mm256_cmpgt_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind { 532; CHECK-LABEL: test_mm256_cmpgt_epi64: 533; CHECK: # %bb.0: 534; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 535; CHECK-NEXT: ret{{[l|q]}} 536 %cmp = icmp sgt <4 x i64> %a0, %a1 537 %res = sext <4 x i1> %cmp to <4 x i64> 538 ret <4 x i64> %res 539} 540 541define <4 x i64> @test_mm256_cvtepi8_epi16(<2 x i64> %a0) { 542; CHECK-LABEL: test_mm256_cvtepi8_epi16: 543; CHECK: # %bb.0: 544; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0 545; CHECK-NEXT: ret{{[l|q]}} 546 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 547 %ext = sext <16 x i8> %arg0 to <16 x i16> 548 %res = bitcast <16 x i16> %ext to <4 x i64> 549 ret <4 x i64> %res 550} 551 552define <4 x i64> @test_mm256_cvtepi8_epi32(<2 x i64> %a0) { 553; CHECK-LABEL: test_mm256_cvtepi8_epi32: 554; CHECK: # %bb.0: 555; CHECK-NEXT: vpmovsxbd %xmm0, %ymm0 556; CHECK-NEXT: ret{{[l|q]}} 557 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 558 %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 559 %ext = sext <8 x i8> %shuf to <8 x i32> 560 %res = bitcast <8 x i32> %ext to <4 x i64> 561 ret <4 x i64> %res 562} 563 564define <4 x i64> @test_mm256_cvtepi8_epi64(<2 x i64> %a0) { 565; CHECK-LABEL: test_mm256_cvtepi8_epi64: 566; CHECK: # %bb.0: 567; CHECK-NEXT: vpmovsxbq %xmm0, %ymm0 568; CHECK-NEXT: ret{{[l|q]}} 569 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 570 %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 571 %ext = sext <4 x i8> %shuf to <4 x i64> 572 ret <4 x i64> %ext 573} 574 575define <4 x i64> @test_mm256_cvtepi16_epi32(<2 x i64> %a0) { 576; CHECK-LABEL: test_mm256_cvtepi16_epi32: 577; CHECK: # %bb.0: 578; CHECK-NEXT: vpmovsxwd %xmm0, %ymm0 579; CHECK-NEXT: ret{{[l|q]}} 580 %arg0 = bitcast <2 x i64> %a0 to <8 x i16> 581 %ext = sext <8 x i16> %arg0 to <8 x i32> 582 %res = bitcast <8 x i32> %ext to <4 x i64> 583 ret <4 x i64> %res 584} 585 586define <4 x i64> @test_mm256_cvtepi16_epi64(<2 x i64> %a0) { 587; CHECK-LABEL: test_mm256_cvtepi16_epi64: 588; CHECK: # %bb.0: 589; CHECK-NEXT: vpmovsxwq %xmm0, %ymm0 590; CHECK-NEXT: ret{{[l|q]}} 591 %arg0 = bitcast <2 x i64> %a0 to <8 x i16> 592 %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 593 %ext = sext <4 x i16> %shuf to <4 x i64> 594 ret <4 x i64> %ext 595} 596 597define <4 x i64> @test_mm256_cvtepi32_epi64(<2 x i64> %a0) { 598; CHECK-LABEL: test_mm256_cvtepi32_epi64: 599; CHECK: # %bb.0: 600; CHECK-NEXT: vpmovsxdq %xmm0, %ymm0 601; CHECK-NEXT: ret{{[l|q]}} 602 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 603 %ext = sext <4 x i32> %arg0 to <4 x i64> 604 ret <4 x i64> %ext 605} 606 607define <4 x i64> @test_mm256_cvtepu8_epi16(<2 x i64> %a0) { 608; CHECK-LABEL: test_mm256_cvtepu8_epi16: 609; CHECK: # %bb.0: 610; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 611; CHECK-NEXT: ret{{[l|q]}} 612 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 613 %ext = zext <16 x i8> %arg0 to <16 x i16> 614 %res = bitcast <16 x i16> %ext to <4 x i64> 615 ret <4 x i64> %res 616} 617 618define <4 x i64> @test_mm256_cvtepu8_epi32(<2 x i64> %a0) { 619; CHECK-LABEL: test_mm256_cvtepu8_epi32: 620; CHECK: # %bb.0: 621; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 622; CHECK-NEXT: ret{{[l|q]}} 623 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 624 %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 625 %ext = zext <8 x i8> %shuf to <8 x i32> 626 %res = bitcast <8 x i32> %ext to <4 x i64> 627 ret <4 x i64> %res 628} 629 630define <4 x i64> @test_mm256_cvtepu8_epi64(<2 x i64> %a0) { 631; CHECK-LABEL: test_mm256_cvtepu8_epi64: 632; CHECK: # %bb.0: 633; CHECK-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero 634; CHECK-NEXT: ret{{[l|q]}} 635 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 636 %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 637 %ext = zext <4 x i8> %shuf to <4 x i64> 638 ret <4 x i64> %ext 639} 640 641define <4 x i64> @test_mm256_cvtepu16_epi32(<2 x i64> %a0) { 642; CHECK-LABEL: test_mm256_cvtepu16_epi32: 643; CHECK: # %bb.0: 644; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 645; CHECK-NEXT: ret{{[l|q]}} 646 %arg0 = bitcast <2 x i64> %a0 to <8 x i16> 647 %ext = zext <8 x i16> %arg0 to <8 x i32> 648 %res = bitcast <8 x i32> %ext to <4 x i64> 649 ret <4 x i64> %res 650} 651 652define <4 x i64> @test_mm256_cvtepu16_epi64(<2 x i64> %a0) { 653; CHECK-LABEL: test_mm256_cvtepu16_epi64: 654; CHECK: # %bb.0: 655; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 656; CHECK-NEXT: ret{{[l|q]}} 657 %arg0 = bitcast <2 x i64> %a0 to <8 x i16> 658 %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 659 %ext = zext <4 x i16> %shuf to <4 x i64> 660 ret <4 x i64> %ext 661} 662 663define <4 x i64> @test_mm256_cvtepu32_epi64(<2 x i64> %a0) { 664; CHECK-LABEL: test_mm256_cvtepu32_epi64: 665; CHECK: # %bb.0: 666; CHECK-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 667; CHECK-NEXT: ret{{[l|q]}} 668 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 669 %ext = zext <4 x i32> %arg0 to <4 x i64> 670 ret <4 x i64> %ext 671} 672 673define <2 x i64> @test_mm256_extracti128_si256(<4 x i64> %a0) nounwind { 674; CHECK-LABEL: test_mm256_extracti128_si256: 675; CHECK: # %bb.0: 676; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 677; CHECK-NEXT: vzeroupper 678; CHECK-NEXT: ret{{[l|q]}} 679 %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 2, i32 3> 680 ret <2 x i64> %res 681} 682 683define <4 x i64> @test_mm256_hadd_epi16(<4 x i64> %a0, <4 x i64> %a1) { 684; CHECK-LABEL: test_mm256_hadd_epi16: 685; CHECK: # %bb.0: 686; CHECK-NEXT: vphaddw %ymm1, %ymm0, %ymm0 687; CHECK-NEXT: ret{{[l|q]}} 688 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 689 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 690 %res = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %arg0, <16 x i16> %arg1) 691 %bc = bitcast <16 x i16> %res to <4 x i64> 692 ret <4 x i64> %bc 693} 694declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readnone 695 696define <4 x i64> @test_mm256_hadd_epi32(<4 x i64> %a0, <4 x i64> %a1) { 697; CHECK-LABEL: test_mm256_hadd_epi32: 698; CHECK: # %bb.0: 699; CHECK-NEXT: vphaddd %ymm1, %ymm0, %ymm0 700; CHECK-NEXT: ret{{[l|q]}} 701 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 702 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 703 %res = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %arg0, <8 x i32> %arg1) 704 %bc = bitcast <8 x i32> %res to <4 x i64> 705 ret <4 x i64> %bc 706} 707declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone 708 709define <4 x i64> @test_mm256_hadds_epi16(<4 x i64> %a0, <4 x i64> %a1) { 710; CHECK-LABEL: test_mm256_hadds_epi16: 711; CHECK: # %bb.0: 712; CHECK-NEXT: vphaddsw %ymm1, %ymm0, %ymm0 713; CHECK-NEXT: ret{{[l|q]}} 714 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 715 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 716 %res = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %arg0, <16 x i16> %arg1) 717 %bc = bitcast <16 x i16> %res to <4 x i64> 718 ret <4 x i64> %bc 719} 720declare <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16>, <16 x i16>) nounwind readnone 721 722define <4 x i64> @test_mm256_hsub_epi16(<4 x i64> %a0, <4 x i64> %a1) { 723; CHECK-LABEL: test_mm256_hsub_epi16: 724; CHECK: # %bb.0: 725; CHECK-NEXT: vphsubw %ymm1, %ymm0, %ymm0 726; CHECK-NEXT: ret{{[l|q]}} 727 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 728 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 729 %res = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %arg0, <16 x i16> %arg1) 730 %bc = bitcast <16 x i16> %res to <4 x i64> 731 ret <4 x i64> %bc 732} 733declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) nounwind readnone 734 735define <4 x i64> @test_mm256_hsub_epi32(<4 x i64> %a0, <4 x i64> %a1) { 736; CHECK-LABEL: test_mm256_hsub_epi32: 737; CHECK: # %bb.0: 738; CHECK-NEXT: vphsubd %ymm1, %ymm0, %ymm0 739; CHECK-NEXT: ret{{[l|q]}} 740 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 741 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 742 %res = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %arg0, <8 x i32> %arg1) 743 %bc = bitcast <8 x i32> %res to <4 x i64> 744 ret <4 x i64> %bc 745} 746declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) nounwind readnone 747 748define <4 x i64> @test_mm256_hsubs_epi16(<4 x i64> %a0, <4 x i64> %a1) { 749; CHECK-LABEL: test_mm256_hsubs_epi16: 750; CHECK: # %bb.0: 751; CHECK-NEXT: vphsubsw %ymm1, %ymm0, %ymm0 752; CHECK-NEXT: ret{{[l|q]}} 753 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 754 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 755 %res = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %arg0, <16 x i16> %arg1) 756 %bc = bitcast <16 x i16> %res to <4 x i64> 757 ret <4 x i64> %bc 758} 759declare <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16>, <16 x i16>) nounwind readnone 760 761define <2 x i64> @test_mm_i32gather_epi32(i32 *%a0, <2 x i64> %a1) { 762; X86-LABEL: test_mm_i32gather_epi32: 763; X86: # %bb.0: 764; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 765; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 766; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 767; X86-NEXT: vpgatherdd %xmm2, (%eax,%xmm0,2), %xmm1 768; X86-NEXT: vmovdqa %xmm1, %xmm0 769; X86-NEXT: retl 770; 771; X64-LABEL: test_mm_i32gather_epi32: 772; X64: # %bb.0: 773; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 774; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 775; X64-NEXT: vpgatherdd %xmm2, (%rdi,%xmm0,2), %xmm1 776; X64-NEXT: vmovdqa %xmm1, %xmm0 777; X64-NEXT: retq 778 %arg0 = bitcast i32 *%a0 to i8* 779 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 780 %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32> 781 %call = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> undef, i8* %arg0, <4 x i32> %arg1, <4 x i32> %mask, i8 2) 782 %bc = bitcast <4 x i32> %call to <2 x i64> 783 ret <2 x i64> %bc 784} 785declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32>, i8*, <4 x i32>, <4 x i32>, i8) nounwind readonly 786 787define <2 x i64> @test_mm_mask_i32gather_epi32(<2 x i64> %a0, i32 *%a1, <2 x i64> %a2, <2 x i64> %a3) { 788; X86-LABEL: test_mm_mask_i32gather_epi32: 789; X86: # %bb.0: 790; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 791; X86-NEXT: vpgatherdd %xmm2, (%eax,%xmm1,2), %xmm0 792; X86-NEXT: retl 793; 794; X64-LABEL: test_mm_mask_i32gather_epi32: 795; X64: # %bb.0: 796; X64-NEXT: vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0 797; X64-NEXT: retq 798 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 799 %arg1 = bitcast i32 *%a1 to i8* 800 %arg2 = bitcast <2 x i64> %a2 to <4 x i32> 801 %arg3 = bitcast <2 x i64> %a3 to <4 x i32> 802 %call = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %arg0, i8* %arg1, <4 x i32> %arg2, <4 x i32> %arg3, i8 2) 803 %bc = bitcast <4 x i32> %call to <2 x i64> 804 ret <2 x i64> %bc 805} 806 807define <4 x i64> @test_mm256_i32gather_epi32(i32 *%a0, <4 x i64> %a1) { 808; X86-LABEL: test_mm256_i32gather_epi32: 809; X86: # %bb.0: 810; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 811; X86-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 812; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 813; X86-NEXT: vpgatherdd %ymm2, (%eax,%ymm0,2), %ymm1 814; X86-NEXT: vmovdqa %ymm1, %ymm0 815; X86-NEXT: retl 816; 817; X64-LABEL: test_mm256_i32gather_epi32: 818; X64: # %bb.0: 819; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 820; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 821; X64-NEXT: vpgatherdd %ymm2, (%rdi,%ymm0,2), %ymm1 822; X64-NEXT: vmovdqa %ymm1, %ymm0 823; X64-NEXT: retq 824 %arg0 = bitcast i32 *%a0 to i8* 825 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 826 %mask = bitcast <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1> to <8 x i32> 827 %call = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8* %arg0, <8 x i32> %arg1, <8 x i32> %mask, i8 2) 828 %bc = bitcast <8 x i32> %call to <4 x i64> 829 ret <4 x i64> %bc 830} 831declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32>, i8*, <8 x i32>, <8 x i32>, i8) nounwind readonly 832 833define <4 x i64> @test_mm256_mask_i32gather_epi32(<4 x i64> %a0, i32 *%a1, <4 x i64> %a2, <4 x i64> %a3) { 834; X86-LABEL: test_mm256_mask_i32gather_epi32: 835; X86: # %bb.0: 836; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 837; X86-NEXT: vpgatherdd %ymm2, (%eax,%ymm1,2), %ymm0 838; X86-NEXT: retl 839; 840; X64-LABEL: test_mm256_mask_i32gather_epi32: 841; X64: # %bb.0: 842; X64-NEXT: vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0 843; X64-NEXT: retq 844 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 845 %arg1 = bitcast i32 *%a1 to i8* 846 %arg2 = bitcast <4 x i64> %a2 to <8 x i32> 847 %arg3 = bitcast <4 x i64> %a3 to <8 x i32> 848 %call = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %arg0, i8* %arg1, <8 x i32> %arg2, <8 x i32> %arg3, i8 2) 849 %bc = bitcast <8 x i32> %call to <4 x i64> 850 ret <4 x i64> %bc 851} 852 853define <2 x i64> @test_mm_i32gather_epi64(i64 *%a0, <2 x i64> %a1) { 854; X86-LABEL: test_mm_i32gather_epi64: 855; X86: # %bb.0: 856; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 857; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 858; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 859; X86-NEXT: vpgatherdq %xmm2, (%eax,%xmm0,2), %xmm1 860; X86-NEXT: vmovdqa %xmm1, %xmm0 861; X86-NEXT: retl 862; 863; X64-LABEL: test_mm_i32gather_epi64: 864; X64: # %bb.0: 865; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 866; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 867; X64-NEXT: vpgatherdq %xmm2, (%rdi,%xmm0,2), %xmm1 868; X64-NEXT: vmovdqa %xmm1, %xmm0 869; X64-NEXT: retq 870 %arg0 = bitcast i64 *%a0 to i8* 871 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 872 %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> undef, i8* %arg0, <4 x i32> %arg1, <2 x i64> <i64 -1, i64 -1>, i8 2) 873 ret <2 x i64> %res 874} 875declare <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64>, i8*, <4 x i32>, <2 x i64>, i8) nounwind readonly 876 877define <2 x i64> @test_mm_mask_i32gather_epi64(<2 x i64> %a0, i64 *%a1, <2 x i64> %a2, <2 x i64> %a3) { 878; X86-LABEL: test_mm_mask_i32gather_epi64: 879; X86: # %bb.0: 880; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 881; X86-NEXT: vpgatherdq %xmm2, (%eax,%xmm1,2), %xmm0 882; X86-NEXT: retl 883; 884; X64-LABEL: test_mm_mask_i32gather_epi64: 885; X64: # %bb.0: 886; X64-NEXT: vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0 887; X64-NEXT: retq 888 %arg1 = bitcast i64 *%a1 to i8* 889 %arg2 = bitcast <2 x i64> %a2 to <4 x i32> 890 %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> %a0, i8* %arg1, <4 x i32> %arg2, <2 x i64> %a3, i8 2) 891 ret <2 x i64> %res 892} 893 894define <4 x i64> @test_mm256_i32gather_epi64(i64 *%a0, <2 x i64> %a1) { 895; X86-LABEL: test_mm256_i32gather_epi64: 896; X86: # %bb.0: 897; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 898; X86-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 899; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 900; X86-NEXT: vpgatherdq %ymm2, (%eax,%xmm0,2), %ymm1 901; X86-NEXT: vmovdqa %ymm1, %ymm0 902; X86-NEXT: retl 903; 904; X64-LABEL: test_mm256_i32gather_epi64: 905; X64: # %bb.0: 906; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 907; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 908; X64-NEXT: vpgatherdq %ymm2, (%rdi,%xmm0,2), %ymm1 909; X64-NEXT: vmovdqa %ymm1, %ymm0 910; X64-NEXT: retq 911 %arg0 = bitcast i64 *%a0 to i8* 912 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 913 %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8* %arg0, <4 x i32> %arg1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2) 914 ret <4 x i64> %res 915} 916declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64>, i8*, <4 x i32>, <4 x i64>, i8) nounwind readonly 917 918define <4 x i64> @test_mm256_mask_i32gather_epi64(<4 x i64> %a0, i64 *%a1, <2 x i64> %a2, <4 x i64> %a3) { 919; X86-LABEL: test_mm256_mask_i32gather_epi64: 920; X86: # %bb.0: 921; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 922; X86-NEXT: vpgatherdq %ymm2, (%eax,%xmm1,2), %ymm0 923; X86-NEXT: retl 924; 925; X64-LABEL: test_mm256_mask_i32gather_epi64: 926; X64: # %bb.0: 927; X64-NEXT: vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0 928; X64-NEXT: retq 929 %arg1 = bitcast i64 *%a1 to i8* 930 %arg2 = bitcast <2 x i64> %a2 to <4 x i32> 931 %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %a0, i8* %arg1, <4 x i32> %arg2, <4 x i64> %a3, i8 2) 932 ret <4 x i64> %res 933} 934 935define <2 x double> @test_mm_i32gather_pd(double *%a0, <2 x i64> %a1) { 936; X86-LABEL: test_mm_i32gather_pd: 937; X86: # %bb.0: 938; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 939; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 940; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 941; X86-NEXT: vgatherdpd %xmm2, (%eax,%xmm0,2), %xmm1 942; X86-NEXT: vmovapd %xmm1, %xmm0 943; X86-NEXT: retl 944; 945; X64-LABEL: test_mm_i32gather_pd: 946; X64: # %bb.0: 947; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 948; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1 949; X64-NEXT: vgatherdpd %xmm2, (%rdi,%xmm0,2), %xmm1 950; X64-NEXT: vmovapd %xmm1, %xmm0 951; X64-NEXT: retq 952 %arg0 = bitcast double *%a0 to i8* 953 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 954 %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer 955 %sext = sext <2 x i1> %cmp to <2 x i64> 956 %mask = bitcast <2 x i64> %sext to <2 x double> 957 %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> undef, i8* %arg0, <4 x i32> %arg1, <2 x double> %mask, i8 2) 958 ret <2 x double> %res 959} 960declare <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double>, i8*, <4 x i32>, <2 x double>, i8) nounwind readonly 961 962define <2 x double> @test_mm_mask_i32gather_pd(<2 x double> %a0, double *%a1, <2 x i64> %a2, <2 x double> %a3) { 963; X86-LABEL: test_mm_mask_i32gather_pd: 964; X86: # %bb.0: 965; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 966; X86-NEXT: vgatherdpd %xmm2, (%eax,%xmm1,2), %xmm0 967; X86-NEXT: retl 968; 969; X64-LABEL: test_mm_mask_i32gather_pd: 970; X64: # %bb.0: 971; X64-NEXT: vgatherdpd %xmm2, (%rdi,%xmm1,2), %xmm0 972; X64-NEXT: retq 973 %arg1 = bitcast double *%a1 to i8* 974 %arg2 = bitcast <2 x i64> %a2 to <4 x i32> 975 %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> %a0, i8* %arg1, <4 x i32> %arg2, <2 x double> %a3, i8 2) 976 ret <2 x double> %res 977} 978 979define <4 x double> @test_mm256_i32gather_pd(double *%a0, <2 x i64> %a1) { 980; X86-LABEL: test_mm256_i32gather_pd: 981; X86: # %bb.0: 982; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 983; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 984; X86-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2 985; X86-NEXT: vgatherdpd %ymm2, (%eax,%xmm0,2), %ymm1 986; X86-NEXT: vmovapd %ymm1, %ymm0 987; X86-NEXT: retl 988; 989; X64-LABEL: test_mm256_i32gather_pd: 990; X64: # %bb.0: 991; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1 992; X64-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2 993; X64-NEXT: vgatherdpd %ymm2, (%rdi,%xmm0,2), %ymm1 994; X64-NEXT: vmovapd %ymm1, %ymm0 995; X64-NEXT: retq 996 %arg0 = bitcast double *%a0 to i8* 997 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 998 %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0) 999 %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8* %arg0, <4 x i32> %arg1, <4 x double> %mask, i8 2) 1000 ret <4 x double> %res 1001} 1002declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double>, i8*, <4 x i32>, <4 x double>, i8) nounwind readonly 1003 1004define <4 x double> @test_mm256_mask_i32gather_pd(<4 x double> %a0, double *%a1, <2 x i64> %a2, <4 x double> %a3) { 1005; X86-LABEL: test_mm256_mask_i32gather_pd: 1006; X86: # %bb.0: 1007; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1008; X86-NEXT: vgatherdpd %ymm2, (%eax,%xmm1,2), %ymm0 1009; X86-NEXT: retl 1010; 1011; X64-LABEL: test_mm256_mask_i32gather_pd: 1012; X64: # %bb.0: 1013; X64-NEXT: vgatherdpd %ymm2, (%rdi,%xmm1,2), %ymm0 1014; X64-NEXT: retq 1015 %arg1 = bitcast double *%a1 to i8* 1016 %arg2 = bitcast <2 x i64> %a2 to <4 x i32> 1017 %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %a0, i8* %arg1, <4 x i32> %arg2, <4 x double> %a3, i8 2) 1018 ret <4 x double> %res 1019} 1020 1021define <4 x float> @test_mm_i32gather_ps(float *%a0, <2 x i64> %a1) { 1022; X86-LABEL: test_mm_i32gather_ps: 1023; X86: # %bb.0: 1024; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1025; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1026; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1 1027; X86-NEXT: vgatherdps %xmm2, (%eax,%xmm0,2), %xmm1 1028; X86-NEXT: vmovaps %xmm1, %xmm0 1029; X86-NEXT: retl 1030; 1031; X64-LABEL: test_mm_i32gather_ps: 1032; X64: # %bb.0: 1033; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1034; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 1035; X64-NEXT: vgatherdps %xmm2, (%rdi,%xmm0,2), %xmm1 1036; X64-NEXT: vmovaps %xmm1, %xmm0 1037; X64-NEXT: retq 1038 %arg0 = bitcast float *%a0 to i8* 1039 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 1040 %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer 1041 %sext = sext <4 x i1> %cmp to <4 x i32> 1042 %mask = bitcast <4 x i32> %sext to <4 x float> 1043 %call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> undef, i8* %arg0, <4 x i32> %arg1, <4 x float> %mask, i8 2) 1044 ret <4 x float> %call 1045} 1046declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float>, i8*, <4 x i32>, <4 x float>, i8) nounwind readonly 1047 1048define <4 x float> @test_mm_mask_i32gather_ps(<4 x float> %a0, float *%a1, <2 x i64> %a2, <4 x float> %a3) { 1049; X86-LABEL: test_mm_mask_i32gather_ps: 1050; X86: # %bb.0: 1051; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1052; X86-NEXT: vgatherdps %xmm2, (%eax,%xmm1,2), %xmm0 1053; X86-NEXT: retl 1054; 1055; X64-LABEL: test_mm_mask_i32gather_ps: 1056; X64: # %bb.0: 1057; X64-NEXT: vgatherdps %xmm2, (%rdi,%xmm1,2), %xmm0 1058; X64-NEXT: retq 1059 %arg1 = bitcast float *%a1 to i8* 1060 %arg2 = bitcast <2 x i64> %a2 to <4 x i32> 1061 %call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %a0, i8* %arg1, <4 x i32> %arg2, <4 x float> %a3, i8 2) 1062 ret <4 x float> %call 1063} 1064 1065define <8 x float> @test_mm256_i32gather_ps(float *%a0, <4 x i64> %a1) { 1066; X86-LABEL: test_mm256_i32gather_ps: 1067; X86: # %bb.0: 1068; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1069; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1 1070; X86-NEXT: vcmpeqps %ymm1, %ymm1, %ymm2 1071; X86-NEXT: vgatherdps %ymm2, (%eax,%ymm0,2), %ymm1 1072; X86-NEXT: vmovaps %ymm1, %ymm0 1073; X86-NEXT: retl 1074; 1075; X64-LABEL: test_mm256_i32gather_ps: 1076; X64: # %bb.0: 1077; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 1078; X64-NEXT: vcmpeqps %ymm1, %ymm1, %ymm2 1079; X64-NEXT: vgatherdps %ymm2, (%rdi,%ymm0,2), %ymm1 1080; X64-NEXT: vmovaps %ymm1, %ymm0 1081; X64-NEXT: retq 1082 %arg0 = bitcast float *%a0 to i8* 1083 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1084 %mask = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> zeroinitializer, <8 x float> zeroinitializer, i8 0) 1085 %call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8* %arg0, <8 x i32> %arg1, <8 x float> %mask, i8 2) 1086 ret <8 x float> %call 1087} 1088declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float>, i8*, <8 x i32>, <8 x float>, i8) nounwind readonly 1089 1090define <8 x float> @test_mm256_mask_i32gather_ps(<8 x float> %a0, float *%a1, <4 x i64> %a2, <8 x float> %a3) { 1091; X86-LABEL: test_mm256_mask_i32gather_ps: 1092; X86: # %bb.0: 1093; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1094; X86-NEXT: vgatherdps %ymm2, (%eax,%ymm1,2), %ymm0 1095; X86-NEXT: retl 1096; 1097; X64-LABEL: test_mm256_mask_i32gather_ps: 1098; X64: # %bb.0: 1099; X64-NEXT: vgatherdps %ymm2, (%rdi,%ymm1,2), %ymm0 1100; X64-NEXT: retq 1101 %arg1 = bitcast float *%a1 to i8* 1102 %arg2 = bitcast <4 x i64> %a2 to <8 x i32> 1103 %call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0, i8* %arg1, <8 x i32> %arg2, <8 x float> %a3, i8 2) 1104 ret <8 x float> %call 1105} 1106 1107define <2 x i64> @test_mm_i64gather_epi32(i32 *%a0, <2 x i64> %a1) { 1108; X86-LABEL: test_mm_i64gather_epi32: 1109; X86: # %bb.0: 1110; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1111; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1112; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 1113; X86-NEXT: vpgatherqd %xmm2, (%eax,%xmm0,2), %xmm1 1114; X86-NEXT: vmovdqa %xmm1, %xmm0 1115; X86-NEXT: retl 1116; 1117; X64-LABEL: test_mm_i64gather_epi32: 1118; X64: # %bb.0: 1119; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1120; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 1121; X64-NEXT: vpgatherqd %xmm2, (%rdi,%xmm0,2), %xmm1 1122; X64-NEXT: vmovdqa %xmm1, %xmm0 1123; X64-NEXT: retq 1124 %arg0 = bitcast i32 *%a0 to i8* 1125 %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32> 1126 %call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> undef, i8* %arg0, <2 x i64> %a1, <4 x i32> %mask, i8 2) 1127 %bc = bitcast <4 x i32> %call to <2 x i64> 1128 ret <2 x i64> %bc 1129} 1130declare <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32>, i8*, <2 x i64>, <4 x i32>, i8) nounwind readonly 1131 1132define <2 x i64> @test_mm_mask_i64gather_epi32(<2 x i64> %a0, i32 *%a1, <2 x i64> %a2, <2 x i64> %a3) { 1133; X86-LABEL: test_mm_mask_i64gather_epi32: 1134; X86: # %bb.0: 1135; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1136; X86-NEXT: vpgatherqd %xmm2, (%eax,%xmm1,2), %xmm0 1137; X86-NEXT: retl 1138; 1139; X64-LABEL: test_mm_mask_i64gather_epi32: 1140; X64: # %bb.0: 1141; X64-NEXT: vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0 1142; X64-NEXT: retq 1143 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 1144 %arg1 = bitcast i32 *%a1 to i8* 1145 %arg3 = bitcast <2 x i64> %a3 to <4 x i32> 1146 %call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> %arg0, i8* %arg1, <2 x i64> %a2, <4 x i32> %arg3, i8 2) 1147 %bc = bitcast <4 x i32> %call to <2 x i64> 1148 ret <2 x i64> %bc 1149} 1150 1151define <2 x i64> @test_mm256_i64gather_epi32(i32 *%a0, <4 x i64> %a1) { 1152; X86-LABEL: test_mm256_i64gather_epi32: 1153; X86: # %bb.0: 1154; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1155; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1156; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 1157; X86-NEXT: vpgatherqd %xmm2, (%eax,%ymm0,2), %xmm1 1158; X86-NEXT: vmovdqa %xmm1, %xmm0 1159; X86-NEXT: vzeroupper 1160; X86-NEXT: retl 1161; 1162; X64-LABEL: test_mm256_i64gather_epi32: 1163; X64: # %bb.0: 1164; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1165; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 1166; X64-NEXT: vpgatherqd %xmm2, (%rdi,%ymm0,2), %xmm1 1167; X64-NEXT: vmovdqa %xmm1, %xmm0 1168; X64-NEXT: vzeroupper 1169; X64-NEXT: retq 1170 %arg0 = bitcast i32 *%a0 to i8* 1171 %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32> 1172 %call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8* %arg0, <4 x i64> %a1, <4 x i32> %mask, i8 2) 1173 %bc = bitcast <4 x i32> %call to <2 x i64> 1174 ret <2 x i64> %bc 1175} 1176declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32>, i8*, <4 x i64>, <4 x i32>, i8) nounwind readonly 1177 1178define <2 x i64> @test_mm256_mask_i64gather_epi32(<2 x i64> %a0, i32 *%a1, <4 x i64> %a2, <2 x i64> %a3) { 1179; X86-LABEL: test_mm256_mask_i64gather_epi32: 1180; X86: # %bb.0: 1181; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1182; X86-NEXT: vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0 1183; X86-NEXT: vzeroupper 1184; X86-NEXT: retl 1185; 1186; X64-LABEL: test_mm256_mask_i64gather_epi32: 1187; X64: # %bb.0: 1188; X64-NEXT: vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0 1189; X64-NEXT: vzeroupper 1190; X64-NEXT: retq 1191 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 1192 %arg1 = bitcast i32 *%a1 to i8* 1193 %arg3 = bitcast <2 x i64> %a3 to <4 x i32> 1194 %call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %arg0, i8* %arg1, <4 x i64> %a2, <4 x i32> %arg3, i8 2) 1195 %bc = bitcast <4 x i32> %call to <2 x i64> 1196 ret <2 x i64> %bc 1197} 1198 1199define <2 x i64> @test_mm_i64gather_epi64(i64 *%a0, <2 x i64> %a1) { 1200; X86-LABEL: test_mm_i64gather_epi64: 1201; X86: # %bb.0: 1202; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1203; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1204; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 1205; X86-NEXT: vpgatherqq %xmm2, (%eax,%xmm0,2), %xmm1 1206; X86-NEXT: vmovdqa %xmm1, %xmm0 1207; X86-NEXT: retl 1208; 1209; X64-LABEL: test_mm_i64gather_epi64: 1210; X64: # %bb.0: 1211; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1212; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 1213; X64-NEXT: vpgatherqq %xmm2, (%rdi,%xmm0,2), %xmm1 1214; X64-NEXT: vmovdqa %xmm1, %xmm0 1215; X64-NEXT: retq 1216 %arg0 = bitcast i64 *%a0 to i8* 1217 %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> undef, i8* %arg0, <2 x i64> %a1, <2 x i64> <i64 -1, i64 -1>, i8 2) 1218 ret <2 x i64> %call 1219} 1220declare <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64>, i8*, <2 x i64>, <2 x i64>, i8) nounwind readonly 1221 1222define <2 x i64> @test_mm_mask_i64gather_epi64(<2 x i64> %a0, i64 *%a1, <2 x i64> %a2, <2 x i64> %a3) { 1223; X86-LABEL: test_mm_mask_i64gather_epi64: 1224; X86: # %bb.0: 1225; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1226; X86-NEXT: vpgatherqq %xmm2, (%eax,%xmm1,2), %xmm0 1227; X86-NEXT: retl 1228; 1229; X64-LABEL: test_mm_mask_i64gather_epi64: 1230; X64: # %bb.0: 1231; X64-NEXT: vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0 1232; X64-NEXT: retq 1233 %arg1 = bitcast i64 *%a1 to i8* 1234 %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> %a0, i8* %arg1, <2 x i64> %a2, <2 x i64> %a3, i8 2) 1235 ret <2 x i64> %call 1236} 1237 1238define <4 x i64> @test_mm256_i64gather_epi64(i64 *%a0, <4 x i64> %a1) { 1239; X86-LABEL: test_mm256_i64gather_epi64: 1240; X86: # %bb.0: 1241; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1242; X86-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 1243; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 1244; X86-NEXT: vpgatherqq %ymm2, (%eax,%ymm0,2), %ymm1 1245; X86-NEXT: vmovdqa %ymm1, %ymm0 1246; X86-NEXT: retl 1247; 1248; X64-LABEL: test_mm256_i64gather_epi64: 1249; X64: # %bb.0: 1250; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 1251; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 1252; X64-NEXT: vpgatherqq %ymm2, (%rdi,%ymm0,2), %ymm1 1253; X64-NEXT: vmovdqa %ymm1, %ymm0 1254; X64-NEXT: retq 1255 %arg0 = bitcast i64 *%a0 to i8* 1256 %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8* %arg0, <4 x i64> %a1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2) 1257 ret <4 x i64> %call 1258} 1259declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64>, i8*, <4 x i64>, <4 x i64>, i8) nounwind readonly 1260 1261define <4 x i64> @test_mm256_mask_i64gather_epi64(<4 x i64> %a0, i64 *%a1, <4 x i64> %a2, <4 x i64> %a3) { 1262; X86-LABEL: test_mm256_mask_i64gather_epi64: 1263; X86: # %bb.0: 1264; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1265; X86-NEXT: vpgatherqq %ymm2, (%eax,%ymm1,2), %ymm0 1266; X86-NEXT: retl 1267; 1268; X64-LABEL: test_mm256_mask_i64gather_epi64: 1269; X64: # %bb.0: 1270; X64-NEXT: vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0 1271; X64-NEXT: retq 1272 %arg1 = bitcast i64 *%a1 to i8* 1273 %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %a0, i8* %arg1, <4 x i64> %a2, <4 x i64> %a3, i8 2) 1274 ret <4 x i64> %call 1275} 1276 1277define <2 x double> @test_mm_i64gather_pd(double *%a0, <2 x i64> %a1) { 1278; X86-LABEL: test_mm_i64gather_pd: 1279; X86: # %bb.0: 1280; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1281; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1282; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1283; X86-NEXT: vgatherqpd %xmm2, (%eax,%xmm0,2), %xmm1 1284; X86-NEXT: vmovapd %xmm1, %xmm0 1285; X86-NEXT: retl 1286; 1287; X64-LABEL: test_mm_i64gather_pd: 1288; X64: # %bb.0: 1289; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1290; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1291; X64-NEXT: vgatherqpd %xmm2, (%rdi,%xmm0,2), %xmm1 1292; X64-NEXT: vmovapd %xmm1, %xmm0 1293; X64-NEXT: retq 1294 %arg0 = bitcast double *%a0 to i8* 1295 %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer 1296 %sext = sext <2 x i1> %cmp to <2 x i64> 1297 %mask = bitcast <2 x i64> %sext to <2 x double> 1298 %call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> undef, i8* %arg0, <2 x i64> %a1, <2 x double> %mask, i8 2) 1299 ret <2 x double> %call 1300} 1301declare <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double>, i8*, <2 x i64>, <2 x double>, i8) nounwind readonly 1302 1303define <2 x double> @test_mm_mask_i64gather_pd(<2 x double> %a0, double *%a1, <2 x i64> %a2, <2 x double> %a3) { 1304; X86-LABEL: test_mm_mask_i64gather_pd: 1305; X86: # %bb.0: 1306; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1307; X86-NEXT: vgatherqpd %xmm2, (%eax,%xmm1,2), %xmm0 1308; X86-NEXT: retl 1309; 1310; X64-LABEL: test_mm_mask_i64gather_pd: 1311; X64: # %bb.0: 1312; X64-NEXT: vgatherqpd %xmm2, (%rdi,%xmm1,2), %xmm0 1313; X64-NEXT: retq 1314 %arg1 = bitcast double *%a1 to i8* 1315 %call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> %a0, i8* %arg1, <2 x i64> %a2, <2 x double> %a3, i8 2) 1316 ret <2 x double> %call 1317} 1318 1319define <4 x double> @test_mm256_i64gather_pd(double *%a0, <4 x i64> %a1) { 1320; X86-LABEL: test_mm256_i64gather_pd: 1321; X86: # %bb.0: 1322; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1323; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1324; X86-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2 1325; X86-NEXT: vgatherqpd %ymm2, (%eax,%ymm0,2), %ymm1 1326; X86-NEXT: vmovapd %ymm1, %ymm0 1327; X86-NEXT: retl 1328; 1329; X64-LABEL: test_mm256_i64gather_pd: 1330; X64: # %bb.0: 1331; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1332; X64-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2 1333; X64-NEXT: vgatherqpd %ymm2, (%rdi,%ymm0,2), %ymm1 1334; X64-NEXT: vmovapd %ymm1, %ymm0 1335; X64-NEXT: retq 1336 %arg0 = bitcast double *%a0 to i8* 1337 %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0) 1338 %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8* %arg0, <4 x i64> %a1, <4 x double> %mask, i8 2) 1339 ret <4 x double> %call 1340} 1341declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double>, i8*, <4 x i64>, <4 x double>, i8) nounwind readonly 1342 1343define <4 x double> @test_mm256_mask_i64gather_pd(<4 x double> %a0, i64 *%a1, <4 x i64> %a2, <4 x double> %a3) { 1344; X86-LABEL: test_mm256_mask_i64gather_pd: 1345; X86: # %bb.0: 1346; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1347; X86-NEXT: vgatherqpd %ymm2, (%eax,%ymm1,2), %ymm0 1348; X86-NEXT: retl 1349; 1350; X64-LABEL: test_mm256_mask_i64gather_pd: 1351; X64: # %bb.0: 1352; X64-NEXT: vgatherqpd %ymm2, (%rdi,%ymm1,2), %ymm0 1353; X64-NEXT: retq 1354 %arg1 = bitcast i64 *%a1 to i8* 1355 %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %a0, i8* %arg1, <4 x i64> %a2, <4 x double> %a3, i8 2) 1356 ret <4 x double> %call 1357} 1358 1359define <4 x float> @test_mm_i64gather_ps(float *%a0, <2 x i64> %a1) { 1360; X86-LABEL: test_mm_i64gather_ps: 1361; X86: # %bb.0: 1362; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1363; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1364; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1 1365; X86-NEXT: vgatherqps %xmm2, (%eax,%xmm0,2), %xmm1 1366; X86-NEXT: vmovaps %xmm1, %xmm0 1367; X86-NEXT: retl 1368; 1369; X64-LABEL: test_mm_i64gather_ps: 1370; X64: # %bb.0: 1371; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1372; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 1373; X64-NEXT: vgatherqps %xmm2, (%rdi,%xmm0,2), %xmm1 1374; X64-NEXT: vmovaps %xmm1, %xmm0 1375; X64-NEXT: retq 1376 %arg0 = bitcast float *%a0 to i8* 1377 %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer 1378 %sext = sext <4 x i1> %cmp to <4 x i32> 1379 %mask = bitcast <4 x i32> %sext to <4 x float> 1380 %call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> undef, i8* %arg0, <2 x i64> %a1, <4 x float> %mask, i8 2) 1381 ret <4 x float> %call 1382} 1383declare <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float>, i8*, <2 x i64>, <4 x float>, i8) nounwind readonly 1384 1385define <4 x float> @test_mm_mask_i64gather_ps(<4 x float> %a0, float *%a1, <2 x i64> %a2, <4 x float> %a3) { 1386; X86-LABEL: test_mm_mask_i64gather_ps: 1387; X86: # %bb.0: 1388; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1389; X86-NEXT: vgatherqps %xmm2, (%eax,%xmm1,2), %xmm0 1390; X86-NEXT: retl 1391; 1392; X64-LABEL: test_mm_mask_i64gather_ps: 1393; X64: # %bb.0: 1394; X64-NEXT: vgatherqps %xmm2, (%rdi,%xmm1,2), %xmm0 1395; X64-NEXT: retq 1396 %arg1 = bitcast float *%a1 to i8* 1397 %call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> %a0, i8* %arg1, <2 x i64> %a2, <4 x float> %a3, i8 2) 1398 ret <4 x float> %call 1399} 1400 1401define <4 x float> @test_mm256_i64gather_ps(float *%a0, <4 x i64> %a1) { 1402; X86-LABEL: test_mm256_i64gather_ps: 1403; X86: # %bb.0: 1404; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1405; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1406; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1 1407; X86-NEXT: vgatherqps %xmm2, (%eax,%ymm0,2), %xmm1 1408; X86-NEXT: vmovaps %xmm1, %xmm0 1409; X86-NEXT: vzeroupper 1410; X86-NEXT: retl 1411; 1412; X64-LABEL: test_mm256_i64gather_ps: 1413; X64: # %bb.0: 1414; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1415; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 1416; X64-NEXT: vgatherqps %xmm2, (%rdi,%ymm0,2), %xmm1 1417; X64-NEXT: vmovaps %xmm1, %xmm0 1418; X64-NEXT: vzeroupper 1419; X64-NEXT: retq 1420 %arg0 = bitcast float *%a0 to i8* 1421 %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer 1422 %sext = sext <4 x i1> %cmp to <4 x i32> 1423 %mask = bitcast <4 x i32> %sext to <4 x float> 1424 %call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8* %arg0, <4 x i64> %a1, <4 x float> %mask, i8 2) 1425 ret <4 x float> %call 1426} 1427declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float>, i8*, <4 x i64>, <4 x float>, i8) nounwind readonly 1428 1429define <4 x float> @test_mm256_mask_i64gather_ps(<4 x float> %a0, float *%a1, <4 x i64> %a2, <4 x float> %a3) { 1430; X86-LABEL: test_mm256_mask_i64gather_ps: 1431; X86: # %bb.0: 1432; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1433; X86-NEXT: vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0 1434; X86-NEXT: vzeroupper 1435; X86-NEXT: retl 1436; 1437; X64-LABEL: test_mm256_mask_i64gather_ps: 1438; X64: # %bb.0: 1439; X64-NEXT: vgatherqps %xmm2, (%rdi,%ymm1,2), %xmm0 1440; X64-NEXT: vzeroupper 1441; X64-NEXT: retq 1442 %arg1 = bitcast float *%a1 to i8* 1443 %call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %a0, i8* %arg1, <4 x i64> %a2, <4 x float> %a3, i8 2) 1444 ret <4 x float> %call 1445} 1446 1447define <4 x i64> @test0_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind { 1448; CHECK-LABEL: test0_mm256_inserti128_si256: 1449; CHECK: # %bb.0: 1450; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 1451; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 1452; CHECK-NEXT: ret{{[l|q]}} 1453 %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 1454 %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 1455 ret <4 x i64> %res 1456} 1457 1458define <4 x i64> @test1_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind { 1459; CHECK-LABEL: test1_mm256_inserti128_si256: 1460; CHECK: # %bb.0: 1461; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1462; CHECK-NEXT: ret{{[l|q]}} 1463 %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 1464 %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 1465 ret <4 x i64> %res 1466} 1467 1468define <4 x i64> @test_mm256_madd_epi16(<4 x i64> %a0, <4 x i64> %a1) { 1469; CHECK-LABEL: test_mm256_madd_epi16: 1470; CHECK: # %bb.0: 1471; CHECK-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 1472; CHECK-NEXT: ret{{[l|q]}} 1473 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1474 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1475 %res = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %arg0, <16 x i16> %arg1) 1476 %bc = bitcast <8 x i32> %res to <4 x i64> 1477 ret <4 x i64> %bc 1478} 1479declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone 1480 1481define <4 x i64> @test_mm256_maddubs_epi16(<4 x i64> %a0, <4 x i64> %a1) { 1482; CHECK-LABEL: test_mm256_maddubs_epi16: 1483; CHECK: # %bb.0: 1484; CHECK-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 1485; CHECK-NEXT: ret{{[l|q]}} 1486 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 1487 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 1488 %res = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %arg0, <32 x i8> %arg1) 1489 %bc = bitcast <16 x i16> %res to <4 x i64> 1490 ret <4 x i64> %bc 1491} 1492declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone 1493 1494define <2 x i64> @test_mm_maskload_epi32(i32* %a0, <2 x i64> %a1) nounwind { 1495; X86-LABEL: test_mm_maskload_epi32: 1496; X86: # %bb.0: 1497; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1498; X86-NEXT: vpmaskmovd (%eax), %xmm0, %xmm0 1499; X86-NEXT: retl 1500; 1501; X64-LABEL: test_mm_maskload_epi32: 1502; X64: # %bb.0: 1503; X64-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm0 1504; X64-NEXT: retq 1505 %arg0 = bitcast i32* %a0 to i8* 1506 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 1507 %call = call <4 x i32> @llvm.x86.avx2.maskload.d(i8* %arg0, <4 x i32> %arg1) 1508 %bc = bitcast <4 x i32> %call to <2 x i64> 1509 ret <2 x i64> %bc 1510} 1511declare <4 x i32> @llvm.x86.avx2.maskload.d(i8*, <4 x i32>) nounwind readonly 1512 1513define <4 x i64> @test_mm256_maskload_epi32(i32* %a0, <4 x i64> %a1) nounwind { 1514; X86-LABEL: test_mm256_maskload_epi32: 1515; X86: # %bb.0: 1516; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1517; X86-NEXT: vpmaskmovd (%eax), %ymm0, %ymm0 1518; X86-NEXT: retl 1519; 1520; X64-LABEL: test_mm256_maskload_epi32: 1521; X64: # %bb.0: 1522; X64-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm0 1523; X64-NEXT: retq 1524 %arg0 = bitcast i32* %a0 to i8* 1525 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1526 %call = call <8 x i32> @llvm.x86.avx2.maskload.d.256(i8* %arg0, <8 x i32> %arg1) 1527 %bc = bitcast <8 x i32> %call to <4 x i64> 1528 ret <4 x i64> %bc 1529} 1530declare <8 x i32> @llvm.x86.avx2.maskload.d.256(i8*, <8 x i32>) nounwind readonly 1531 1532define <2 x i64> @test_mm_maskload_epi64(i64* %a0, <2 x i64> %a1) nounwind { 1533; X86-LABEL: test_mm_maskload_epi64: 1534; X86: # %bb.0: 1535; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1536; X86-NEXT: vpmaskmovq (%eax), %xmm0, %xmm0 1537; X86-NEXT: retl 1538; 1539; X64-LABEL: test_mm_maskload_epi64: 1540; X64: # %bb.0: 1541; X64-NEXT: vpmaskmovq (%rdi), %xmm0, %xmm0 1542; X64-NEXT: retq 1543 %arg0 = bitcast i64* %a0 to i8* 1544 %res = call <2 x i64> @llvm.x86.avx2.maskload.q(i8* %arg0, <2 x i64> %a1) 1545 ret <2 x i64> %res 1546} 1547declare <2 x i64> @llvm.x86.avx2.maskload.q(i8*, <2 x i64>) nounwind readonly 1548 1549define <4 x i64> @test_mm256_maskload_epi64(i64* %a0, <4 x i64> %a1) nounwind { 1550; X86-LABEL: test_mm256_maskload_epi64: 1551; X86: # %bb.0: 1552; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1553; X86-NEXT: vpmaskmovq (%eax), %ymm0, %ymm0 1554; X86-NEXT: retl 1555; 1556; X64-LABEL: test_mm256_maskload_epi64: 1557; X64: # %bb.0: 1558; X64-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm0 1559; X64-NEXT: retq 1560 %arg0 = bitcast i64* %a0 to i8* 1561 %res = call <4 x i64> @llvm.x86.avx2.maskload.q.256(i8* %arg0, <4 x i64> %a1) 1562 ret <4 x i64> %res 1563} 1564declare <4 x i64> @llvm.x86.avx2.maskload.q.256(i8*, <4 x i64>) nounwind readonly 1565 1566define void @test_mm_maskstore_epi32(float* %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind { 1567; X86-LABEL: test_mm_maskstore_epi32: 1568; X86: # %bb.0: 1569; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1570; X86-NEXT: vpmaskmovd %xmm1, %xmm0, (%eax) 1571; X86-NEXT: retl 1572; 1573; X64-LABEL: test_mm_maskstore_epi32: 1574; X64: # %bb.0: 1575; X64-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) 1576; X64-NEXT: retq 1577 %arg0 = bitcast float* %a0 to i8* 1578 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 1579 %arg2 = bitcast <2 x i64> %a2 to <4 x i32> 1580 call void @llvm.x86.avx2.maskstore.d(i8* %arg0, <4 x i32> %arg1, <4 x i32> %arg2) 1581 ret void 1582} 1583declare void @llvm.x86.avx2.maskstore.d(i8*, <4 x i32>, <4 x i32>) nounwind readnone 1584 1585define void @test_mm256_maskstore_epi32(float* %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind { 1586; X86-LABEL: test_mm256_maskstore_epi32: 1587; X86: # %bb.0: 1588; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1589; X86-NEXT: vpmaskmovd %ymm1, %ymm0, (%eax) 1590; X86-NEXT: vzeroupper 1591; X86-NEXT: retl 1592; 1593; X64-LABEL: test_mm256_maskstore_epi32: 1594; X64: # %bb.0: 1595; X64-NEXT: vpmaskmovd %ymm1, %ymm0, (%rdi) 1596; X64-NEXT: vzeroupper 1597; X64-NEXT: retq 1598 %arg0 = bitcast float* %a0 to i8* 1599 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1600 %arg2 = bitcast <4 x i64> %a2 to <8 x i32> 1601 call void @llvm.x86.avx2.maskstore.d.256(i8* %arg0, <8 x i32> %arg1, <8 x i32> %arg2) 1602 ret void 1603} 1604declare void @llvm.x86.avx2.maskstore.d.256(i8*, <8 x i32>, <8 x i32>) nounwind readnone 1605 1606define void @test_mm_maskstore_epi64(i64* %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind { 1607; X86-LABEL: test_mm_maskstore_epi64: 1608; X86: # %bb.0: 1609; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1610; X86-NEXT: vpmaskmovq %xmm1, %xmm0, (%eax) 1611; X86-NEXT: retl 1612; 1613; X64-LABEL: test_mm_maskstore_epi64: 1614; X64: # %bb.0: 1615; X64-NEXT: vpmaskmovq %xmm1, %xmm0, (%rdi) 1616; X64-NEXT: retq 1617 %arg0 = bitcast i64* %a0 to i8* 1618 call void @llvm.x86.avx2.maskstore.q(i8* %arg0, <2 x i64> %a1, <2 x i64> %a2) 1619 ret void 1620} 1621declare void @llvm.x86.avx2.maskstore.q(i8*, <2 x i64>, <2 x i64>) nounwind readnone 1622 1623define void @test_mm256_maskstore_epi64(i64* %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind { 1624; X86-LABEL: test_mm256_maskstore_epi64: 1625; X86: # %bb.0: 1626; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1627; X86-NEXT: vpmaskmovq %ymm1, %ymm0, (%eax) 1628; X86-NEXT: vzeroupper 1629; X86-NEXT: retl 1630; 1631; X64-LABEL: test_mm256_maskstore_epi64: 1632; X64: # %bb.0: 1633; X64-NEXT: vpmaskmovq %ymm1, %ymm0, (%rdi) 1634; X64-NEXT: vzeroupper 1635; X64-NEXT: retq 1636 %arg0 = bitcast i64* %a0 to i8* 1637 call void @llvm.x86.avx2.maskstore.q.256(i8* %arg0, <4 x i64> %a1, <4 x i64> %a2) 1638 ret void 1639} 1640declare void @llvm.x86.avx2.maskstore.q.256(i8*, <4 x i64>, <4 x i64>) nounwind readnone 1641 1642define <4 x i64> @test_mm256_max_epi8(<4 x i64> %a0, <4 x i64> %a1) { 1643; CHECK-LABEL: test_mm256_max_epi8: 1644; CHECK: # %bb.0: 1645; CHECK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 1646; CHECK-NEXT: ret{{[l|q]}} 1647 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 1648 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 1649 %cmp = icmp sgt <32 x i8> %arg0, %arg1 1650 %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1 1651 %bc = bitcast <32 x i8> %sel to <4 x i64> 1652 ret <4 x i64> %bc 1653} 1654 1655define <4 x i64> @test_mm256_max_epi16(<4 x i64> %a0, <4 x i64> %a1) { 1656; CHECK-LABEL: test_mm256_max_epi16: 1657; CHECK: # %bb.0: 1658; CHECK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 1659; CHECK-NEXT: ret{{[l|q]}} 1660 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1661 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1662 %cmp = icmp sgt <16 x i16> %arg0, %arg1 1663 %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1 1664 %bc = bitcast <16 x i16> %sel to <4 x i64> 1665 ret <4 x i64> %bc 1666} 1667 1668define <4 x i64> @test_mm256_max_epi32(<4 x i64> %a0, <4 x i64> %a1) { 1669; CHECK-LABEL: test_mm256_max_epi32: 1670; CHECK: # %bb.0: 1671; CHECK-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 1672; CHECK-NEXT: ret{{[l|q]}} 1673 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 1674 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1675 %cmp = icmp sgt <8 x i32> %arg0, %arg1 1676 %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1 1677 %bc = bitcast <8 x i32> %sel to <4 x i64> 1678 ret <4 x i64> %bc 1679} 1680 1681define <4 x i64> @test_mm256_max_epu8(<4 x i64> %a0, <4 x i64> %a1) { 1682; CHECK-LABEL: test_mm256_max_epu8: 1683; CHECK: # %bb.0: 1684; CHECK-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 1685; CHECK-NEXT: ret{{[l|q]}} 1686 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 1687 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 1688 %cmp = icmp ugt <32 x i8> %arg0, %arg1 1689 %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1 1690 %bc = bitcast <32 x i8> %sel to <4 x i64> 1691 ret <4 x i64> %bc 1692} 1693 1694define <4 x i64> @test_mm256_max_epu16(<4 x i64> %a0, <4 x i64> %a1) { 1695; CHECK-LABEL: test_mm256_max_epu16: 1696; CHECK: # %bb.0: 1697; CHECK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 1698; CHECK-NEXT: ret{{[l|q]}} 1699 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1700 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1701 %cmp = icmp ugt <16 x i16> %arg0, %arg1 1702 %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1 1703 %bc = bitcast <16 x i16> %sel to <4 x i64> 1704 ret <4 x i64> %bc 1705} 1706 1707define <4 x i64> @test_mm256_max_epu32(<4 x i64> %a0, <4 x i64> %a1) { 1708; CHECK-LABEL: test_mm256_max_epu32: 1709; CHECK: # %bb.0: 1710; CHECK-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 1711; CHECK-NEXT: ret{{[l|q]}} 1712 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 1713 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1714 %cmp = icmp ugt <8 x i32> %arg0, %arg1 1715 %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1 1716 %bc = bitcast <8 x i32> %sel to <4 x i64> 1717 ret <4 x i64> %bc 1718} 1719 1720define <4 x i64> @test_mm256_min_epi8(<4 x i64> %a0, <4 x i64> %a1) { 1721; CHECK-LABEL: test_mm256_min_epi8: 1722; CHECK: # %bb.0: 1723; CHECK-NEXT: vpminsb %ymm1, %ymm0, %ymm0 1724; CHECK-NEXT: ret{{[l|q]}} 1725 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 1726 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 1727 %cmp = icmp slt <32 x i8> %arg0, %arg1 1728 %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1 1729 %bc = bitcast <32 x i8> %sel to <4 x i64> 1730 ret <4 x i64> %bc 1731} 1732 1733define <4 x i64> @test_mm256_min_epi16(<4 x i64> %a0, <4 x i64> %a1) { 1734; CHECK-LABEL: test_mm256_min_epi16: 1735; CHECK: # %bb.0: 1736; CHECK-NEXT: vpminsw %ymm1, %ymm0, %ymm0 1737; CHECK-NEXT: ret{{[l|q]}} 1738 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1739 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1740 %cmp = icmp slt <16 x i16> %arg0, %arg1 1741 %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1 1742 %bc = bitcast <16 x i16> %sel to <4 x i64> 1743 ret <4 x i64> %bc 1744} 1745 1746define <4 x i64> @test_mm256_min_epi32(<4 x i64> %a0, <4 x i64> %a1) { 1747; CHECK-LABEL: test_mm256_min_epi32: 1748; CHECK: # %bb.0: 1749; CHECK-NEXT: vpminsd %ymm1, %ymm0, %ymm0 1750; CHECK-NEXT: ret{{[l|q]}} 1751 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 1752 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1753 %cmp = icmp slt <8 x i32> %arg0, %arg1 1754 %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1 1755 %bc = bitcast <8 x i32> %sel to <4 x i64> 1756 ret <4 x i64> %bc 1757} 1758 1759define <4 x i64> @test_mm256_min_epu8(<4 x i64> %a0, <4 x i64> %a1) { 1760; CHECK-LABEL: test_mm256_min_epu8: 1761; CHECK: # %bb.0: 1762; CHECK-NEXT: vpminub %ymm1, %ymm0, %ymm0 1763; CHECK-NEXT: ret{{[l|q]}} 1764 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 1765 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 1766 %cmp = icmp ult <32 x i8> %arg0, %arg1 1767 %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1 1768 %bc = bitcast <32 x i8> %sel to <4 x i64> 1769 ret <4 x i64> %bc 1770} 1771 1772define <4 x i64> @test_mm256_min_epu16(<4 x i64> %a0, <4 x i64> %a1) { 1773; CHECK-LABEL: test_mm256_min_epu16: 1774; CHECK: # %bb.0: 1775; CHECK-NEXT: vpminuw %ymm1, %ymm0, %ymm0 1776; CHECK-NEXT: ret{{[l|q]}} 1777 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1778 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1779 %cmp = icmp ult <16 x i16> %arg0, %arg1 1780 %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1 1781 %bc = bitcast <16 x i16> %sel to <4 x i64> 1782 ret <4 x i64> %bc 1783} 1784 1785define <4 x i64> @test_mm256_min_epu32(<4 x i64> %a0, <4 x i64> %a1) { 1786; CHECK-LABEL: test_mm256_min_epu32: 1787; CHECK: # %bb.0: 1788; CHECK-NEXT: vpminud %ymm1, %ymm0, %ymm0 1789; CHECK-NEXT: ret{{[l|q]}} 1790 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 1791 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1792 %cmp = icmp ult <8 x i32> %arg0, %arg1 1793 %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1 1794 %bc = bitcast <8 x i32> %sel to <4 x i64> 1795 ret <4 x i64> %bc 1796} 1797 1798define i32 @test_mm256_movemask_epi8(<4 x i64> %a0) nounwind { 1799; CHECK-LABEL: test_mm256_movemask_epi8: 1800; CHECK: # %bb.0: 1801; CHECK-NEXT: vpmovmskb %ymm0, %eax 1802; CHECK-NEXT: vzeroupper 1803; CHECK-NEXT: ret{{[l|q]}} 1804 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 1805 %res = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %arg0) 1806 ret i32 %res 1807} 1808declare i32 @llvm.x86.avx2.pmovmskb(<32 x i8>) nounwind readnone 1809 1810define <4 x i64> @test_mm256_mpsadbw_epu8(<4 x i64> %a0, <4 x i64> %a1) { 1811; CHECK-LABEL: test_mm256_mpsadbw_epu8: 1812; CHECK: # %bb.0: 1813; CHECK-NEXT: vmpsadbw $3, %ymm1, %ymm0, %ymm0 1814; CHECK-NEXT: ret{{[l|q]}} 1815 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 1816 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 1817 %call = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %arg0, <32 x i8> %arg1, i8 3) 1818 %bc = bitcast <16 x i16> %call to <4 x i64> 1819 ret <4 x i64> %bc 1820} 1821declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind readnone 1822 1823define <4 x i64> @test_mm256_mul_epi32(<4 x i64> %a0, <4 x i64> %a1) { 1824; CHECK-LABEL: test_mm256_mul_epi32: 1825; CHECK: # %bb.0: 1826; CHECK-NEXT: vpsllq $32, %ymm0, %ymm0 1827; CHECK-NEXT: vpsrad $31, %ymm0, %ymm2 1828; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] 1829; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] 1830; CHECK-NEXT: vpsllq $32, %ymm1, %ymm1 1831; CHECK-NEXT: vpsrad $31, %ymm1, %ymm2 1832; CHECK-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] 1833; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] 1834; CHECK-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 1835; CHECK-NEXT: ret{{[l|q]}} 1836 %A = shl <4 x i64> %a0, <i64 32, i64 32, i64 32, i64 32> 1837 %A1 = ashr exact <4 x i64> %A, <i64 32, i64 32, i64 32, i64 32> 1838 %B = shl <4 x i64> %a1, <i64 32, i64 32, i64 32, i64 32> 1839 %B1 = ashr exact <4 x i64> %B, <i64 32, i64 32, i64 32, i64 32> 1840 %res = mul nsw <4 x i64> %A1, %B1 1841 ret <4 x i64> %res 1842} 1843declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone 1844 1845define <4 x i64> @test_mm256_mul_epu32(<4 x i64> %a0, <4 x i64> %a1) { 1846; CHECK-LABEL: test_mm256_mul_epu32: 1847; CHECK: # %bb.0: 1848; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 1849; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] 1850; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] 1851; CHECK-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 1852; CHECK-NEXT: ret{{[l|q]}} 1853 %A = and <4 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 1854 %B = and <4 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 1855 %res = mul nuw <4 x i64> %A, %B 1856 ret <4 x i64> %res 1857} 1858declare <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32>, <8 x i32>) nounwind readnone 1859 1860define <4 x i64> @test_mm256_mulhi_epi16(<4 x i64> %a0, <4 x i64> %a1) { 1861; CHECK-LABEL: test_mm256_mulhi_epi16: 1862; CHECK: # %bb.0: 1863; CHECK-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 1864; CHECK-NEXT: ret{{[l|q]}} 1865 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1866 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1867 %res = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %arg0, <16 x i16> %arg1) 1868 %bc = bitcast <16 x i16> %res to <4 x i64> 1869 ret <4 x i64> %bc 1870} 1871declare <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16>, <16 x i16>) nounwind readnone 1872 1873define <4 x i64> @test_mm256_mulhi_epu16(<4 x i64> %a0, <4 x i64> %a1) { 1874; CHECK-LABEL: test_mm256_mulhi_epu16: 1875; CHECK: # %bb.0: 1876; CHECK-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 1877; CHECK-NEXT: ret{{[l|q]}} 1878 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1879 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1880 %res = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %arg0, <16 x i16> %arg1) 1881 %bc = bitcast <16 x i16> %res to <4 x i64> 1882 ret <4 x i64> %bc 1883} 1884declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>) nounwind readnone 1885 1886define <4 x i64> @test_mm256_mulhrs_epi16(<4 x i64> %a0, <4 x i64> %a1) { 1887; CHECK-LABEL: test_mm256_mulhrs_epi16: 1888; CHECK: # %bb.0: 1889; CHECK-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm0 1890; CHECK-NEXT: ret{{[l|q]}} 1891 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1892 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1893 %res = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %arg0, <16 x i16> %arg1) 1894 %bc = bitcast <16 x i16> %res to <4 x i64> 1895 ret <4 x i64> %bc 1896} 1897declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind readnone 1898 1899define <4 x i64> @test_mm256_mullo_epi16(<4 x i64> %a0, <4 x i64> %a1) { 1900; CHECK-LABEL: test_mm256_mullo_epi16: 1901; CHECK: # %bb.0: 1902; CHECK-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1903; CHECK-NEXT: ret{{[l|q]}} 1904 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1905 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1906 %res = mul <16 x i16> %arg0, %arg1 1907 %bc = bitcast <16 x i16> %res to <4 x i64> 1908 ret <4 x i64> %bc 1909} 1910 1911define <4 x i64> @test_mm256_mullo_epi32(<4 x i64> %a0, <4 x i64> %a1) { 1912; CHECK-LABEL: test_mm256_mullo_epi32: 1913; CHECK: # %bb.0: 1914; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm0 1915; CHECK-NEXT: ret{{[l|q]}} 1916 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 1917 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1918 %res = mul <8 x i32> %arg0, %arg1 1919 %bc = bitcast <8 x i32> %res to <4 x i64> 1920 ret <4 x i64> %bc 1921} 1922 1923define <4 x i64> @test_mm256_or_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind { 1924; CHECK-LABEL: test_mm256_or_si256: 1925; CHECK: # %bb.0: 1926; CHECK-NEXT: vorps %ymm1, %ymm0, %ymm0 1927; CHECK-NEXT: ret{{[l|q]}} 1928 %res = or <4 x i64> %a0, %a1 1929 ret <4 x i64> %res 1930} 1931 1932define <4 x i64> @test_mm256_packs_epi16(<4 x i64> %a0, <4 x i64> %a1) { 1933; CHECK-LABEL: test_mm256_packs_epi16: 1934; CHECK: # %bb.0: 1935; CHECK-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 1936; CHECK-NEXT: ret{{[l|q]}} 1937 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1938 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1939 %call = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %arg0, <16 x i16> %arg1) 1940 %res = bitcast <32 x i8> %call to <4 x i64> 1941 ret <4 x i64> %res 1942} 1943declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone 1944 1945define <4 x i64> @test_mm256_packs_epi32(<4 x i64> %a0, <4 x i64> %a1) { 1946; CHECK-LABEL: test_mm256_packs_epi32: 1947; CHECK: # %bb.0: 1948; CHECK-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 1949; CHECK-NEXT: ret{{[l|q]}} 1950 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 1951 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1952 %call = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %arg0, <8 x i32> %arg1) 1953 %res = bitcast <16 x i16> %call to <4 x i64> 1954 ret <4 x i64> %res 1955} 1956declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone 1957 1958define <4 x i64> @test_mm256_packus_epi16(<4 x i64> %a0, <4 x i64> %a1) { 1959; CHECK-LABEL: test_mm256_packus_epi16: 1960; CHECK: # %bb.0: 1961; CHECK-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 1962; CHECK-NEXT: ret{{[l|q]}} 1963 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1964 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1965 %call = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %arg0, <16 x i16> %arg1) 1966 %res = bitcast <32 x i8> %call to <4 x i64> 1967 ret <4 x i64> %res 1968} 1969declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone 1970 1971define <4 x i64> @test_mm256_packus_epi32(<4 x i64> %a0, <4 x i64> %a1) { 1972; CHECK-LABEL: test_mm256_packus_epi32: 1973; CHECK: # %bb.0: 1974; CHECK-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 1975; CHECK-NEXT: ret{{[l|q]}} 1976 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 1977 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1978 %call = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %arg0, <8 x i32> %arg1) 1979 %res = bitcast <16 x i16> %call to <4 x i64> 1980 ret <4 x i64> %res 1981} 1982declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone 1983 1984define <4 x i64> @test_mm256_permute2x128_si256(<4 x i64> %a0, <4 x i64> %a1) { 1985; CHECK-LABEL: test_mm256_permute2x128_si256: 1986; CHECK: # %bb.0: 1987; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 1988; CHECK-NEXT: ret{{[l|q]}} 1989 %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 1990 ret <4 x i64> %res 1991} 1992declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind readonly 1993 1994define <4 x i64> @test_mm256_permute4x64_epi64(<4 x i64> %a0) { 1995; CHECK-LABEL: test_mm256_permute4x64_epi64: 1996; CHECK: # %bb.0: 1997; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,2,0] 1998; CHECK-NEXT: ret{{[l|q]}} 1999 %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 0> 2000 ret <4 x i64> %res 2001} 2002 2003define <4 x double> @test_mm256_permute4x64_pd(<4 x double> %a0) { 2004; CHECK-LABEL: test_mm256_permute4x64_pd: 2005; CHECK: # %bb.0: 2006; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,2,1,0] 2007; CHECK-NEXT: ret{{[l|q]}} 2008 %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0> 2009 ret <4 x double> %res 2010} 2011 2012define <4 x i64> @test_mm256_permutevar8x32_epi32(<4 x i64> %a0, <4 x i64> %a1) { 2013; CHECK-LABEL: test_mm256_permutevar8x32_epi32: 2014; CHECK: # %bb.0: 2015; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 2016; CHECK-NEXT: ret{{[l|q]}} 2017 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2018 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2019 %call = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %arg0, <8 x i32> %arg1) 2020 %res = bitcast <8 x i32> %call to <4 x i64> 2021 ret <4 x i64> %res 2022} 2023declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly 2024 2025define <8 x float> @test_mm256_permutevar8x32_ps(<8 x float> %a0, <4 x i64> %a1) { 2026; CHECK-LABEL: test_mm256_permutevar8x32_ps: 2027; CHECK: # %bb.0: 2028; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 2029; CHECK-NEXT: ret{{[l|q]}} 2030 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2031 %res = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %arg1) 2032 ret <8 x float> %res 2033} 2034declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly 2035 2036define <4 x i64> @test_mm256_sad_epu8(<4 x i64> %a0, <4 x i64> %a1) { 2037; CHECK-LABEL: test_mm256_sad_epu8: 2038; CHECK: # %bb.0: 2039; CHECK-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 2040; CHECK-NEXT: ret{{[l|q]}} 2041 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2042 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 2043 %res = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %arg0, <32 x i8> %arg1) 2044 ret <4 x i64> %res 2045} 2046declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone 2047 2048define <4 x i64> @test_mm256_shuffle_epi32(<4 x i64> %a0) { 2049; CHECK-LABEL: test_mm256_shuffle_epi32: 2050; CHECK: # %bb.0: 2051; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,0,0,7,7,4,4] 2052; CHECK-NEXT: ret{{[l|q]}} 2053 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2054 %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 0, i32 0, i32 7, i32 7, i32 4, i32 4> 2055 %res = bitcast <8 x i32> %shuf to <4 x i64> 2056 ret <4 x i64> %res 2057} 2058 2059define <4 x i64> @test_mm256_shuffle_epi8(<4 x i64> %a0, <4 x i64> %a1) { 2060; CHECK-LABEL: test_mm256_shuffle_epi8: 2061; CHECK: # %bb.0: 2062; CHECK-NEXT: vpshufb %ymm1, %ymm0, %ymm0 2063; CHECK-NEXT: ret{{[l|q]}} 2064 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2065 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 2066 %shuf = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %arg0, <32 x i8> %arg1) 2067 %res = bitcast <32 x i8> %shuf to <4 x i64> 2068 ret <4 x i64> %res 2069} 2070declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) nounwind readnone 2071 2072define <4 x i64> @test_mm256_shufflehi_epi16(<4 x i64> %a0) { 2073; CHECK-LABEL: test_mm256_shufflehi_epi16: 2074; CHECK: # %bb.0: 2075; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,6,5,8,9,10,11,15,14,14,13] 2076; CHECK-NEXT: ret{{[l|q]}} 2077 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2078 %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 14, i32 13> 2079 %res = bitcast <16 x i16> %shuf to <4 x i64> 2080 ret <4 x i64> %res 2081} 2082 2083define <4 x i64> @test_mm256_shufflelo_epi16(<4 x i64> %a0) { 2084; CHECK-LABEL: test_mm256_shufflelo_epi16: 2085; CHECK: # %bb.0: 2086; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,0,1,1,4,5,6,7,11,8,9,9,12,13,14,15] 2087; CHECK-NEXT: ret{{[l|q]}} 2088 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2089 %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 3, i32 0, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 9, i32 9, i32 12, i32 13, i32 14, i32 15> 2090 %res = bitcast <16 x i16> %shuf to <4 x i64> 2091 ret <4 x i64> %res 2092} 2093 2094define <4 x i64> @test_mm256_sign_epi8(<4 x i64> %a0, <4 x i64> %a1) { 2095; CHECK-LABEL: test_mm256_sign_epi8: 2096; CHECK: # %bb.0: 2097; CHECK-NEXT: vpsignb %ymm1, %ymm0, %ymm0 2098; CHECK-NEXT: ret{{[l|q]}} 2099 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2100 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 2101 %call = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %arg0, <32 x i8> %arg1) 2102 %res = bitcast <32 x i8> %call to <4 x i64> 2103 ret <4 x i64> %res 2104} 2105declare <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8>, <32 x i8>) nounwind readnone 2106 2107define <4 x i64> @test_mm256_sign_epi16(<4 x i64> %a0, <4 x i64> %a1) { 2108; CHECK-LABEL: test_mm256_sign_epi16: 2109; CHECK: # %bb.0: 2110; CHECK-NEXT: vpsignw %ymm1, %ymm0, %ymm0 2111; CHECK-NEXT: ret{{[l|q]}} 2112 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2113 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 2114 %call = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %arg0, <16 x i16> %arg1) 2115 %res = bitcast <16 x i16> %call to <4 x i64> 2116 ret <4 x i64> %res 2117} 2118declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readnone 2119 2120define <4 x i64> @test_mm256_sign_epi32(<4 x i64> %a0, <4 x i64> %a1) { 2121; CHECK-LABEL: test_mm256_sign_epi32: 2122; CHECK: # %bb.0: 2123; CHECK-NEXT: vpsignd %ymm1, %ymm0, %ymm0 2124; CHECK-NEXT: ret{{[l|q]}} 2125 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2126 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2127 %call = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %arg0, <8 x i32> %arg1) 2128 %res = bitcast <8 x i32> %call to <4 x i64> 2129 ret <4 x i64> %res 2130} 2131declare <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32>, <8 x i32>) nounwind readnone 2132 2133define <4 x i64> @test_mm256_sll_epi16(<4 x i64> %a0, <2 x i64> %a1) { 2134; CHECK-LABEL: test_mm256_sll_epi16: 2135; CHECK: # %bb.0: 2136; CHECK-NEXT: vpsllw %xmm1, %ymm0, %ymm0 2137; CHECK-NEXT: ret{{[l|q]}} 2138 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2139 %arg1 = bitcast <2 x i64> %a1 to <8 x i16> 2140 %res = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %arg0, <8 x i16> %arg1) 2141 %bc = bitcast <16 x i16> %res to <4 x i64> 2142 ret <4 x i64> %bc 2143} 2144declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnone 2145 2146define <4 x i64> @test_mm256_sll_epi32(<4 x i64> %a0, <2 x i64> %a1) { 2147; CHECK-LABEL: test_mm256_sll_epi32: 2148; CHECK: # %bb.0: 2149; CHECK-NEXT: vpslld %xmm1, %ymm0, %ymm0 2150; CHECK-NEXT: ret{{[l|q]}} 2151 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2152 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 2153 %res = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %arg0, <4 x i32> %arg1) 2154 %bc = bitcast <8 x i32> %res to <4 x i64> 2155 ret <4 x i64> %bc 2156} 2157declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone 2158 2159define <4 x i64> @test_mm256_sll_epi64(<4 x i64> %a0, <2 x i64> %a1) { 2160; CHECK-LABEL: test_mm256_sll_epi64: 2161; CHECK: # %bb.0: 2162; CHECK-NEXT: vpsllq %xmm1, %ymm0, %ymm0 2163; CHECK-NEXT: ret{{[l|q]}} 2164 %res = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1) 2165 ret <4 x i64> %res 2166} 2167declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone 2168 2169define <4 x i64> @test_mm256_slli_epi16(<4 x i64> %a0) { 2170; CHECK-LABEL: test_mm256_slli_epi16: 2171; CHECK: # %bb.0: 2172; CHECK-NEXT: vpsllw $3, %ymm0, %ymm0 2173; CHECK-NEXT: ret{{[l|q]}} 2174 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2175 %res = call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %arg0, i32 3) 2176 %bc = bitcast <16 x i16> %res to <4 x i64> 2177 ret <4 x i64> %bc 2178} 2179declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) nounwind readnone 2180 2181define <4 x i64> @test_mm256_slli_epi32(<4 x i64> %a0) { 2182; CHECK-LABEL: test_mm256_slli_epi32: 2183; CHECK: # %bb.0: 2184; CHECK-NEXT: vpslld $3, %ymm0, %ymm0 2185; CHECK-NEXT: ret{{[l|q]}} 2186 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2187 %res = call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %arg0, i32 3) 2188 %bc = bitcast <8 x i32> %res to <4 x i64> 2189 ret <4 x i64> %bc 2190} 2191declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) nounwind readnone 2192 2193define <4 x i64> @test_mm256_slli_epi64(<4 x i64> %a0) { 2194; CHECK-LABEL: test_mm256_slli_epi64: 2195; CHECK: # %bb.0: 2196; CHECK-NEXT: vpsllq $3, %ymm0, %ymm0 2197; CHECK-NEXT: ret{{[l|q]}} 2198 %res = call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %a0, i32 3) 2199 ret <4 x i64> %res 2200} 2201declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) nounwind readnone 2202 2203define <4 x i64> @test_mm256_slli_si256(<4 x i64> %a0) { 2204; CHECK-LABEL: test_mm256_slli_si256: 2205; CHECK: # %bb.0: 2206; CHECK-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28] 2207; CHECK-NEXT: ret{{[l|q]}} 2208 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2209 %shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60> 2210 %res = bitcast <32 x i8> %shuf to <4 x i64> 2211 ret <4 x i64> %res 2212} 2213 2214define <2 x i64> @test_mm_sllv_epi32(<2 x i64> %a0, <2 x i64> %a1) { 2215; CHECK-LABEL: test_mm_sllv_epi32: 2216; CHECK: # %bb.0: 2217; CHECK-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 2218; CHECK-NEXT: ret{{[l|q]}} 2219 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 2220 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 2221 %res = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %arg0, <4 x i32> %arg1) 2222 %bc = bitcast <4 x i32> %res to <2 x i64> 2223 ret <2 x i64> %bc 2224} 2225declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone 2226 2227define <4 x i64> @test_mm256_sllv_epi32(<4 x i64> %a0, <4 x i64> %a1) { 2228; CHECK-LABEL: test_mm256_sllv_epi32: 2229; CHECK: # %bb.0: 2230; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 2231; CHECK-NEXT: ret{{[l|q]}} 2232 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2233 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2234 %res = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %arg0, <8 x i32> %arg1) 2235 %bc = bitcast <8 x i32> %res to <4 x i64> 2236 ret <4 x i64> %bc 2237} 2238declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind readnone 2239 2240define <2 x i64> @test_mm_sllv_epi64(<2 x i64> %a0, <2 x i64> %a1) { 2241; CHECK-LABEL: test_mm_sllv_epi64: 2242; CHECK: # %bb.0: 2243; CHECK-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 2244; CHECK-NEXT: ret{{[l|q]}} 2245 %res = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1) 2246 ret <2 x i64> %res 2247} 2248declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone 2249 2250define <4 x i64> @test_mm256_sllv_epi64(<4 x i64> %a0, <4 x i64> %a1) { 2251; CHECK-LABEL: test_mm256_sllv_epi64: 2252; CHECK: # %bb.0: 2253; CHECK-NEXT: vpsllvq %ymm1, %ymm0, %ymm0 2254; CHECK-NEXT: ret{{[l|q]}} 2255 %res = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1) 2256 ret <4 x i64> %res 2257} 2258declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind readnone 2259 2260define <4 x i64> @test_mm256_sra_epi16(<4 x i64> %a0, <2 x i64> %a1) { 2261; CHECK-LABEL: test_mm256_sra_epi16: 2262; CHECK: # %bb.0: 2263; CHECK-NEXT: vpsraw %xmm1, %ymm0, %ymm0 2264; CHECK-NEXT: ret{{[l|q]}} 2265 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2266 %arg1 = bitcast <2 x i64> %a1 to <8 x i16> 2267 %res = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %arg0, <8 x i16> %arg1) 2268 %bc = bitcast <16 x i16> %res to <4 x i64> 2269 ret <4 x i64> %bc 2270} 2271declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnone 2272 2273define <4 x i64> @test_mm256_sra_epi32(<4 x i64> %a0, <2 x i64> %a1) { 2274; CHECK-LABEL: test_mm256_sra_epi32: 2275; CHECK: # %bb.0: 2276; CHECK-NEXT: vpsrad %xmm1, %ymm0, %ymm0 2277; CHECK-NEXT: ret{{[l|q]}} 2278 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2279 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 2280 %res = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %arg0, <4 x i32> %arg1) 2281 %bc = bitcast <8 x i32> %res to <4 x i64> 2282 ret <4 x i64> %bc 2283} 2284declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone 2285 2286define <4 x i64> @test_mm256_srai_epi16(<4 x i64> %a0) { 2287; CHECK-LABEL: test_mm256_srai_epi16: 2288; CHECK: # %bb.0: 2289; CHECK-NEXT: vpsraw $3, %ymm0, %ymm0 2290; CHECK-NEXT: ret{{[l|q]}} 2291 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2292 %res = call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %arg0, i32 3) 2293 %bc = bitcast <16 x i16> %res to <4 x i64> 2294 ret <4 x i64> %bc 2295} 2296declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) nounwind readnone 2297 2298define <4 x i64> @test_mm256_srai_epi32(<4 x i64> %a0) { 2299; CHECK-LABEL: test_mm256_srai_epi32: 2300; CHECK: # %bb.0: 2301; CHECK-NEXT: vpsrad $3, %ymm0, %ymm0 2302; CHECK-NEXT: ret{{[l|q]}} 2303 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2304 %res = call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %arg0, i32 3) 2305 %bc = bitcast <8 x i32> %res to <4 x i64> 2306 ret <4 x i64> %bc 2307} 2308declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32) nounwind readnone 2309 2310define <2 x i64> @test_mm_srav_epi32(<2 x i64> %a0, <2 x i64> %a1) { 2311; CHECK-LABEL: test_mm_srav_epi32: 2312; CHECK: # %bb.0: 2313; CHECK-NEXT: vpsravd %xmm1, %xmm0, %xmm0 2314; CHECK-NEXT: ret{{[l|q]}} 2315 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 2316 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 2317 %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %arg0, <4 x i32> %arg1) 2318 %bc = bitcast <4 x i32> %res to <2 x i64> 2319 ret <2 x i64> %bc 2320} 2321declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone 2322 2323define <4 x i64> @test_mm256_srav_epi32(<4 x i64> %a0, <4 x i64> %a1) { 2324; CHECK-LABEL: test_mm256_srav_epi32: 2325; CHECK: # %bb.0: 2326; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm0 2327; CHECK-NEXT: ret{{[l|q]}} 2328 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2329 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2330 %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %arg0, <8 x i32> %arg1) 2331 %bc = bitcast <8 x i32> %res to <4 x i64> 2332 ret <4 x i64> %bc 2333} 2334declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone 2335 2336define <4 x i64> @test_mm256_srl_epi16(<4 x i64> %a0, <2 x i64> %a1) { 2337; CHECK-LABEL: test_mm256_srl_epi16: 2338; CHECK: # %bb.0: 2339; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 2340; CHECK-NEXT: ret{{[l|q]}} 2341 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2342 %arg1 = bitcast <2 x i64> %a1 to <8 x i16> 2343 %res = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %arg0, <8 x i16> %arg1) 2344 %bc = bitcast <16 x i16> %res to <4 x i64> 2345 ret <4 x i64> %bc 2346} 2347declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnone 2348 2349define <4 x i64> @test_mm256_srl_epi32(<4 x i64> %a0, <2 x i64> %a1) { 2350; CHECK-LABEL: test_mm256_srl_epi32: 2351; CHECK: # %bb.0: 2352; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm0 2353; CHECK-NEXT: ret{{[l|q]}} 2354 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2355 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 2356 %res = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %arg0, <4 x i32> %arg1) 2357 %bc = bitcast <8 x i32> %res to <4 x i64> 2358 ret <4 x i64> %bc 2359} 2360declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone 2361 2362define <4 x i64> @test_mm256_srl_epi64(<4 x i64> %a0, <2 x i64> %a1) { 2363; CHECK-LABEL: test_mm256_srl_epi64: 2364; CHECK: # %bb.0: 2365; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 2366; CHECK-NEXT: ret{{[l|q]}} 2367 %res = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1) 2368 ret <4 x i64> %res 2369} 2370declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone 2371 2372define <4 x i64> @test_mm256_srli_epi16(<4 x i64> %a0) { 2373; CHECK-LABEL: test_mm256_srli_epi16: 2374; CHECK: # %bb.0: 2375; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm0 2376; CHECK-NEXT: ret{{[l|q]}} 2377 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2378 %res = call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %arg0, i32 3) 2379 %bc = bitcast <16 x i16> %res to <4 x i64> 2380 ret <4 x i64> %bc 2381} 2382declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) nounwind readnone 2383 2384define <4 x i64> @test_mm256_srli_epi32(<4 x i64> %a0) { 2385; CHECK-LABEL: test_mm256_srli_epi32: 2386; CHECK: # %bb.0: 2387; CHECK-NEXT: vpsrld $3, %ymm0, %ymm0 2388; CHECK-NEXT: ret{{[l|q]}} 2389 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2390 %res = call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %arg0, i32 3) 2391 %bc = bitcast <8 x i32> %res to <4 x i64> 2392 ret <4 x i64> %bc 2393} 2394declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) nounwind readnone 2395 2396define <4 x i64> @test_mm256_srli_epi64(<4 x i64> %a0) { 2397; CHECK-LABEL: test_mm256_srli_epi64: 2398; CHECK: # %bb.0: 2399; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm0 2400; CHECK-NEXT: ret{{[l|q]}} 2401 %res = call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %a0, i32 3) 2402 ret <4 x i64> %res 2403} 2404declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) nounwind readnone 2405 2406define <4 x i64> @test_mm256_srli_si256(<4 x i64> %a0) { 2407; CHECK-LABEL: test_mm256_srli_si256: 2408; CHECK: # %bb.0: 2409; CHECK-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero 2410; CHECK-NEXT: ret{{[l|q]}} 2411 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2412 %shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50> 2413 %res = bitcast <32 x i8> %shuf to <4 x i64> 2414 ret <4 x i64> %res 2415} 2416 2417define <2 x i64> @test_mm_srlv_epi32(<2 x i64> %a0, <2 x i64> %a1) { 2418; CHECK-LABEL: test_mm_srlv_epi32: 2419; CHECK: # %bb.0: 2420; CHECK-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 2421; CHECK-NEXT: ret{{[l|q]}} 2422 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 2423 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 2424 %res = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %arg0, <4 x i32> %arg1) 2425 %bc = bitcast <4 x i32> %res to <2 x i64> 2426 ret <2 x i64> %bc 2427} 2428declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone 2429 2430define <4 x i64> @test_mm256_srlv_epi32(<4 x i64> %a0, <4 x i64> %a1) { 2431; CHECK-LABEL: test_mm256_srlv_epi32: 2432; CHECK: # %bb.0: 2433; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 2434; CHECK-NEXT: ret{{[l|q]}} 2435 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2436 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2437 %res = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %arg0, <8 x i32> %arg1) 2438 %bc = bitcast <8 x i32> %res to <4 x i64> 2439 ret <4 x i64> %bc 2440} 2441declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind readnone 2442 2443define <2 x i64> @test_mm_srlv_epi64(<2 x i64> %a0, <2 x i64> %a1) { 2444; CHECK-LABEL: test_mm_srlv_epi64: 2445; CHECK: # %bb.0: 2446; CHECK-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 2447; CHECK-NEXT: ret{{[l|q]}} 2448 %res = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1) 2449 ret <2 x i64> %res 2450} 2451declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone 2452 2453define <4 x i64> @test_mm256_srlv_epi64(<4 x i64> %a0, <4 x i64> %a1) { 2454; CHECK-LABEL: test_mm256_srlv_epi64: 2455; CHECK: # %bb.0: 2456; CHECK-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 2457; CHECK-NEXT: ret{{[l|q]}} 2458 %res = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1) 2459 ret <4 x i64> %res 2460} 2461declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind readnone 2462 2463define <4 x i64> @test_mm256_stream_load_si256(<4 x i64> *%a0) { 2464; X86-LABEL: test_mm256_stream_load_si256: 2465; X86: # %bb.0: 2466; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 2467; X86-NEXT: vmovntdqa (%eax), %ymm0 2468; X86-NEXT: retl 2469; 2470; X64-LABEL: test_mm256_stream_load_si256: 2471; X64: # %bb.0: 2472; X64-NEXT: vmovntdqa (%rdi), %ymm0 2473; X64-NEXT: retq 2474 %arg0 = bitcast <4 x i64> *%a0 to i8* 2475 %res = call <4 x i64> @llvm.x86.avx2.movntdqa(i8* %arg0) 2476 ret <4 x i64> %res 2477} 2478declare <4 x i64> @llvm.x86.avx2.movntdqa(i8*) nounwind readonly 2479 2480define <4 x i64> @test_mm256_sub_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2481; CHECK-LABEL: test_mm256_sub_epi8: 2482; CHECK: # %bb.0: 2483; CHECK-NEXT: vpsubb %ymm1, %ymm0, %ymm0 2484; CHECK-NEXT: ret{{[l|q]}} 2485 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2486 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 2487 %res = sub <32 x i8> %arg0, %arg1 2488 %bc = bitcast <32 x i8> %res to <4 x i64> 2489 ret <4 x i64> %bc 2490} 2491 2492define <4 x i64> @test_mm256_sub_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2493; CHECK-LABEL: test_mm256_sub_epi16: 2494; CHECK: # %bb.0: 2495; CHECK-NEXT: vpsubw %ymm1, %ymm0, %ymm0 2496; CHECK-NEXT: ret{{[l|q]}} 2497 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2498 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 2499 %res = sub <16 x i16> %arg0, %arg1 2500 %bc = bitcast <16 x i16> %res to <4 x i64> 2501 ret <4 x i64> %bc 2502} 2503 2504define <4 x i64> @test_mm256_sub_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2505; CHECK-LABEL: test_mm256_sub_epi32: 2506; CHECK: # %bb.0: 2507; CHECK-NEXT: vpsubd %ymm1, %ymm0, %ymm0 2508; CHECK-NEXT: ret{{[l|q]}} 2509 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2510 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2511 %res = sub <8 x i32> %arg0, %arg1 2512 %bc = bitcast <8 x i32> %res to <4 x i64> 2513 ret <4 x i64> %bc 2514} 2515 2516define <4 x i64> @test_mm256_sub_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2517; CHECK-LABEL: test_mm256_sub_epi64: 2518; CHECK: # %bb.0: 2519; CHECK-NEXT: vpsubq %ymm1, %ymm0, %ymm0 2520; CHECK-NEXT: ret{{[l|q]}} 2521 %res = sub <4 x i64> %a0, %a1 2522 ret <4 x i64> %res 2523} 2524 2525define <4 x i64> @test_mm256_subs_epi8(<4 x i64> %a0, <4 x i64> %a1) { 2526; CHECK-LABEL: test_mm256_subs_epi8: 2527; CHECK: # %bb.0: 2528; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 2529; CHECK-NEXT: ret{{[l|q]}} 2530 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2531 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 2532 %res = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %arg0, <32 x i8> %arg1) 2533 %bc = bitcast <32 x i8> %res to <4 x i64> 2534 ret <4 x i64> %bc 2535} 2536declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) nounwind readnone 2537 2538define <4 x i64> @test_mm256_subs_epi16(<4 x i64> %a0, <4 x i64> %a1) { 2539; CHECK-LABEL: test_mm256_subs_epi16: 2540; CHECK: # %bb.0: 2541; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 2542; CHECK-NEXT: ret{{[l|q]}} 2543 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2544 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 2545 %res = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %arg0, <16 x i16> %arg1) 2546 %bc = bitcast <16 x i16> %res to <4 x i64> 2547 ret <4 x i64> %bc 2548} 2549declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) nounwind readnone 2550 2551define <4 x i64> @test_mm256_subs_epu8(<4 x i64> %a0, <4 x i64> %a1) { 2552; CHECK-LABEL: test_mm256_subs_epu8: 2553; CHECK: # %bb.0: 2554; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 2555; CHECK-NEXT: ret{{[l|q]}} 2556 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2557 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 2558 %res = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %arg0, <32 x i8> %arg1) 2559 %bc = bitcast <32 x i8> %res to <4 x i64> 2560 ret <4 x i64> %bc 2561} 2562declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind readnone 2563 2564define <4 x i64> @test_mm256_subs_epu16(<4 x i64> %a0, <4 x i64> %a1) { 2565; CHECK-LABEL: test_mm256_subs_epu16: 2566; CHECK: # %bb.0: 2567; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 2568; CHECK-NEXT: ret{{[l|q]}} 2569 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2570 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 2571 %res = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %arg0, <16 x i16> %arg1) 2572 %bc = bitcast <16 x i16> %res to <4 x i64> 2573 ret <4 x i64> %bc 2574} 2575declare <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16>, <16 x i16>) nounwind readnone 2576 2577define <4 x i64> @test_mm256_unpackhi_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2578; CHECK-LABEL: test_mm256_unpackhi_epi8: 2579; CHECK: # %bb.0: 2580; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] 2581; CHECK-NEXT: ret{{[l|q]}} 2582 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2583 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 2584 %res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63> 2585 %bc = bitcast <32 x i8> %res to <4 x i64> 2586 ret <4 x i64> %bc 2587} 2588 2589define <4 x i64> @test_mm256_unpackhi_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2590; CHECK-LABEL: test_mm256_unpackhi_epi16: 2591; CHECK: # %bb.0: 2592; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] 2593; CHECK-NEXT: ret{{[l|q]}} 2594 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2595 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 2596 %res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> 2597 %bc = bitcast <16 x i16> %res to <4 x i64> 2598 ret <4 x i64> %bc 2599} 2600 2601define <4 x i64> @test_mm256_unpackhi_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2602; CHECK-LABEL: test_mm256_unpackhi_epi32: 2603; CHECK: # %bb.0: 2604; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 2605; CHECK-NEXT: ret{{[l|q]}} 2606 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2607 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2608 %res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15> 2609 %bc = bitcast <8 x i32> %res to <4 x i64> 2610 ret <4 x i64> %bc 2611} 2612 2613define <4 x i64> @test_mm256_unpackhi_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2614; CHECK-LABEL: test_mm256_unpackhi_epi64: 2615; CHECK: # %bb.0: 2616; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 2617; CHECK-NEXT: ret{{[l|q]}} 2618 %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7> 2619 ret <4 x i64> %res 2620} 2621 2622define <4 x i64> @test_mm256_unpacklo_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2623; CHECK-LABEL: test_mm256_unpacklo_epi8: 2624; CHECK: # %bb.0: 2625; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] 2626; CHECK-NEXT: ret{{[l|q]}} 2627 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2628 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 2629 %res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55> 2630 %bc = bitcast <32 x i8> %res to <4 x i64> 2631 ret <4 x i64> %bc 2632} 2633 2634define <4 x i64> @test_mm256_unpacklo_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2635; CHECK-LABEL: test_mm256_unpacklo_epi16: 2636; CHECK: # %bb.0: 2637; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] 2638; CHECK-NEXT: ret{{[l|q]}} 2639 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2640 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 2641 %res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27> 2642 %bc = bitcast <16 x i16> %res to <4 x i64> 2643 ret <4 x i64> %bc 2644} 2645 2646define <4 x i64> @test_mm256_unpacklo_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2647; CHECK-LABEL: test_mm256_unpacklo_epi32: 2648; CHECK: # %bb.0: 2649; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 2650; CHECK-NEXT: ret{{[l|q]}} 2651 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2652 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2653 %res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13> 2654 %bc = bitcast <8 x i32> %res to <4 x i64> 2655 ret <4 x i64> %bc 2656} 2657 2658define <4 x i64> @test_mm256_unpacklo_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2659; CHECK-LABEL: test_mm256_unpacklo_epi64: 2660; CHECK: # %bb.0: 2661; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 2662; CHECK-NEXT: ret{{[l|q]}} 2663 %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 2664 ret <4 x i64> %res 2665} 2666 2667define <4 x i64> @test_mm256_xor_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2668; CHECK-LABEL: test_mm256_xor_si256: 2669; CHECK: # %bb.0: 2670; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm0 2671; CHECK-NEXT: ret{{[l|q]}} 2672 %res = xor <4 x i64> %a0, %a1 2673 ret <4 x i64> %res 2674} 2675 2676declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone 2677 2678declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone 2679