1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX2 3; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX512 4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX2 5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512 6 7declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) 8declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) 9declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) 10declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) 11 12define <32 x i8> @combine_pshufb_pslldq(<32 x i8> %a0) { 13; X32-LABEL: combine_pshufb_pslldq: 14; X32: # %bb.0: 15; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0 16; X32-NEXT: retl 17; 18; X64-LABEL: combine_pshufb_pslldq: 19; X64: # %bb.0: 20; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 21; X64-NEXT: retq 22 %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>) 23 %2 = shufflevector <32 x i8> %1, <32 x i8> zeroinitializer, <32 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23> 24 ret <32 x i8> %2 25} 26 27define <32 x i8> @combine_pshufb_psrldq(<32 x i8> %a0) { 28; X32-LABEL: combine_pshufb_psrldq: 29; X32: # %bb.0: 30; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0 31; X32-NEXT: retl 32; 33; X64-LABEL: combine_pshufb_psrldq: 34; X64: # %bb.0: 35; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 36; X64-NEXT: retq 37 %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>) 38 %2 = shufflevector <32 x i8> %1, <32 x i8> zeroinitializer, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32> 39 ret <32 x i8> %2 40} 41 42define <32 x i8> @combine_pshufb_vpermd(<8 x i32> %a) { 43; X32-LABEL: combine_pshufb_vpermd: 44; X32: # %bb.0: 45; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,16,17,18,18] 46; X32-NEXT: retl 47; 48; X64-LABEL: combine_pshufb_vpermd: 49; X64: # %bb.0: 50; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,16,17,18,18] 51; X64-NEXT: retq 52 %tmp0 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4>) 53 %tmp1 = bitcast <8 x i32> %tmp0 to <32 x i8> 54 %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 30> 55 ret <32 x i8> %tmp2 56} 57 58define <32 x i8> @combine_pshufb_vpermps(<8 x float> %a) { 59; X32-LABEL: combine_pshufb_vpermps: 60; X32: # %bb.0: 61; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,16,17,18,18] 62; X32-NEXT: retl 63; 64; X64-LABEL: combine_pshufb_vpermps: 65; X64: # %bb.0: 66; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,16,17,18,18] 67; X64-NEXT: retq 68 %tmp0 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4>) 69 %tmp1 = bitcast <8 x float> %tmp0 to <32 x i8> 70 %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 30> 71 ret <32 x i8> %tmp2 72} 73 74define <32 x i8> @combine_and_pshufb(<32 x i8> %a0) { 75; X32-LABEL: combine_and_pshufb: 76; X32: # %bb.0: 77; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1 78; X32-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] 79; X32-NEXT: retl 80; 81; X64-LABEL: combine_and_pshufb: 82; X64: # %bb.0: 83; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 84; X64-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] 85; X64-NEXT: retq 86 %1 = shufflevector <32 x i8> %a0, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 32, i32 32, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 87 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>) 88 ret <32 x i8> %2 89} 90 91define <32 x i8> @combine_pshufb_and(<32 x i8> %a0) { 92; X32-LABEL: combine_pshufb_and: 93; X32: # %bb.0: 94; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1 95; X32-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] 96; X32-NEXT: retl 97; 98; X64-LABEL: combine_pshufb_and: 99; X64: # %bb.0: 100; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 101; X64-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] 102; X64-NEXT: retq 103 %1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>) 104 %2 = shufflevector <32 x i8> %1, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 32, i32 32, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 105 ret <32 x i8> %2 106} 107 108define <4 x i64> @combine_permq_pshufb_as_vperm2i128(<4 x i64> %a0) { 109; X32-LABEL: combine_permq_pshufb_as_vperm2i128: 110; X32: # %bb.0: 111; X32-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero 112; X32-NEXT: vpaddq {{\.LCPI.*}}, %ymm0, %ymm0 113; X32-NEXT: retl 114; 115; X64-LABEL: combine_permq_pshufb_as_vperm2i128: 116; X64: # %bb.0: 117; X64-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero 118; X64-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 119; X64-NEXT: retq 120 %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 121 %2 = bitcast <4 x i64> %1 to <32 x i8> 122 %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255>) 123 %4 = bitcast <32 x i8> %3 to <4 x i64> 124 %5 = add <4 x i64> %4, <i64 1, i64 1, i64 3, i64 3> 125 ret <4 x i64> %5 126} 127 128define <8 x i32> @combine_as_vpermd(<8 x i32> %a0) { 129; X32-LABEL: combine_as_vpermd: 130; X32: # %bb.0: 131; X32-NEXT: vmovaps {{.*#+}} ymm1 = [4,5,4,5,6,7,0,7] 132; X32-NEXT: vpermps %ymm0, %ymm1, %ymm0 133; X32-NEXT: retl 134; 135; X64-LABEL: combine_as_vpermd: 136; X64: # %bb.0: 137; X64-NEXT: vmovaps {{.*#+}} ymm1 = [4,5,4,5,6,7,0,7] 138; X64-NEXT: vpermps %ymm0, %ymm1, %ymm0 139; X64-NEXT: retq 140 %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> 141 %2 = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6>) 142 %3 = shufflevector <8 x i32> %1, <8 x i32> %2, <8 x i32> <i32 0, i32 8, i32 9, i32 1, i32 15, i32 14, i32 4, i32 3> 143 ret <8 x i32> %3 144} 145 146define <8 x float> @combine_as_vpermps(<8 x float> %a0) { 147; X32-LABEL: combine_as_vpermps: 148; X32: # %bb.0: 149; X32-NEXT: vmovaps {{.*#+}} ymm1 = <6,4,7,5,1,u,4,7> 150; X32-NEXT: vpermps %ymm0, %ymm1, %ymm0 151; X32-NEXT: retl 152; 153; X64-LABEL: combine_as_vpermps: 154; X64: # %bb.0: 155; X64-NEXT: vmovaps {{.*#+}} ymm1 = <6,4,7,5,1,u,4,7> 156; X64-NEXT: vpermps %ymm0, %ymm1, %ymm0 157; X64-NEXT: retq 158 %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7> 159 %2 = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 1, i32 undef, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>) 160 %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> <i32 15, i32 0, i32 14, i32 1, i32 8, i32 9, i32 4, i32 3> 161 ret <8 x float> %3 162} 163 164define <32 x i8> @combine_permq_pshufb_as_vpblendd(<4 x i64> %a0) { 165; X32-LABEL: combine_permq_pshufb_as_vpblendd: 166; X32: # %bb.0: 167; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 168; X32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 169; X32-NEXT: retl 170; 171; X64-LABEL: combine_permq_pshufb_as_vpblendd: 172; X64: # %bb.0: 173; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 174; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 175; X64-NEXT: retq 176 %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 177 %2 = bitcast <4 x i64> %1 to <32 x i8> 178 %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255>) 179 ret <32 x i8> %3 180} 181 182define <16 x i8> @combine_pshufb_as_vpbroadcastb128(<16 x i8> %a) { 183; X32-LABEL: combine_pshufb_as_vpbroadcastb128: 184; X32: # %bb.0: 185; X32-NEXT: vpbroadcastb %xmm0, %xmm0 186; X32-NEXT: retl 187; 188; X64-LABEL: combine_pshufb_as_vpbroadcastb128: 189; X64: # %bb.0: 190; X64-NEXT: vpbroadcastb %xmm0, %xmm0 191; X64-NEXT: retq 192 %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> zeroinitializer) 193 ret <16 x i8> %1 194} 195 196define <32 x i8> @combine_pshufb_as_vpbroadcastb256(<2 x i64> %a) { 197; X32-LABEL: combine_pshufb_as_vpbroadcastb256: 198; X32: # %bb.0: 199; X32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 200; X32-NEXT: vpbroadcastb %xmm0, %ymm0 201; X32-NEXT: retl 202; 203; X64-LABEL: combine_pshufb_as_vpbroadcastb256: 204; X64: # %bb.0: 205; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 206; X64-NEXT: vpbroadcastb %xmm0, %ymm0 207; X64-NEXT: retq 208 %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 209 %2 = bitcast <4 x i64> %1 to <32 x i8> 210 %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> zeroinitializer) 211 %4 = bitcast <32 x i8> %3 to <8 x i32> 212 %5 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %4, <8 x i32> zeroinitializer) 213 %6 = bitcast <8 x i32> %5 to <32 x i8> 214 ret <32 x i8> %6 215} 216 217define <16 x i8> @combine_pshufb_as_vpbroadcastw128(<16 x i8> %a) { 218; X32-LABEL: combine_pshufb_as_vpbroadcastw128: 219; X32: # %bb.0: 220; X32-NEXT: vpbroadcastw %xmm0, %xmm0 221; X32-NEXT: retl 222; 223; X64-LABEL: combine_pshufb_as_vpbroadcastw128: 224; X64: # %bb.0: 225; X64-NEXT: vpbroadcastw %xmm0, %xmm0 226; X64-NEXT: retq 227 %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>) 228 ret <16 x i8> %1 229} 230 231define <32 x i8> @combine_pshufb_as_vpbroadcastw256(<2 x i64> %a) { 232; X32-LABEL: combine_pshufb_as_vpbroadcastw256: 233; X32: # %bb.0: 234; X32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 235; X32-NEXT: vpbroadcastw %xmm0, %ymm0 236; X32-NEXT: retl 237; 238; X64-LABEL: combine_pshufb_as_vpbroadcastw256: 239; X64: # %bb.0: 240; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 241; X64-NEXT: vpbroadcastw %xmm0, %ymm0 242; X64-NEXT: retq 243 %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 244 %2 = bitcast <4 x i64> %1 to <32 x i8> 245 %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>) 246 %4 = bitcast <32 x i8> %3 to <8 x i32> 247 %5 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %4, <8 x i32> zeroinitializer) 248 %6 = bitcast <8 x i32> %5 to <32 x i8> 249 ret <32 x i8> %6 250} 251 252define <16 x i8> @combine_pshufb_as_vpbroadcastd128(<16 x i8> %a) { 253; X32-LABEL: combine_pshufb_as_vpbroadcastd128: 254; X32: # %bb.0: 255; X32-NEXT: vpbroadcastd %xmm0, %xmm0 256; X32-NEXT: vpaddb {{\.LCPI.*}}, %xmm0, %xmm0 257; X32-NEXT: retl 258; 259; X64-LABEL: combine_pshufb_as_vpbroadcastd128: 260; X64: # %bb.0: 261; X64-NEXT: vpbroadcastd %xmm0, %xmm0 262; X64-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 263; X64-NEXT: retq 264 %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>) 265 %2 = add <16 x i8> %1, <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3> 266 ret <16 x i8> %2 267} 268 269define <8 x i32> @combine_permd_as_vpbroadcastd256(<4 x i32> %a) { 270; X32-LABEL: combine_permd_as_vpbroadcastd256: 271; X32: # %bb.0: 272; X32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 273; X32-NEXT: vpbroadcastd %xmm0, %ymm0 274; X32-NEXT: vpaddd {{\.LCPI.*}}, %ymm0, %ymm0 275; X32-NEXT: retl 276; 277; X64-LABEL: combine_permd_as_vpbroadcastd256: 278; X64: # %bb.0: 279; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 280; X64-NEXT: vpbroadcastd %xmm0, %ymm0 281; X64-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 282; X64-NEXT: retq 283 %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 284 %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %1, <8 x i32> zeroinitializer) 285 %3 = add <8 x i32> %2, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 286 ret <8 x i32> %3 287} 288 289define <16 x i8> @combine_pshufb_as_vpbroadcastq128(<16 x i8> %a) { 290; X32-LABEL: combine_pshufb_as_vpbroadcastq128: 291; X32: # %bb.0: 292; X32-NEXT: vpbroadcastq %xmm0, %xmm0 293; X32-NEXT: retl 294; 295; X64-LABEL: combine_pshufb_as_vpbroadcastq128: 296; X64: # %bb.0: 297; X64-NEXT: vpbroadcastq %xmm0, %xmm0 298; X64-NEXT: retq 299 %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>) 300 ret <16 x i8> %1 301} 302 303define <8 x i32> @combine_permd_as_vpbroadcastq256(<4 x i32> %a) { 304; X32-LABEL: combine_permd_as_vpbroadcastq256: 305; X32: # %bb.0: 306; X32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 307; X32-NEXT: vpbroadcastq %xmm0, %ymm0 308; X32-NEXT: vpaddd {{\.LCPI.*}}, %ymm0, %ymm0 309; X32-NEXT: retl 310; 311; X64-LABEL: combine_permd_as_vpbroadcastq256: 312; X64: # %bb.0: 313; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 314; X64-NEXT: vpbroadcastq %xmm0, %ymm0 315; X64-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 316; X64-NEXT: retq 317 %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 318 %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %1, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>) 319 %3 = add <8 x i32> %2, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 320 ret <8 x i32> %3 321} 322 323define <4 x float> @combine_pshufb_as_vpbroadcastss128(<4 x float> %a) { 324; X32-LABEL: combine_pshufb_as_vpbroadcastss128: 325; X32: # %bb.0: 326; X32-NEXT: vbroadcastss %xmm0, %xmm0 327; X32-NEXT: retl 328; 329; X64-LABEL: combine_pshufb_as_vpbroadcastss128: 330; X64: # %bb.0: 331; X64-NEXT: vbroadcastss %xmm0, %xmm0 332; X64-NEXT: retq 333 %1 = bitcast <4 x float> %a to <16 x i8> 334 %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>) 335 %3 = bitcast <16 x i8> %2 to <4 x float> 336 ret <4 x float> %3 337} 338 339define <8 x float> @combine_permps_as_vpbroadcastss256(<4 x float> %a) { 340; X32-LABEL: combine_permps_as_vpbroadcastss256: 341; X32: # %bb.0: 342; X32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 343; X32-NEXT: vbroadcastss %xmm0, %ymm0 344; X32-NEXT: retl 345; 346; X64-LABEL: combine_permps_as_vpbroadcastss256: 347; X64: # %bb.0: 348; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 349; X64-NEXT: vbroadcastss %xmm0, %ymm0 350; X64-NEXT: retq 351 %1 = shufflevector <4 x float> %a, <4 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 352 %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %1, <8 x i32> zeroinitializer) 353 ret <8 x float> %2 354} 355 356define <4 x double> @combine_permps_as_vpbroadcastsd256(<2 x double> %a) { 357; X32-LABEL: combine_permps_as_vpbroadcastsd256: 358; X32: # %bb.0: 359; X32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 360; X32-NEXT: vbroadcastsd %xmm0, %ymm0 361; X32-NEXT: retl 362; 363; X64-LABEL: combine_permps_as_vpbroadcastsd256: 364; X64: # %bb.0: 365; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 366; X64-NEXT: vbroadcastsd %xmm0, %ymm0 367; X64-NEXT: retq 368 %1 = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 369 %2 = bitcast <4 x double> %1 to <8 x float> 370 %3 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %2, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>) 371 %4 = bitcast <8 x float> %3 to <4 x double> 372 ret <4 x double> %4 373} 374 375define <16 x i8> @combine_vpbroadcast_pshufb_as_vpbroadcastb128(<16 x i8> %a) { 376; X32-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastb128: 377; X32: # %bb.0: 378; X32-NEXT: vpbroadcastb %xmm0, %xmm0 379; X32-NEXT: retl 380; 381; X64-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastb128: 382; X64: # %bb.0: 383; X64-NEXT: vpbroadcastb %xmm0, %xmm0 384; X64-NEXT: retq 385 %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> zeroinitializer 386 %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> zeroinitializer) 387 ret <16 x i8> %2 388} 389 390define <32 x i8> @combine_vpbroadcast_pshufb_as_vpbroadcastb256(<32 x i8> %a) { 391; X32-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastb256: 392; X32: # %bb.0: 393; X32-NEXT: vpbroadcastb %xmm0, %ymm0 394; X32-NEXT: retl 395; 396; X64-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastb256: 397; X64: # %bb.0: 398; X64-NEXT: vpbroadcastb %xmm0, %ymm0 399; X64-NEXT: retq 400 %1 = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> zeroinitializer 401 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> zeroinitializer) 402 ret <32 x i8> %2 403} 404 405define <4 x float> @combine_vpbroadcast_pshufb_as_vpbroadcastss128(<4 x float> %a) { 406; X32-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastss128: 407; X32: # %bb.0: 408; X32-NEXT: vbroadcastss %xmm0, %xmm0 409; X32-NEXT: retl 410; 411; X64-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastss128: 412; X64: # %bb.0: 413; X64-NEXT: vbroadcastss %xmm0, %xmm0 414; X64-NEXT: retq 415 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> zeroinitializer 416 %2 = bitcast <4 x float> %1 to <16 x i8> 417 %3 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>) 418 %4 = bitcast <16 x i8> %3 to <4 x float> 419 ret <4 x float> %4 420} 421 422define <8 x float> @combine_vpbroadcast_permd_as_vpbroadcastss256(<4 x float> %a) { 423; X32-LABEL: combine_vpbroadcast_permd_as_vpbroadcastss256: 424; X32: # %bb.0: 425; X32-NEXT: vbroadcastss %xmm0, %ymm0 426; X32-NEXT: vbroadcastss %xmm0, %ymm0 427; X32-NEXT: retl 428; 429; X64-LABEL: combine_vpbroadcast_permd_as_vpbroadcastss256: 430; X64: # %bb.0: 431; X64-NEXT: vbroadcastss %xmm0, %ymm0 432; X64-NEXT: vbroadcastss %xmm0, %ymm0 433; X64-NEXT: retq 434 %1 = shufflevector <4 x float> %a, <4 x float> undef, <8 x i32> zeroinitializer 435 %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %1, <8 x i32> zeroinitializer) 436 ret <8 x float> %2 437} 438 439define <4 x double> @combine_vpbroadcast_permd_as_vpbroadcastsd256(<2 x double> %a) { 440; X32-LABEL: combine_vpbroadcast_permd_as_vpbroadcastsd256: 441; X32: # %bb.0: 442; X32-NEXT: vbroadcastsd %xmm0, %ymm0 443; X32-NEXT: vbroadcastsd %xmm0, %ymm0 444; X32-NEXT: retl 445; 446; X64-LABEL: combine_vpbroadcast_permd_as_vpbroadcastsd256: 447; X64: # %bb.0: 448; X64-NEXT: vbroadcastsd %xmm0, %ymm0 449; X64-NEXT: vbroadcastsd %xmm0, %ymm0 450; X64-NEXT: retq 451 %1 = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> zeroinitializer 452 %2 = bitcast <4 x double> %1 to <8 x float> 453 %3 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %2, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>) 454 %4 = bitcast <8 x float> %3 to <4 x double> 455 ret <4 x double> %4 456} 457 458define <8 x i32> @combine_permd_as_permq(<8 x i32> %a) { 459; X32-LABEL: combine_permd_as_permq: 460; X32: # %bb.0: 461; X32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1] 462; X32-NEXT: retl 463; 464; X64-LABEL: combine_permd_as_permq: 465; X64: # %bb.0: 466; X64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1] 467; X64-NEXT: retq 468 %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 2, i32 3>) 469 ret <8 x i32> %1 470} 471 472define <8 x float> @combine_permps_as_permpd(<8 x float> %a) { 473; X32-LABEL: combine_permps_as_permpd: 474; X32: # %bb.0: 475; X32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,0,1] 476; X32-NEXT: retl 477; 478; X64-LABEL: combine_permps_as_permpd: 479; X64: # %bb.0: 480; X64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,0,1] 481; X64-NEXT: retq 482 %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 0, i32 1, i32 2, i32 3>) 483 ret <8 x float> %1 484} 485 486define <4 x i64> @combine_pshufb_as_zext(<32 x i8> %a0) { 487; X32-LABEL: combine_pshufb_as_zext: 488; X32: # %bb.0: 489; X32-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 490; X32-NEXT: retl 491; 492; X64-LABEL: combine_pshufb_as_zext: 493; X64: # %bb.0: 494; X64-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 495; X64-NEXT: retq 496 %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 497 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 10, i8 11, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 4, i8 5, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>) 498 %3 = bitcast <32 x i8> %2 to <4 x i64> 499 ret <4 x i64> %3 500} 501 502define <4 x i64> @combine_pshufb_as_zext128(<32 x i8> %a0) { 503; X32-LABEL: combine_pshufb_as_zext128: 504; X32: # %bb.0: 505; X32-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] 506; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 507; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,14],zero,zero,zero,zero,zero,zero,ymm0[13,12],zero,zero,zero,zero,zero,zero,ymm0[31,30],zero,zero,zero,zero,zero,zero,ymm0[29,28],zero,zero,zero,zero,zero,zero 508; X32-NEXT: retl 509; 510; X64-LABEL: combine_pshufb_as_zext128: 511; X64: # %bb.0: 512; X64-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] 513; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 514; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,14],zero,zero,zero,zero,zero,zero,ymm0[13,12],zero,zero,zero,zero,zero,zero,ymm0[31,30],zero,zero,zero,zero,zero,zero,ymm0[29,28],zero,zero,zero,zero,zero,zero 515; X64-NEXT: retq 516 %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> 517 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 15, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 13, i8 12, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 15, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 13, i8 12, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>) 518 %3 = bitcast <32 x i8> %2 to <4 x i64> 519 ret <4 x i64> %3 520} 521 522define <4 x double> @combine_pshufb_as_vzmovl_64(<4 x double> %a0) { 523; X32-LABEL: combine_pshufb_as_vzmovl_64: 524; X32: # %bb.0: 525; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 526; X32-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 527; X32-NEXT: retl 528; 529; X64-LABEL: combine_pshufb_as_vzmovl_64: 530; X64: # %bb.0: 531; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 532; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 533; X64-NEXT: retq 534 %1 = bitcast <4 x double> %a0 to <32 x i8> 535 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>) 536 %3 = bitcast <32 x i8> %2 to <4 x double> 537 ret <4 x double> %3 538} 539 540define <8 x float> @combine_pshufb_as_vzmovl_32(<8 x float> %a0) { 541; X32-LABEL: combine_pshufb_as_vzmovl_32: 542; X32: # %bb.0: 543; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 544; X32-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 545; X32-NEXT: retl 546; 547; X64-LABEL: combine_pshufb_as_vzmovl_32: 548; X64: # %bb.0: 549; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 550; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 551; X64-NEXT: retq 552 %1 = bitcast <8 x float> %a0 to <32 x i8> 553 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>) 554 %3 = bitcast <32 x i8> %2 to <8 x float> 555 ret <8 x float> %3 556} 557 558define <32 x i8> @combine_pshufb_as_pslldq(<32 x i8> %a0) { 559; X32-LABEL: combine_pshufb_as_pslldq: 560; X32: # %bb.0: 561; X32-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21] 562; X32-NEXT: retl 563; 564; X64-LABEL: combine_pshufb_as_pslldq: 565; X64: # %bb.0: 566; X64-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21] 567; X64-NEXT: retq 568 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5>) 569 ret <32 x i8> %res0 570} 571 572define <32 x i8> @combine_pshufb_as_psrldq(<32 x i8> %a0) { 573; X32-LABEL: combine_pshufb_as_psrldq: 574; X32: # %bb.0: 575; X32-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 576; X32-NEXT: retl 577; 578; X64-LABEL: combine_pshufb_as_psrldq: 579; X64: # %bb.0: 580; X64-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 581; X64-NEXT: retq 582 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>) 583 ret <32 x i8> %res0 584} 585 586define <32 x i8> @combine_pshufb_as_psrlw(<32 x i8> %a0) { 587; X32-LABEL: combine_pshufb_as_psrlw: 588; X32: # %bb.0: 589; X32-NEXT: vpsrlw $8, %ymm0, %ymm0 590; X32-NEXT: retl 591; 592; X64-LABEL: combine_pshufb_as_psrlw: 593; X64: # %bb.0: 594; X64-NEXT: vpsrlw $8, %ymm0, %ymm0 595; X64-NEXT: retq 596 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 1, i8 128, i8 3, i8 128, i8 5, i8 128, i8 7, i8 128, i8 9, i8 128, i8 11, i8 128, i8 13, i8 128, i8 15, i8 128, i8 17, i8 128, i8 19, i8 128, i8 21, i8 128, i8 23, i8 128, i8 25, i8 128, i8 27, i8 128, i8 29, i8 128, i8 31, i8 128>) 597 ret <32 x i8> %res0 598} 599 600define <32 x i8> @combine_pshufb_as_pslld(<32 x i8> %a0) { 601; X32-LABEL: combine_pshufb_as_pslld: 602; X32: # %bb.0: 603; X32-NEXT: vpslld $24, %ymm0, %ymm0 604; X32-NEXT: retl 605; 606; X64-LABEL: combine_pshufb_as_pslld: 607; X64: # %bb.0: 608; X64-NEXT: vpslld $24, %ymm0, %ymm0 609; X64-NEXT: retq 610 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 128, i8 128, i8 128, i8 0, i8 128, i8 128, i8 128, i8 4, i8 128, i8 128, i8 128, i8 8, i8 128, i8 128, i8 128, i8 12, i8 128, i8 128, i8 128, i8 16, i8 128, i8 128, i8 128, i8 20, i8 128, i8 128, i8 128, i8 24, i8 128, i8 128, i8 128, i8 28>) 611 ret <32 x i8> %res0 612} 613 614define <32 x i8> @combine_pshufb_as_psrlq(<32 x i8> %a0) { 615; X32-LABEL: combine_pshufb_as_psrlq: 616; X32: # %bb.0: 617; X32-NEXT: vpsrlq $40, %ymm0, %ymm0 618; X32-NEXT: retl 619; 620; X64-LABEL: combine_pshufb_as_psrlq: 621; X64: # %bb.0: 622; X64-NEXT: vpsrlq $40, %ymm0, %ymm0 623; X64-NEXT: retq 624 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 5, i8 6, i8 7, i8 128, i8 128, i8 128, i8 128, i8 128, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 21, i8 22, i8 23, i8 128, i8 128, i8 128, i8 128, i8 128, i8 29, i8 30, i8 31, i8 128, i8 128, i8 128, i8 128, i8 128>) 625 ret <32 x i8> %res0 626} 627 628define <32 x i8> @combine_pshufb_as_pshuflw(<32 x i8> %a0) { 629; X32-LABEL: combine_pshufb_as_pshuflw: 630; X32: # %bb.0: 631; X32-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] 632; X32-NEXT: retl 633; 634; X64-LABEL: combine_pshufb_as_pshuflw: 635; X64: # %bb.0: 636; X64-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] 637; X64-NEXT: retq 638 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>) 639 ret <32 x i8> %res0 640} 641 642define <32 x i8> @combine_pshufb_as_pshufhw(<32 x i8> %a0) { 643; X32-LABEL: combine_pshufb_as_pshufhw: 644; X32: # %bb.0: 645; X32-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14] 646; X32-NEXT: retl 647; 648; X64-LABEL: combine_pshufb_as_pshufhw: 649; X64: # %bb.0: 650; X64-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14] 651; X64-NEXT: retq 652 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>) 653 ret <32 x i8> %res0 654} 655 656define <32 x i8> @combine_pshufb_not_as_pshufw(<32 x i8> %a0) { 657; X32-LABEL: combine_pshufb_not_as_pshufw: 658; X32: # %bb.0: 659; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29] 660; X32-NEXT: retl 661; 662; X64-LABEL: combine_pshufb_not_as_pshufw: 663; X64: # %bb.0: 664; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29] 665; X64-NEXT: retq 666 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>) 667 %res1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %res0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>) 668 ret <32 x i8> %res1 669} 670 671define <32 x i8> @combine_pshufb_as_unpacklo_undef(<32 x i8> %a0) { 672; X32-LABEL: combine_pshufb_as_unpacklo_undef: 673; X32: # %bb.0: 674; X32-NEXT: retl 675; 676; X64-LABEL: combine_pshufb_as_unpacklo_undef: 677; X64: # %bb.0: 678; X64-NEXT: retq 679 %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 undef, i8 0, i8 undef, i8 1, i8 undef, i8 2, i8 undef, i8 3, i8 undef, i8 4, i8 undef, i8 5, i8 undef, i8 6, i8 undef, i8 7, i8 undef, i8 16, i8 undef, i8 17, i8 undef, i8 18, i8 undef, i8 19, i8 undef, i8 20, i8 undef, i8 21, i8 undef, i8 22, i8 undef, i8 23>) 680 %2 = shufflevector <32 x i8> %1, <32 x i8> undef, <32 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14, i32 16, i32 16, i32 18, i32 18, i32 20, i32 20, i32 22, i32 22, i32 24, i32 24, i32 26, i32 26, i32 28, i32 28, i32 30, i32 30> 681 ret <32 x i8> %2 682} 683 684define <32 x i8> @combine_pshufb_as_unpacklo_zero(<32 x i8> %a0) { 685; X32-LABEL: combine_pshufb_as_unpacklo_zero: 686; X32: # %bb.0: 687; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1 688; X32-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] 689; X32-NEXT: retl 690; 691; X64-LABEL: combine_pshufb_as_unpacklo_zero: 692; X64: # %bb.0: 693; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 694; X64-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] 695; X64-NEXT: retq 696 %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 -1, i8 -1, i8 2, i8 3, i8 -1, i8 -1, i8 4, i8 5, i8 -1, i8 -1, i8 6, i8 7, i8 -1, i8 -1, i8 16, i8 17, i8 -1, i8 -1, i8 18, i8 19, i8 -1, i8 -1, i8 20, i8 21, i8 -1, i8 -1, i8 22, i8 23, i8 -1, i8 -1>) 697 ret <32 x i8> %1 698} 699 700define <32 x i8> @combine_pshufb_as_unpackhi_zero(<32 x i8> %a0) { 701; X32-LABEL: combine_pshufb_as_unpackhi_zero: 702; X32: # %bb.0: 703; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1 704; X32-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] 705; X32-NEXT: retl 706; 707; X64-LABEL: combine_pshufb_as_unpackhi_zero: 708; X64: # %bb.0: 709; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 710; X64-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] 711; X64-NEXT: retq 712 %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 -1, i8 8, i8 -1, i8 9, i8 -1, i8 10, i8 -1, i8 11, i8 -1, i8 12, i8 -1, i8 13, i8 -1, i8 14, i8 -1, i8 15, i8 -1, i8 24, i8 -1, i8 25, i8 -1, i8 26, i8 -1, i8 27, i8 -1, i8 28, i8 -1, i8 29, i8 -1, i8 30, i8 -1, i8 31>) 713 ret <32 x i8> %1 714} 715 716define <32 x i8> @combine_psrlw_pshufb(<16 x i16> %a0) { 717; X32-LABEL: combine_psrlw_pshufb: 718; X32: # %bb.0: 719; X32-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 720; X32-NEXT: retl 721; 722; X64-LABEL: combine_psrlw_pshufb: 723; X64: # %bb.0: 724; X64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 725; X64-NEXT: retq 726 %1 = lshr <16 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 727 %2 = bitcast <16 x i16> %1 to <32 x i8> 728 %3 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 1, i8 0, i8 3, i8 2, i8 5, i8 4, i8 7, i8 6, i8 9, i8 8, i8 11, i8 10, i8 13, i8 12, i8 15, i8 14, i8 17, i8 16, i8 19, i8 18, i8 21, i8 20, i8 23, i8 22, i8 25, i8 24, i8 27, i8 26, i8 29, i8 28, i8 31, i8 30>) 729 ret <32 x i8> %3 730} 731 732define <32 x i8> @combine_pslld_pshufb(<8 x i32> %a0) { 733; X32-LABEL: combine_pslld_pshufb: 734; X32: # %bb.0: 735; X32-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 736; X32-NEXT: retl 737; 738; X64-LABEL: combine_pslld_pshufb: 739; X64: # %bb.0: 740; X64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 741; X64-NEXT: retq 742 %1 = shl <8 x i32> %a0, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24> 743 %2 = bitcast <8 x i32> %1 to <32 x i8> 744 %3 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 11, i8 10, i8 9, i8 8, i8 15, i8 14, i8 13, i8 12, i8 19, i8 18, i8 17, i8 16, i8 23, i8 22, i8 21, i8 20, i8 27, i8 26, i8 25, i8 24, i8 31, i8 30, i8 29, i8 28>) 745 ret <32 x i8> %3 746} 747 748define <32 x i8> @combine_psrlq_pshufb(<4 x i64> %a0) { 749; X32-LABEL: combine_psrlq_pshufb: 750; X32: # %bb.0: 751; X32-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[7,6,5,4],zero,zero,zero,zero,ymm0[15,14,13,12],zero,zero,zero,zero,ymm0[23,22,21],zero,zero,zero,zero,ymm0[31,30,29,28],zero 752; X32-NEXT: retl 753; 754; X64-LABEL: combine_psrlq_pshufb: 755; X64: # %bb.0: 756; X64-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[7,6,5,4],zero,zero,zero,zero,ymm0[15,14,13,12],zero,zero,zero,zero,ymm0[23,22,21],zero,zero,zero,zero,ymm0[31,30,29,28],zero 757; X64-NEXT: retq 758 %1 = lshr <4 x i64> %a0, <i64 32, i64 32, i64 32, i64 32> 759 %2 = bitcast <4 x i64> %1 to <32 x i8> 760 %3 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23>) 761 ret <32 x i8> %3 762} 763 764define <32 x i8> @combine_unpack_unpack_pshufb(<32 x i8> %a0) { 765; X32-LABEL: combine_unpack_unpack_pshufb: 766; X32: # %bb.0: 767; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,4,8,1,1,5,9,2,2,6,10,3,3,7,11,16,16,20,24,17,17,21,25,18,18,22,26,19,19,23,27] 768; X32-NEXT: retl 769; 770; X64-LABEL: combine_unpack_unpack_pshufb: 771; X64: # %bb.0: 772; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,4,8,1,1,5,9,2,2,6,10,3,3,7,11,16,16,20,24,17,17,21,25,18,18,22,26,19,19,23,27] 773; X64-NEXT: retq 774 %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19> 775 %2 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 776 %3 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 24, i32 25, i32 26, i32 27, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 777 %4 = shufflevector <32 x i8> %1, <32 x i8> %2, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 778 %5 = shufflevector <32 x i8> %1, <32 x i8> %3, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 779 %6 = shufflevector <32 x i8> %4, <32 x i8> %5, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55> 780 ret <32 x i8> %6 781} 782 783define <16 x i16> @shuffle_combine_packssdw_pshufb(<8 x i32> %a0) { 784; X32-LABEL: shuffle_combine_packssdw_pshufb: 785; X32: # %bb.0: 786; X32-NEXT: vpsrad $31, %ymm0, %ymm0 787; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,8,9,4,5,0,1,12,13,8,9,4,5,0,1,16,17,20,21,24,25,28,29,28,29,24,25,20,21,16,17] 788; X32-NEXT: retl 789; 790; X64-LABEL: shuffle_combine_packssdw_pshufb: 791; X64: # %bb.0: 792; X64-NEXT: vpsrad $31, %ymm0, %ymm0 793; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,8,9,4,5,0,1,12,13,8,9,4,5,0,1,16,17,20,21,24,25,28,29,28,29,24,25,20,21,16,17] 794; X64-NEXT: retq 795 %1 = ashr <8 x i32> %a0, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31> 796 %2 = tail call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %1, <8 x i32> %1) 797 %3 = shufflevector <16 x i16> %2, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0, i32 8, i32 9, i32 10, i32 11, i32 11, i32 10, i32 9, i32 8> 798 ret <16 x i16> %3 799} 800declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone 801 802define <32 x i8> @shuffle_combine_packsswb_pshufb(<16 x i16> %a0, <16 x i16> %a1) { 803; X32-LABEL: shuffle_combine_packsswb_pshufb: 804; X32: # %bb.0: 805; X32-NEXT: vpsraw $15, %ymm0, %ymm0 806; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,12,10,8,6,4,2,0,14,12,10,8,6,4,2,0,30,28,26,24,22,20,18,16,30,28,26,24,22,20,18,16] 807; X32-NEXT: retl 808; 809; X64-LABEL: shuffle_combine_packsswb_pshufb: 810; X64: # %bb.0: 811; X64-NEXT: vpsraw $15, %ymm0, %ymm0 812; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,12,10,8,6,4,2,0,14,12,10,8,6,4,2,0,30,28,26,24,22,20,18,16,30,28,26,24,22,20,18,16] 813; X64-NEXT: retq 814 %1 = ashr <16 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> 815 %2 = ashr <16 x i16> %a1, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> 816 %3 = tail call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %1, <16 x i16> %2) 817 %4 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %3, <32 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>) 818 ret <32 x i8> %4 819} 820declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone 821 822define <16 x i16> @shuffle_combine_packusdw_pshufb(<8 x i32> %a0, <8 x i32> %a1) { 823; X32-LABEL: shuffle_combine_packusdw_pshufb: 824; X32: # %bb.0: 825; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,10,11,6,7,2,3,14,15,10,11,6,7,2,3,18,19,22,23,26,27,30,31,30,31,26,27,22,23,18,19] 826; X32-NEXT: retl 827; 828; X64-LABEL: shuffle_combine_packusdw_pshufb: 829; X64: # %bb.0: 830; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,10,11,6,7,2,3,14,15,10,11,6,7,2,3,18,19,22,23,26,27,30,31,30,31,26,27,22,23,18,19] 831; X64-NEXT: retq 832 %1 = lshr <8 x i32> %a0, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 833 %2 = tail call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %1, <8 x i32> %1) 834 %3 = shufflevector <16 x i16> %2, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0, i32 8, i32 9, i32 10, i32 11, i32 11, i32 10, i32 9, i32 8> 835 ret <16 x i16> %3 836} 837declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone 838 839define <32 x i8> @shuffle_combine_packuswb_pshufb(<16 x i16> %a0, <16 x i16> %a1) { 840; X32-LABEL: shuffle_combine_packuswb_pshufb: 841; X32: # %bb.0: 842; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,13,11,9,7,5,3,1,15,13,11,9,7,5,3,1,31,29,27,25,23,21,19,17,31,29,27,25,23,21,19,17] 843; X32-NEXT: retl 844; 845; X64-LABEL: shuffle_combine_packuswb_pshufb: 846; X64: # %bb.0: 847; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,13,11,9,7,5,3,1,15,13,11,9,7,5,3,1,31,29,27,25,23,21,19,17,31,29,27,25,23,21,19,17] 848; X64-NEXT: retq 849 %1 = lshr <16 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 850 %2 = lshr <16 x i16> %a1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 851 %3 = tail call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %1, <16 x i16> %2) 852 %4 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %3, <32 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>) 853 ret <32 x i8> %4 854} 855declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone 856 857define <16 x i8> @combine_pshufb_insertion_as_broadcast_v2i64(i64 %a0) { 858; X32-LABEL: combine_pshufb_insertion_as_broadcast_v2i64: 859; X32: # %bb.0: 860; X32-NEXT: vpbroadcastq {{[0-9]+}}(%esp), %xmm0 861; X32-NEXT: retl 862; 863; X64-LABEL: combine_pshufb_insertion_as_broadcast_v2i64: 864; X64: # %bb.0: 865; X64-NEXT: vmovq %rdi, %xmm0 866; X64-NEXT: vpbroadcastq %xmm0, %xmm0 867; X64-NEXT: retq 868 %1 = insertelement <2 x i64> undef, i64 %a0, i32 0 869 %2 = bitcast <2 x i64> %1 to <16 x i8> 870 %3 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>) 871 ret <16 x i8> %3 872} 873 874define <8 x i32> @combine_permd_insertion_as_broadcast_v4i64(i64 %a0) { 875; X32-LABEL: combine_permd_insertion_as_broadcast_v4i64: 876; X32: # %bb.0: 877; X32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %ymm0 878; X32-NEXT: retl 879; 880; X64-LABEL: combine_permd_insertion_as_broadcast_v4i64: 881; X64: # %bb.0: 882; X64-NEXT: vmovq %rdi, %xmm0 883; X64-NEXT: vpbroadcastq %xmm0, %ymm0 884; X64-NEXT: retq 885 %1 = insertelement <4 x i64> undef, i64 %a0, i32 0 886 %2 = bitcast <4 x i64> %1 to <8 x i32> 887 %3 = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %2, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>) 888 ret <8 x i32> %3 889} 890 891define <8 x i32> @constant_fold_permd() { 892; X32-LABEL: constant_fold_permd: 893; X32: # %bb.0: 894; X32-NEXT: vmovaps {{.*#+}} ymm0 = [5,7,3,2,8,2,6,1] 895; X32-NEXT: retl 896; 897; X64-LABEL: constant_fold_permd: 898; X64: # %bb.0: 899; X64-NEXT: vmovaps {{.*#+}} ymm0 = [5,7,3,2,8,2,6,1] 900; X64-NEXT: retq 901 %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>, <8 x i32> <i32 4, i32 6, i32 2, i32 1, i32 7, i32 1, i32 5, i32 0>) 902 ret <8 x i32> %1 903} 904 905define <8 x float> @constant_fold_permps() { 906; X32-LABEL: constant_fold_permps: 907; X32: # %bb.0: 908; X32-NEXT: vmovaps {{.*#+}} ymm0 = [5.000000e+00,7.000000e+00,3.000000e+00,2.000000e+00,8.000000e+00,2.000000e+00,6.000000e+00,1.000000e+00] 909; X32-NEXT: retl 910; 911; X64-LABEL: constant_fold_permps: 912; X64: # %bb.0: 913; X64-NEXT: vmovaps {{.*#+}} ymm0 = [5.000000e+00,7.000000e+00,3.000000e+00,2.000000e+00,8.000000e+00,2.000000e+00,6.000000e+00,1.000000e+00] 914; X64-NEXT: retq 915 %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, <8 x i32> <i32 4, i32 6, i32 2, i32 1, i32 7, i32 1, i32 5, i32 0>) 916 ret <8 x float> %1 917} 918 919define <32 x i8> @constant_fold_pshufb_256() { 920; X32-LABEL: constant_fold_pshufb_256: 921; X32: # %bb.0: 922; X32-NEXT: vmovaps {{.*#+}} ymm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9,255,0,0,0,u,u,0,0,241,0,0,0,0,0,249,250> 923; X32-NEXT: retl 924; 925; X64-LABEL: constant_fold_pshufb_256: 926; X64: # %bb.0: 927; X64-NEXT: vmovaps {{.*#+}} ymm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9,255,0,0,0,u,u,0,0,241,0,0,0,0,0,249,250> 928; X64-NEXT: retq 929 %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 -1, i8 -2, i8 -3, i8 -4, i8 -5, i8 -6, i8 -7, i8 -8, i8 -9, i8 -10, i8 -11, i8 -12, i8 -13, i8 -14, i8 -15>, <32 x i8> <i8 1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 -1, i8 -1, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 7, i8 6, i8 1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 -1, i8 -1, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 7, i8 6>) 930 ret <32 x i8> %1 931} 932 933define <32 x i8> @PR27320(<8 x i32> %a0) { 934; X32-LABEL: PR27320: 935; X32: # %bb.0: 936; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] 937; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,1,2,3,4,4,5,6,7,7,8,9,10,10,11,28,29,29,30,31,16,16,17,18,19,19,20,21,22,22,23] 938; X32-NEXT: retl 939; 940; X64-LABEL: PR27320: 941; X64: # %bb.0: 942; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] 943; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,1,2,3,4,4,5,6,7,7,8,9,10,10,11,28,29,29,30,31,16,16,17,18,19,19,20,21,22,22,23] 944; X64-NEXT: retq 945 %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 3, i32 4, i32 5, i32 undef> 946 %2 = bitcast <8 x i32> %1 to <32 x i8> 947 %3 = shufflevector <32 x i8> %2, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 7, i32 7, i32 8, i32 9, i32 10, i32 10, i32 11, i32 16, i32 17, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 23, i32 23, i32 24, i32 25, i32 26, i32 26, i32 27> 948 ret <32 x i8> %3 949} 950 951define internal fastcc <8 x float> @PR34577(<8 x float> %inp0, <8 x float> %inp1, <8 x float> %inp2) { 952; X32-LABEL: PR34577: 953; X32: # %bb.0: # %entry 954; X32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] 955; X32-NEXT: vxorps %xmm2, %xmm2, %xmm2 956; X32-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 957; X32-NEXT: vmovaps {{.*#+}} ymm2 = <u,u,7,2,u,u,3,2> 958; X32-NEXT: vpermps %ymm1, %ymm2, %ymm1 959; X32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 960; X32-NEXT: retl 961; 962; X64-LABEL: PR34577: 963; X64: # %bb.0: # %entry 964; X64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] 965; X64-NEXT: vxorps %xmm2, %xmm2, %xmm2 966; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 967; X64-NEXT: vmovaps {{.*#+}} ymm2 = <u,u,7,2,u,u,3,2> 968; X64-NEXT: vpermps %ymm1, %ymm2, %ymm1 969; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 970; X64-NEXT: retq 971entry: 972 %shuf0 = shufflevector <8 x float> %inp0, <8 x float> %inp2, <8 x i32> <i32 1, i32 10, i32 11, i32 13, i32 2, i32 13, i32 5, i32 0> 973 %sel = select <8 x i1> <i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <8 x float> %shuf0, <8 x float> zeroinitializer 974 %shuf1 = shufflevector <8 x float> zeroinitializer, <8 x float> %sel, <8 x i32> <i32 6, i32 11, i32 6, i32 15, i32 12, i32 11, i32 1, i32 3> 975 %shuf2 = shufflevector <8 x float> %inp1, <8 x float> %shuf1, <8 x i32> <i32 15, i32 10, i32 7, i32 2, i32 12, i32 undef, i32 3, i32 2> 976 ret <8 x float> %shuf2 977} 978