1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512F 7; 8; Combine tests involving SSE3/SSSE3 target shuffles (MOVDDUP, MOVSHDUP, MOVSLDUP, PSHUFB) 9 10declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) 11 12define <16 x i8> @combine_vpshufb_as_zero(<16 x i8> %a0) { 13; SSE-LABEL: combine_vpshufb_as_zero: 14; SSE: # %bb.0: 15; SSE-NEXT: xorps %xmm0, %xmm0 16; SSE-NEXT: retq 17; 18; AVX-LABEL: combine_vpshufb_as_zero: 19; AVX: # %bb.0: 20; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 21; AVX-NEXT: retq 22 %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 128, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>) 23 %res1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res0, <16 x i8> <i8 0, i8 128, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>) 24 %res2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res1, <16 x i8> <i8 0, i8 1, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>) 25 ret <16 x i8> %res2 26} 27 28define <16 x i8> @combine_vpshufb_as_movq(<16 x i8> %a0) { 29; SSE-LABEL: combine_vpshufb_as_movq: 30; SSE: # %bb.0: 31; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 32; SSE-NEXT: retq 33; 34; AVX-LABEL: combine_vpshufb_as_movq: 35; AVX: # %bb.0: 36; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 37; AVX-NEXT: retq 38 %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 128, i8 1, i8 128, i8 2, i8 128, i8 3, i8 128, i8 4, i8 128, i8 5, i8 128, i8 6, i8 128, i8 7, i8 128>) 39 %res1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res0, <16 x i8> <i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 1, i8 3, i8 5, i8 7, i8 9, i8 11, i8 13, i8 15>) 40 ret <16 x i8> %res1 41} 42 43define <2 x double> @combine_pshufb_as_movsd(<2 x double> %a0, <2 x double> %a1) { 44; SSSE3-LABEL: combine_pshufb_as_movsd: 45; SSSE3: # %bb.0: 46; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 47; SSSE3-NEXT: movapd %xmm1, %xmm0 48; SSSE3-NEXT: retq 49; 50; SSE41-LABEL: combine_pshufb_as_movsd: 51; SSE41: # %bb.0: 52; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 53; SSE41-NEXT: retq 54; 55; AVX-LABEL: combine_pshufb_as_movsd: 56; AVX: # %bb.0: 57; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 58; AVX-NEXT: retq 59 %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 3, i32 0> 60 %2 = bitcast <2 x double> %1 to <16 x i8> 61 %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>) 62 %4 = bitcast <16 x i8> %3 to <2 x double> 63 ret <2 x double> %4 64} 65 66define <4 x float> @combine_pshufb_as_movss(<4 x float> %a0, <4 x float> %a1) { 67; SSSE3-LABEL: combine_pshufb_as_movss: 68; SSSE3: # %bb.0: 69; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 70; SSSE3-NEXT: retq 71; 72; SSE41-LABEL: combine_pshufb_as_movss: 73; SSE41: # %bb.0: 74; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 75; SSE41-NEXT: retq 76; 77; AVX-LABEL: combine_pshufb_as_movss: 78; AVX: # %bb.0: 79; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 80; AVX-NEXT: retq 81 %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 3, i32 2, i32 1> 82 %2 = bitcast <4 x float> %1 to <16 x i8> 83 %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 12, i8 13, i8 14, i8 15, i8 8, i8 9, i8 10, i8 11, i8 4, i8 5, i8 6, i8 7>) 84 %4 = bitcast <16 x i8> %3 to <4 x float> 85 ret <4 x float> %4 86} 87 88define <4 x i32> @combine_pshufb_as_zext(<16 x i8> %a0) { 89; SSSE3-LABEL: combine_pshufb_as_zext: 90; SSSE3: # %bb.0: 91; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 92; SSSE3-NEXT: retq 93; 94; SSE41-LABEL: combine_pshufb_as_zext: 95; SSE41: # %bb.0: 96; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 97; SSE41-NEXT: retq 98; 99; AVX-LABEL: combine_pshufb_as_zext: 100; AVX: # %bb.0: 101; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 102; AVX-NEXT: retq 103 %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 -1, i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 -1, i8 2, i8 -1, i8 -1, i8 -1, i8 3, i8 -1, i8 -1, i8 -1>) 104 %2 = bitcast <16 x i8> %1 to <4 x i32> 105 ret <4 x i32> %2 106} 107 108define <2 x double> @combine_pshufb_as_vzmovl_64(<2 x double> %a0) { 109; SSE-LABEL: combine_pshufb_as_vzmovl_64: 110; SSE: # %bb.0: 111; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 112; SSE-NEXT: retq 113; 114; AVX-LABEL: combine_pshufb_as_vzmovl_64: 115; AVX: # %bb.0: 116; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 117; AVX-NEXT: retq 118 %1 = bitcast <2 x double> %a0 to <16 x i8> 119 %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>) 120 %3 = bitcast <16 x i8> %2 to <2 x double> 121 ret <2 x double> %3 122} 123 124define <4 x float> @combine_pshufb_as_vzmovl_32(<4 x float> %a0) { 125; SSSE3-LABEL: combine_pshufb_as_vzmovl_32: 126; SSSE3: # %bb.0: 127; SSSE3-NEXT: xorps %xmm1, %xmm1 128; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 129; SSSE3-NEXT: movaps %xmm1, %xmm0 130; SSSE3-NEXT: retq 131; 132; SSE41-LABEL: combine_pshufb_as_vzmovl_32: 133; SSE41: # %bb.0: 134; SSE41-NEXT: xorps %xmm1, %xmm1 135; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 136; SSE41-NEXT: retq 137; 138; AVX-LABEL: combine_pshufb_as_vzmovl_32: 139; AVX: # %bb.0: 140; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 141; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 142; AVX-NEXT: retq 143 %1 = bitcast <4 x float> %a0 to <16 x i8> 144 %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>) 145 %3 = bitcast <16 x i8> %2 to <4 x float> 146 ret <4 x float> %3 147} 148 149define <4 x float> @combine_pshufb_movddup(<4 x float> %a0) { 150; SSE-LABEL: combine_pshufb_movddup: 151; SSE: # %bb.0: 152; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[5,5,5,5,7,7,7,7,5,5,5,5,7,7,7,7] 153; SSE-NEXT: retq 154; 155; AVX-LABEL: combine_pshufb_movddup: 156; AVX: # %bb.0: 157; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,5,5,5,7,7,7,7,5,5,5,5,7,7,7,7] 158; AVX-NEXT: retq 159 %1 = bitcast <4 x float> %a0 to <16 x i8> 160 %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 5, i8 5, i8 5, i8 5, i8 7, i8 7, i8 7, i8 7, i8 1, i8 1, i8 1, i8 1, i8 3, i8 3, i8 3, i8 3>) 161 %3 = bitcast <16 x i8> %2 to <4 x float> 162 %4 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 163 ret <4 x float> %4 164} 165 166define <4 x float> @combine_pshufb_movshdup(<4 x float> %a0) { 167; SSE-LABEL: combine_pshufb_movshdup: 168; SSE: # %bb.0: 169; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,7,7,7,7,7,7,7,3,3,3,3,3,3,3,3] 170; SSE-NEXT: retq 171; 172; AVX-LABEL: combine_pshufb_movshdup: 173; AVX: # %bb.0: 174; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,7,7,7,7,7,7,7,3,3,3,3,3,3,3,3] 175; AVX-NEXT: retq 176 %1 = bitcast <4 x float> %a0 to <16 x i8> 177 %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 5, i8 5, i8 5, i8 5, i8 7, i8 7, i8 7, i8 7, i8 1, i8 1, i8 1, i8 1, i8 3, i8 3, i8 3, i8 3>) 178 %3 = bitcast <16 x i8> %2 to <4 x float> 179 %4 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 180 ret <4 x float> %4 181} 182 183define <4 x float> @combine_pshufb_movsldup(<4 x float> %a0) { 184; SSE-LABEL: combine_pshufb_movsldup: 185; SSE: # %bb.0: 186; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[5,5,5,5,5,5,5,5,1,1,1,1,1,1,1,1] 187; SSE-NEXT: retq 188; 189; AVX-LABEL: combine_pshufb_movsldup: 190; AVX: # %bb.0: 191; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,5,5,5,5,5,5,5,1,1,1,1,1,1,1,1] 192; AVX-NEXT: retq 193 %1 = bitcast <4 x float> %a0 to <16 x i8> 194 %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 5, i8 5, i8 5, i8 5, i8 7, i8 7, i8 7, i8 7, i8 1, i8 1, i8 1, i8 1, i8 3, i8 3, i8 3, i8 3>) 195 %3 = bitcast <16 x i8> %2 to <4 x float> 196 %4 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 197 ret <4 x float> %4 198} 199 200define <16 x i8> @combine_pshufb_palignr(<16 x i8> %a0, <16 x i8> %a1) { 201; SSE-LABEL: combine_pshufb_palignr: 202; SSE: # %bb.0: 203; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 204; SSE-NEXT: retq 205; 206; AVX-LABEL: combine_pshufb_palignr: 207; AVX: # %bb.0: 208; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] 209; AVX-NEXT: retq 210 %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23> 211 %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>) 212 ret <16 x i8> %2 213} 214 215define <16 x i8> @combine_pshufb_pslldq(<16 x i8> %a0) { 216; SSE-LABEL: combine_pshufb_pslldq: 217; SSE: # %bb.0: 218; SSE-NEXT: xorps %xmm0, %xmm0 219; SSE-NEXT: retq 220; 221; AVX-LABEL: combine_pshufb_pslldq: 222; AVX: # %bb.0: 223; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 224; AVX-NEXT: retq 225 %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>) 226 %2 = shufflevector <16 x i8> %1, <16 x i8> zeroinitializer, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 227 ret <16 x i8> %2 228} 229 230define <16 x i8> @combine_pshufb_psrldq(<16 x i8> %a0) { 231; SSE-LABEL: combine_pshufb_psrldq: 232; SSE: # %bb.0: 233; SSE-NEXT: xorps %xmm0, %xmm0 234; SSE-NEXT: retq 235; 236; AVX-LABEL: combine_pshufb_psrldq: 237; AVX: # %bb.0: 238; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 239; AVX-NEXT: retq 240 %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>) 241 %2 = shufflevector <16 x i8> %1, <16 x i8> zeroinitializer, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 242 ret <16 x i8> %2 243} 244 245define <16 x i8> @combine_and_pshufb(<16 x i8> %a0) { 246; SSSE3-LABEL: combine_and_pshufb: 247; SSSE3: # %bb.0: 248; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 249; SSSE3-NEXT: retq 250; 251; SSE41-LABEL: combine_and_pshufb: 252; SSE41: # %bb.0: 253; SSE41-NEXT: pxor %xmm1, %xmm1 254; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7] 255; SSE41-NEXT: retq 256; 257; AVX-LABEL: combine_and_pshufb: 258; AVX: # %bb.0: 259; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 260; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7] 261; AVX-NEXT: retq 262 %1 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 16, i32 16, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 263 %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>) 264 ret <16 x i8> %2 265} 266 267define <16 x i8> @combine_pshufb_and(<16 x i8> %a0) { 268; SSSE3-LABEL: combine_pshufb_and: 269; SSSE3: # %bb.0: 270; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 271; SSSE3-NEXT: retq 272; 273; SSE41-LABEL: combine_pshufb_and: 274; SSE41: # %bb.0: 275; SSE41-NEXT: pxor %xmm1, %xmm1 276; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7] 277; SSE41-NEXT: retq 278; 279; AVX-LABEL: combine_pshufb_and: 280; AVX: # %bb.0: 281; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 282; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7] 283; AVX-NEXT: retq 284 %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>) 285 %2 = shufflevector <16 x i8> %1, <16 x i8> zeroinitializer, <16 x i32> <i32 16, i32 16, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 286 ret <16 x i8> %2 287} 288 289define <16 x i8> @combine_pshufb_as_palignr(<16 x i8> %a0) { 290; SSE-LABEL: combine_pshufb_as_palignr: 291; SSE: # %bb.0: 292; SSE-NEXT: palignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0] 293; SSE-NEXT: retq 294; 295; AVX-LABEL: combine_pshufb_as_palignr: 296; AVX: # %bb.0: 297; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0] 298; AVX-NEXT: retq 299 %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 undef, i8 undef, i8 0>) 300 ret <16 x i8> %res0 301} 302 303define <16 x i8> @combine_pshufb_as_pslldq(<16 x i8> %a0) { 304; SSE-LABEL: combine_pshufb_as_pslldq: 305; SSE: # %bb.0: 306; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] 307; SSE-NEXT: retq 308; 309; AVX-LABEL: combine_pshufb_as_pslldq: 310; AVX: # %bb.0: 311; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] 312; AVX-NEXT: retq 313 %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5>) 314 ret <16 x i8> %res0 315} 316 317define <16 x i8> @combine_pshufb_as_psrldq(<16 x i8> %a0) { 318; SSE-LABEL: combine_pshufb_as_psrldq: 319; SSE: # %bb.0: 320; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 321; SSE-NEXT: retq 322; 323; AVX-LABEL: combine_pshufb_as_psrldq: 324; AVX: # %bb.0: 325; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 326; AVX-NEXT: retq 327 %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>) 328 ret <16 x i8> %res0 329} 330 331define <16 x i8> @combine_pshufb_as_psrlw(<16 x i8> %a0) { 332; SSE-LABEL: combine_pshufb_as_psrlw: 333; SSE: # %bb.0: 334; SSE-NEXT: psrlw $8, %xmm0 335; SSE-NEXT: retq 336; 337; AVX-LABEL: combine_pshufb_as_psrlw: 338; AVX: # %bb.0: 339; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 340; AVX-NEXT: retq 341 %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 1, i8 128, i8 3, i8 128, i8 5, i8 128, i8 7, i8 128, i8 9, i8 128, i8 11, i8 128, i8 13, i8 128, i8 15, i8 128>) 342 ret <16 x i8> %res0 343} 344 345define <16 x i8> @combine_pshufb_as_pslld(<16 x i8> %a0) { 346; SSE-LABEL: combine_pshufb_as_pslld: 347; SSE: # %bb.0: 348; SSE-NEXT: pslld $24, %xmm0 349; SSE-NEXT: retq 350; 351; AVX-LABEL: combine_pshufb_as_pslld: 352; AVX: # %bb.0: 353; AVX-NEXT: vpslld $24, %xmm0, %xmm0 354; AVX-NEXT: retq 355 %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 128, i8 128, i8 128, i8 0, i8 128, i8 128, i8 128, i8 4, i8 128, i8 128, i8 128, i8 8, i8 128, i8 128, i8 128, i8 12>) 356 ret <16 x i8> %res0 357} 358 359define <16 x i8> @combine_pshufb_as_psrlq(<16 x i8> %a0) { 360; SSE-LABEL: combine_pshufb_as_psrlq: 361; SSE: # %bb.0: 362; SSE-NEXT: psrlq $40, %xmm0 363; SSE-NEXT: retq 364; 365; AVX-LABEL: combine_pshufb_as_psrlq: 366; AVX: # %bb.0: 367; AVX-NEXT: vpsrlq $40, %xmm0, %xmm0 368; AVX-NEXT: retq 369 %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 5, i8 6, i8 7, i8 128, i8 128, i8 128, i8 128, i8 128, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128>) 370 ret <16 x i8> %res0 371} 372 373define <16 x i8> @combine_pshufb_as_pshuflw(<16 x i8> %a0) { 374; SSE-LABEL: combine_pshufb_as_pshuflw: 375; SSE: # %bb.0: 376; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] 377; SSE-NEXT: retq 378; 379; AVX-LABEL: combine_pshufb_as_pshuflw: 380; AVX: # %bb.0: 381; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] 382; AVX-NEXT: retq 383 %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>) 384 ret <16 x i8> %res0 385} 386 387define <16 x i8> @combine_pshufb_as_pshufhw(<16 x i8> %a0) { 388; SSE-LABEL: combine_pshufb_as_pshufhw: 389; SSE: # %bb.0: 390; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] 391; SSE-NEXT: retq 392; 393; AVX-LABEL: combine_pshufb_as_pshufhw: 394; AVX: # %bb.0: 395; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] 396; AVX-NEXT: retq 397 %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>) 398 ret <16 x i8> %res0 399} 400 401define <16 x i8> @combine_pshufb_not_as_pshufw(<16 x i8> %a0) { 402; SSE-LABEL: combine_pshufb_not_as_pshufw: 403; SSE: # %bb.0: 404; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] 405; SSE-NEXT: retq 406; 407; AVX-LABEL: combine_pshufb_not_as_pshufw: 408; AVX: # %bb.0: 409; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] 410; AVX-NEXT: retq 411 %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>) 412 %res1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>) 413 ret <16 x i8> %res1 414} 415 416define <16 x i8> @combine_vpshufb_as_pshuflw_not_pslld(<16 x i8> *%a0) { 417; SSE-LABEL: combine_vpshufb_as_pshuflw_not_pslld: 418; SSE: # %bb.0: 419; SSE-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,2,2,4,5,6,7] 420; SSE-NEXT: retq 421; 422; AVX-LABEL: combine_vpshufb_as_pshuflw_not_pslld: 423; AVX: # %bb.0: 424; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,2,2,4,5,6,7] 425; AVX-NEXT: retq 426 %res0 = load <16 x i8>, <16 x i8> *%a0, align 16 427 %res1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res0, <16 x i8> <i8 undef, i8 undef, i8 0, i8 1, i8 undef, i8 undef, i8 4, i8 5, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>) 428 ret <16 x i8> %res1 429} 430 431define <16 x i8> @combine_pshufb_as_unary_unpcklbw(<16 x i8> %a0) { 432; SSE-LABEL: combine_pshufb_as_unary_unpcklbw: 433; SSE: # %bb.0: 434; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 435; SSE-NEXT: retq 436; 437; AVX-LABEL: combine_pshufb_as_unary_unpcklbw: 438; AVX: # %bb.0: 439; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 440; AVX-NEXT: retq 441 %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 undef, i8 undef, i8 1, i8 2, i8 2, i8 3, i8 3, i8 4, i8 4, i8 5, i8 5, i8 6, i8 6, i8 7, i8 7>) 442 ret <16 x i8> %1 443} 444 445define <16 x i8> @combine_pshufb_as_unary_unpckhwd(<16 x i8> %a0) { 446; SSE-LABEL: combine_pshufb_as_unary_unpckhwd: 447; SSE: # %bb.0: 448; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 449; SSE-NEXT: retq 450; 451; AVX-LABEL: combine_pshufb_as_unary_unpckhwd: 452; AVX: # %bb.0: 453; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 454; AVX-NEXT: retq 455 %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 8, i8 9, i8 8, i8 9, i8 10, i8 11, i8 10, i8 11, i8 12, i8 13, i8 12, i8 13, i8 14, i8 15, i8 undef, i8 undef>) 456 ret <16 x i8> %1 457} 458 459define <8 x i16> @combine_pshufb_as_unpacklo_undef(<16 x i8> %a0) { 460; ALL-LABEL: combine_pshufb_as_unpacklo_undef: 461; ALL: # %bb.0: 462; ALL-NEXT: retq 463 %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 undef, i8 undef, i8 0, i8 1, i8 undef, i8 undef, i8 2, i8 3, i8 undef, i8 undef, i8 4, i8 5, i8 undef, i8 undef, i8 6, i8 7>) 464 %2 = bitcast <16 x i8> %1 to <8 x i16> 465 %3 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 466 ret <8 x i16> %3 467} 468 469define <16 x i8> @combine_pshufb_as_unpackhi_undef(<16 x i8> %a0) { 470; ALL-LABEL: combine_pshufb_as_unpackhi_undef: 471; ALL: # %bb.0: 472; ALL-NEXT: retq 473 %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 8, i8 undef, i8 9, i8 undef, i8 10, i8 undef, i8 11, i8 undef, i8 12, i8 undef, i8 13, i8 undef, i8 14, i8 undef, i8 15, i8 undef>) 474 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15> 475 ret <16 x i8> %2 476} 477 478define <16 x i8> @combine_pshufb_as_unpacklo_zero(<16 x i8> %a0) { 479; SSE-LABEL: combine_pshufb_as_unpacklo_zero: 480; SSE: # %bb.0: 481; SSE-NEXT: xorps %xmm1, %xmm1 482; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 483; SSE-NEXT: movaps %xmm1, %xmm0 484; SSE-NEXT: retq 485; 486; AVX-LABEL: combine_pshufb_as_unpacklo_zero: 487; AVX: # %bb.0: 488; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 489; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 490; AVX-NEXT: retq 491 %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 4, i8 5, i8 6, i8 7>) 492 ret <16 x i8> %1 493} 494 495define <16 x i8> @combine_pshufb_as_unpackhi_zero(<16 x i8> %a0) { 496; SSE-LABEL: combine_pshufb_as_unpackhi_zero: 497; SSE: # %bb.0: 498; SSE-NEXT: pxor %xmm1, %xmm1 499; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 500; SSE-NEXT: retq 501; 502; AVX-LABEL: combine_pshufb_as_unpackhi_zero: 503; AVX: # %bb.0: 504; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 505; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 506; AVX-NEXT: retq 507 %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 8, i8 -1, i8 9, i8 -1, i8 10, i8 -1, i8 11, i8 -1, i8 12, i8 -1, i8 13, i8 -1, i8 14, i8 -1, i8 15, i8 -1>) 508 ret <16 x i8> %1 509} 510 511define <16 x i8> @combine_psrlw_pshufb(<8 x i16> %a0) { 512; SSE-LABEL: combine_psrlw_pshufb: 513; SSE: # %bb.0: 514; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero 515; SSE-NEXT: retq 516; 517; AVX-LABEL: combine_psrlw_pshufb: 518; AVX: # %bb.0: 519; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero 520; AVX-NEXT: retq 521 %1 = lshr <8 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 522 %2 = bitcast <8 x i16> %1 to <16 x i8> 523 %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1>) 524 ret <16 x i8> %3 525} 526 527define <16 x i8> @combine_pslld_pshufb(<4 x i32> %a0) { 528; SSE-LABEL: combine_pslld_pshufb: 529; SSE: # %bb.0: 530; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,1,0],zero,xmm0[6,5,4],zero,xmm0[10,9,8],zero,xmm0[14,13,12],zero 531; SSE-NEXT: retq 532; 533; AVX-LABEL: combine_pslld_pshufb: 534; AVX: # %bb.0: 535; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,1,0],zero,xmm0[6,5,4],zero,xmm0[10,9,8],zero,xmm0[14,13,12],zero 536; AVX-NEXT: retq 537 %1 = shl <4 x i32> %a0, <i32 8, i32 8, i32 8, i32 8> 538 %2 = bitcast <4 x i32> %1 to <16 x i8> 539 %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 11, i8 10, i8 9, i8 8, i8 15, i8 14, i8 13, i8 12>) 540 ret <16 x i8> %3 541} 542 543define <16 x i8> @combine_psrlq_pshufb(<2 x i64> %a0) { 544; SSE-LABEL: combine_psrlq_pshufb: 545; SSE: # %bb.0: 546; SSE-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[7,6],zero,zero,zero,zero,zero,zero,xmm0[15,14] 547; SSE-NEXT: retq 548; 549; AVX-LABEL: combine_psrlq_pshufb: 550; AVX: # %bb.0: 551; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[7,6],zero,zero,zero,zero,zero,zero,xmm0[15,14] 552; AVX-NEXT: retq 553 %1 = lshr <2 x i64> %a0, <i64 48, i64 48> 554 %2 = bitcast <2 x i64> %1 to <16 x i8> 555 %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8>) 556 ret <16 x i8> %3 557} 558 559define <16 x i8> @combine_unpckl_arg0_pshufb(<16 x i8> %a0, <16 x i8> %a1) { 560; SSE-LABEL: combine_unpckl_arg0_pshufb: 561; SSE: # %bb.0: 562; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero 563; SSE-NEXT: retq 564; 565; AVX-LABEL: combine_unpckl_arg0_pshufb: 566; AVX: # %bb.0: 567; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero 568; AVX-NEXT: retq 569 %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23> 570 %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1>) 571 ret <16 x i8> %2 572} 573 574define <16 x i8> @combine_unpckl_arg1_pshufb(<16 x i8> %a0, <16 x i8> %a1) { 575; SSE-LABEL: combine_unpckl_arg1_pshufb: 576; SSE: # %bb.0: 577; SSE-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero 578; SSE-NEXT: movdqa %xmm1, %xmm0 579; SSE-NEXT: retq 580; 581; AVX-LABEL: combine_unpckl_arg1_pshufb: 582; AVX: # %bb.0: 583; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero 584; AVX-NEXT: retq 585 %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23> 586 %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 1, i8 -1, i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 -1>) 587 ret <16 x i8> %2 588} 589 590define <8 x i16> @shuffle_combine_unpack_insert(<8 x i16> %a0) { 591; SSE-LABEL: shuffle_combine_unpack_insert: 592; SSE: # %bb.0: 593; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,8,9,8,9,8,9,10,11,10,11] 594; SSE-NEXT: retq 595; 596; AVX-LABEL: shuffle_combine_unpack_insert: 597; AVX: # %bb.0: 598; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,8,9,8,9,8,9,10,11,10,11] 599; AVX-NEXT: retq 600 %1 = extractelement <8 x i16> %a0, i32 2 601 %2 = extractelement <8 x i16> %a0, i32 4 602 %3 = insertelement <8 x i16> %a0, i16 %1, i32 4 603 %4 = insertelement <8 x i16> %a0, i16 %2, i32 2 604 %5 = shufflevector <8 x i16> %3, <8 x i16> %4, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> 605 %6 = shufflevector <8 x i16> %5, <8 x i16> %3, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 undef, i32 undef, i32 undef, i32 undef> 606 %7 = shufflevector <8 x i16> %5, <8 x i16> %a0, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 undef, i32 undef, i32 undef, i32 undef> 607 %8 = shufflevector <8 x i16> %6, <8 x i16> %7, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> 608 ret <8 x i16> %8 609} 610 611define <16 x i8> @shuffle_combine_packssdw_pshufb(<4 x i32> %a0) { 612; SSE-LABEL: shuffle_combine_packssdw_pshufb: 613; SSE: # %bb.0: 614; SSE-NEXT: psrad $31, %xmm0 615; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[13,12,9,8,5,4,1,0,13,12,9,8,5,4,1,0] 616; SSE-NEXT: retq 617; 618; AVX-LABEL: shuffle_combine_packssdw_pshufb: 619; AVX: # %bb.0: 620; AVX-NEXT: vpsrad $31, %xmm0, %xmm0 621; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[13,12,9,8,5,4,1,0,13,12,9,8,5,4,1,0] 622; AVX-NEXT: retq 623 %1 = ashr <4 x i32> %a0, <i32 31, i32 31, i32 31, i32 31> 624 %2 = tail call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %1, <4 x i32> %1) 625 %3 = bitcast <8 x i16> %2 to <16 x i8> 626 %4 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %3, <16 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8>) 627 ret <16 x i8> %4 628} 629declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone 630 631define <16 x i8> @shuffle_combine_packsswb_pshufb(<8 x i16> %a0, <8 x i16> %a1) { 632; SSE-LABEL: shuffle_combine_packsswb_pshufb: 633; SSE: # %bb.0: 634; SSE-NEXT: psraw $15, %xmm0 635; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[14,12,10,8,6,4,2,0,14,12,10,8,6,4,2,0] 636; SSE-NEXT: retq 637; 638; AVX-LABEL: shuffle_combine_packsswb_pshufb: 639; AVX: # %bb.0: 640; AVX-NEXT: vpsraw $15, %xmm0, %xmm0 641; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,12,10,8,6,4,2,0,14,12,10,8,6,4,2,0] 642; AVX-NEXT: retq 643 %1 = ashr <8 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> 644 %2 = ashr <8 x i16> %a1, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> 645 %3 = tail call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %1, <8 x i16> %2) 646 %4 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %3, <16 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>) 647 ret <16 x i8> %4 648} 649declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone 650 651define <16 x i8> @shuffle_combine_packuswb_pshufb(<8 x i16> %a0, <8 x i16> %a1) { 652; SSE-LABEL: shuffle_combine_packuswb_pshufb: 653; SSE: # %bb.0: 654; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,13,11,9,7,5,3,1,15,13,11,9,7,5,3,1] 655; SSE-NEXT: retq 656; 657; AVX-LABEL: shuffle_combine_packuswb_pshufb: 658; AVX: # %bb.0: 659; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,13,11,9,7,5,3,1,15,13,11,9,7,5,3,1] 660; AVX-NEXT: retq 661 %1 = lshr <8 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 662 %2 = lshr <8 x i16> %a1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 663 %3 = tail call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %1, <8 x i16> %2) 664 %4 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %3, <16 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>) 665 ret <16 x i8> %4 666} 667declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone 668 669define <16 x i8> @constant_fold_pshufb() { 670; SSE-LABEL: constant_fold_pshufb: 671; SSE: # %bb.0: 672; SSE-NEXT: movaps {{.*#+}} xmm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9> 673; SSE-NEXT: retq 674; 675; AVX-LABEL: constant_fold_pshufb: 676; AVX: # %bb.0: 677; AVX-NEXT: vmovaps {{.*#+}} xmm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9> 678; AVX-NEXT: retq 679 %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <16 x i8> <i8 1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 -1, i8 -1, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 7, i8 6>) 680 ret <16 x i8> %1 681} 682 683; FIXME - unnecessary pshufb/broadcast being used - pshufb mask only needs lowest byte. 684define <16 x i8> @constant_fold_pshufb_2() { 685; SSE-LABEL: constant_fold_pshufb_2: 686; SSE: # %bb.0: 687; SSE-NEXT: movl $2, %eax 688; SSE-NEXT: movd %eax, %xmm0 689; SSE-NEXT: pxor %xmm1, %xmm1 690; SSE-NEXT: pshufb %xmm1, %xmm0 691; SSE-NEXT: retq 692; 693; AVX1-LABEL: constant_fold_pshufb_2: 694; AVX1: # %bb.0: 695; AVX1-NEXT: movl $2, %eax 696; AVX1-NEXT: vmovd %eax, %xmm0 697; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 698; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 699; AVX1-NEXT: retq 700; 701; AVX2-LABEL: constant_fold_pshufb_2: 702; AVX2: # %bb.0: 703; AVX2-NEXT: movl $2, %eax 704; AVX2-NEXT: vmovd %eax, %xmm0 705; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 706; AVX2-NEXT: retq 707; 708; AVX512F-LABEL: constant_fold_pshufb_2: 709; AVX512F: # %bb.0: 710; AVX512F-NEXT: movl $2, %eax 711; AVX512F-NEXT: vmovd %eax, %xmm0 712; AVX512F-NEXT: vpbroadcastb %xmm0, %xmm0 713; AVX512F-NEXT: retq 714 %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> <i8 2, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>) 715 ret <16 x i8> %1 716} 717 718define i32 @mask_zzz3_v16i8(<16 x i8> %a0) { 719; SSSE3-LABEL: mask_zzz3_v16i8: 720; SSSE3: # %bb.0: 721; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[14,u,u,u,u,u,u,u,u,u,u,u,u] 722; SSSE3-NEXT: movd %xmm0, %eax 723; SSSE3-NEXT: retq 724; 725; SSE41-LABEL: mask_zzz3_v16i8: 726; SSE41: # %bb.0: 727; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[14] 728; SSE41-NEXT: pextrd $3, %xmm0, %eax 729; SSE41-NEXT: retq 730; 731; AVX-LABEL: mask_zzz3_v16i8: 732; AVX: # %bb.0: 733; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[14] 734; AVX-NEXT: vpextrd $3, %xmm0, %eax 735; AVX-NEXT: retq 736 %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14>) 737 %2 = bitcast <16 x i8> %1 to <4 x i32> 738 %3 = extractelement <4 x i32> %2, i32 3 739 %4 = and i32 %3, 4278190080 740 ret i32 %4 741} 742 743define i32 @mask_z1z3_v16i8(<16 x i8> %a0) { 744; SSSE3-LABEL: mask_z1z3_v16i8: 745; SSSE3: # %bb.0: 746; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,xmm0[10],zero,xmm0[14,u,u,u,u,u,u,u,u,u,u,u,u] 747; SSSE3-NEXT: movd %xmm0, %eax 748; SSSE3-NEXT: retq 749; 750; SSE41-LABEL: mask_z1z3_v16i8: 751; SSE41: # %bb.0: 752; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u],zero,xmm0[10],zero,xmm0[14] 753; SSE41-NEXT: pextrd $3, %xmm0, %eax 754; SSE41-NEXT: retq 755; 756; AVX-LABEL: mask_z1z3_v16i8: 757; AVX: # %bb.0: 758; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u],zero,xmm0[10],zero,xmm0[14] 759; AVX-NEXT: vpextrd $3, %xmm0, %eax 760; AVX-NEXT: retq 761 %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14>) 762 %2 = bitcast <16 x i8> %1 to <4 x i32> 763 %3 = extractelement <4 x i32> %2, i32 3 764 %4 = and i32 %3, 4278255360 765 ret i32 %4 766} 767 768define i32 @PR22415(double %a0) { 769; SSE-LABEL: PR22415: 770; SSE: # %bb.0: 771; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 772; SSE-NEXT: movd %xmm0, %eax 773; SSE-NEXT: retq 774; 775; AVX-LABEL: PR22415: 776; AVX: # %bb.0: 777; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 778; AVX-NEXT: vmovd %xmm0, %eax 779; AVX-NEXT: retq 780 %1 = bitcast double %a0 to <8 x i8> 781 %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 undef> 782 %3 = shufflevector <4 x i8> %2, <4 x i8> undef, <3 x i32> <i32 0, i32 1, i32 2> 783 %4 = bitcast <3 x i8> %3 to i24 784 %5 = zext i24 %4 to i32 785 ret i32 %5 786} 787