1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512F 7; 8; Combine tests involving SSE3/SSSE3 target shuffles (MOVDDUP, MOVSHDUP, MOVSLDUP, PSHUFB) 9 10declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) 11 12define <16 x i8> @combine_vpshufb_zero(<16 x i8> %a0) { 13; SSE-LABEL: combine_vpshufb_zero: 14; SSE: # BB#0: 15; SSE-NEXT: xorps %xmm0, %xmm0 16; SSE-NEXT: retq 17; 18; AVX-LABEL: combine_vpshufb_zero: 19; AVX: # BB#0: 20; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 21; AVX-NEXT: retq 22 %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 128, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>) 23 %res1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res0, <16 x i8> <i8 0, i8 128, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>) 24 %res2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res1, <16 x i8> <i8 0, i8 1, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>) 25 ret <16 x i8> %res2 26} 27 28define <16 x i8> @combine_vpshufb_movq(<16 x i8> %a0) { 29; SSE-LABEL: combine_vpshufb_movq: 30; SSE: # BB#0: 31; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 32; SSE-NEXT: retq 33; 34; AVX-LABEL: combine_vpshufb_movq: 35; AVX: # BB#0: 36; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 37; AVX-NEXT: retq 38 %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 128, i8 1, i8 128, i8 2, i8 128, i8 3, i8 128, i8 4, i8 128, i8 5, i8 128, i8 6, i8 128, i8 7, i8 128>) 39 %res1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res0, <16 x i8> <i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 1, i8 3, i8 5, i8 7, i8 9, i8 11, i8 13, i8 15>) 40 ret <16 x i8> %res1 41} 42 43define <4 x float> @combine_pshufb_movddup(<4 x float> %a0) { 44; SSE-LABEL: combine_pshufb_movddup: 45; SSE: # BB#0: 46; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[5,5,5,5,7,7,7,7,5,5,5,5,7,7,7,7] 47; SSE-NEXT: retq 48; 49; AVX-LABEL: combine_pshufb_movddup: 50; AVX: # BB#0: 51; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,5,5,5,7,7,7,7,5,5,5,5,7,7,7,7] 52; AVX-NEXT: retq 53 %1 = bitcast <4 x float> %a0 to <16 x i8> 54 %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 5, i8 5, i8 5, i8 5, i8 7, i8 7, i8 7, i8 7, i8 1, i8 1, i8 1, i8 1, i8 3, i8 3, i8 3, i8 3>) 55 %3 = bitcast <16 x i8> %2 to <4 x float> 56 %4 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 57 ret <4 x float> %4 58} 59 60define <4 x float> @combine_pshufb_movshdup(<4 x float> %a0) { 61; SSE-LABEL: combine_pshufb_movshdup: 62; SSE: # BB#0: 63; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,7,7,7,7,7,7,7,3,3,3,3,3,3,3,3] 64; SSE-NEXT: retq 65; 66; AVX-LABEL: combine_pshufb_movshdup: 67; AVX: # BB#0: 68; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,7,7,7,7,7,7,7,3,3,3,3,3,3,3,3] 69; AVX-NEXT: retq 70 %1 = bitcast <4 x float> %a0 to <16 x i8> 71 %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 5, i8 5, i8 5, i8 5, i8 7, i8 7, i8 7, i8 7, i8 1, i8 1, i8 1, i8 1, i8 3, i8 3, i8 3, i8 3>) 72 %3 = bitcast <16 x i8> %2 to <4 x float> 73 %4 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 74 ret <4 x float> %4 75} 76 77define <4 x float> @combine_pshufb_movsldup(<4 x float> %a0) { 78; SSE-LABEL: combine_pshufb_movsldup: 79; SSE: # BB#0: 80; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[5,5,5,5,5,5,5,5,1,1,1,1,1,1,1,1] 81; SSE-NEXT: retq 82; 83; AVX-LABEL: combine_pshufb_movsldup: 84; AVX: # BB#0: 85; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,5,5,5,5,5,5,5,1,1,1,1,1,1,1,1] 86; AVX-NEXT: retq 87 %1 = bitcast <4 x float> %a0 to <16 x i8> 88 %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 5, i8 5, i8 5, i8 5, i8 7, i8 7, i8 7, i8 7, i8 1, i8 1, i8 1, i8 1, i8 3, i8 3, i8 3, i8 3>) 89 %3 = bitcast <16 x i8> %2 to <4 x float> 90 %4 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 91 ret <4 x float> %4 92} 93 94define <16 x i8> @combine_pshufb_palignr(<16 x i8> %a0, <16 x i8> %a1) { 95; SSE-LABEL: combine_pshufb_palignr: 96; SSE: # BB#0: 97; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 98; SSE-NEXT: retq 99; 100; AVX-LABEL: combine_pshufb_palignr: 101; AVX: # BB#0: 102; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 103; AVX-NEXT: retq 104 %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23> 105 %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>) 106 ret <16 x i8> %2 107} 108 109define <16 x i8> @combine_pshufb_pslldq(<16 x i8> %a0) { 110; SSE-LABEL: combine_pshufb_pslldq: 111; SSE: # BB#0: 112; SSE-NEXT: xorps %xmm0, %xmm0 113; SSE-NEXT: retq 114; 115; AVX-LABEL: combine_pshufb_pslldq: 116; AVX: # BB#0: 117; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 118; AVX-NEXT: retq 119 %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>) 120 %2 = shufflevector <16 x i8> %1, <16 x i8> zeroinitializer, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 121 ret <16 x i8> %2 122} 123 124define <16 x i8> @combine_pshufb_psrldq(<16 x i8> %a0) { 125; SSE-LABEL: combine_pshufb_psrldq: 126; SSE: # BB#0: 127; SSE-NEXT: xorps %xmm0, %xmm0 128; SSE-NEXT: retq 129; 130; AVX-LABEL: combine_pshufb_psrldq: 131; AVX: # BB#0: 132; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 133; AVX-NEXT: retq 134 %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>) 135 %2 = shufflevector <16 x i8> %1, <16 x i8> zeroinitializer, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 136 ret <16 x i8> %2 137} 138 139define <16 x i8> @combine_pshufb_as_pslldq(<16 x i8> %a0) { 140; SSE-LABEL: combine_pshufb_as_pslldq: 141; SSE: # BB#0: 142; SSE-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] 143; SSE-NEXT: retq 144; 145; AVX-LABEL: combine_pshufb_as_pslldq: 146; AVX: # BB#0: 147; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] 148; AVX-NEXT: retq 149 %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5>) 150 ret <16 x i8> %res0 151} 152 153define <16 x i8> @combine_pshufb_as_psrldq(<16 x i8> %a0) { 154; SSE-LABEL: combine_pshufb_as_psrldq: 155; SSE: # BB#0: 156; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 157; SSE-NEXT: retq 158; 159; AVX-LABEL: combine_pshufb_as_psrldq: 160; AVX: # BB#0: 161; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 162; AVX-NEXT: retq 163 %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>) 164 ret <16 x i8> %res0 165} 166 167define <16 x i8> @combine_pshufb_as_pshuflw(<16 x i8> %a0) { 168; SSE-LABEL: combine_pshufb_as_pshuflw: 169; SSE: # BB#0: 170; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] 171; SSE-NEXT: retq 172; 173; AVX-LABEL: combine_pshufb_as_pshuflw: 174; AVX: # BB#0: 175; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] 176; AVX-NEXT: retq 177 %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>) 178 ret <16 x i8> %res0 179} 180 181define <16 x i8> @combine_pshufb_as_pshufhw(<16 x i8> %a0) { 182; SSE-LABEL: combine_pshufb_as_pshufhw: 183; SSE: # BB#0: 184; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] 185; SSE-NEXT: retq 186; 187; AVX-LABEL: combine_pshufb_as_pshufhw: 188; AVX: # BB#0: 189; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] 190; AVX-NEXT: retq 191 %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>) 192 ret <16 x i8> %res0 193} 194 195define <16 x i8> @combine_pshufb_not_as_pshufw(<16 x i8> %a0) { 196; SSE-LABEL: combine_pshufb_not_as_pshufw: 197; SSE: # BB#0: 198; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] 199; SSE-NEXT: retq 200; 201; AVX-LABEL: combine_pshufb_not_as_pshufw: 202; AVX: # BB#0: 203; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] 204; AVX-NEXT: retq 205 %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>) 206 %res1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>) 207 ret <16 x i8> %res1 208} 209 210define <16 x i8> @combine_pshufb_as_unary_unpcklbw(<16 x i8> %a0) { 211; SSE-LABEL: combine_pshufb_as_unary_unpcklbw: 212; SSE: # BB#0: 213; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 214; SSE-NEXT: retq 215; 216; AVX-LABEL: combine_pshufb_as_unary_unpcklbw: 217; AVX: # BB#0: 218; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 219; AVX-NEXT: retq 220 %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 undef, i8 undef, i8 1, i8 2, i8 2, i8 3, i8 3, i8 4, i8 4, i8 5, i8 5, i8 6, i8 6, i8 7, i8 7>) 221 ret <16 x i8> %1 222} 223 224define <16 x i8> @combine_pshufb_as_unary_unpckhwd(<16 x i8> %a0) { 225; SSE-LABEL: combine_pshufb_as_unary_unpckhwd: 226; SSE: # BB#0: 227; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 228; SSE-NEXT: retq 229; 230; AVX-LABEL: combine_pshufb_as_unary_unpckhwd: 231; AVX: # BB#0: 232; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 233; AVX-NEXT: retq 234 %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 8, i8 9, i8 8, i8 9, i8 10, i8 11, i8 10, i8 11, i8 12, i8 13, i8 12, i8 13, i8 14, i8 15, i8 undef, i8 undef>) 235 ret <16 x i8> %1 236} 237 238define <16 x i8> @combine_unpckl_arg0_pshufb(<16 x i8> %a0, <16 x i8> %a1) { 239; SSE-LABEL: combine_unpckl_arg0_pshufb: 240; SSE: # BB#0: 241; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero 242; SSE-NEXT: retq 243; 244; AVX-LABEL: combine_unpckl_arg0_pshufb: 245; AVX: # BB#0: 246; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero 247; AVX-NEXT: retq 248 %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23> 249 %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1>) 250 ret <16 x i8> %2 251} 252 253define <16 x i8> @combine_unpckl_arg1_pshufb(<16 x i8> %a0, <16 x i8> %a1) { 254; SSE-LABEL: combine_unpckl_arg1_pshufb: 255; SSE: # BB#0: 256; SSE-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero 257; SSE-NEXT: movdqa %xmm1, %xmm0 258; SSE-NEXT: retq 259; 260; AVX-LABEL: combine_unpckl_arg1_pshufb: 261; AVX: # BB#0: 262; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero 263; AVX-NEXT: retq 264 %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23> 265 %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 1, i8 -1, i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 -1>) 266 ret <16 x i8> %2 267} 268