1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,+sse4a | FileCheck %s --check-prefixes=CHECK,SSSE3 3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.2,+sse4a | FileCheck %s --check-prefixes=CHECK,SSE42 4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,+sse4a| FileCheck %s --check-prefixes=CHECK,AVX 5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,+sse4a | FileCheck %s --check-prefixes=CHECK,AVX 6; 7; Combine tests involving SSE4A target shuffles (EXTRQI,INSERTQI) 8 9declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) 10 11define <16 x i8> @combine_extrqi_pshufb_16i8(<16 x i8> %a0) { 12; CHECK-LABEL: combine_extrqi_pshufb_16i8: 13; CHECK: # %bb.0: 14; CHECK-NEXT: extrq {{.*#+}} xmm0 = xmm0[1,2],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] 15; CHECK-NEXT: retq 16 %1 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 1, i32 2, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 17 %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 255, i8 255, i8 255, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>) 18 ret <16 x i8> %2 19} 20 21define <8 x i16> @combine_extrqi_pshufb_8i16(<8 x i16> %a0) { 22; CHECK-LABEL: combine_extrqi_pshufb_8i16: 23; CHECK: # %bb.0: 24; CHECK-NEXT: extrq {{.*#+}} xmm0 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] 25; CHECK-NEXT: retq 26 %1 = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 1, i32 2, i32 8, i32 8, i32 undef, i32 undef, i32 undef, i32 undef> 27 %2 = bitcast <8 x i16> %1 to <16 x i8> 28 %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 0, i8 1, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>) 29 %4 = bitcast <16 x i8> %3 to <8 x i16> 30 ret <8 x i16> %4 31} 32 33define <16 x i8> @combine_insertqi_pshufb_16i8(<16 x i8> %a0, <16 x i8> %a1) { 34; SSSE3-LABEL: combine_insertqi_pshufb_16i8: 35; SSSE3: # %bb.0: 36; SSSE3-NEXT: movdqa %xmm1, %xmm0 37; SSSE3-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] 38; SSSE3-NEXT: retq 39; 40; SSE42-LABEL: combine_insertqi_pshufb_16i8: 41; SSE42: # %bb.0: 42; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 43; SSE42-NEXT: retq 44; 45; AVX-LABEL: combine_insertqi_pshufb_16i8: 46; AVX: # %bb.0: 47; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 48; AVX-NEXT: retq 49 %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 16, i32 17, i32 18, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 50 %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>) 51 ret <16 x i8> %2 52} 53 54define <8 x i16> @combine_insertqi_pshufb_8i16(<8 x i16> %a0, <8 x i16> %a1) { 55; SSSE3-LABEL: combine_insertqi_pshufb_8i16: 56; SSSE3: # %bb.0: 57; SSSE3-NEXT: movdqa %xmm1, %xmm0 58; SSSE3-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] 59; SSSE3-NEXT: retq 60; 61; SSE42-LABEL: combine_insertqi_pshufb_8i16: 62; SSE42: # %bb.0: 63; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 64; SSE42-NEXT: retq 65; 66; AVX-LABEL: combine_insertqi_pshufb_8i16: 67; AVX: # %bb.0: 68; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 69; AVX-NEXT: retq 70 %1 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 8, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 71 %2 = bitcast <8 x i16> %1 to <16 x i8> 72 %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 0, i8 1, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>) 73 %4 = bitcast <16 x i8> %3 to <8 x i16> 74 ret <8 x i16> %4 75} 76 77define <16 x i8> @combine_pshufb_insertqi_pshufb(<16 x i8> %a0, <16 x i8> %a1) { 78; CHECK-LABEL: combine_pshufb_insertqi_pshufb: 79; CHECK: # %bb.0: 80; CHECK-NEXT: insertq {{.*#+}} xmm0 = xmm0[0],xmm1[0,1],xmm0[3,4,5,6,7,u,u,u,u,u,u,u,u] 81; CHECK-NEXT: retq 82 %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>) 83 %2 = shufflevector <16 x i8> %1, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 17, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 84 %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 7, i8 1, i8 2, i8 4, i8 3, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>) 85 ret <16 x i8> %3 86} 87