1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX1 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX2OR512VL 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=ALL,AVX2OR512VL 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL 7 8define <2 x i64> @unpckh_unary_extracted_v4i64(<4 x i64> %x) { 9; AVX1-LABEL: unpckh_unary_extracted_v4i64: 10; AVX1: # %bb.0: 11; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 12; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 13; AVX1-NEXT: vzeroupper 14; AVX1-NEXT: retq 15; 16; AVX2OR512VL-LABEL: unpckh_unary_extracted_v4i64: 17; AVX2OR512VL: # %bb.0: 18; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,3,2,3] 19; AVX2OR512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 20; AVX2OR512VL-NEXT: vzeroupper 21; AVX2OR512VL-NEXT: retq 22 %extrl = shufflevector <4 x i64> %x, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 23 %extrh = shufflevector <4 x i64> %x, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 24 %r = shufflevector <2 x i64> %extrl, <2 x i64> %extrh, <2 x i32> <i32 1, i32 3> 25 ret <2 x i64> %r 26} 27 28define <2 x double> @unpckh_unary_extracted_v8f64(<4 x double> %x) { 29; AVX1-LABEL: unpckh_unary_extracted_v8f64: 30; AVX1: # %bb.0: 31; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 32; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 33; AVX1-NEXT: vzeroupper 34; AVX1-NEXT: retq 35; 36; AVX2OR512VL-LABEL: unpckh_unary_extracted_v8f64: 37; AVX2OR512VL: # %bb.0: 38; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,3,2,3] 39; AVX2OR512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 40; AVX2OR512VL-NEXT: vzeroupper 41; AVX2OR512VL-NEXT: retq 42 %extrl = shufflevector <4 x double> %x, <4 x double> undef, <2 x i32> <i32 0, i32 1> 43 %extrh = shufflevector <4 x double> %x, <4 x double> undef, <2 x i32> <i32 2, i32 3> 44 %r = shufflevector <2 x double> %extrl, <2 x double> %extrh, <2 x i32> <i32 1, i32 3> 45 ret <2 x double> %r 46} 47 48; vpermps requires a constant load for the index op. It's unlikely to be profitable. 49 50define <4 x i32> @unpckh_unary_extracted_v8i32(<8 x i32> %x) { 51; ALL-LABEL: unpckh_unary_extracted_v8i32: 52; ALL: # %bb.0: 53; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1 54; ALL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 55; ALL-NEXT: vzeroupper 56; ALL-NEXT: retq 57 %extrl = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 58 %extrh = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 59 %r = shufflevector <4 x i32> %extrl, <4 x i32> %extrh, <4 x i32> <i32 2, i32 6, i32 3, i32 7> 60 ret <4 x i32> %r 61} 62 63define <4 x float> @unpckh_unary_extracted_v8f32(<8 x float> %x) { 64; ALL-LABEL: unpckh_unary_extracted_v8f32: 65; ALL: # %bb.0: 66; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1 67; ALL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 68; ALL-NEXT: vzeroupper 69; ALL-NEXT: retq 70 %extrl = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 71 %extrh = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 72 %r = shufflevector <4 x float> %extrl, <4 x float> %extrh, <4 x i32> <i32 2, i32 6, i32 3, i32 7> 73 ret <4 x float> %r 74} 75 76define <8 x i16> @unpckh_unary_extracted_v16i16(<16 x i16> %x) { 77; AVX1-LABEL: unpckh_unary_extracted_v16i16: 78; AVX1: # %bb.0: 79; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 80; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 81; AVX1-NEXT: vzeroupper 82; AVX1-NEXT: retq 83; 84; AVX2OR512VL-LABEL: unpckh_unary_extracted_v16i16: 85; AVX2OR512VL: # %bb.0: 86; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 87; AVX2OR512VL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 88; AVX2OR512VL-NEXT: vzeroupper 89; AVX2OR512VL-NEXT: retq 90 %extrl = shufflevector <16 x i16> %x, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 91 %extrh = shufflevector <16 x i16> %x, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 92 %r = shufflevector <8 x i16> %extrl, <8 x i16> %extrh, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 93 ret <8 x i16> %r 94} 95 96define <16 x i8> @unpckh_unary_extracted_v32i8(<32 x i8> %x) { 97; AVX1-LABEL: unpckh_unary_extracted_v32i8: 98; AVX1: # %bb.0: 99; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 100; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 101; AVX1-NEXT: vzeroupper 102; AVX1-NEXT: retq 103; 104; AVX2OR512VL-LABEL: unpckh_unary_extracted_v32i8: 105; AVX2OR512VL: # %bb.0: 106; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 107; AVX2OR512VL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 108; AVX2OR512VL-NEXT: vzeroupper 109; AVX2OR512VL-NEXT: retq 110 %extrl = shufflevector <32 x i8> %x, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 111 %extrh = shufflevector <32 x i8> %x, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 112 %r = shufflevector <16 x i8> %extrl, <16 x i8> %extrh, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> 113 ret <16 x i8> %r 114} 115 116define <2 x i64> @unpckl_unary_extracted_v4i64(<4 x i64> %x) { 117; AVX1-LABEL: unpckl_unary_extracted_v4i64: 118; AVX1: # %bb.0: 119; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 120; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 121; AVX1-NEXT: vzeroupper 122; AVX1-NEXT: retq 123; 124; AVX2OR512VL-LABEL: unpckl_unary_extracted_v4i64: 125; AVX2OR512VL: # %bb.0: 126; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] 127; AVX2OR512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 128; AVX2OR512VL-NEXT: vzeroupper 129; AVX2OR512VL-NEXT: retq 130 %extrl = shufflevector <4 x i64> %x, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 131 %extrh = shufflevector <4 x i64> %x, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 132 %r = shufflevector <2 x i64> %extrl, <2 x i64> %extrh, <2 x i32> <i32 0, i32 2> 133 ret <2 x i64> %r 134} 135 136define <2 x double> @unpckl_unary_extracted_v8f64(<4 x double> %x) { 137; AVX1-LABEL: unpckl_unary_extracted_v8f64: 138; AVX1: # %bb.0: 139; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 140; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 141; AVX1-NEXT: vzeroupper 142; AVX1-NEXT: retq 143; 144; AVX2OR512VL-LABEL: unpckl_unary_extracted_v8f64: 145; AVX2OR512VL: # %bb.0: 146; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] 147; AVX2OR512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 148; AVX2OR512VL-NEXT: vzeroupper 149; AVX2OR512VL-NEXT: retq 150 %extrl = shufflevector <4 x double> %x, <4 x double> undef, <2 x i32> <i32 0, i32 1> 151 %extrh = shufflevector <4 x double> %x, <4 x double> undef, <2 x i32> <i32 2, i32 3> 152 %r = shufflevector <2 x double> %extrl, <2 x double> %extrh, <2 x i32> <i32 0, i32 2> 153 ret <2 x double> %r 154} 155 156; vpermps requires a constant load for the index op. It's unlikely to be profitable. 157 158define <4 x i32> @unpckl_unary_extracted_v8i32(<8 x i32> %x) { 159; ALL-LABEL: unpckl_unary_extracted_v8i32: 160; ALL: # %bb.0: 161; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1 162; ALL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 163; ALL-NEXT: vzeroupper 164; ALL-NEXT: retq 165 %extrl = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 166 %extrh = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 167 %r = shufflevector <4 x i32> %extrl, <4 x i32> %extrh, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 168 ret <4 x i32> %r 169} 170 171define <4 x float> @unpckl_unary_extracted_v8f32(<8 x float> %x) { 172; ALL-LABEL: unpckl_unary_extracted_v8f32: 173; ALL: # %bb.0: 174; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1 175; ALL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 176; ALL-NEXT: vzeroupper 177; ALL-NEXT: retq 178 %extrl = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 179 %extrh = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 180 %r = shufflevector <4 x float> %extrl, <4 x float> %extrh, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 181 ret <4 x float> %r 182} 183 184define <8 x i16> @unpckl_unary_extracted_v16i16(<16 x i16> %x) { 185; AVX1-LABEL: unpckl_unary_extracted_v16i16: 186; AVX1: # %bb.0: 187; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 188; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 189; AVX1-NEXT: vzeroupper 190; AVX1-NEXT: retq 191; 192; AVX2OR512VL-LABEL: unpckl_unary_extracted_v16i16: 193; AVX2OR512VL: # %bb.0: 194; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 195; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 196; AVX2OR512VL-NEXT: vzeroupper 197; AVX2OR512VL-NEXT: retq 198 %extrl = shufflevector <16 x i16> %x, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 199 %extrh = shufflevector <16 x i16> %x, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 200 %r = shufflevector <8 x i16> %extrl, <8 x i16> %extrh, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> 201 ret <8 x i16> %r 202} 203 204define <16 x i8> @unpckl_unary_extracted_v32i8(<32 x i8> %x) { 205; AVX1-LABEL: unpckl_unary_extracted_v32i8: 206; AVX1: # %bb.0: 207; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 208; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 209; AVX1-NEXT: vzeroupper 210; AVX1-NEXT: retq 211; 212; AVX2OR512VL-LABEL: unpckl_unary_extracted_v32i8: 213; AVX2OR512VL: # %bb.0: 214; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 215; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 216; AVX2OR512VL-NEXT: vzeroupper 217; AVX2OR512VL-NEXT: retq 218 %extrl = shufflevector <32 x i8> %x, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 219 %extrh = shufflevector <32 x i8> %x, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 220 %r = shufflevector <16 x i8> %extrl, <16 x i8> %extrh, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23> 221 ret <16 x i8> %r 222} 223 224; This would infinite loop because we did not recognize the unpack shuffle mask in commuted form. 225 226define <8 x i32> @extract_unpckl_v8i32(<8 x i32> %a) { 227; ALL-LABEL: extract_unpckl_v8i32: 228; ALL: # %bb.0: 229; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1 230; ALL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 231; ALL-NEXT: retq 232 %shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 4, i32 undef, i32 5, i32 1, i32 undef, i32 undef, i32 undef, i32 undef> 233 ret <8 x i32> %shuffle 234} 235 236