1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; These are tests for SSE3 codegen. 3 4; RUN: llc < %s -mtriple=x86_64-apple-darwin9 --mattr=+sse3 | FileCheck %s --check-prefix=X64 5 6; Test for v8xi16 lowering where we extract the first element of the vector and 7; placed it in the second element of the result. 8 9define void @t0(<8 x i16>* %dest, <8 x i16>* %old) nounwind { 10; X64-LABEL: t0: 11; X64: ## BB#0: ## %entry 12; X64-NEXT: movl $1, %eax 13; X64-NEXT: movd %eax, %xmm0 14; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 15; X64-NEXT: movdqa %xmm0, (%rdi) 16; X64-NEXT: retq 17entry: 18 %tmp3 = load <8 x i16>, <8 x i16>* %old 19 %tmp6 = shufflevector <8 x i16> %tmp3, 20 <8 x i16> < i16 1, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef >, 21 <8 x i32> < i32 8, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef > 22 store <8 x i16> %tmp6, <8 x i16>* %dest 23 ret void 24} 25 26define <8 x i16> @t1(<8 x i16>* %A, <8 x i16>* %B) nounwind { 27; X64-LABEL: t1: 28; X64: ## BB#0: 29; X64-NEXT: movaps {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,65535] 30; X64-NEXT: movaps %xmm0, %xmm1 31; X64-NEXT: andnps (%rsi), %xmm1 32; X64-NEXT: andps (%rdi), %xmm0 33; X64-NEXT: orps %xmm1, %xmm0 34; X64-NEXT: retq 35 %tmp1 = load <8 x i16>, <8 x i16>* %A 36 %tmp2 = load <8 x i16>, <8 x i16>* %B 37 %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> < i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 > 38 ret <8 x i16> %tmp3 39 40} 41 42define <8 x i16> @t2(<8 x i16> %A, <8 x i16> %B) nounwind { 43; X64-LABEL: t2: 44; X64: ## BB#0: 45; X64-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,0,65535,65535,65535,65535] 46; X64-NEXT: pand %xmm2, %xmm0 47; X64-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,2,1,4,5,6,7] 48; X64-NEXT: pandn %xmm1, %xmm2 49; X64-NEXT: por %xmm2, %xmm0 50; X64-NEXT: retq 51 %tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 9, i32 1, i32 2, i32 9, i32 4, i32 5, i32 6, i32 7 > 52 ret <8 x i16> %tmp 53} 54 55define <8 x i16> @t3(<8 x i16> %A, <8 x i16> %B) nounwind { 56; X64-LABEL: t3: 57; X64: ## BB#0: 58; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] 59; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] 60; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] 61; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] 62; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 63; X64-NEXT: retq 64 %tmp = shufflevector <8 x i16> %A, <8 x i16> %A, <8 x i32> < i32 8, i32 3, i32 2, i32 13, i32 7, i32 6, i32 5, i32 4 > 65 ret <8 x i16> %tmp 66} 67 68define <8 x i16> @t4(<8 x i16> %A, <8 x i16> %B) nounwind { 69; X64-LABEL: t4: 70; X64: ## BB#0: 71; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] 72; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] 73; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] 74; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,4,7] 75; X64-NEXT: retq 76 %tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 0, i32 7, i32 2, i32 3, i32 1, i32 5, i32 6, i32 5 > 77 ret <8 x i16> %tmp 78} 79 80define <8 x i16> @t5(<8 x i16> %A, <8 x i16> %B) nounwind { 81; X64-LABEL: t5: 82; X64: ## BB#0: 83; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 84; X64-NEXT: movdqa %xmm1, %xmm0 85; X64-NEXT: retq 86 %tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 8, i32 9, i32 0, i32 1, i32 10, i32 11, i32 2, i32 3 > 87 ret <8 x i16> %tmp 88} 89 90define <8 x i16> @t6(<8 x i16> %A, <8 x i16> %B) nounwind { 91; X64-LABEL: t6: 92; X64: ## BB#0: 93; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 94; X64-NEXT: retq 95 %tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 > 96 ret <8 x i16> %tmp 97} 98 99define <8 x i16> @t7(<8 x i16> %A, <8 x i16> %B) nounwind { 100; X64-LABEL: t7: 101; X64: ## BB#0: 102; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7] 103; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] 104; X64-NEXT: retq 105 %tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 0, i32 0, i32 3, i32 2, i32 4, i32 6, i32 4, i32 7 > 106 ret <8 x i16> %tmp 107} 108 109define void @t8(<2 x i64>* %res, <2 x i64>* %A) nounwind { 110; X64-LABEL: t8: 111; X64: ## BB#0: 112; X64-NEXT: pshuflw {{.*#+}} xmm0 = mem[2,1,0,3,4,5,6,7] 113; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] 114; X64-NEXT: movdqa %xmm0, (%rdi) 115; X64-NEXT: retq 116 %tmp = load <2 x i64>, <2 x i64>* %A 117 %tmp.upgrd.1 = bitcast <2 x i64> %tmp to <8 x i16> 118 %tmp0 = extractelement <8 x i16> %tmp.upgrd.1, i32 0 119 %tmp1 = extractelement <8 x i16> %tmp.upgrd.1, i32 1 120 %tmp2 = extractelement <8 x i16> %tmp.upgrd.1, i32 2 121 %tmp3 = extractelement <8 x i16> %tmp.upgrd.1, i32 3 122 %tmp4 = extractelement <8 x i16> %tmp.upgrd.1, i32 4 123 %tmp5 = extractelement <8 x i16> %tmp.upgrd.1, i32 5 124 %tmp6 = extractelement <8 x i16> %tmp.upgrd.1, i32 6 125 %tmp7 = extractelement <8 x i16> %tmp.upgrd.1, i32 7 126 %tmp8 = insertelement <8 x i16> undef, i16 %tmp2, i32 0 127 %tmp9 = insertelement <8 x i16> %tmp8, i16 %tmp1, i32 1 128 %tmp10 = insertelement <8 x i16> %tmp9, i16 %tmp0, i32 2 129 %tmp11 = insertelement <8 x i16> %tmp10, i16 %tmp3, i32 3 130 %tmp12 = insertelement <8 x i16> %tmp11, i16 %tmp6, i32 4 131 %tmp13 = insertelement <8 x i16> %tmp12, i16 %tmp5, i32 5 132 %tmp14 = insertelement <8 x i16> %tmp13, i16 %tmp4, i32 6 133 %tmp15 = insertelement <8 x i16> %tmp14, i16 %tmp7, i32 7 134 %tmp15.upgrd.2 = bitcast <8 x i16> %tmp15 to <2 x i64> 135 store <2 x i64> %tmp15.upgrd.2, <2 x i64>* %res 136 ret void 137} 138 139define void @t9(<4 x float>* %r, <2 x i32>* %A) nounwind { 140; X64-LABEL: t9: 141; X64: ## BB#0: 142; X64-NEXT: movapd (%rdi), %xmm0 143; X64-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] 144; X64-NEXT: movapd %xmm0, (%rdi) 145; X64-NEXT: retq 146 %tmp = load <4 x float>, <4 x float>* %r 147 %tmp.upgrd.3 = bitcast <2 x i32>* %A to double* 148 %tmp.upgrd.4 = load double, double* %tmp.upgrd.3 149 %tmp.upgrd.5 = insertelement <2 x double> undef, double %tmp.upgrd.4, i32 0 150 %tmp5 = insertelement <2 x double> %tmp.upgrd.5, double undef, i32 1 151 %tmp6 = bitcast <2 x double> %tmp5 to <4 x float> 152 %tmp.upgrd.6 = extractelement <4 x float> %tmp, i32 0 153 %tmp7 = extractelement <4 x float> %tmp, i32 1 154 %tmp8 = extractelement <4 x float> %tmp6, i32 0 155 %tmp9 = extractelement <4 x float> %tmp6, i32 1 156 %tmp10 = insertelement <4 x float> undef, float %tmp.upgrd.6, i32 0 157 %tmp11 = insertelement <4 x float> %tmp10, float %tmp7, i32 1 158 %tmp12 = insertelement <4 x float> %tmp11, float %tmp8, i32 2 159 %tmp13 = insertelement <4 x float> %tmp12, float %tmp9, i32 3 160 store <4 x float> %tmp13, <4 x float>* %r 161 ret void 162} 163 164 165 166; FIXME: This testcase produces icky code. It can be made much better! 167; PR2585 168 169@g1 = external constant <4 x i32> 170@g2 = external constant <4 x i16> 171 172define void @t10() nounwind { 173; X64-LABEL: t10: 174; X64: ## BB#0: 175; X64-NEXT: movq _g1@{{.*}}(%rip), %rax 176; X64-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7] 177; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 178; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 179; X64-NEXT: movq _g2@{{.*}}(%rip), %rax 180; X64-NEXT: movq %xmm0, (%rax) 181; X64-NEXT: retq 182 load <4 x i32>, <4 x i32>* @g1, align 16 183 bitcast <4 x i32> %1 to <8 x i16> 184 shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> < i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef > 185 bitcast <8 x i16> %3 to <2 x i64> 186 extractelement <2 x i64> %4, i32 0 187 bitcast i64 %5 to <4 x i16> 188 store <4 x i16> %6, <4 x i16>* @g2, align 8 189 ret void 190} 191 192; Pack various elements via shuffles. 193define <8 x i16> @t11(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { 194; X64-LABEL: t11: 195; X64: ## BB#0: ## %entry 196; X64-NEXT: psrld $16, %xmm0 197; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 198; X64-NEXT: retq 199entry: 200 %tmp7 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 1, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef > 201 ret <8 x i16> %tmp7 202 203} 204 205define <8 x i16> @t12(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { 206; X64-LABEL: t12: 207; X64: ## BB#0: ## %entry 208; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 209; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 210; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] 211; X64-NEXT: retq 212entry: 213 %tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 0, i32 1, i32 undef, i32 undef, i32 3, i32 11, i32 undef , i32 undef > 214 ret <8 x i16> %tmp9 215 216} 217 218define <8 x i16> @t13(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { 219; X64-LABEL: t13: 220; X64: ## BB#0: ## %entry 221; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 222; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] 223; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] 224; X64-NEXT: retq 225entry: 226 %tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 8, i32 9, i32 undef, i32 undef, i32 11, i32 3, i32 undef , i32 undef > 227 ret <8 x i16> %tmp9 228} 229 230define <8 x i16> @t14(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { 231; X64-LABEL: t14: 232; X64: ## BB#0: ## %entry 233; X64-NEXT: psrlq $16, %xmm0 234; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 235; X64-NEXT: movdqa %xmm1, %xmm0 236; X64-NEXT: retq 237entry: 238 %tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 2, i32 undef , i32 undef > 239 ret <8 x i16> %tmp9 240} 241 242; FIXME: t15 is worse off from disabling of scheduler 2-address hack. 243define <8 x i16> @t15(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { 244; X64-LABEL: t15: 245; X64: ## BB#0: ## %entry 246; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 247; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] 248; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 249; X64-NEXT: retq 250entry: 251 %tmp8 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 undef, i32 undef, i32 7, i32 2, i32 8, i32 undef, i32 undef , i32 undef > 252 ret <8 x i16> %tmp8 253} 254 255; Test yonah where we convert a shuffle to pextrw and pinrsw 256define <16 x i8> @t16(<16 x i8> %T0) nounwind readnone { 257; X64-LABEL: t16: 258; X64: ## BB#0: ## %entry 259; X64-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0] 260; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 261; X64-NEXT: movdqa %xmm1, %xmm0 262; X64-NEXT: retq 263entry: 264 %tmp8 = shufflevector <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> %T0, <16 x i32> < i32 0, i32 1, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef > 265 %tmp9 = shufflevector <16 x i8> %tmp8, <16 x i8> %T0, <16 x i32> < i32 0, i32 1, i32 2, i32 17, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef > 266 ret <16 x i8> %tmp9 267} 268 269; rdar://8520311 270define <4 x i32> @t17() nounwind { 271; X64-LABEL: t17: 272; X64: ## BB#0: ## %entry 273; X64-NEXT: movaps (%rax), %xmm0 274; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1] 275; X64-NEXT: pxor %xmm1, %xmm1 276; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 277; X64-NEXT: retq 278entry: 279 %tmp1 = load <4 x float>, <4 x float>* undef, align 16 280 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> <i32 4, i32 1, i32 2, i32 3> 281 %tmp3 = load <4 x float>, <4 x float>* undef, align 16 282 %tmp4 = shufflevector <4 x float> %tmp2, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1> 283 %tmp5 = bitcast <4 x float> %tmp3 to <4 x i32> 284 %tmp6 = shufflevector <4 x i32> %tmp5, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1> 285 %tmp7 = and <4 x i32> %tmp6, <i32 undef, i32 undef, i32 -1, i32 0> 286 ret <4 x i32> %tmp7 287} 288