1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86-SSE 3; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X86-AVX,X86-AVX1 4; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X86-AVX,X86-AVX2 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64-SSE 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X64-AVX,X64-AVX1 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64-AVX,X64-AVX2 8 9@c = external dso_local global i32*, align 8 10 11; %val1 = load <2 x i8> 12; %op1 = zext<2 x i32> %val1 13; %val2 = load <2 x i8> 14; %op2 = zext<2 x i32> %val2 15; %rst = mul <2 x i32> %op1, %op2 16; 17define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { 18; X86-SSE-LABEL: mul_2xi8: 19; X86-SSE: # %bb.0: # %entry 20; X86-SSE-NEXT: pushl %esi 21; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 22; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 23; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx 24; X86-SSE-NEXT: movl c, %esi 25; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx 26; X86-SSE-NEXT: movd %edx, %xmm0 27; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax 28; X86-SSE-NEXT: movd %eax, %xmm1 29; X86-SSE-NEXT: pxor %xmm2, %xmm2 30; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 31; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 32; X86-SSE-NEXT: pmullw %xmm0, %xmm1 33; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 34; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4) 35; X86-SSE-NEXT: popl %esi 36; X86-SSE-NEXT: retl 37; 38; X86-AVX-LABEL: mul_2xi8: 39; X86-AVX: # %bb.0: # %entry 40; X86-AVX-NEXT: pushl %esi 41; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 42; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 43; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx 44; X86-AVX-NEXT: movl c, %esi 45; X86-AVX-NEXT: movzwl (%edx,%ecx), %edx 46; X86-AVX-NEXT: vmovd %edx, %xmm0 47; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 48; X86-AVX-NEXT: movzwl (%eax,%ecx), %eax 49; X86-AVX-NEXT: vmovd %eax, %xmm1 50; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 51; X86-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 52; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) 53; X86-AVX-NEXT: popl %esi 54; X86-AVX-NEXT: retl 55; 56; X64-SSE-LABEL: mul_2xi8: 57; X64-SSE: # %bb.0: # %entry 58; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 59; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx 60; X64-SSE-NEXT: movd %ecx, %xmm0 61; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx 62; X64-SSE-NEXT: movd %ecx, %xmm1 63; X64-SSE-NEXT: pxor %xmm2, %xmm2 64; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 65; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 66; X64-SSE-NEXT: pmullw %xmm0, %xmm1 67; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 68; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4) 69; X64-SSE-NEXT: retq 70; 71; X64-AVX-LABEL: mul_2xi8: 72; X64-AVX: # %bb.0: # %entry 73; X64-AVX-NEXT: movq {{.*}}(%rip), %rax 74; X64-AVX-NEXT: movzwl (%rdi,%rdx), %ecx 75; X64-AVX-NEXT: vmovd %ecx, %xmm0 76; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 77; X64-AVX-NEXT: movzwl (%rsi,%rdx), %ecx 78; X64-AVX-NEXT: vmovd %ecx, %xmm1 79; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 80; X64-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 81; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) 82; X64-AVX-NEXT: retq 83entry: 84 %pre = load i32*, i32** @c 85 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 86 %tmp7 = bitcast i8* %tmp6 to <2 x i8>* 87 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 88 %tmp8 = zext <2 x i8> %wide.load to <2 x i32> 89 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 90 %tmp11 = bitcast i8* %tmp10 to <2 x i8>* 91 %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1 92 %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32> 93 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 94 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 95 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 96 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 97 ret void 98} 99 100; %val1 = load <4 x i8> 101; %op1 = zext<4 x i32> %val1 102; %val2 = load <4 x i8> 103; %op2 = zext<4 x i32> %val2 104; %rst = mul <4 x i32> %op1, %op2 105; 106define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { 107; X86-SSE-LABEL: mul_4xi8: 108; X86-SSE: # %bb.0: # %entry 109; X86-SSE-NEXT: pushl %esi 110; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 111; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 112; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx 113; X86-SSE-NEXT: movl c, %esi 114; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 115; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 116; X86-SSE-NEXT: pxor %xmm2, %xmm2 117; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 118; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 119; X86-SSE-NEXT: pmullw %xmm0, %xmm1 120; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 121; X86-SSE-NEXT: movdqu %xmm1, (%esi,%ecx,4) 122; X86-SSE-NEXT: popl %esi 123; X86-SSE-NEXT: retl 124; 125; X86-AVX-LABEL: mul_4xi8: 126; X86-AVX: # %bb.0: # %entry 127; X86-AVX-NEXT: pushl %esi 128; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 129; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 130; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx 131; X86-AVX-NEXT: movl c, %esi 132; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 133; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 134; X86-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 135; X86-AVX-NEXT: vmovdqu %xmm0, (%esi,%ecx,4) 136; X86-AVX-NEXT: popl %esi 137; X86-AVX-NEXT: retl 138; 139; X64-SSE-LABEL: mul_4xi8: 140; X64-SSE: # %bb.0: # %entry 141; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 142; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 143; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 144; X64-SSE-NEXT: pxor %xmm2, %xmm2 145; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 146; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 147; X64-SSE-NEXT: pmullw %xmm0, %xmm1 148; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 149; X64-SSE-NEXT: movdqu %xmm1, (%rax,%rdx,4) 150; X64-SSE-NEXT: retq 151; 152; X64-AVX-LABEL: mul_4xi8: 153; X64-AVX: # %bb.0: # %entry 154; X64-AVX-NEXT: movq {{.*}}(%rip), %rax 155; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 156; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 157; X64-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 158; X64-AVX-NEXT: vmovdqu %xmm0, (%rax,%rdx,4) 159; X64-AVX-NEXT: retq 160entry: 161 %pre = load i32*, i32** @c 162 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 163 %tmp7 = bitcast i8* %tmp6 to <4 x i8>* 164 %wide.load = load <4 x i8>, <4 x i8>* %tmp7, align 1 165 %tmp8 = zext <4 x i8> %wide.load to <4 x i32> 166 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 167 %tmp11 = bitcast i8* %tmp10 to <4 x i8>* 168 %wide.load17 = load <4 x i8>, <4 x i8>* %tmp11, align 1 169 %tmp12 = zext <4 x i8> %wide.load17 to <4 x i32> 170 %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8 171 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 172 %tmp15 = bitcast i32* %tmp14 to <4 x i32>* 173 store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4 174 ret void 175} 176 177; %val1 = load <8 x i8> 178; %op1 = zext<8 x i32> %val1 179; %val2 = load <8 x i8> 180; %op2 = zext<8 x i32> %val2 181; %rst = mul <8 x i32> %op1, %op2 182; 183define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { 184; X86-SSE-LABEL: mul_8xi8: 185; X86-SSE: # %bb.0: # %entry 186; X86-SSE-NEXT: pushl %esi 187; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 188; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 189; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx 190; X86-SSE-NEXT: movl c, %esi 191; X86-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 192; X86-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 193; X86-SSE-NEXT: pxor %xmm2, %xmm2 194; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 195; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 196; X86-SSE-NEXT: pmullw %xmm0, %xmm1 197; X86-SSE-NEXT: movdqa %xmm1, %xmm0 198; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 199; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 200; X86-SSE-NEXT: movdqu %xmm1, 16(%esi,%ecx,4) 201; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4) 202; X86-SSE-NEXT: popl %esi 203; X86-SSE-NEXT: retl 204; 205; X86-AVX1-LABEL: mul_8xi8: 206; X86-AVX1: # %bb.0: # %entry 207; X86-AVX1-NEXT: pushl %esi 208; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax 209; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx 210; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx 211; X86-AVX1-NEXT: movl c, %esi 212; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 213; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 214; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 215; X86-AVX1-NEXT: vpmaddwd %xmm0, %xmm2, %xmm0 216; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 217; X86-AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1 218; X86-AVX1-NEXT: vmovdqu %xmm0, 16(%esi,%ecx,4) 219; X86-AVX1-NEXT: vmovdqu %xmm1, (%esi,%ecx,4) 220; X86-AVX1-NEXT: popl %esi 221; X86-AVX1-NEXT: retl 222; 223; X86-AVX2-LABEL: mul_8xi8: 224; X86-AVX2: # %bb.0: # %entry 225; X86-AVX2-NEXT: pushl %esi 226; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax 227; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx 228; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx 229; X86-AVX2-NEXT: movl c, %esi 230; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 231; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 232; X86-AVX2-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0 233; X86-AVX2-NEXT: vmovdqu %ymm0, (%esi,%ecx,4) 234; X86-AVX2-NEXT: popl %esi 235; X86-AVX2-NEXT: vzeroupper 236; X86-AVX2-NEXT: retl 237; 238; X64-SSE-LABEL: mul_8xi8: 239; X64-SSE: # %bb.0: # %entry 240; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 241; X64-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 242; X64-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 243; X64-SSE-NEXT: pxor %xmm2, %xmm2 244; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 245; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 246; X64-SSE-NEXT: pmullw %xmm0, %xmm1 247; X64-SSE-NEXT: movdqa %xmm1, %xmm0 248; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 249; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 250; X64-SSE-NEXT: movdqu %xmm1, 16(%rax,%rdx,4) 251; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4) 252; X64-SSE-NEXT: retq 253; 254; X64-AVX1-LABEL: mul_8xi8: 255; X64-AVX1: # %bb.0: # %entry 256; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax 257; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 258; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 259; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 260; X64-AVX1-NEXT: vpmaddwd %xmm0, %xmm2, %xmm0 261; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 262; X64-AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1 263; X64-AVX1-NEXT: vmovdqu %xmm0, 16(%rax,%rdx,4) 264; X64-AVX1-NEXT: vmovdqu %xmm1, (%rax,%rdx,4) 265; X64-AVX1-NEXT: retq 266; 267; X64-AVX2-LABEL: mul_8xi8: 268; X64-AVX2: # %bb.0: # %entry 269; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax 270; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 271; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 272; X64-AVX2-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0 273; X64-AVX2-NEXT: vmovdqu %ymm0, (%rax,%rdx,4) 274; X64-AVX2-NEXT: vzeroupper 275; X64-AVX2-NEXT: retq 276entry: 277 %pre = load i32*, i32** @c 278 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 279 %tmp7 = bitcast i8* %tmp6 to <8 x i8>* 280 %wide.load = load <8 x i8>, <8 x i8>* %tmp7, align 1 281 %tmp8 = zext <8 x i8> %wide.load to <8 x i32> 282 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 283 %tmp11 = bitcast i8* %tmp10 to <8 x i8>* 284 %wide.load17 = load <8 x i8>, <8 x i8>* %tmp11, align 1 285 %tmp12 = zext <8 x i8> %wide.load17 to <8 x i32> 286 %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8 287 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 288 %tmp15 = bitcast i32* %tmp14 to <8 x i32>* 289 store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4 290 ret void 291} 292 293; %val1 = load <16 x i8> 294; %op1 = zext<16 x i32> %val1 295; %val2 = load <16 x i8> 296; %op2 = zext<16 x i32> %val2 297; %rst = mul <16 x i32> %op1, %op2 298; 299define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { 300; X86-SSE-LABEL: mul_16xi8: 301; X86-SSE: # %bb.0: # %entry 302; X86-SSE-NEXT: pushl %esi 303; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 304; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 305; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx 306; X86-SSE-NEXT: movl c, %esi 307; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0 308; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm1 309; X86-SSE-NEXT: pxor %xmm2, %xmm2 310; X86-SSE-NEXT: movdqa %xmm0, %xmm3 311; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 312; X86-SSE-NEXT: movdqa %xmm1, %xmm4 313; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] 314; X86-SSE-NEXT: pmullw %xmm3, %xmm4 315; X86-SSE-NEXT: movdqa %xmm4, %xmm3 316; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 317; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] 318; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 319; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 320; X86-SSE-NEXT: pmullw %xmm0, %xmm1 321; X86-SSE-NEXT: movdqa %xmm1, %xmm0 322; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 323; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 324; X86-SSE-NEXT: movdqu %xmm1, 48(%esi,%ecx,4) 325; X86-SSE-NEXT: movdqu %xmm0, 32(%esi,%ecx,4) 326; X86-SSE-NEXT: movdqu %xmm4, 16(%esi,%ecx,4) 327; X86-SSE-NEXT: movdqu %xmm3, (%esi,%ecx,4) 328; X86-SSE-NEXT: popl %esi 329; X86-SSE-NEXT: retl 330; 331; X86-AVX1-LABEL: mul_16xi8: 332; X86-AVX1: # %bb.0: # %entry 333; X86-AVX1-NEXT: pushl %esi 334; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax 335; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx 336; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx 337; X86-AVX1-NEXT: movl c, %esi 338; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 339; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 340; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 341; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 342; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 343; X86-AVX1-NEXT: vpmaddwd %xmm0, %xmm4, %xmm0 344; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 345; X86-AVX1-NEXT: vpmaddwd %xmm1, %xmm4, %xmm1 346; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 347; X86-AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 348; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 349; X86-AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3 350; X86-AVX1-NEXT: vmovdqu %xmm0, 48(%esi,%ecx,4) 351; X86-AVX1-NEXT: vmovdqu %xmm1, 32(%esi,%ecx,4) 352; X86-AVX1-NEXT: vmovdqu %xmm2, 16(%esi,%ecx,4) 353; X86-AVX1-NEXT: vmovdqu %xmm3, (%esi,%ecx,4) 354; X86-AVX1-NEXT: popl %esi 355; X86-AVX1-NEXT: retl 356; 357; X86-AVX2-LABEL: mul_16xi8: 358; X86-AVX2: # %bb.0: # %entry 359; X86-AVX2-NEXT: pushl %esi 360; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax 361; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx 362; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx 363; X86-AVX2-NEXT: movl c, %esi 364; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 365; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 366; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 367; X86-AVX2-NEXT: vpmaddwd %ymm0, %ymm2, %ymm0 368; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 369; X86-AVX2-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1 370; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4) 371; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4) 372; X86-AVX2-NEXT: popl %esi 373; X86-AVX2-NEXT: vzeroupper 374; X86-AVX2-NEXT: retl 375; 376; X64-SSE-LABEL: mul_16xi8: 377; X64-SSE: # %bb.0: # %entry 378; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 379; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0 380; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm1 381; X64-SSE-NEXT: pxor %xmm2, %xmm2 382; X64-SSE-NEXT: movdqa %xmm0, %xmm3 383; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 384; X64-SSE-NEXT: movdqa %xmm1, %xmm4 385; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] 386; X64-SSE-NEXT: pmullw %xmm3, %xmm4 387; X64-SSE-NEXT: movdqa %xmm4, %xmm3 388; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 389; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] 390; X64-SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 391; X64-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 392; X64-SSE-NEXT: pmullw %xmm0, %xmm1 393; X64-SSE-NEXT: movdqa %xmm1, %xmm0 394; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 395; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 396; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4) 397; X64-SSE-NEXT: movdqu %xmm0, 32(%rax,%rdx,4) 398; X64-SSE-NEXT: movdqu %xmm4, 16(%rax,%rdx,4) 399; X64-SSE-NEXT: movdqu %xmm3, (%rax,%rdx,4) 400; X64-SSE-NEXT: retq 401; 402; X64-AVX1-LABEL: mul_16xi8: 403; X64-AVX1: # %bb.0: # %entry 404; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax 405; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 406; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 407; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 408; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 409; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 410; X64-AVX1-NEXT: vpmaddwd %xmm0, %xmm4, %xmm0 411; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 412; X64-AVX1-NEXT: vpmaddwd %xmm1, %xmm4, %xmm1 413; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 414; X64-AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 415; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 416; X64-AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3 417; X64-AVX1-NEXT: vmovdqu %xmm0, 48(%rax,%rdx,4) 418; X64-AVX1-NEXT: vmovdqu %xmm1, 32(%rax,%rdx,4) 419; X64-AVX1-NEXT: vmovdqu %xmm2, 16(%rax,%rdx,4) 420; X64-AVX1-NEXT: vmovdqu %xmm3, (%rax,%rdx,4) 421; X64-AVX1-NEXT: retq 422; 423; X64-AVX2-LABEL: mul_16xi8: 424; X64-AVX2: # %bb.0: # %entry 425; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax 426; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 427; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 428; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 429; X64-AVX2-NEXT: vpmaddwd %ymm0, %ymm2, %ymm0 430; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 431; X64-AVX2-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1 432; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4) 433; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4) 434; X64-AVX2-NEXT: vzeroupper 435; X64-AVX2-NEXT: retq 436entry: 437 %pre = load i32*, i32** @c 438 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 439 %tmp7 = bitcast i8* %tmp6 to <16 x i8>* 440 %wide.load = load <16 x i8>, <16 x i8>* %tmp7, align 1 441 %tmp8 = zext <16 x i8> %wide.load to <16 x i32> 442 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 443 %tmp11 = bitcast i8* %tmp10 to <16 x i8>* 444 %wide.load17 = load <16 x i8>, <16 x i8>* %tmp11, align 1 445 %tmp12 = zext <16 x i8> %wide.load17 to <16 x i32> 446 %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8 447 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 448 %tmp15 = bitcast i32* %tmp14 to <16 x i32>* 449 store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4 450 ret void 451} 452 453; %val1 = load <2 x i16> 454; %op1 = zext<2 x i32> %val1 455; %val2 = load <2 x i16> 456; %op2 = zext<2 x i32> %val2 457; %rst = mul <2 x i32> %op1, %op2 458; 459define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { 460; X86-SSE-LABEL: mul_2xi16: 461; X86-SSE: # %bb.0: # %entry 462; X86-SSE-NEXT: pushl %esi 463; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 464; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 465; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx 466; X86-SSE-NEXT: movl c, %esi 467; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 468; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 469; X86-SSE-NEXT: movdqa %xmm1, %xmm2 470; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2 471; X86-SSE-NEXT: pmullw %xmm0, %xmm1 472; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 473; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4) 474; X86-SSE-NEXT: popl %esi 475; X86-SSE-NEXT: retl 476; 477; X86-AVX-LABEL: mul_2xi16: 478; X86-AVX: # %bb.0: # %entry 479; X86-AVX-NEXT: pushl %esi 480; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 481; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 482; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx 483; X86-AVX-NEXT: movl c, %esi 484; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 485; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 486; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 487; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 488; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 489; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) 490; X86-AVX-NEXT: popl %esi 491; X86-AVX-NEXT: retl 492; 493; X64-SSE-LABEL: mul_2xi16: 494; X64-SSE: # %bb.0: # %entry 495; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 496; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 497; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 498; X64-SSE-NEXT: movdqa %xmm1, %xmm2 499; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2 500; X64-SSE-NEXT: pmullw %xmm0, %xmm1 501; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 502; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4) 503; X64-SSE-NEXT: retq 504; 505; X64-AVX-LABEL: mul_2xi16: 506; X64-AVX: # %bb.0: # %entry 507; X64-AVX-NEXT: movq {{.*}}(%rip), %rax 508; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 509; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 510; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 511; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 512; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 513; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) 514; X64-AVX-NEXT: retq 515entry: 516 %pre = load i32*, i32** @c 517 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 518 %tmp7 = bitcast i8* %tmp6 to <2 x i16>* 519 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 520 %tmp8 = zext <2 x i16> %wide.load to <2 x i32> 521 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 522 %tmp11 = bitcast i8* %tmp10 to <2 x i16>* 523 %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1 524 %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32> 525 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 526 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 527 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 528 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 529 ret void 530} 531 532; %val1 = load <4 x i16> 533; %op1 = zext<4 x i32> %val1 534; %val2 = load <4 x i16> 535; %op2 = zext<4 x i32> %val2 536; %rst = mul <4 x i32> %op1, %op2 537; 538define void @mul_4xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { 539; X86-SSE-LABEL: mul_4xi16: 540; X86-SSE: # %bb.0: # %entry 541; X86-SSE-NEXT: pushl %esi 542; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 543; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 544; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx 545; X86-SSE-NEXT: movl c, %esi 546; X86-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 547; X86-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 548; X86-SSE-NEXT: movdqa %xmm1, %xmm2 549; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2 550; X86-SSE-NEXT: pmullw %xmm0, %xmm1 551; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 552; X86-SSE-NEXT: movdqu %xmm1, (%esi,%ecx,4) 553; X86-SSE-NEXT: popl %esi 554; X86-SSE-NEXT: retl 555; 556; X86-AVX-LABEL: mul_4xi16: 557; X86-AVX: # %bb.0: # %entry 558; X86-AVX-NEXT: pushl %esi 559; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 560; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 561; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx 562; X86-AVX-NEXT: movl c, %esi 563; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 564; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 565; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 566; X86-AVX-NEXT: vmovdqu %xmm0, (%esi,%ecx,4) 567; X86-AVX-NEXT: popl %esi 568; X86-AVX-NEXT: retl 569; 570; X64-SSE-LABEL: mul_4xi16: 571; X64-SSE: # %bb.0: # %entry 572; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 573; X64-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 574; X64-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 575; X64-SSE-NEXT: movdqa %xmm1, %xmm2 576; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2 577; X64-SSE-NEXT: pmullw %xmm0, %xmm1 578; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 579; X64-SSE-NEXT: movdqu %xmm1, (%rax,%rdx,4) 580; X64-SSE-NEXT: retq 581; 582; X64-AVX-LABEL: mul_4xi16: 583; X64-AVX: # %bb.0: # %entry 584; X64-AVX-NEXT: movq {{.*}}(%rip), %rax 585; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 586; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 587; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 588; X64-AVX-NEXT: vmovdqu %xmm0, (%rax,%rdx,4) 589; X64-AVX-NEXT: retq 590entry: 591 %pre = load i32*, i32** @c 592 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 593 %tmp7 = bitcast i8* %tmp6 to <4 x i16>* 594 %wide.load = load <4 x i16>, <4 x i16>* %tmp7, align 1 595 %tmp8 = zext <4 x i16> %wide.load to <4 x i32> 596 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 597 %tmp11 = bitcast i8* %tmp10 to <4 x i16>* 598 %wide.load17 = load <4 x i16>, <4 x i16>* %tmp11, align 1 599 %tmp12 = zext <4 x i16> %wide.load17 to <4 x i32> 600 %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8 601 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 602 %tmp15 = bitcast i32* %tmp14 to <4 x i32>* 603 store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4 604 ret void 605} 606 607; %val1 = load <8 x i16> 608; %op1 = zext<8 x i32> %val1 609; %val2 = load <8 x i16> 610; %op2 = zext<8 x i32> %val2 611; %rst = mul <8 x i32> %op1, %op2 612; 613define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { 614; X86-SSE-LABEL: mul_8xi16: 615; X86-SSE: # %bb.0: # %entry 616; X86-SSE-NEXT: pushl %esi 617; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 618; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 619; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx 620; X86-SSE-NEXT: movl c, %esi 621; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0 622; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm1 623; X86-SSE-NEXT: movdqa %xmm1, %xmm2 624; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2 625; X86-SSE-NEXT: pmullw %xmm0, %xmm1 626; X86-SSE-NEXT: movdqa %xmm1, %xmm0 627; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 628; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 629; X86-SSE-NEXT: movdqu %xmm1, 16(%esi,%ecx,4) 630; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4) 631; X86-SSE-NEXT: popl %esi 632; X86-SSE-NEXT: retl 633; 634; X86-AVX1-LABEL: mul_8xi16: 635; X86-AVX1: # %bb.0: # %entry 636; X86-AVX1-NEXT: pushl %esi 637; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax 638; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx 639; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx 640; X86-AVX1-NEXT: movl c, %esi 641; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 642; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 643; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 644; X86-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0 645; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 646; X86-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 647; X86-AVX1-NEXT: vmovdqu %xmm0, 16(%esi,%ecx,4) 648; X86-AVX1-NEXT: vmovdqu %xmm1, (%esi,%ecx,4) 649; X86-AVX1-NEXT: popl %esi 650; X86-AVX1-NEXT: retl 651; 652; X86-AVX2-LABEL: mul_8xi16: 653; X86-AVX2: # %bb.0: # %entry 654; X86-AVX2-NEXT: pushl %esi 655; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax 656; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx 657; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx 658; X86-AVX2-NEXT: movl c, %esi 659; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 660; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 661; X86-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0 662; X86-AVX2-NEXT: vmovdqu %ymm0, (%esi,%ecx,4) 663; X86-AVX2-NEXT: popl %esi 664; X86-AVX2-NEXT: vzeroupper 665; X86-AVX2-NEXT: retl 666; 667; X64-SSE-LABEL: mul_8xi16: 668; X64-SSE: # %bb.0: # %entry 669; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 670; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0 671; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm1 672; X64-SSE-NEXT: movdqa %xmm1, %xmm2 673; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2 674; X64-SSE-NEXT: pmullw %xmm0, %xmm1 675; X64-SSE-NEXT: movdqa %xmm1, %xmm0 676; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 677; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 678; X64-SSE-NEXT: movdqu %xmm1, 16(%rax,%rdx,4) 679; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4) 680; X64-SSE-NEXT: retq 681; 682; X64-AVX1-LABEL: mul_8xi16: 683; X64-AVX1: # %bb.0: # %entry 684; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax 685; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 686; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 687; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 688; X64-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0 689; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 690; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 691; X64-AVX1-NEXT: vmovdqu %xmm0, 16(%rax,%rdx,4) 692; X64-AVX1-NEXT: vmovdqu %xmm1, (%rax,%rdx,4) 693; X64-AVX1-NEXT: retq 694; 695; X64-AVX2-LABEL: mul_8xi16: 696; X64-AVX2: # %bb.0: # %entry 697; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax 698; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 699; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 700; X64-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0 701; X64-AVX2-NEXT: vmovdqu %ymm0, (%rax,%rdx,4) 702; X64-AVX2-NEXT: vzeroupper 703; X64-AVX2-NEXT: retq 704entry: 705 %pre = load i32*, i32** @c 706 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 707 %tmp7 = bitcast i8* %tmp6 to <8 x i16>* 708 %wide.load = load <8 x i16>, <8 x i16>* %tmp7, align 1 709 %tmp8 = zext <8 x i16> %wide.load to <8 x i32> 710 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 711 %tmp11 = bitcast i8* %tmp10 to <8 x i16>* 712 %wide.load17 = load <8 x i16>, <8 x i16>* %tmp11, align 1 713 %tmp12 = zext <8 x i16> %wide.load17 to <8 x i32> 714 %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8 715 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 716 %tmp15 = bitcast i32* %tmp14 to <8 x i32>* 717 store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4 718 ret void 719} 720 721; %val1 = load <16 x i16> 722; %op1 = zext<16 x i32> %val1 723; %val2 = load <16 x i16> 724; %op2 = zext<16 x i32> %val2 725; %rst = mul <16 x i32> %op1, %op2 726; 727define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { 728; X86-SSE-LABEL: mul_16xi16: 729; X86-SSE: # %bb.0: # %entry 730; X86-SSE-NEXT: pushl %esi 731; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 732; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 733; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx 734; X86-SSE-NEXT: movl c, %esi 735; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0 736; X86-SSE-NEXT: movdqu 16(%edx,%ecx), %xmm1 737; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm2 738; X86-SSE-NEXT: movdqu 16(%eax,%ecx), %xmm3 739; X86-SSE-NEXT: movdqa %xmm2, %xmm4 740; X86-SSE-NEXT: pmulhuw %xmm0, %xmm4 741; X86-SSE-NEXT: pmullw %xmm0, %xmm2 742; X86-SSE-NEXT: movdqa %xmm2, %xmm0 743; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 744; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] 745; X86-SSE-NEXT: movdqa %xmm3, %xmm4 746; X86-SSE-NEXT: pmulhuw %xmm1, %xmm4 747; X86-SSE-NEXT: pmullw %xmm1, %xmm3 748; X86-SSE-NEXT: movdqa %xmm3, %xmm1 749; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] 750; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] 751; X86-SSE-NEXT: movdqu %xmm3, 32(%esi,%ecx,4) 752; X86-SSE-NEXT: movdqu %xmm1, 48(%esi,%ecx,4) 753; X86-SSE-NEXT: movdqu %xmm2, (%esi,%ecx,4) 754; X86-SSE-NEXT: movdqu %xmm0, 16(%esi,%ecx,4) 755; X86-SSE-NEXT: popl %esi 756; X86-SSE-NEXT: retl 757; 758; X86-AVX1-LABEL: mul_16xi16: 759; X86-AVX1: # %bb.0: # %entry 760; X86-AVX1-NEXT: pushl %esi 761; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax 762; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx 763; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx 764; X86-AVX1-NEXT: movl c, %esi 765; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 766; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 767; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 768; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 769; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 770; X86-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 771; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 772; X86-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 773; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 774; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 775; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 776; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 777; X86-AVX1-NEXT: vmovdqu %xmm0, 48(%esi,%ecx,4) 778; X86-AVX1-NEXT: vmovdqu %xmm1, 32(%esi,%ecx,4) 779; X86-AVX1-NEXT: vmovdqu %xmm2, 16(%esi,%ecx,4) 780; X86-AVX1-NEXT: vmovdqu %xmm3, (%esi,%ecx,4) 781; X86-AVX1-NEXT: popl %esi 782; X86-AVX1-NEXT: retl 783; 784; X86-AVX2-LABEL: mul_16xi16: 785; X86-AVX2: # %bb.0: # %entry 786; X86-AVX2-NEXT: pushl %esi 787; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax 788; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx 789; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx 790; X86-AVX2-NEXT: movl c, %esi 791; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 792; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 793; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 794; X86-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 795; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 796; X86-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 797; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4) 798; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4) 799; X86-AVX2-NEXT: popl %esi 800; X86-AVX2-NEXT: vzeroupper 801; X86-AVX2-NEXT: retl 802; 803; X64-SSE-LABEL: mul_16xi16: 804; X64-SSE: # %bb.0: # %entry 805; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 806; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0 807; X64-SSE-NEXT: movdqu 16(%rdi,%rdx), %xmm1 808; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm2 809; X64-SSE-NEXT: movdqu 16(%rsi,%rdx), %xmm3 810; X64-SSE-NEXT: movdqa %xmm2, %xmm4 811; X64-SSE-NEXT: pmulhuw %xmm0, %xmm4 812; X64-SSE-NEXT: pmullw %xmm0, %xmm2 813; X64-SSE-NEXT: movdqa %xmm2, %xmm0 814; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 815; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] 816; X64-SSE-NEXT: movdqa %xmm3, %xmm4 817; X64-SSE-NEXT: pmulhuw %xmm1, %xmm4 818; X64-SSE-NEXT: pmullw %xmm1, %xmm3 819; X64-SSE-NEXT: movdqa %xmm3, %xmm1 820; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] 821; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] 822; X64-SSE-NEXT: movdqu %xmm3, 32(%rax,%rdx,4) 823; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4) 824; X64-SSE-NEXT: movdqu %xmm2, (%rax,%rdx,4) 825; X64-SSE-NEXT: movdqu %xmm0, 16(%rax,%rdx,4) 826; X64-SSE-NEXT: retq 827; 828; X64-AVX1-LABEL: mul_16xi16: 829; X64-AVX1: # %bb.0: # %entry 830; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax 831; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 832; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 833; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 834; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 835; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 836; X64-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 837; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 838; X64-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 839; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 840; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 841; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 842; X64-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 843; X64-AVX1-NEXT: vmovdqu %xmm0, 48(%rax,%rdx,4) 844; X64-AVX1-NEXT: vmovdqu %xmm1, 32(%rax,%rdx,4) 845; X64-AVX1-NEXT: vmovdqu %xmm2, 16(%rax,%rdx,4) 846; X64-AVX1-NEXT: vmovdqu %xmm3, (%rax,%rdx,4) 847; X64-AVX1-NEXT: retq 848; 849; X64-AVX2-LABEL: mul_16xi16: 850; X64-AVX2: # %bb.0: # %entry 851; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax 852; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 853; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 854; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 855; X64-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 856; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 857; X64-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 858; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4) 859; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4) 860; X64-AVX2-NEXT: vzeroupper 861; X64-AVX2-NEXT: retq 862entry: 863 %pre = load i32*, i32** @c 864 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 865 %tmp7 = bitcast i8* %tmp6 to <16 x i16>* 866 %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1 867 %tmp8 = zext <16 x i16> %wide.load to <16 x i32> 868 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 869 %tmp11 = bitcast i8* %tmp10 to <16 x i16>* 870 %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1 871 %tmp12 = zext <16 x i16> %wide.load17 to <16 x i32> 872 %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8 873 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 874 %tmp15 = bitcast i32* %tmp14 to <16 x i32>* 875 store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4 876 ret void 877} 878 879; %val1 = load <2 x i8> 880; %op1 = sext<2 x i32> %val1 881; %val2 = load <2 x i8> 882; %op2 = sext<2 x i32> %val2 883; %rst = mul <2 x i32> %op1, %op2 884; 885define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { 886; X86-SSE-LABEL: mul_2xi8_sext: 887; X86-SSE: # %bb.0: # %entry 888; X86-SSE-NEXT: pushl %esi 889; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 890; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 891; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx 892; X86-SSE-NEXT: movl c, %esi 893; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx 894; X86-SSE-NEXT: movd %edx, %xmm0 895; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax 896; X86-SSE-NEXT: movd %eax, %xmm1 897; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 898; X86-SSE-NEXT: psraw $8, %xmm0 899; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 900; X86-SSE-NEXT: psraw $8, %xmm1 901; X86-SSE-NEXT: pmullw %xmm0, %xmm1 902; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] 903; X86-SSE-NEXT: psrad $16, %xmm0 904; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4) 905; X86-SSE-NEXT: popl %esi 906; X86-SSE-NEXT: retl 907; 908; X86-AVX-LABEL: mul_2xi8_sext: 909; X86-AVX: # %bb.0: # %entry 910; X86-AVX-NEXT: pushl %esi 911; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 912; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 913; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx 914; X86-AVX-NEXT: movl c, %esi 915; X86-AVX-NEXT: movzwl (%edx,%ecx), %edx 916; X86-AVX-NEXT: vmovd %edx, %xmm0 917; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 918; X86-AVX-NEXT: movzwl (%eax,%ecx), %eax 919; X86-AVX-NEXT: vmovd %eax, %xmm1 920; X86-AVX-NEXT: vpmovsxbd %xmm1, %xmm1 921; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 922; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) 923; X86-AVX-NEXT: popl %esi 924; X86-AVX-NEXT: retl 925; 926; X64-SSE-LABEL: mul_2xi8_sext: 927; X64-SSE: # %bb.0: # %entry 928; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 929; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx 930; X64-SSE-NEXT: movd %ecx, %xmm0 931; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx 932; X64-SSE-NEXT: movd %ecx, %xmm1 933; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 934; X64-SSE-NEXT: psraw $8, %xmm0 935; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 936; X64-SSE-NEXT: psraw $8, %xmm1 937; X64-SSE-NEXT: pmullw %xmm0, %xmm1 938; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] 939; X64-SSE-NEXT: psrad $16, %xmm0 940; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4) 941; X64-SSE-NEXT: retq 942; 943; X64-AVX-LABEL: mul_2xi8_sext: 944; X64-AVX: # %bb.0: # %entry 945; X64-AVX-NEXT: movq {{.*}}(%rip), %rax 946; X64-AVX-NEXT: movzwl (%rdi,%rdx), %ecx 947; X64-AVX-NEXT: vmovd %ecx, %xmm0 948; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 949; X64-AVX-NEXT: movzwl (%rsi,%rdx), %ecx 950; X64-AVX-NEXT: vmovd %ecx, %xmm1 951; X64-AVX-NEXT: vpmovsxbd %xmm1, %xmm1 952; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 953; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) 954; X64-AVX-NEXT: retq 955entry: 956 %pre = load i32*, i32** @c 957 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 958 %tmp7 = bitcast i8* %tmp6 to <2 x i8>* 959 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 960 %tmp8 = sext <2 x i8> %wide.load to <2 x i32> 961 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 962 %tmp11 = bitcast i8* %tmp10 to <2 x i8>* 963 %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1 964 %tmp12 = sext <2 x i8> %wide.load17 to <2 x i32> 965 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 966 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 967 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 968 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 969 ret void 970} 971 972; %val1 = load <2 x i8> 973; %op1 = sext<2 x i32> %val1 974; %val2 = load <2 x i8> 975; %op2 = zext<2 x i32> %val2 976; %rst = mul <2 x i32> %op1, %op2 977; 978define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { 979; X86-SSE-LABEL: mul_2xi8_sext_zext: 980; X86-SSE: # %bb.0: # %entry 981; X86-SSE-NEXT: pushl %esi 982; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 983; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 984; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx 985; X86-SSE-NEXT: movl c, %esi 986; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx 987; X86-SSE-NEXT: movd %edx, %xmm0 988; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax 989; X86-SSE-NEXT: movd %eax, %xmm1 990; X86-SSE-NEXT: pxor %xmm2, %xmm2 991; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 992; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 993; X86-SSE-NEXT: psraw $8, %xmm0 994; X86-SSE-NEXT: movdqa %xmm1, %xmm2 995; X86-SSE-NEXT: pmulhw %xmm0, %xmm2 996; X86-SSE-NEXT: pmullw %xmm1, %xmm0 997; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 998; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4) 999; X86-SSE-NEXT: popl %esi 1000; X86-SSE-NEXT: retl 1001; 1002; X86-AVX-LABEL: mul_2xi8_sext_zext: 1003; X86-AVX: # %bb.0: # %entry 1004; X86-AVX-NEXT: pushl %esi 1005; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1006; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 1007; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx 1008; X86-AVX-NEXT: movl c, %esi 1009; X86-AVX-NEXT: movzwl (%edx,%ecx), %edx 1010; X86-AVX-NEXT: vmovd %edx, %xmm0 1011; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 1012; X86-AVX-NEXT: movzwl (%eax,%ecx), %eax 1013; X86-AVX-NEXT: vmovd %eax, %xmm1 1014; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 1015; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 1016; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) 1017; X86-AVX-NEXT: popl %esi 1018; X86-AVX-NEXT: retl 1019; 1020; X64-SSE-LABEL: mul_2xi8_sext_zext: 1021; X64-SSE: # %bb.0: # %entry 1022; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 1023; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx 1024; X64-SSE-NEXT: movd %ecx, %xmm0 1025; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx 1026; X64-SSE-NEXT: movd %ecx, %xmm1 1027; X64-SSE-NEXT: pxor %xmm2, %xmm2 1028; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 1029; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1030; X64-SSE-NEXT: psraw $8, %xmm0 1031; X64-SSE-NEXT: movdqa %xmm1, %xmm2 1032; X64-SSE-NEXT: pmulhw %xmm0, %xmm2 1033; X64-SSE-NEXT: pmullw %xmm1, %xmm0 1034; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 1035; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4) 1036; X64-SSE-NEXT: retq 1037; 1038; X64-AVX-LABEL: mul_2xi8_sext_zext: 1039; X64-AVX: # %bb.0: # %entry 1040; X64-AVX-NEXT: movq {{.*}}(%rip), %rax 1041; X64-AVX-NEXT: movzwl (%rdi,%rdx), %ecx 1042; X64-AVX-NEXT: vmovd %ecx, %xmm0 1043; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 1044; X64-AVX-NEXT: movzwl (%rsi,%rdx), %ecx 1045; X64-AVX-NEXT: vmovd %ecx, %xmm1 1046; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 1047; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 1048; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) 1049; X64-AVX-NEXT: retq 1050entry: 1051 %pre = load i32*, i32** @c 1052 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 1053 %tmp7 = bitcast i8* %tmp6 to <2 x i8>* 1054 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 1055 %tmp8 = sext <2 x i8> %wide.load to <2 x i32> 1056 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 1057 %tmp11 = bitcast i8* %tmp10 to <2 x i8>* 1058 %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1 1059 %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32> 1060 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 1061 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 1062 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 1063 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 1064 ret void 1065} 1066 1067; %val1 = load <2 x i16> 1068; %op1 = sext<2 x i32> %val1 1069; %val2 = load <2 x i16> 1070; %op2 = sext<2 x i32> %val2 1071; %rst = mul <2 x i32> %op1, %op2 1072; 1073define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { 1074; X86-SSE-LABEL: mul_2xi16_sext: 1075; X86-SSE: # %bb.0: # %entry 1076; X86-SSE-NEXT: pushl %esi 1077; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 1078; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 1079; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx 1080; X86-SSE-NEXT: movl c, %esi 1081; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1082; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1083; X86-SSE-NEXT: movdqa %xmm1, %xmm2 1084; X86-SSE-NEXT: pmulhw %xmm0, %xmm2 1085; X86-SSE-NEXT: pmullw %xmm0, %xmm1 1086; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 1087; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4) 1088; X86-SSE-NEXT: popl %esi 1089; X86-SSE-NEXT: retl 1090; 1091; X86-AVX-LABEL: mul_2xi16_sext: 1092; X86-AVX: # %bb.0: # %entry 1093; X86-AVX-NEXT: pushl %esi 1094; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1095; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 1096; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx 1097; X86-AVX-NEXT: movl c, %esi 1098; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1099; X86-AVX-NEXT: vpmovsxwd %xmm0, %xmm0 1100; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1101; X86-AVX-NEXT: vpmovsxwd %xmm1, %xmm1 1102; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 1103; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) 1104; X86-AVX-NEXT: popl %esi 1105; X86-AVX-NEXT: retl 1106; 1107; X64-SSE-LABEL: mul_2xi16_sext: 1108; X64-SSE: # %bb.0: # %entry 1109; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 1110; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1111; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1112; X64-SSE-NEXT: movdqa %xmm1, %xmm2 1113; X64-SSE-NEXT: pmulhw %xmm0, %xmm2 1114; X64-SSE-NEXT: pmullw %xmm0, %xmm1 1115; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 1116; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4) 1117; X64-SSE-NEXT: retq 1118; 1119; X64-AVX-LABEL: mul_2xi16_sext: 1120; X64-AVX: # %bb.0: # %entry 1121; X64-AVX-NEXT: movq {{.*}}(%rip), %rax 1122; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1123; X64-AVX-NEXT: vpmovsxwd %xmm0, %xmm0 1124; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1125; X64-AVX-NEXT: vpmovsxwd %xmm1, %xmm1 1126; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 1127; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) 1128; X64-AVX-NEXT: retq 1129entry: 1130 %pre = load i32*, i32** @c 1131 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 1132 %tmp7 = bitcast i8* %tmp6 to <2 x i16>* 1133 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 1134 %tmp8 = sext <2 x i16> %wide.load to <2 x i32> 1135 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 1136 %tmp11 = bitcast i8* %tmp10 to <2 x i16>* 1137 %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1 1138 %tmp12 = sext <2 x i16> %wide.load17 to <2 x i32> 1139 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 1140 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 1141 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 1142 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 1143 ret void 1144} 1145 1146; %val1 = load <2 x i16> 1147; %op1 = sext<2 x i32> %val1 1148; %val2 = load <2 x i16> 1149; %op2 = zext<2 x i32> %val2 1150; %rst = mul <2 x i32> %op1, %op2 1151; 1152define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { 1153; X86-SSE-LABEL: mul_2xi16_sext_zext: 1154; X86-SSE: # %bb.0: # %entry 1155; X86-SSE-NEXT: pushl %esi 1156; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 1157; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 1158; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx 1159; X86-SSE-NEXT: movl c, %esi 1160; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1161; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] 1162; X86-SSE-NEXT: psrad $16, %xmm0 1163; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1164; X86-SSE-NEXT: pxor %xmm2, %xmm2 1165; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 1166; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 1167; X86-SSE-NEXT: pmuludq %xmm0, %xmm1 1168; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 1169; X86-SSE-NEXT: pmuludq %xmm2, %xmm0 1170; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1171; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4) 1172; X86-SSE-NEXT: popl %esi 1173; X86-SSE-NEXT: retl 1174; 1175; X86-AVX-LABEL: mul_2xi16_sext_zext: 1176; X86-AVX: # %bb.0: # %entry 1177; X86-AVX-NEXT: pushl %esi 1178; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1179; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 1180; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx 1181; X86-AVX-NEXT: movl c, %esi 1182; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1183; X86-AVX-NEXT: vpmovsxwd %xmm0, %xmm0 1184; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1185; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 1186; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 1187; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) 1188; X86-AVX-NEXT: popl %esi 1189; X86-AVX-NEXT: retl 1190; 1191; X64-SSE-LABEL: mul_2xi16_sext_zext: 1192; X64-SSE: # %bb.0: # %entry 1193; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 1194; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1195; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] 1196; X64-SSE-NEXT: psrad $16, %xmm0 1197; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1198; X64-SSE-NEXT: pxor %xmm2, %xmm2 1199; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 1200; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 1201; X64-SSE-NEXT: pmuludq %xmm0, %xmm1 1202; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 1203; X64-SSE-NEXT: pmuludq %xmm2, %xmm0 1204; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1205; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4) 1206; X64-SSE-NEXT: retq 1207; 1208; X64-AVX-LABEL: mul_2xi16_sext_zext: 1209; X64-AVX: # %bb.0: # %entry 1210; X64-AVX-NEXT: movq {{.*}}(%rip), %rax 1211; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1212; X64-AVX-NEXT: vpmovsxwd %xmm0, %xmm0 1213; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1214; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 1215; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 1216; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) 1217; X64-AVX-NEXT: retq 1218entry: 1219 %pre = load i32*, i32** @c 1220 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 1221 %tmp7 = bitcast i8* %tmp6 to <2 x i16>* 1222 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 1223 %tmp8 = sext <2 x i16> %wide.load to <2 x i32> 1224 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 1225 %tmp11 = bitcast i8* %tmp10 to <2 x i16>* 1226 %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1 1227 %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32> 1228 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 1229 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 1230 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 1231 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 1232 ret void 1233} 1234 1235; %val1 = load <16 x i16> 1236; %op1 = sext<16 x i32> %val1 1237; %val2 = load <16 x i16> 1238; %op2 = sext<16 x i32> %val2 1239; %rst = mul <16 x i32> %op1, %op2 1240; 1241define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { 1242; X86-SSE-LABEL: mul_16xi16_sext: 1243; X86-SSE: # %bb.0: # %entry 1244; X86-SSE-NEXT: pushl %esi 1245; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 1246; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 1247; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx 1248; X86-SSE-NEXT: movl c, %esi 1249; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0 1250; X86-SSE-NEXT: movdqu 16(%edx,%ecx), %xmm1 1251; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm2 1252; X86-SSE-NEXT: movdqu 16(%eax,%ecx), %xmm3 1253; X86-SSE-NEXT: movdqa %xmm2, %xmm4 1254; X86-SSE-NEXT: pmulhw %xmm0, %xmm4 1255; X86-SSE-NEXT: pmullw %xmm0, %xmm2 1256; X86-SSE-NEXT: movdqa %xmm2, %xmm0 1257; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 1258; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] 1259; X86-SSE-NEXT: movdqa %xmm3, %xmm4 1260; X86-SSE-NEXT: pmulhw %xmm1, %xmm4 1261; X86-SSE-NEXT: pmullw %xmm1, %xmm3 1262; X86-SSE-NEXT: movdqa %xmm3, %xmm1 1263; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] 1264; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] 1265; X86-SSE-NEXT: movdqu %xmm3, 32(%esi,%ecx,4) 1266; X86-SSE-NEXT: movdqu %xmm1, 48(%esi,%ecx,4) 1267; X86-SSE-NEXT: movdqu %xmm2, (%esi,%ecx,4) 1268; X86-SSE-NEXT: movdqu %xmm0, 16(%esi,%ecx,4) 1269; X86-SSE-NEXT: popl %esi 1270; X86-SSE-NEXT: retl 1271; 1272; X86-AVX1-LABEL: mul_16xi16_sext: 1273; X86-AVX1: # %bb.0: # %entry 1274; X86-AVX1-NEXT: pushl %esi 1275; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax 1276; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx 1277; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx 1278; X86-AVX1-NEXT: movl c, %esi 1279; X86-AVX1-NEXT: vpmovsxwd 24(%edx,%ecx), %xmm0 1280; X86-AVX1-NEXT: vpmovsxwd 16(%edx,%ecx), %xmm1 1281; X86-AVX1-NEXT: vpmovsxwd 8(%edx,%ecx), %xmm2 1282; X86-AVX1-NEXT: vpmovsxwd (%edx,%ecx), %xmm3 1283; X86-AVX1-NEXT: vpmovsxwd 24(%eax,%ecx), %xmm4 1284; X86-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 1285; X86-AVX1-NEXT: vpmovsxwd 16(%eax,%ecx), %xmm4 1286; X86-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 1287; X86-AVX1-NEXT: vpmovsxwd 8(%eax,%ecx), %xmm4 1288; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 1289; X86-AVX1-NEXT: vpmovsxwd (%eax,%ecx), %xmm4 1290; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 1291; X86-AVX1-NEXT: vmovdqu %xmm0, 48(%esi,%ecx,4) 1292; X86-AVX1-NEXT: vmovdqu %xmm1, 32(%esi,%ecx,4) 1293; X86-AVX1-NEXT: vmovdqu %xmm2, 16(%esi,%ecx,4) 1294; X86-AVX1-NEXT: vmovdqu %xmm3, (%esi,%ecx,4) 1295; X86-AVX1-NEXT: popl %esi 1296; X86-AVX1-NEXT: retl 1297; 1298; X86-AVX2-LABEL: mul_16xi16_sext: 1299; X86-AVX2: # %bb.0: # %entry 1300; X86-AVX2-NEXT: pushl %esi 1301; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax 1302; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx 1303; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx 1304; X86-AVX2-NEXT: movl c, %esi 1305; X86-AVX2-NEXT: vpmovsxwd 16(%edx,%ecx), %ymm0 1306; X86-AVX2-NEXT: vpmovsxwd (%edx,%ecx), %ymm1 1307; X86-AVX2-NEXT: vpmovsxwd 16(%eax,%ecx), %ymm2 1308; X86-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 1309; X86-AVX2-NEXT: vpmovsxwd (%eax,%ecx), %ymm2 1310; X86-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 1311; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4) 1312; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4) 1313; X86-AVX2-NEXT: popl %esi 1314; X86-AVX2-NEXT: vzeroupper 1315; X86-AVX2-NEXT: retl 1316; 1317; X64-SSE-LABEL: mul_16xi16_sext: 1318; X64-SSE: # %bb.0: # %entry 1319; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 1320; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0 1321; X64-SSE-NEXT: movdqu 16(%rdi,%rdx), %xmm1 1322; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm2 1323; X64-SSE-NEXT: movdqu 16(%rsi,%rdx), %xmm3 1324; X64-SSE-NEXT: movdqa %xmm2, %xmm4 1325; X64-SSE-NEXT: pmulhw %xmm0, %xmm4 1326; X64-SSE-NEXT: pmullw %xmm0, %xmm2 1327; X64-SSE-NEXT: movdqa %xmm2, %xmm0 1328; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 1329; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] 1330; X64-SSE-NEXT: movdqa %xmm3, %xmm4 1331; X64-SSE-NEXT: pmulhw %xmm1, %xmm4 1332; X64-SSE-NEXT: pmullw %xmm1, %xmm3 1333; X64-SSE-NEXT: movdqa %xmm3, %xmm1 1334; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] 1335; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] 1336; X64-SSE-NEXT: movdqu %xmm3, 32(%rax,%rdx,4) 1337; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4) 1338; X64-SSE-NEXT: movdqu %xmm2, (%rax,%rdx,4) 1339; X64-SSE-NEXT: movdqu %xmm0, 16(%rax,%rdx,4) 1340; X64-SSE-NEXT: retq 1341; 1342; X64-AVX1-LABEL: mul_16xi16_sext: 1343; X64-AVX1: # %bb.0: # %entry 1344; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax 1345; X64-AVX1-NEXT: vpmovsxwd 24(%rdi,%rdx), %xmm0 1346; X64-AVX1-NEXT: vpmovsxwd 16(%rdi,%rdx), %xmm1 1347; X64-AVX1-NEXT: vpmovsxwd 8(%rdi,%rdx), %xmm2 1348; X64-AVX1-NEXT: vpmovsxwd (%rdi,%rdx), %xmm3 1349; X64-AVX1-NEXT: vpmovsxwd 24(%rsi,%rdx), %xmm4 1350; X64-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 1351; X64-AVX1-NEXT: vpmovsxwd 16(%rsi,%rdx), %xmm4 1352; X64-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 1353; X64-AVX1-NEXT: vpmovsxwd 8(%rsi,%rdx), %xmm4 1354; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 1355; X64-AVX1-NEXT: vpmovsxwd (%rsi,%rdx), %xmm4 1356; X64-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 1357; X64-AVX1-NEXT: vmovdqu %xmm0, 48(%rax,%rdx,4) 1358; X64-AVX1-NEXT: vmovdqu %xmm1, 32(%rax,%rdx,4) 1359; X64-AVX1-NEXT: vmovdqu %xmm2, 16(%rax,%rdx,4) 1360; X64-AVX1-NEXT: vmovdqu %xmm3, (%rax,%rdx,4) 1361; X64-AVX1-NEXT: retq 1362; 1363; X64-AVX2-LABEL: mul_16xi16_sext: 1364; X64-AVX2: # %bb.0: # %entry 1365; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax 1366; X64-AVX2-NEXT: vpmovsxwd 16(%rdi,%rdx), %ymm0 1367; X64-AVX2-NEXT: vpmovsxwd (%rdi,%rdx), %ymm1 1368; X64-AVX2-NEXT: vpmovsxwd 16(%rsi,%rdx), %ymm2 1369; X64-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 1370; X64-AVX2-NEXT: vpmovsxwd (%rsi,%rdx), %ymm2 1371; X64-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 1372; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4) 1373; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4) 1374; X64-AVX2-NEXT: vzeroupper 1375; X64-AVX2-NEXT: retq 1376entry: 1377 %pre = load i32*, i32** @c 1378 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 1379 %tmp7 = bitcast i8* %tmp6 to <16 x i16>* 1380 %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1 1381 %tmp8 = sext <16 x i16> %wide.load to <16 x i32> 1382 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 1383 %tmp11 = bitcast i8* %tmp10 to <16 x i16>* 1384 %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1 1385 %tmp12 = sext <16 x i16> %wide.load17 to <16 x i32> 1386 %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8 1387 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 1388 %tmp15 = bitcast i32* %tmp14 to <16 x i32>* 1389 store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4 1390 ret void 1391} 1392 1393; %val = load <2 x i8> 1394; %op1 = zext<2 x i32> %val 1395; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 255) 1396; %rst = mul <2 x i32> %op1, %op2 1397; 1398define void @mul_2xi8_varconst1(i8* nocapture readonly %a, i64 %index) { 1399; X86-SSE-LABEL: mul_2xi8_varconst1: 1400; X86-SSE: # %bb.0: # %entry 1401; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 1402; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 1403; X86-SSE-NEXT: movl c, %edx 1404; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx 1405; X86-SSE-NEXT: movd %ecx, %xmm0 1406; X86-SSE-NEXT: pxor %xmm1, %xmm1 1407; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1408; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1409; X86-SSE-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0 1410; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) 1411; X86-SSE-NEXT: retl 1412; 1413; X86-AVX-LABEL: mul_2xi8_varconst1: 1414; X86-AVX: # %bb.0: # %entry 1415; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1416; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 1417; X86-AVX-NEXT: movl c, %edx 1418; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx 1419; X86-AVX-NEXT: vmovd %ecx, %xmm0 1420; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1421; X86-AVX-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0 1422; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) 1423; X86-AVX-NEXT: retl 1424; 1425; X64-SSE-LABEL: mul_2xi8_varconst1: 1426; X64-SSE: # %bb.0: # %entry 1427; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 1428; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx 1429; X64-SSE-NEXT: movd %ecx, %xmm0 1430; X64-SSE-NEXT: pxor %xmm1, %xmm1 1431; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1432; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1433; X64-SSE-NEXT: pmaddwd {{.*}}(%rip), %xmm0 1434; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) 1435; X64-SSE-NEXT: retq 1436; 1437; X64-AVX-LABEL: mul_2xi8_varconst1: 1438; X64-AVX: # %bb.0: # %entry 1439; X64-AVX-NEXT: movq {{.*}}(%rip), %rax 1440; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx 1441; X64-AVX-NEXT: vmovd %ecx, %xmm0 1442; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1443; X64-AVX-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0 1444; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) 1445; X64-AVX-NEXT: retq 1446entry: 1447 %pre = load i32*, i32** @c 1448 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 1449 %tmp7 = bitcast i8* %tmp6 to <2 x i8>* 1450 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 1451 %tmp8 = zext <2 x i8> %wide.load to <2 x i32> 1452 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 255> 1453 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 1454 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 1455 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 1456 ret void 1457} 1458 1459; %val = load <2 x i8> 1460; %op1 = sext<2 x i32> %val 1461; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 127) 1462; %rst = mul <2 x i32> %op1, %op2 1463; 1464define void @mul_2xi8_varconst2(i8* nocapture readonly %a, i64 %index) { 1465; X86-SSE-LABEL: mul_2xi8_varconst2: 1466; X86-SSE: # %bb.0: # %entry 1467; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 1468; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 1469; X86-SSE-NEXT: movl c, %edx 1470; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx 1471; X86-SSE-NEXT: movd %ecx, %xmm0 1472; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1473; X86-SSE-NEXT: psraw $8, %xmm0 1474; X86-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 1475; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] 1476; X86-SSE-NEXT: psrad $16, %xmm0 1477; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) 1478; X86-SSE-NEXT: retl 1479; 1480; X86-AVX-LABEL: mul_2xi8_varconst2: 1481; X86-AVX: # %bb.0: # %entry 1482; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1483; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 1484; X86-AVX-NEXT: movl c, %edx 1485; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx 1486; X86-AVX-NEXT: vmovd %ecx, %xmm0 1487; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 1488; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 1489; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) 1490; X86-AVX-NEXT: retl 1491; 1492; X64-SSE-LABEL: mul_2xi8_varconst2: 1493; X64-SSE: # %bb.0: # %entry 1494; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 1495; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx 1496; X64-SSE-NEXT: movd %ecx, %xmm0 1497; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1498; X64-SSE-NEXT: psraw $8, %xmm0 1499; X64-SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 1500; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] 1501; X64-SSE-NEXT: psrad $16, %xmm0 1502; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) 1503; X64-SSE-NEXT: retq 1504; 1505; X64-AVX-LABEL: mul_2xi8_varconst2: 1506; X64-AVX: # %bb.0: # %entry 1507; X64-AVX-NEXT: movq {{.*}}(%rip), %rax 1508; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx 1509; X64-AVX-NEXT: vmovd %ecx, %xmm0 1510; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 1511; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 1512; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) 1513; X64-AVX-NEXT: retq 1514entry: 1515 %pre = load i32*, i32** @c 1516 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 1517 %tmp7 = bitcast i8* %tmp6 to <2 x i8>* 1518 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 1519 %tmp8 = sext <2 x i8> %wide.load to <2 x i32> 1520 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 127> 1521 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 1522 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 1523 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 1524 ret void 1525} 1526 1527; %val = load <2 x i8> 1528; %op1 = zext<2 x i32> %val 1529; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 256) 1530; %rst = mul <2 x i32> %op1, %op2 1531; 1532define void @mul_2xi8_varconst3(i8* nocapture readonly %a, i64 %index) { 1533; X86-SSE-LABEL: mul_2xi8_varconst3: 1534; X86-SSE: # %bb.0: # %entry 1535; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 1536; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 1537; X86-SSE-NEXT: movl c, %edx 1538; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx 1539; X86-SSE-NEXT: movd %ecx, %xmm0 1540; X86-SSE-NEXT: pxor %xmm1, %xmm1 1541; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1542; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1543; X86-SSE-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0 1544; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) 1545; X86-SSE-NEXT: retl 1546; 1547; X86-AVX-LABEL: mul_2xi8_varconst3: 1548; X86-AVX: # %bb.0: # %entry 1549; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1550; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 1551; X86-AVX-NEXT: movl c, %edx 1552; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx 1553; X86-AVX-NEXT: vmovd %ecx, %xmm0 1554; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1555; X86-AVX-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0 1556; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) 1557; X86-AVX-NEXT: retl 1558; 1559; X64-SSE-LABEL: mul_2xi8_varconst3: 1560; X64-SSE: # %bb.0: # %entry 1561; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 1562; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx 1563; X64-SSE-NEXT: movd %ecx, %xmm0 1564; X64-SSE-NEXT: pxor %xmm1, %xmm1 1565; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1566; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1567; X64-SSE-NEXT: pmaddwd {{.*}}(%rip), %xmm0 1568; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) 1569; X64-SSE-NEXT: retq 1570; 1571; X64-AVX-LABEL: mul_2xi8_varconst3: 1572; X64-AVX: # %bb.0: # %entry 1573; X64-AVX-NEXT: movq {{.*}}(%rip), %rax 1574; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx 1575; X64-AVX-NEXT: vmovd %ecx, %xmm0 1576; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1577; X64-AVX-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0 1578; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) 1579; X64-AVX-NEXT: retq 1580entry: 1581 %pre = load i32*, i32** @c 1582 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 1583 %tmp7 = bitcast i8* %tmp6 to <2 x i8>* 1584 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 1585 %tmp8 = zext <2 x i8> %wide.load to <2 x i32> 1586 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 256> 1587 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 1588 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 1589 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 1590 ret void 1591} 1592 1593; %val = load <2 x i8> 1594; %op1 = zext<2 x i32> %val 1595; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-1 ~ 255) 1596; %rst = mul <2 x i32> %op1, %op2 1597; 1598define void @mul_2xi8_varconst4(i8* nocapture readonly %a, i64 %index) { 1599; X86-SSE-LABEL: mul_2xi8_varconst4: 1600; X86-SSE: # %bb.0: # %entry 1601; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 1602; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 1603; X86-SSE-NEXT: movl c, %edx 1604; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx 1605; X86-SSE-NEXT: movd %ecx, %xmm0 1606; X86-SSE-NEXT: pxor %xmm1, %xmm1 1607; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1608; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u> 1609; X86-SSE-NEXT: movdqa %xmm0, %xmm2 1610; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 1611; X86-SSE-NEXT: pmullw %xmm1, %xmm0 1612; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 1613; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) 1614; X86-SSE-NEXT: retl 1615; 1616; X86-AVX-LABEL: mul_2xi8_varconst4: 1617; X86-AVX: # %bb.0: # %entry 1618; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1619; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 1620; X86-AVX-NEXT: movl c, %edx 1621; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx 1622; X86-AVX-NEXT: vmovd %ecx, %xmm0 1623; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1624; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 1625; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) 1626; X86-AVX-NEXT: retl 1627; 1628; X64-SSE-LABEL: mul_2xi8_varconst4: 1629; X64-SSE: # %bb.0: # %entry 1630; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 1631; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx 1632; X64-SSE-NEXT: movd %ecx, %xmm0 1633; X64-SSE-NEXT: pxor %xmm1, %xmm1 1634; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1635; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u> 1636; X64-SSE-NEXT: movdqa %xmm0, %xmm2 1637; X64-SSE-NEXT: pmulhw %xmm1, %xmm2 1638; X64-SSE-NEXT: pmullw %xmm1, %xmm0 1639; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 1640; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) 1641; X64-SSE-NEXT: retq 1642; 1643; X64-AVX-LABEL: mul_2xi8_varconst4: 1644; X64-AVX: # %bb.0: # %entry 1645; X64-AVX-NEXT: movq {{.*}}(%rip), %rax 1646; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx 1647; X64-AVX-NEXT: vmovd %ecx, %xmm0 1648; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1649; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 1650; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) 1651; X64-AVX-NEXT: retq 1652entry: 1653 %pre = load i32*, i32** @c 1654 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 1655 %tmp7 = bitcast i8* %tmp6 to <2 x i8>* 1656 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 1657 %tmp8 = zext <2 x i8> %wide.load to <2 x i32> 1658 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -1, i32 255> 1659 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 1660 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 1661 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 1662 ret void 1663} 1664 1665; %val = load <2 x i8> 1666; %op1 = sext<2 x i32> %val 1667; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-129 ~ 127) 1668; %rst = mul <2 x i32> %op1, %op2 1669; 1670define void @mul_2xi8_varconst5(i8* nocapture readonly %a, i64 %index) { 1671; X86-SSE-LABEL: mul_2xi8_varconst5: 1672; X86-SSE: # %bb.0: # %entry 1673; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 1674; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 1675; X86-SSE-NEXT: movl c, %edx 1676; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx 1677; X86-SSE-NEXT: movd %ecx, %xmm0 1678; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1679; X86-SSE-NEXT: psraw $8, %xmm0 1680; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u> 1681; X86-SSE-NEXT: movdqa %xmm0, %xmm2 1682; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 1683; X86-SSE-NEXT: pmullw %xmm1, %xmm0 1684; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 1685; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) 1686; X86-SSE-NEXT: retl 1687; 1688; X86-AVX-LABEL: mul_2xi8_varconst5: 1689; X86-AVX: # %bb.0: # %entry 1690; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1691; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 1692; X86-AVX-NEXT: movl c, %edx 1693; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx 1694; X86-AVX-NEXT: vmovd %ecx, %xmm0 1695; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 1696; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 1697; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) 1698; X86-AVX-NEXT: retl 1699; 1700; X64-SSE-LABEL: mul_2xi8_varconst5: 1701; X64-SSE: # %bb.0: # %entry 1702; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 1703; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx 1704; X64-SSE-NEXT: movd %ecx, %xmm0 1705; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1706; X64-SSE-NEXT: psraw $8, %xmm0 1707; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u> 1708; X64-SSE-NEXT: movdqa %xmm0, %xmm2 1709; X64-SSE-NEXT: pmulhw %xmm1, %xmm2 1710; X64-SSE-NEXT: pmullw %xmm1, %xmm0 1711; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 1712; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) 1713; X64-SSE-NEXT: retq 1714; 1715; X64-AVX-LABEL: mul_2xi8_varconst5: 1716; X64-AVX: # %bb.0: # %entry 1717; X64-AVX-NEXT: movq {{.*}}(%rip), %rax 1718; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx 1719; X64-AVX-NEXT: vmovd %ecx, %xmm0 1720; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 1721; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 1722; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) 1723; X64-AVX-NEXT: retq 1724entry: 1725 %pre = load i32*, i32** @c 1726 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 1727 %tmp7 = bitcast i8* %tmp6 to <2 x i8>* 1728 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 1729 %tmp8 = sext <2 x i8> %wide.load to <2 x i32> 1730 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -129, i32 127> 1731 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 1732 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 1733 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 1734 ret void 1735} 1736 1737; %val = load <2 x i8> 1738; %op1 = sext<2 x i32> %val 1739; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 128) 1740; %rst = mul <2 x i32> %op1, %op2 1741; 1742define void @mul_2xi8_varconst6(i8* nocapture readonly %a, i64 %index) { 1743; X86-SSE-LABEL: mul_2xi8_varconst6: 1744; X86-SSE: # %bb.0: # %entry 1745; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 1746; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 1747; X86-SSE-NEXT: movl c, %edx 1748; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx 1749; X86-SSE-NEXT: movd %ecx, %xmm0 1750; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1751; X86-SSE-NEXT: psraw $8, %xmm0 1752; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u> 1753; X86-SSE-NEXT: movdqa %xmm0, %xmm2 1754; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 1755; X86-SSE-NEXT: pmullw %xmm1, %xmm0 1756; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 1757; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) 1758; X86-SSE-NEXT: retl 1759; 1760; X86-AVX-LABEL: mul_2xi8_varconst6: 1761; X86-AVX: # %bb.0: # %entry 1762; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1763; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 1764; X86-AVX-NEXT: movl c, %edx 1765; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx 1766; X86-AVX-NEXT: vmovd %ecx, %xmm0 1767; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 1768; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 1769; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) 1770; X86-AVX-NEXT: retl 1771; 1772; X64-SSE-LABEL: mul_2xi8_varconst6: 1773; X64-SSE: # %bb.0: # %entry 1774; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 1775; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx 1776; X64-SSE-NEXT: movd %ecx, %xmm0 1777; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1778; X64-SSE-NEXT: psraw $8, %xmm0 1779; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u> 1780; X64-SSE-NEXT: movdqa %xmm0, %xmm2 1781; X64-SSE-NEXT: pmulhw %xmm1, %xmm2 1782; X64-SSE-NEXT: pmullw %xmm1, %xmm0 1783; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 1784; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) 1785; X64-SSE-NEXT: retq 1786; 1787; X64-AVX-LABEL: mul_2xi8_varconst6: 1788; X64-AVX: # %bb.0: # %entry 1789; X64-AVX-NEXT: movq {{.*}}(%rip), %rax 1790; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx 1791; X64-AVX-NEXT: vmovd %ecx, %xmm0 1792; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 1793; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 1794; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) 1795; X64-AVX-NEXT: retq 1796entry: 1797 %pre = load i32*, i32** @c 1798 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 1799 %tmp7 = bitcast i8* %tmp6 to <2 x i8>* 1800 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 1801 %tmp8 = sext <2 x i8> %wide.load to <2 x i32> 1802 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 128> 1803 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 1804 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 1805 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 1806 ret void 1807} 1808 1809; %val = load <2 x i16> 1810; %op1 = zext<2 x i32> %val 1811; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65535) 1812; %rst = mul <2 x i32> %op1, %op2 1813; 1814define void @mul_2xi16_varconst1(i8* nocapture readonly %a, i64 %index) { 1815; X86-SSE-LABEL: mul_2xi16_varconst1: 1816; X86-SSE: # %bb.0: # %entry 1817; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 1818; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 1819; X86-SSE-NEXT: movl c, %edx 1820; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1821; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u> 1822; X86-SSE-NEXT: movdqa %xmm0, %xmm2 1823; X86-SSE-NEXT: pmulhuw %xmm1, %xmm2 1824; X86-SSE-NEXT: pmullw %xmm1, %xmm0 1825; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 1826; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) 1827; X86-SSE-NEXT: retl 1828; 1829; X86-AVX-LABEL: mul_2xi16_varconst1: 1830; X86-AVX: # %bb.0: # %entry 1831; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1832; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 1833; X86-AVX-NEXT: movl c, %edx 1834; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1835; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1836; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 1837; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) 1838; X86-AVX-NEXT: retl 1839; 1840; X64-SSE-LABEL: mul_2xi16_varconst1: 1841; X64-SSE: # %bb.0: # %entry 1842; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 1843; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1844; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u> 1845; X64-SSE-NEXT: movdqa %xmm0, %xmm2 1846; X64-SSE-NEXT: pmulhuw %xmm1, %xmm2 1847; X64-SSE-NEXT: pmullw %xmm1, %xmm0 1848; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 1849; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) 1850; X64-SSE-NEXT: retq 1851; 1852; X64-AVX-LABEL: mul_2xi16_varconst1: 1853; X64-AVX: # %bb.0: # %entry 1854; X64-AVX-NEXT: movq {{.*}}(%rip), %rax 1855; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1856; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1857; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 1858; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) 1859; X64-AVX-NEXT: retq 1860entry: 1861 %pre = load i32*, i32** @c 1862 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 1863 %tmp7 = bitcast i8* %tmp6 to <2 x i16>* 1864 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 1865 %tmp8 = zext <2 x i16> %wide.load to <2 x i32> 1866 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65535> 1867 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 1868 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 1869 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 1870 ret void 1871} 1872 1873; %val = load <2 x i16> 1874; %op1 = sext<2 x i32> %val 1875; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-32768 ~ 32767) 1876; %rst = mul <2 x i32> %op1, %op2 1877; 1878define void @mul_2xi16_varconst2(i8* nocapture readonly %a, i64 %index) { 1879; X86-SSE-LABEL: mul_2xi16_varconst2: 1880; X86-SSE: # %bb.0: # %entry 1881; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 1882; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 1883; X86-SSE-NEXT: movl c, %edx 1884; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1885; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u> 1886; X86-SSE-NEXT: movdqa %xmm0, %xmm2 1887; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 1888; X86-SSE-NEXT: pmullw %xmm1, %xmm0 1889; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 1890; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) 1891; X86-SSE-NEXT: retl 1892; 1893; X86-AVX-LABEL: mul_2xi16_varconst2: 1894; X86-AVX: # %bb.0: # %entry 1895; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1896; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 1897; X86-AVX-NEXT: movl c, %edx 1898; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1899; X86-AVX-NEXT: vpmovsxwd %xmm0, %xmm0 1900; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 1901; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) 1902; X86-AVX-NEXT: retl 1903; 1904; X64-SSE-LABEL: mul_2xi16_varconst2: 1905; X64-SSE: # %bb.0: # %entry 1906; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 1907; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1908; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u> 1909; X64-SSE-NEXT: movdqa %xmm0, %xmm2 1910; X64-SSE-NEXT: pmulhw %xmm1, %xmm2 1911; X64-SSE-NEXT: pmullw %xmm1, %xmm0 1912; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 1913; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) 1914; X64-SSE-NEXT: retq 1915; 1916; X64-AVX-LABEL: mul_2xi16_varconst2: 1917; X64-AVX: # %bb.0: # %entry 1918; X64-AVX-NEXT: movq {{.*}}(%rip), %rax 1919; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1920; X64-AVX-NEXT: vpmovsxwd %xmm0, %xmm0 1921; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 1922; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) 1923; X64-AVX-NEXT: retq 1924entry: 1925 %pre = load i32*, i32** @c 1926 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 1927 %tmp7 = bitcast i8* %tmp6 to <2 x i16>* 1928 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 1929 %tmp8 = sext <2 x i16> %wide.load to <2 x i32> 1930 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -32768, i32 32767> 1931 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 1932 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 1933 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 1934 ret void 1935} 1936 1937; %val = load <2 x i16> 1938; %op1 = zext<2 x i32> %val 1939; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65536) 1940; %rst = mul <2 x i32> %op1, %op2 1941; 1942define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) { 1943; X86-SSE-LABEL: mul_2xi16_varconst3: 1944; X86-SSE: # %bb.0: # %entry 1945; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 1946; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 1947; X86-SSE-NEXT: movl c, %edx 1948; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1949; X86-SSE-NEXT: pxor %xmm1, %xmm1 1950; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1951; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 1952; X86-SSE-NEXT: pmuludq {{\.LCPI.*}}, %xmm0 1953; X86-SSE-NEXT: pmuludq {{\.LCPI.*}}, %xmm1 1954; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1955; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) 1956; X86-SSE-NEXT: retl 1957; 1958; X86-AVX-LABEL: mul_2xi16_varconst3: 1959; X86-AVX: # %bb.0: # %entry 1960; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1961; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 1962; X86-AVX-NEXT: movl c, %edx 1963; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1964; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1965; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 1966; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) 1967; X86-AVX-NEXT: retl 1968; 1969; X64-SSE-LABEL: mul_2xi16_varconst3: 1970; X64-SSE: # %bb.0: # %entry 1971; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 1972; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1973; X64-SSE-NEXT: pxor %xmm1, %xmm1 1974; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1975; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 1976; X64-SSE-NEXT: pmuludq {{.*}}(%rip), %xmm0 1977; X64-SSE-NEXT: pmuludq {{.*}}(%rip), %xmm1 1978; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1979; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) 1980; X64-SSE-NEXT: retq 1981; 1982; X64-AVX-LABEL: mul_2xi16_varconst3: 1983; X64-AVX: # %bb.0: # %entry 1984; X64-AVX-NEXT: movq {{.*}}(%rip), %rax 1985; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1986; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1987; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 1988; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) 1989; X64-AVX-NEXT: retq 1990entry: 1991 %pre = load i32*, i32** @c 1992 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 1993 %tmp7 = bitcast i8* %tmp6 to <2 x i16>* 1994 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 1995 %tmp8 = zext <2 x i16> %wide.load to <2 x i32> 1996 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65536> 1997 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 1998 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 1999 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 2000 ret void 2001} 2002 2003; %val = load <2 x i16> 2004; %op1 = sext<2 x i32> %val 2005; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 32768) 2006; %rst = mul <2 x i32> %op1, %op2 2007; 2008define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) { 2009; X86-SSE-LABEL: mul_2xi16_varconst4: 2010; X86-SSE: # %bb.0: # %entry 2011; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 2012; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 2013; X86-SSE-NEXT: movl c, %edx 2014; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2015; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] 2016; X86-SSE-NEXT: psrad $16, %xmm0 2017; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 2018; X86-SSE-NEXT: pmuludq {{\.LCPI.*}}, %xmm0 2019; X86-SSE-NEXT: pmuludq {{\.LCPI.*}}, %xmm1 2020; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2021; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) 2022; X86-SSE-NEXT: retl 2023; 2024; X86-AVX-LABEL: mul_2xi16_varconst4: 2025; X86-AVX: # %bb.0: # %entry 2026; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 2027; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 2028; X86-AVX-NEXT: movl c, %edx 2029; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2030; X86-AVX-NEXT: vpmovsxwd %xmm0, %xmm0 2031; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 2032; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) 2033; X86-AVX-NEXT: retl 2034; 2035; X64-SSE-LABEL: mul_2xi16_varconst4: 2036; X64-SSE: # %bb.0: # %entry 2037; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 2038; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2039; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] 2040; X64-SSE-NEXT: psrad $16, %xmm0 2041; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 2042; X64-SSE-NEXT: pmuludq {{.*}}(%rip), %xmm0 2043; X64-SSE-NEXT: pmuludq {{.*}}(%rip), %xmm1 2044; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2045; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) 2046; X64-SSE-NEXT: retq 2047; 2048; X64-AVX-LABEL: mul_2xi16_varconst4: 2049; X64-AVX: # %bb.0: # %entry 2050; X64-AVX-NEXT: movq {{.*}}(%rip), %rax 2051; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2052; X64-AVX-NEXT: vpmovsxwd %xmm0, %xmm0 2053; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 2054; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) 2055; X64-AVX-NEXT: retq 2056entry: 2057 %pre = load i32*, i32** @c 2058 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 2059 %tmp7 = bitcast i8* %tmp6 to <2 x i16>* 2060 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 2061 %tmp8 = sext <2 x i16> %wide.load to <2 x i32> 2062 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 32768> 2063 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 2064 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 2065 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 2066 ret void 2067} 2068 2069; 2070; Illegal Types 2071; 2072 2073define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind { 2074; X86-SSE-LABEL: PR34947: 2075; X86-SSE: # %bb.0: 2076; X86-SSE-NEXT: pushl %esi 2077; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 2078; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 2079; X86-SSE-NEXT: movdqa (%eax), %xmm4 2080; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2081; X86-SSE-NEXT: movdqa (%ecx), %xmm1 2082; X86-SSE-NEXT: movdqa 16(%ecx), %xmm5 2083; X86-SSE-NEXT: pxor %xmm3, %xmm3 2084; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] 2085; X86-SSE-NEXT: movdqa %xmm4, %xmm2 2086; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 2087; X86-SSE-NEXT: movdqa %xmm4, %xmm6 2088; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] 2089; X86-SSE-NEXT: movdqa %xmm4, %xmm3 2090; X86-SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 2091; X86-SSE-NEXT: movd %xmm3, %eax 2092; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[3,3,3,3] 2093; X86-SSE-NEXT: movd %xmm3, %esi 2094; X86-SSE-NEXT: xorl %edx, %edx 2095; X86-SSE-NEXT: divl %esi 2096; X86-SSE-NEXT: movd %edx, %xmm3 2097; X86-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,3,2,3] 2098; X86-SSE-NEXT: movd %xmm7, %eax 2099; X86-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,3,2,3] 2100; X86-SSE-NEXT: movd %xmm7, %esi 2101; X86-SSE-NEXT: xorl %edx, %edx 2102; X86-SSE-NEXT: divl %esi 2103; X86-SSE-NEXT: movd %edx, %xmm7 2104; X86-SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] 2105; X86-SSE-NEXT: movd %xmm6, %eax 2106; X86-SSE-NEXT: movd %xmm5, %esi 2107; X86-SSE-NEXT: xorl %edx, %edx 2108; X86-SSE-NEXT: divl %esi 2109; X86-SSE-NEXT: movd %edx, %xmm3 2110; X86-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,1,1] 2111; X86-SSE-NEXT: movd %xmm6, %eax 2112; X86-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] 2113; X86-SSE-NEXT: movd %xmm5, %esi 2114; X86-SSE-NEXT: xorl %edx, %edx 2115; X86-SSE-NEXT: divl %esi 2116; X86-SSE-NEXT: movd %edx, %xmm5 2117; X86-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] 2118; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0] 2119; X86-SSE-NEXT: movdqa %xmm4, %xmm5 2120; X86-SSE-NEXT: psrld $16, %xmm5 2121; X86-SSE-NEXT: movd %xmm5, %eax 2122; X86-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] 2123; X86-SSE-NEXT: movd %xmm5, %esi 2124; X86-SSE-NEXT: xorl %edx, %edx 2125; X86-SSE-NEXT: divl %esi 2126; X86-SSE-NEXT: movd %edx, %xmm6 2127; X86-SSE-NEXT: movd %xmm2, %eax 2128; X86-SSE-NEXT: movd %xmm1, %esi 2129; X86-SSE-NEXT: xorl %edx, %edx 2130; X86-SSE-NEXT: divl %esi 2131; X86-SSE-NEXT: movd %edx, %xmm5 2132; X86-SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] 2133; X86-SSE-NEXT: psrlq $48, %xmm4 2134; X86-SSE-NEXT: movd %xmm4, %eax 2135; X86-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[3,3,3,3] 2136; X86-SSE-NEXT: movd %xmm4, %esi 2137; X86-SSE-NEXT: xorl %edx, %edx 2138; X86-SSE-NEXT: divl %esi 2139; X86-SSE-NEXT: movd %edx, %xmm4 2140; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 2141; X86-SSE-NEXT: movd %xmm2, %eax 2142; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 2143; X86-SSE-NEXT: movd %xmm1, %esi 2144; X86-SSE-NEXT: xorl %edx, %edx 2145; X86-SSE-NEXT: divl %esi 2146; X86-SSE-NEXT: movd %edx, %xmm1 2147; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 2148; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm1[0] 2149; X86-SSE-NEXT: movd %xmm0, %eax 2150; X86-SSE-NEXT: xorl %edx, %edx 2151; X86-SSE-NEXT: divl 32(%ecx) 2152; X86-SSE-NEXT: movdqa {{.*#+}} xmm0 = [8199,8199,8199,8199] 2153; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] 2154; X86-SSE-NEXT: pmuludq %xmm0, %xmm5 2155; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,2,2,3] 2156; X86-SSE-NEXT: pmuludq %xmm0, %xmm1 2157; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2158; X86-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 2159; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] 2160; X86-SSE-NEXT: pmuludq %xmm0, %xmm3 2161; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 2162; X86-SSE-NEXT: pmuludq %xmm0, %xmm1 2163; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 2164; X86-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 2165; X86-SSE-NEXT: imull $8199, %edx, %eax # imm = 0x2007 2166; X86-SSE-NEXT: movl %eax, (%eax) 2167; X86-SSE-NEXT: movdqa %xmm3, (%eax) 2168; X86-SSE-NEXT: movdqa %xmm2, (%eax) 2169; X86-SSE-NEXT: popl %esi 2170; X86-SSE-NEXT: retl 2171; 2172; X86-AVX1-LABEL: PR34947: 2173; X86-AVX1: # %bb.0: 2174; X86-AVX1-NEXT: pushl %ebp 2175; X86-AVX1-NEXT: pushl %ebx 2176; X86-AVX1-NEXT: pushl %edi 2177; X86-AVX1-NEXT: pushl %esi 2178; X86-AVX1-NEXT: subl $16, %esp 2179; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx 2180; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax 2181; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 2182; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 2183; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 2184; X86-AVX1-NEXT: vmovd %xmm1, %eax 2185; X86-AVX1-NEXT: xorl %edx, %edx 2186; X86-AVX1-NEXT: divl 32(%ecx) 2187; X86-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill 2188; X86-AVX1-NEXT: vpextrd $3, %xmm2, %eax 2189; X86-AVX1-NEXT: vmovdqa (%ecx), %xmm1 2190; X86-AVX1-NEXT: vmovdqa 16(%ecx), %xmm3 2191; X86-AVX1-NEXT: vpextrd $3, %xmm3, %ecx 2192; X86-AVX1-NEXT: xorl %edx, %edx 2193; X86-AVX1-NEXT: divl %ecx 2194; X86-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill 2195; X86-AVX1-NEXT: vpextrd $2, %xmm2, %eax 2196; X86-AVX1-NEXT: vpextrd $2, %xmm3, %ecx 2197; X86-AVX1-NEXT: xorl %edx, %edx 2198; X86-AVX1-NEXT: divl %ecx 2199; X86-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill 2200; X86-AVX1-NEXT: vpextrd $1, %xmm2, %eax 2201; X86-AVX1-NEXT: vpextrd $1, %xmm3, %ecx 2202; X86-AVX1-NEXT: xorl %edx, %edx 2203; X86-AVX1-NEXT: divl %ecx 2204; X86-AVX1-NEXT: movl %edx, (%esp) # 4-byte Spill 2205; X86-AVX1-NEXT: vmovd %xmm2, %eax 2206; X86-AVX1-NEXT: vmovd %xmm3, %ecx 2207; X86-AVX1-NEXT: xorl %edx, %edx 2208; X86-AVX1-NEXT: divl %ecx 2209; X86-AVX1-NEXT: movl %edx, %ebp 2210; X86-AVX1-NEXT: vpextrd $3, %xmm0, %eax 2211; X86-AVX1-NEXT: vpextrd $3, %xmm1, %ecx 2212; X86-AVX1-NEXT: xorl %edx, %edx 2213; X86-AVX1-NEXT: divl %ecx 2214; X86-AVX1-NEXT: movl %edx, %ebx 2215; X86-AVX1-NEXT: vpextrd $2, %xmm0, %eax 2216; X86-AVX1-NEXT: vpextrd $2, %xmm1, %esi 2217; X86-AVX1-NEXT: xorl %edx, %edx 2218; X86-AVX1-NEXT: divl %esi 2219; X86-AVX1-NEXT: movl %edx, %esi 2220; X86-AVX1-NEXT: vpextrd $1, %xmm0, %eax 2221; X86-AVX1-NEXT: vpextrd $1, %xmm1, %edi 2222; X86-AVX1-NEXT: xorl %edx, %edx 2223; X86-AVX1-NEXT: divl %edi 2224; X86-AVX1-NEXT: movl %edx, %edi 2225; X86-AVX1-NEXT: vmovd %xmm0, %eax 2226; X86-AVX1-NEXT: vmovd %xmm1, %ecx 2227; X86-AVX1-NEXT: xorl %edx, %edx 2228; X86-AVX1-NEXT: divl %ecx 2229; X86-AVX1-NEXT: vmovd %edx, %xmm0 2230; X86-AVX1-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 2231; X86-AVX1-NEXT: vpinsrd $2, %esi, %xmm0, %xmm0 2232; X86-AVX1-NEXT: vpinsrd $3, %ebx, %xmm0, %xmm0 2233; X86-AVX1-NEXT: vmovd %ebp, %xmm1 2234; X86-AVX1-NEXT: vpinsrd $1, (%esp), %xmm1, %xmm1 # 4-byte Folded Reload 2235; X86-AVX1-NEXT: vpinsrd $2, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload 2236; X86-AVX1-NEXT: vpinsrd $3, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload 2237; X86-AVX1-NEXT: imull $8199, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload 2238; X86-AVX1-NEXT: # imm = 0x2007 2239; X86-AVX1-NEXT: movl %eax, (%eax) 2240; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [8199,8199,8199,8199] 2241; X86-AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0 2242; X86-AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1 2243; X86-AVX1-NEXT: vmovdqa %xmm1, (%eax) 2244; X86-AVX1-NEXT: vmovdqa %xmm0, (%eax) 2245; X86-AVX1-NEXT: addl $16, %esp 2246; X86-AVX1-NEXT: popl %esi 2247; X86-AVX1-NEXT: popl %edi 2248; X86-AVX1-NEXT: popl %ebx 2249; X86-AVX1-NEXT: popl %ebp 2250; X86-AVX1-NEXT: retl 2251; 2252; X86-AVX2-LABEL: PR34947: 2253; X86-AVX2: # %bb.0: 2254; X86-AVX2-NEXT: pushl %edi 2255; X86-AVX2-NEXT: pushl %esi 2256; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %esi 2257; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax 2258; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 2259; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 2260; X86-AVX2-NEXT: vmovdqa (%esi), %xmm2 2261; X86-AVX2-NEXT: vmovdqa 16(%esi), %xmm3 2262; X86-AVX2-NEXT: vpextrd $1, %xmm3, %ecx 2263; X86-AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 2264; X86-AVX2-NEXT: vpextrd $1, %xmm4, %eax 2265; X86-AVX2-NEXT: xorl %edx, %edx 2266; X86-AVX2-NEXT: divl %ecx 2267; X86-AVX2-NEXT: movl %edx, %ecx 2268; X86-AVX2-NEXT: vmovd %xmm3, %edi 2269; X86-AVX2-NEXT: vmovd %xmm4, %eax 2270; X86-AVX2-NEXT: xorl %edx, %edx 2271; X86-AVX2-NEXT: divl %edi 2272; X86-AVX2-NEXT: vmovd %edx, %xmm5 2273; X86-AVX2-NEXT: vpinsrd $1, %ecx, %xmm5, %xmm5 2274; X86-AVX2-NEXT: vpextrd $2, %xmm3, %ecx 2275; X86-AVX2-NEXT: vpextrd $2, %xmm4, %eax 2276; X86-AVX2-NEXT: xorl %edx, %edx 2277; X86-AVX2-NEXT: divl %ecx 2278; X86-AVX2-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5 2279; X86-AVX2-NEXT: vpextrd $3, %xmm3, %ecx 2280; X86-AVX2-NEXT: vpextrd $3, %xmm4, %eax 2281; X86-AVX2-NEXT: xorl %edx, %edx 2282; X86-AVX2-NEXT: divl %ecx 2283; X86-AVX2-NEXT: vpinsrd $3, %edx, %xmm5, %xmm3 2284; X86-AVX2-NEXT: vpextrd $1, %xmm2, %ecx 2285; X86-AVX2-NEXT: vpextrd $1, %xmm1, %eax 2286; X86-AVX2-NEXT: xorl %edx, %edx 2287; X86-AVX2-NEXT: divl %ecx 2288; X86-AVX2-NEXT: movl %edx, %ecx 2289; X86-AVX2-NEXT: vmovd %xmm2, %edi 2290; X86-AVX2-NEXT: vmovd %xmm1, %eax 2291; X86-AVX2-NEXT: xorl %edx, %edx 2292; X86-AVX2-NEXT: divl %edi 2293; X86-AVX2-NEXT: vmovd %edx, %xmm4 2294; X86-AVX2-NEXT: vpinsrd $1, %ecx, %xmm4, %xmm4 2295; X86-AVX2-NEXT: vpextrd $2, %xmm2, %ecx 2296; X86-AVX2-NEXT: vpextrd $2, %xmm1, %eax 2297; X86-AVX2-NEXT: xorl %edx, %edx 2298; X86-AVX2-NEXT: divl %ecx 2299; X86-AVX2-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4 2300; X86-AVX2-NEXT: vpextrd $3, %xmm2, %ecx 2301; X86-AVX2-NEXT: vpextrd $3, %xmm1, %eax 2302; X86-AVX2-NEXT: xorl %edx, %edx 2303; X86-AVX2-NEXT: divl %ecx 2304; X86-AVX2-NEXT: vpinsrd $3, %edx, %xmm4, %xmm1 2305; X86-AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 2306; X86-AVX2-NEXT: vmovd %xmm0, %eax 2307; X86-AVX2-NEXT: xorl %edx, %edx 2308; X86-AVX2-NEXT: divl 32(%esi) 2309; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm0 = [8199,8199,8199,8199,8199,8199,8199,8199] 2310; X86-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0 2311; X86-AVX2-NEXT: imull $8199, %edx, %eax # imm = 0x2007 2312; X86-AVX2-NEXT: movl %eax, (%eax) 2313; X86-AVX2-NEXT: vmovdqa %ymm0, (%eax) 2314; X86-AVX2-NEXT: popl %esi 2315; X86-AVX2-NEXT: popl %edi 2316; X86-AVX2-NEXT: vzeroupper 2317; X86-AVX2-NEXT: retl 2318; 2319; X64-SSE-LABEL: PR34947: 2320; X64-SSE: # %bb.0: 2321; X64-SSE-NEXT: movdqa (%rdi), %xmm4 2322; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2323; X64-SSE-NEXT: movdqa (%rsi), %xmm1 2324; X64-SSE-NEXT: movdqa 16(%rsi), %xmm5 2325; X64-SSE-NEXT: pxor %xmm3, %xmm3 2326; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] 2327; X64-SSE-NEXT: movdqa %xmm4, %xmm2 2328; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 2329; X64-SSE-NEXT: movdqa %xmm4, %xmm6 2330; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] 2331; X64-SSE-NEXT: movdqa %xmm4, %xmm3 2332; X64-SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 2333; X64-SSE-NEXT: movd %xmm3, %eax 2334; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[3,3,3,3] 2335; X64-SSE-NEXT: movd %xmm3, %ecx 2336; X64-SSE-NEXT: xorl %edx, %edx 2337; X64-SSE-NEXT: divl %ecx 2338; X64-SSE-NEXT: movd %edx, %xmm3 2339; X64-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,3,2,3] 2340; X64-SSE-NEXT: movd %xmm7, %eax 2341; X64-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,3,2,3] 2342; X64-SSE-NEXT: movd %xmm7, %ecx 2343; X64-SSE-NEXT: xorl %edx, %edx 2344; X64-SSE-NEXT: divl %ecx 2345; X64-SSE-NEXT: movd %edx, %xmm7 2346; X64-SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] 2347; X64-SSE-NEXT: movd %xmm6, %eax 2348; X64-SSE-NEXT: movd %xmm5, %ecx 2349; X64-SSE-NEXT: xorl %edx, %edx 2350; X64-SSE-NEXT: divl %ecx 2351; X64-SSE-NEXT: movd %edx, %xmm3 2352; X64-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,1,1] 2353; X64-SSE-NEXT: movd %xmm6, %eax 2354; X64-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] 2355; X64-SSE-NEXT: movd %xmm5, %ecx 2356; X64-SSE-NEXT: xorl %edx, %edx 2357; X64-SSE-NEXT: divl %ecx 2358; X64-SSE-NEXT: movd %edx, %xmm5 2359; X64-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] 2360; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0] 2361; X64-SSE-NEXT: movdqa %xmm4, %xmm5 2362; X64-SSE-NEXT: psrld $16, %xmm5 2363; X64-SSE-NEXT: movd %xmm5, %eax 2364; X64-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] 2365; X64-SSE-NEXT: movd %xmm5, %ecx 2366; X64-SSE-NEXT: xorl %edx, %edx 2367; X64-SSE-NEXT: divl %ecx 2368; X64-SSE-NEXT: movd %edx, %xmm6 2369; X64-SSE-NEXT: movd %xmm2, %eax 2370; X64-SSE-NEXT: movd %xmm1, %ecx 2371; X64-SSE-NEXT: xorl %edx, %edx 2372; X64-SSE-NEXT: divl %ecx 2373; X64-SSE-NEXT: movd %edx, %xmm5 2374; X64-SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] 2375; X64-SSE-NEXT: psrlq $48, %xmm4 2376; X64-SSE-NEXT: movd %xmm4, %eax 2377; X64-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[3,3,3,3] 2378; X64-SSE-NEXT: movd %xmm4, %ecx 2379; X64-SSE-NEXT: xorl %edx, %edx 2380; X64-SSE-NEXT: divl %ecx 2381; X64-SSE-NEXT: movd %edx, %xmm4 2382; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 2383; X64-SSE-NEXT: movd %xmm2, %eax 2384; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 2385; X64-SSE-NEXT: movd %xmm1, %ecx 2386; X64-SSE-NEXT: xorl %edx, %edx 2387; X64-SSE-NEXT: divl %ecx 2388; X64-SSE-NEXT: movd %edx, %xmm1 2389; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 2390; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm1[0] 2391; X64-SSE-NEXT: movd %xmm0, %eax 2392; X64-SSE-NEXT: xorl %edx, %edx 2393; X64-SSE-NEXT: divl 32(%rsi) 2394; X64-SSE-NEXT: movdqa {{.*#+}} xmm0 = [8199,8199,8199,8199] 2395; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] 2396; X64-SSE-NEXT: pmuludq %xmm0, %xmm5 2397; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,2,2,3] 2398; X64-SSE-NEXT: pmuludq %xmm0, %xmm1 2399; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2400; X64-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 2401; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] 2402; X64-SSE-NEXT: pmuludq %xmm0, %xmm3 2403; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 2404; X64-SSE-NEXT: pmuludq %xmm0, %xmm1 2405; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 2406; X64-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 2407; X64-SSE-NEXT: imull $8199, %edx, %eax # imm = 0x2007 2408; X64-SSE-NEXT: movl %eax, (%rax) 2409; X64-SSE-NEXT: movdqa %xmm3, (%rax) 2410; X64-SSE-NEXT: movdqa %xmm2, (%rax) 2411; X64-SSE-NEXT: retq 2412; 2413; X64-AVX1-LABEL: PR34947: 2414; X64-AVX1: # %bb.0: 2415; X64-AVX1-NEXT: pushq %rbp 2416; X64-AVX1-NEXT: pushq %rbx 2417; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 2418; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 2419; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 2420; X64-AVX1-NEXT: vmovd %xmm1, %eax 2421; X64-AVX1-NEXT: xorl %edx, %edx 2422; X64-AVX1-NEXT: divl 32(%rsi) 2423; X64-AVX1-NEXT: movl %edx, %r8d 2424; X64-AVX1-NEXT: vpextrd $3, %xmm2, %eax 2425; X64-AVX1-NEXT: vmovdqa (%rsi), %xmm1 2426; X64-AVX1-NEXT: vmovdqa 16(%rsi), %xmm3 2427; X64-AVX1-NEXT: vpextrd $3, %xmm3, %ecx 2428; X64-AVX1-NEXT: xorl %edx, %edx 2429; X64-AVX1-NEXT: divl %ecx 2430; X64-AVX1-NEXT: movl %edx, %r9d 2431; X64-AVX1-NEXT: vpextrd $2, %xmm2, %eax 2432; X64-AVX1-NEXT: vpextrd $2, %xmm3, %ecx 2433; X64-AVX1-NEXT: xorl %edx, %edx 2434; X64-AVX1-NEXT: divl %ecx 2435; X64-AVX1-NEXT: movl %edx, %r10d 2436; X64-AVX1-NEXT: vpextrd $1, %xmm2, %eax 2437; X64-AVX1-NEXT: vpextrd $1, %xmm3, %ecx 2438; X64-AVX1-NEXT: xorl %edx, %edx 2439; X64-AVX1-NEXT: divl %ecx 2440; X64-AVX1-NEXT: movl %edx, %r11d 2441; X64-AVX1-NEXT: vmovd %xmm2, %eax 2442; X64-AVX1-NEXT: vmovd %xmm3, %ecx 2443; X64-AVX1-NEXT: xorl %edx, %edx 2444; X64-AVX1-NEXT: divl %ecx 2445; X64-AVX1-NEXT: movl %edx, %esi 2446; X64-AVX1-NEXT: vpextrd $3, %xmm0, %eax 2447; X64-AVX1-NEXT: vpextrd $3, %xmm1, %ecx 2448; X64-AVX1-NEXT: xorl %edx, %edx 2449; X64-AVX1-NEXT: divl %ecx 2450; X64-AVX1-NEXT: movl %edx, %edi 2451; X64-AVX1-NEXT: vpextrd $2, %xmm0, %eax 2452; X64-AVX1-NEXT: vpextrd $2, %xmm1, %ecx 2453; X64-AVX1-NEXT: xorl %edx, %edx 2454; X64-AVX1-NEXT: divl %ecx 2455; X64-AVX1-NEXT: movl %edx, %ecx 2456; X64-AVX1-NEXT: vpextrd $1, %xmm0, %eax 2457; X64-AVX1-NEXT: vpextrd $1, %xmm1, %ebx 2458; X64-AVX1-NEXT: xorl %edx, %edx 2459; X64-AVX1-NEXT: divl %ebx 2460; X64-AVX1-NEXT: movl %edx, %ebx 2461; X64-AVX1-NEXT: vmovd %xmm0, %eax 2462; X64-AVX1-NEXT: vmovd %xmm1, %ebp 2463; X64-AVX1-NEXT: xorl %edx, %edx 2464; X64-AVX1-NEXT: divl %ebp 2465; X64-AVX1-NEXT: vmovd %edx, %xmm0 2466; X64-AVX1-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm0 2467; X64-AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 2468; X64-AVX1-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 2469; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8199,8199,8199,8199] 2470; X64-AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 2471; X64-AVX1-NEXT: vmovd %esi, %xmm2 2472; X64-AVX1-NEXT: vpinsrd $1, %r11d, %xmm2, %xmm2 2473; X64-AVX1-NEXT: vpinsrd $2, %r10d, %xmm2, %xmm2 2474; X64-AVX1-NEXT: vpinsrd $3, %r9d, %xmm2, %xmm2 2475; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 2476; X64-AVX1-NEXT: imull $8199, %r8d, %eax # imm = 0x2007 2477; X64-AVX1-NEXT: movl %eax, (%rax) 2478; X64-AVX1-NEXT: vmovdqa %xmm1, (%rax) 2479; X64-AVX1-NEXT: vmovdqa %xmm0, (%rax) 2480; X64-AVX1-NEXT: popq %rbx 2481; X64-AVX1-NEXT: popq %rbp 2482; X64-AVX1-NEXT: retq 2483; 2484; X64-AVX2-LABEL: PR34947: 2485; X64-AVX2: # %bb.0: 2486; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 2487; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 2488; X64-AVX2-NEXT: vmovdqa (%rsi), %xmm2 2489; X64-AVX2-NEXT: vmovdqa 16(%rsi), %xmm3 2490; X64-AVX2-NEXT: vpextrd $1, %xmm3, %ecx 2491; X64-AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 2492; X64-AVX2-NEXT: vpextrd $1, %xmm4, %eax 2493; X64-AVX2-NEXT: xorl %edx, %edx 2494; X64-AVX2-NEXT: divl %ecx 2495; X64-AVX2-NEXT: movl %edx, %ecx 2496; X64-AVX2-NEXT: vmovd %xmm3, %edi 2497; X64-AVX2-NEXT: vmovd %xmm4, %eax 2498; X64-AVX2-NEXT: xorl %edx, %edx 2499; X64-AVX2-NEXT: divl %edi 2500; X64-AVX2-NEXT: vmovd %edx, %xmm5 2501; X64-AVX2-NEXT: vpinsrd $1, %ecx, %xmm5, %xmm5 2502; X64-AVX2-NEXT: vpextrd $2, %xmm3, %ecx 2503; X64-AVX2-NEXT: vpextrd $2, %xmm4, %eax 2504; X64-AVX2-NEXT: xorl %edx, %edx 2505; X64-AVX2-NEXT: divl %ecx 2506; X64-AVX2-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5 2507; X64-AVX2-NEXT: vpextrd $3, %xmm3, %ecx 2508; X64-AVX2-NEXT: vpextrd $3, %xmm4, %eax 2509; X64-AVX2-NEXT: xorl %edx, %edx 2510; X64-AVX2-NEXT: divl %ecx 2511; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm5, %xmm3 2512; X64-AVX2-NEXT: vpextrd $1, %xmm2, %ecx 2513; X64-AVX2-NEXT: vpextrd $1, %xmm1, %eax 2514; X64-AVX2-NEXT: xorl %edx, %edx 2515; X64-AVX2-NEXT: divl %ecx 2516; X64-AVX2-NEXT: movl %edx, %ecx 2517; X64-AVX2-NEXT: vmovd %xmm2, %edi 2518; X64-AVX2-NEXT: vmovd %xmm1, %eax 2519; X64-AVX2-NEXT: xorl %edx, %edx 2520; X64-AVX2-NEXT: divl %edi 2521; X64-AVX2-NEXT: vmovd %edx, %xmm4 2522; X64-AVX2-NEXT: vpinsrd $1, %ecx, %xmm4, %xmm4 2523; X64-AVX2-NEXT: vpextrd $2, %xmm2, %ecx 2524; X64-AVX2-NEXT: vpextrd $2, %xmm1, %eax 2525; X64-AVX2-NEXT: xorl %edx, %edx 2526; X64-AVX2-NEXT: divl %ecx 2527; X64-AVX2-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4 2528; X64-AVX2-NEXT: vpextrd $3, %xmm2, %ecx 2529; X64-AVX2-NEXT: vpextrd $3, %xmm1, %eax 2530; X64-AVX2-NEXT: xorl %edx, %edx 2531; X64-AVX2-NEXT: divl %ecx 2532; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm4, %xmm1 2533; X64-AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 2534; X64-AVX2-NEXT: vmovd %xmm0, %eax 2535; X64-AVX2-NEXT: xorl %edx, %edx 2536; X64-AVX2-NEXT: divl 32(%rsi) 2537; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm0 = [8199,8199,8199,8199,8199,8199,8199,8199] 2538; X64-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0 2539; X64-AVX2-NEXT: imull $8199, %edx, %eax # imm = 0x2007 2540; X64-AVX2-NEXT: movl %eax, (%rax) 2541; X64-AVX2-NEXT: vmovdqa %ymm0, (%rax) 2542; X64-AVX2-NEXT: vzeroupper 2543; X64-AVX2-NEXT: retq 2544 %a0 = load <9 x i16>, <9 x i16>* %p0, align 64 2545 %a1 = load <9 x i32>, <9 x i32>* %p1, align 64 2546 %ext0 = zext <9 x i16> %a0 to <9 x i32> 2547 %rem = urem <9 x i32> %ext0, %a1 2548 %mul = mul <9 x i32> <i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199>, %rem 2549 store <9 x i32> %mul, <9 x i32>* undef, align 64 2550 ret void 2551} 2552