1; NOTE: Assertions have been autogenerated by update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s 3 4@c = external global i32*, align 8 5 6; %val1 = load <2 x i8> 7; %op1 = zext<2 x i32> %val1 8; %val2 = load <2 x i8> 9; %op2 = zext<2 x i32> %val2 10; %rst = mul <2 x i32> %op1, %op2 11; 12define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { 13; CHECK-LABEL: mul_2xi8: 14; CHECK: # BB#0: # %entry 15; CHECK-NEXT: movq {{.*}}(%rip), %rax 16; CHECK-NEXT: movzwl (%rdi,%rdx), %ecx 17; CHECK-NEXT: movd %ecx, %xmm0 18; CHECK-NEXT: movzwl (%rsi,%rdx), %ecx 19; CHECK-NEXT: movd %ecx, %xmm1 20; CHECK-NEXT: pxor %xmm2, %xmm2 21; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 22; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 23; CHECK-NEXT: pmullw %xmm0, %xmm1 24; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 25; CHECK-NEXT: movq %xmm1, (%rax,%rdx,4) 26; CHECK-NEXT: retq 27entry: 28 %pre = load i32*, i32** @c 29 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 30 %tmp7 = bitcast i8* %tmp6 to <2 x i8>* 31 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 32 %tmp8 = zext <2 x i8> %wide.load to <2 x i32> 33 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 34 %tmp11 = bitcast i8* %tmp10 to <2 x i8>* 35 %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1 36 %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32> 37 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 38 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 39 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 40 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 41 ret void 42} 43 44; %val1 = load <4 x i8> 45; %op1 = zext<4 x i32> %val1 46; %val2 = load <4 x i8> 47; %op2 = zext<4 x i32> %val2 48; %rst = mul <4 x i32> %op1, %op2 49; 50define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { 51; CHECK-LABEL: mul_4xi8: 52; CHECK: # BB#0: # %entry 53; CHECK-NEXT: movq {{.*}}(%rip), %rax 54; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 55; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 56; CHECK-NEXT: pxor %xmm2, %xmm2 57; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 58; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 59; CHECK-NEXT: pmullw %xmm0, %xmm1 60; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 61; CHECK-NEXT: movdqu %xmm1, (%rax,%rdx,4) 62; CHECK-NEXT: retq 63entry: 64 %pre = load i32*, i32** @c 65 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 66 %tmp7 = bitcast i8* %tmp6 to <4 x i8>* 67 %wide.load = load <4 x i8>, <4 x i8>* %tmp7, align 1 68 %tmp8 = zext <4 x i8> %wide.load to <4 x i32> 69 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 70 %tmp11 = bitcast i8* %tmp10 to <4 x i8>* 71 %wide.load17 = load <4 x i8>, <4 x i8>* %tmp11, align 1 72 %tmp12 = zext <4 x i8> %wide.load17 to <4 x i32> 73 %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8 74 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 75 %tmp15 = bitcast i32* %tmp14 to <4 x i32>* 76 store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4 77 ret void 78} 79 80; %val1 = load <8 x i8> 81; %op1 = zext<8 x i32> %val1 82; %val2 = load <8 x i8> 83; %op2 = zext<8 x i32> %val2 84; %rst = mul <8 x i32> %op1, %op2 85; 86define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { 87; CHECK-LABEL: mul_8xi8: 88; CHECK: # BB#0: # %entry 89; CHECK-NEXT: movq {{.*}}(%rip), %rax 90; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 91; CHECK-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 92; CHECK-NEXT: pxor %xmm2, %xmm2 93; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 94; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 95; CHECK-NEXT: pmullw %xmm0, %xmm1 96; CHECK-NEXT: movdqa %xmm1, %xmm0 97; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 98; CHECK-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 99; CHECK-NEXT: movdqu %xmm1, 16(%rax,%rdx,4) 100; CHECK-NEXT: movdqu %xmm0, (%rax,%rdx,4) 101; CHECK-NEXT: retq 102entry: 103 %pre = load i32*, i32** @c 104 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 105 %tmp7 = bitcast i8* %tmp6 to <8 x i8>* 106 %wide.load = load <8 x i8>, <8 x i8>* %tmp7, align 1 107 %tmp8 = zext <8 x i8> %wide.load to <8 x i32> 108 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 109 %tmp11 = bitcast i8* %tmp10 to <8 x i8>* 110 %wide.load17 = load <8 x i8>, <8 x i8>* %tmp11, align 1 111 %tmp12 = zext <8 x i8> %wide.load17 to <8 x i32> 112 %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8 113 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 114 %tmp15 = bitcast i32* %tmp14 to <8 x i32>* 115 store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4 116 ret void 117} 118 119; %val1 = load <16 x i8> 120; %op1 = zext<16 x i32> %val1 121; %val2 = load <16 x i8> 122; %op2 = zext<16 x i32> %val2 123; %rst = mul <16 x i32> %op1, %op2 124; 125define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { 126; CHECK-LABEL: mul_16xi8: 127; CHECK: # BB#0: # %entry 128; CHECK-NEXT: movq {{.*}}(%rip), %rax 129; CHECK-NEXT: movdqu (%rdi,%rdx), %xmm0 130; CHECK-NEXT: movdqu (%rsi,%rdx), %xmm1 131; CHECK-NEXT: pxor %xmm2, %xmm2 132; CHECK-NEXT: movdqa %xmm0, %xmm3 133; CHECK-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 134; CHECK-NEXT: movdqa %xmm1, %xmm4 135; CHECK-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] 136; CHECK-NEXT: pmullw %xmm3, %xmm4 137; CHECK-NEXT: movdqa %xmm4, %xmm3 138; CHECK-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 139; CHECK-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] 140; CHECK-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 141; CHECK-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 142; CHECK-NEXT: pmullw %xmm0, %xmm1 143; CHECK-NEXT: movdqa %xmm1, %xmm0 144; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 145; CHECK-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 146; CHECK-NEXT: movdqu %xmm1, 48(%rax,%rdx,4) 147; CHECK-NEXT: movdqu %xmm0, 32(%rax,%rdx,4) 148; CHECK-NEXT: movdqu %xmm4, 16(%rax,%rdx,4) 149; CHECK-NEXT: movdqu %xmm3, (%rax,%rdx,4) 150; CHECK-NEXT: retq 151entry: 152 %pre = load i32*, i32** @c 153 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 154 %tmp7 = bitcast i8* %tmp6 to <16 x i8>* 155 %wide.load = load <16 x i8>, <16 x i8>* %tmp7, align 1 156 %tmp8 = zext <16 x i8> %wide.load to <16 x i32> 157 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 158 %tmp11 = bitcast i8* %tmp10 to <16 x i8>* 159 %wide.load17 = load <16 x i8>, <16 x i8>* %tmp11, align 1 160 %tmp12 = zext <16 x i8> %wide.load17 to <16 x i32> 161 %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8 162 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 163 %tmp15 = bitcast i32* %tmp14 to <16 x i32>* 164 store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4 165 ret void 166} 167 168; %val1 = load <2 x i16> 169; %op1 = zext<2 x i32> %val1 170; %val2 = load <2 x i16> 171; %op2 = zext<2 x i32> %val2 172; %rst = mul <2 x i32> %op1, %op2 173; 174define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { 175; CHECK-LABEL: mul_2xi16: 176; CHECK: # BB#0: # %entry 177; CHECK-NEXT: movq {{.*}}(%rip), %rax 178; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 179; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 180; CHECK-NEXT: movdqa %xmm1, %xmm2 181; CHECK-NEXT: pmulhuw %xmm0, %xmm2 182; CHECK-NEXT: pmullw %xmm0, %xmm1 183; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 184; CHECK-NEXT: movq %xmm1, (%rax,%rdx,4) 185; CHECK-NEXT: retq 186entry: 187 %pre = load i32*, i32** @c 188 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 189 %tmp7 = bitcast i8* %tmp6 to <2 x i16>* 190 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 191 %tmp8 = zext <2 x i16> %wide.load to <2 x i32> 192 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 193 %tmp11 = bitcast i8* %tmp10 to <2 x i16>* 194 %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1 195 %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32> 196 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 197 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 198 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 199 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 200 ret void 201} 202 203; %val1 = load <4 x i16> 204; %op1 = zext<4 x i32> %val1 205; %val2 = load <4 x i16> 206; %op2 = zext<4 x i32> %val2 207; %rst = mul <4 x i32> %op1, %op2 208; 209define void @mul_4xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { 210; CHECK-LABEL: mul_4xi16: 211; CHECK: # BB#0: # %entry 212; CHECK-NEXT: movq {{.*}}(%rip), %rax 213; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 214; CHECK-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 215; CHECK-NEXT: movdqa %xmm1, %xmm2 216; CHECK-NEXT: pmulhuw %xmm0, %xmm2 217; CHECK-NEXT: pmullw %xmm0, %xmm1 218; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 219; CHECK-NEXT: movdqu %xmm1, (%rax,%rdx,4) 220; CHECK-NEXT: retq 221entry: 222 %pre = load i32*, i32** @c 223 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 224 %tmp7 = bitcast i8* %tmp6 to <4 x i16>* 225 %wide.load = load <4 x i16>, <4 x i16>* %tmp7, align 1 226 %tmp8 = zext <4 x i16> %wide.load to <4 x i32> 227 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 228 %tmp11 = bitcast i8* %tmp10 to <4 x i16>* 229 %wide.load17 = load <4 x i16>, <4 x i16>* %tmp11, align 1 230 %tmp12 = zext <4 x i16> %wide.load17 to <4 x i32> 231 %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8 232 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 233 %tmp15 = bitcast i32* %tmp14 to <4 x i32>* 234 store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4 235 ret void 236} 237 238; %val1 = load <8 x i16> 239; %op1 = zext<8 x i32> %val1 240; %val2 = load <8 x i16> 241; %op2 = zext<8 x i32> %val2 242; %rst = mul <8 x i32> %op1, %op2 243; 244define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { 245; CHECK-LABEL: mul_8xi16: 246; CHECK: # BB#0: # %entry 247; CHECK-NEXT: movq {{.*}}(%rip), %rax 248; CHECK-NEXT: movdqu (%rdi,%rdx), %xmm0 249; CHECK-NEXT: movdqu (%rsi,%rdx), %xmm1 250; CHECK-NEXT: movdqa %xmm1, %xmm2 251; CHECK-NEXT: pmulhuw %xmm0, %xmm2 252; CHECK-NEXT: pmullw %xmm0, %xmm1 253; CHECK-NEXT: movdqa %xmm1, %xmm0 254; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 255; CHECK-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 256; CHECK-NEXT: movdqu %xmm1, 16(%rax,%rdx,4) 257; CHECK-NEXT: movdqu %xmm0, (%rax,%rdx,4) 258; CHECK-NEXT: retq 259entry: 260 %pre = load i32*, i32** @c 261 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 262 %tmp7 = bitcast i8* %tmp6 to <8 x i16>* 263 %wide.load = load <8 x i16>, <8 x i16>* %tmp7, align 1 264 %tmp8 = zext <8 x i16> %wide.load to <8 x i32> 265 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 266 %tmp11 = bitcast i8* %tmp10 to <8 x i16>* 267 %wide.load17 = load <8 x i16>, <8 x i16>* %tmp11, align 1 268 %tmp12 = zext <8 x i16> %wide.load17 to <8 x i32> 269 %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8 270 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 271 %tmp15 = bitcast i32* %tmp14 to <8 x i32>* 272 store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4 273 ret void 274} 275 276; %val1 = load <16 x i16> 277; %op1 = zext<16 x i32> %val1 278; %val2 = load <16 x i16> 279; %op2 = zext<16 x i32> %val2 280; %rst = mul <16 x i32> %op1, %op2 281; 282define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { 283; CHECK-LABEL: mul_16xi16: 284; CHECK: # BB#0: # %entry 285; CHECK-NEXT: movq {{.*}}(%rip), %rax 286; CHECK-NEXT: movdqu (%rdi,%rdx), %xmm0 287; CHECK-NEXT: movdqu 16(%rdi,%rdx), %xmm1 288; CHECK-NEXT: movdqu (%rsi,%rdx), %xmm2 289; CHECK-NEXT: movdqu 16(%rsi,%rdx), %xmm3 290; CHECK-NEXT: movdqa %xmm2, %xmm4 291; CHECK-NEXT: pmulhuw %xmm0, %xmm4 292; CHECK-NEXT: pmullw %xmm0, %xmm2 293; CHECK-NEXT: movdqa %xmm2, %xmm0 294; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] 295; CHECK-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 296; CHECK-NEXT: movdqa %xmm3, %xmm4 297; CHECK-NEXT: pmulhuw %xmm1, %xmm4 298; CHECK-NEXT: pmullw %xmm1, %xmm3 299; CHECK-NEXT: movdqa %xmm3, %xmm1 300; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] 301; CHECK-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 302; CHECK-NEXT: movdqu %xmm3, 48(%rax,%rdx,4) 303; CHECK-NEXT: movdqu %xmm1, 32(%rax,%rdx,4) 304; CHECK-NEXT: movdqu %xmm2, 16(%rax,%rdx,4) 305; CHECK-NEXT: movdqu %xmm0, (%rax,%rdx,4) 306; CHECK-NEXT: retq 307entry: 308 %pre = load i32*, i32** @c 309 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 310 %tmp7 = bitcast i8* %tmp6 to <16 x i16>* 311 %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1 312 %tmp8 = zext <16 x i16> %wide.load to <16 x i32> 313 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 314 %tmp11 = bitcast i8* %tmp10 to <16 x i16>* 315 %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1 316 %tmp12 = zext <16 x i16> %wide.load17 to <16 x i32> 317 %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8 318 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 319 %tmp15 = bitcast i32* %tmp14 to <16 x i32>* 320 store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4 321 ret void 322} 323 324; %val1 = load <2 x i8> 325; %op1 = sext<2 x i32> %val1 326; %val2 = load <2 x i8> 327; %op2 = sext<2 x i32> %val2 328; %rst = mul <2 x i32> %op1, %op2 329; 330define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { 331; CHECK-LABEL: mul_2xi8_sext: 332; CHECK: # BB#0: # %entry 333; CHECK-NEXT: movq {{.*}}(%rip), %rax 334; CHECK-NEXT: movzwl (%rdi,%rdx), %ecx 335; CHECK-NEXT: movd %ecx, %xmm0 336; CHECK-NEXT: movzwl (%rsi,%rdx), %ecx 337; CHECK-NEXT: movd %ecx, %xmm1 338; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 339; CHECK-NEXT: psraw $8, %xmm0 340; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 341; CHECK-NEXT: psraw $8, %xmm1 342; CHECK-NEXT: pmullw %xmm0, %xmm1 343; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 344; CHECK-NEXT: psrad $16, %xmm0 345; CHECK-NEXT: movq %xmm0, (%rax,%rdx,4) 346; CHECK-NEXT: retq 347entry: 348 %pre = load i32*, i32** @c 349 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 350 %tmp7 = bitcast i8* %tmp6 to <2 x i8>* 351 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 352 %tmp8 = sext <2 x i8> %wide.load to <2 x i32> 353 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 354 %tmp11 = bitcast i8* %tmp10 to <2 x i8>* 355 %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1 356 %tmp12 = sext <2 x i8> %wide.load17 to <2 x i32> 357 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 358 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 359 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 360 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 361 ret void 362} 363 364; %val1 = load <2 x i8> 365; %op1 = sext<2 x i32> %val1 366; %val2 = load <2 x i8> 367; %op2 = zext<2 x i32> %val2 368; %rst = mul <2 x i32> %op1, %op2 369; 370define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { 371; CHECK-LABEL: mul_2xi8_sext_zext: 372; CHECK: # BB#0: # %entry 373; CHECK-NEXT: movq {{.*}}(%rip), %rax 374; CHECK-NEXT: movzwl (%rdi,%rdx), %ecx 375; CHECK-NEXT: movd %ecx, %xmm0 376; CHECK-NEXT: movzwl (%rsi,%rdx), %ecx 377; CHECK-NEXT: movd %ecx, %xmm1 378; CHECK-NEXT: pxor %xmm2, %xmm2 379; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 380; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 381; CHECK-NEXT: psraw $8, %xmm0 382; CHECK-NEXT: movdqa %xmm1, %xmm2 383; CHECK-NEXT: pmulhw %xmm0, %xmm2 384; CHECK-NEXT: pmullw %xmm1, %xmm0 385; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 386; CHECK-NEXT: movq %xmm0, (%rax,%rdx,4) 387; CHECK-NEXT: retq 388entry: 389 %pre = load i32*, i32** @c 390 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 391 %tmp7 = bitcast i8* %tmp6 to <2 x i8>* 392 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 393 %tmp8 = sext <2 x i8> %wide.load to <2 x i32> 394 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 395 %tmp11 = bitcast i8* %tmp10 to <2 x i8>* 396 %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1 397 %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32> 398 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 399 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 400 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 401 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 402 ret void 403} 404 405; %val1 = load <2 x i16> 406; %op1 = sext<2 x i32> %val1 407; %val2 = load <2 x i16> 408; %op2 = sext<2 x i32> %val2 409; %rst = mul <2 x i32> %op1, %op2 410; 411define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { 412; CHECK-LABEL: mul_2xi16_sext: 413; CHECK: # BB#0: # %entry 414; CHECK-NEXT: movq {{.*}}(%rip), %rax 415; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 416; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 417; CHECK-NEXT: movdqa %xmm1, %xmm2 418; CHECK-NEXT: pmulhw %xmm0, %xmm2 419; CHECK-NEXT: pmullw %xmm0, %xmm1 420; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 421; CHECK-NEXT: movq %xmm1, (%rax,%rdx,4) 422; CHECK-NEXT: retq 423entry: 424 %pre = load i32*, i32** @c 425 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 426 %tmp7 = bitcast i8* %tmp6 to <2 x i16>* 427 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 428 %tmp8 = sext <2 x i16> %wide.load to <2 x i32> 429 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 430 %tmp11 = bitcast i8* %tmp10 to <2 x i16>* 431 %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1 432 %tmp12 = sext <2 x i16> %wide.load17 to <2 x i32> 433 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 434 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 435 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 436 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 437 ret void 438} 439 440; %val1 = load <2 x i16> 441; %op1 = sext<2 x i32> %val1 442; %val2 = load <2 x i16> 443; %op2 = zext<2 x i32> %val2 444; %rst = mul <2 x i32> %op1, %op2 445; 446define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { 447; CHECK-LABEL: mul_2xi16_sext_zext: 448; CHECK: # BB#0: # %entry 449; CHECK-NEXT: movq {{.*}}(%rip), %rax 450; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 451; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 452; CHECK-NEXT: psrad $16, %xmm0 453; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] 454; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 455; CHECK-NEXT: pxor %xmm2, %xmm2 456; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 457; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] 458; CHECK-NEXT: movdqa %xmm1, %xmm2 459; CHECK-NEXT: pmuludq %xmm0, %xmm2 460; CHECK-NEXT: movdqa %xmm0, %xmm3 461; CHECK-NEXT: psrlq $32, %xmm3 462; CHECK-NEXT: pmuludq %xmm1, %xmm3 463; CHECK-NEXT: psllq $32, %xmm3 464; CHECK-NEXT: paddq %xmm2, %xmm3 465; CHECK-NEXT: psrlq $32, %xmm1 466; CHECK-NEXT: pmuludq %xmm0, %xmm1 467; CHECK-NEXT: psllq $32, %xmm1 468; CHECK-NEXT: paddq %xmm3, %xmm1 469; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 470; CHECK-NEXT: movq %xmm0, (%rax,%rdx,4) 471; CHECK-NEXT: retq 472entry: 473 %pre = load i32*, i32** @c 474 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 475 %tmp7 = bitcast i8* %tmp6 to <2 x i16>* 476 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 477 %tmp8 = sext <2 x i16> %wide.load to <2 x i32> 478 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 479 %tmp11 = bitcast i8* %tmp10 to <2 x i16>* 480 %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1 481 %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32> 482 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 483 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 484 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 485 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 486 ret void 487} 488 489; %val1 = load <16 x i16> 490; %op1 = sext<16 x i32> %val1 491; %val2 = load <16 x i16> 492; %op2 = sext<16 x i32> %val2 493; %rst = mul <16 x i32> %op1, %op2 494; 495define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { 496; CHECK-LABEL: mul_16xi16_sext: 497; CHECK: # BB#0: # %entry 498; CHECK-NEXT: movq {{.*}}(%rip), %rax 499; CHECK-NEXT: movdqu (%rdi,%rdx), %xmm0 500; CHECK-NEXT: movdqu 16(%rdi,%rdx), %xmm1 501; CHECK-NEXT: movdqu (%rsi,%rdx), %xmm2 502; CHECK-NEXT: movdqu 16(%rsi,%rdx), %xmm3 503; CHECK-NEXT: movdqa %xmm2, %xmm4 504; CHECK-NEXT: pmulhw %xmm0, %xmm4 505; CHECK-NEXT: pmullw %xmm0, %xmm2 506; CHECK-NEXT: movdqa %xmm2, %xmm0 507; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] 508; CHECK-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 509; CHECK-NEXT: movdqa %xmm3, %xmm4 510; CHECK-NEXT: pmulhw %xmm1, %xmm4 511; CHECK-NEXT: pmullw %xmm1, %xmm3 512; CHECK-NEXT: movdqa %xmm3, %xmm1 513; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] 514; CHECK-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 515; CHECK-NEXT: movdqu %xmm3, 48(%rax,%rdx,4) 516; CHECK-NEXT: movdqu %xmm1, 32(%rax,%rdx,4) 517; CHECK-NEXT: movdqu %xmm2, 16(%rax,%rdx,4) 518; CHECK-NEXT: movdqu %xmm0, (%rax,%rdx,4) 519; CHECK-NEXT: retq 520entry: 521 %pre = load i32*, i32** @c 522 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 523 %tmp7 = bitcast i8* %tmp6 to <16 x i16>* 524 %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1 525 %tmp8 = sext <16 x i16> %wide.load to <16 x i32> 526 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 527 %tmp11 = bitcast i8* %tmp10 to <16 x i16>* 528 %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1 529 %tmp12 = sext <16 x i16> %wide.load17 to <16 x i32> 530 %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8 531 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 532 %tmp15 = bitcast i32* %tmp14 to <16 x i32>* 533 store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4 534 ret void 535} 536 537; %val = load <2 x i8> 538; %op1 = zext<2 x i32> %val 539; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 255) 540; %rst = mul <2 x i32> %op1, %op2 541; 542define void @mul_2xi8_varconst1(i8* nocapture readonly %a, i64 %index) { 543; CHECK-LABEL: mul_2xi8_varconst1: 544; CHECK: # BB#0: # %entry 545; CHECK-NEXT: movq {{.*}}(%rip), %rax 546; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx 547; CHECK-NEXT: movd %ecx, %xmm0 548; CHECK-NEXT: pxor %xmm1, %xmm1 549; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 550; CHECK-NEXT: pmullw {{.*}}(%rip), %xmm0 551; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 552; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) 553; CHECK-NEXT: retq 554entry: 555 %pre = load i32*, i32** @c 556 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 557 %tmp7 = bitcast i8* %tmp6 to <2 x i8>* 558 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 559 %tmp8 = zext <2 x i8> %wide.load to <2 x i32> 560 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 255> 561 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 562 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 563 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 564 ret void 565} 566 567; %val = load <2 x i8> 568; %op1 = sext<2 x i32> %val 569; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 127) 570; %rst = mul <2 x i32> %op1, %op2 571; 572define void @mul_2xi8_varconst2(i8* nocapture readonly %a, i64 %index) { 573; CHECK-LABEL: mul_2xi8_varconst2: 574; CHECK: # BB#0: # %entry 575; CHECK-NEXT: movq {{.*}}(%rip), %rax 576; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx 577; CHECK-NEXT: movd %ecx, %xmm0 578; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 579; CHECK-NEXT: psraw $8, %xmm0 580; CHECK-NEXT: pmullw {{.*}}(%rip), %xmm0 581; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 582; CHECK-NEXT: psrad $16, %xmm0 583; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) 584; CHECK-NEXT: retq 585entry: 586 %pre = load i32*, i32** @c 587 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 588 %tmp7 = bitcast i8* %tmp6 to <2 x i8>* 589 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 590 %tmp8 = sext <2 x i8> %wide.load to <2 x i32> 591 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 127> 592 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 593 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 594 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 595 ret void 596} 597 598; %val = load <2 x i8> 599; %op1 = zext<2 x i32> %val 600; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 256) 601; %rst = mul <2 x i32> %op1, %op2 602; 603define void @mul_2xi8_varconst3(i8* nocapture readonly %a, i64 %index) { 604; CHECK-LABEL: mul_2xi8_varconst3: 605; CHECK: # BB#0: # %entry 606; CHECK-NEXT: movq {{.*}}(%rip), %rax 607; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx 608; CHECK-NEXT: movd %ecx, %xmm0 609; CHECK-NEXT: pxor %xmm1, %xmm1 610; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 611; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u> 612; CHECK-NEXT: movdqa %xmm0, %xmm2 613; CHECK-NEXT: pmulhw %xmm1, %xmm2 614; CHECK-NEXT: pmullw %xmm1, %xmm0 615; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 616; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) 617; CHECK-NEXT: retq 618entry: 619 %pre = load i32*, i32** @c 620 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 621 %tmp7 = bitcast i8* %tmp6 to <2 x i8>* 622 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 623 %tmp8 = zext <2 x i8> %wide.load to <2 x i32> 624 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 256> 625 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 626 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 627 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 628 ret void 629} 630 631; %val = load <2 x i8> 632; %op1 = zext<2 x i32> %val 633; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-1 ~ 255) 634; %rst = mul <2 x i32> %op1, %op2 635; 636define void @mul_2xi8_varconst4(i8* nocapture readonly %a, i64 %index) { 637; CHECK-LABEL: mul_2xi8_varconst4: 638; CHECK: # BB#0: # %entry 639; CHECK-NEXT: movq {{.*}}(%rip), %rax 640; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx 641; CHECK-NEXT: movd %ecx, %xmm0 642; CHECK-NEXT: pxor %xmm1, %xmm1 643; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 644; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u> 645; CHECK-NEXT: movdqa %xmm0, %xmm2 646; CHECK-NEXT: pmulhw %xmm1, %xmm2 647; CHECK-NEXT: pmullw %xmm1, %xmm0 648; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 649; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) 650; CHECK-NEXT: retq 651entry: 652 %pre = load i32*, i32** @c 653 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 654 %tmp7 = bitcast i8* %tmp6 to <2 x i8>* 655 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 656 %tmp8 = zext <2 x i8> %wide.load to <2 x i32> 657 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -1, i32 255> 658 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 659 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 660 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 661 ret void 662} 663 664; %val = load <2 x i8> 665; %op1 = sext<2 x i32> %val 666; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-129 ~ 127) 667; %rst = mul <2 x i32> %op1, %op2 668; 669define void @mul_2xi8_varconst5(i8* nocapture readonly %a, i64 %index) { 670; CHECK-LABEL: mul_2xi8_varconst5: 671; CHECK: # BB#0: # %entry 672; CHECK-NEXT: movq {{.*}}(%rip), %rax 673; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx 674; CHECK-NEXT: movd %ecx, %xmm0 675; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 676; CHECK-NEXT: psraw $8, %xmm0 677; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u> 678; CHECK-NEXT: movdqa %xmm0, %xmm2 679; CHECK-NEXT: pmulhw %xmm1, %xmm2 680; CHECK-NEXT: pmullw %xmm1, %xmm0 681; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 682; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) 683; CHECK-NEXT: retq 684entry: 685 %pre = load i32*, i32** @c 686 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 687 %tmp7 = bitcast i8* %tmp6 to <2 x i8>* 688 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 689 %tmp8 = sext <2 x i8> %wide.load to <2 x i32> 690 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -129, i32 127> 691 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 692 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 693 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 694 ret void 695} 696 697; %val = load <2 x i8> 698; %op1 = sext<2 x i32> %val 699; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 128) 700; %rst = mul <2 x i32> %op1, %op2 701; 702define void @mul_2xi8_varconst6(i8* nocapture readonly %a, i64 %index) { 703; CHECK-LABEL: mul_2xi8_varconst6: 704; CHECK: # BB#0: # %entry 705; CHECK-NEXT: movq {{.*}}(%rip), %rax 706; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx 707; CHECK-NEXT: movd %ecx, %xmm0 708; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 709; CHECK-NEXT: psraw $8, %xmm0 710; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u> 711; CHECK-NEXT: movdqa %xmm0, %xmm2 712; CHECK-NEXT: pmulhw %xmm1, %xmm2 713; CHECK-NEXT: pmullw %xmm1, %xmm0 714; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 715; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) 716; CHECK-NEXT: retq 717entry: 718 %pre = load i32*, i32** @c 719 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 720 %tmp7 = bitcast i8* %tmp6 to <2 x i8>* 721 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 722 %tmp8 = sext <2 x i8> %wide.load to <2 x i32> 723 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 128> 724 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 725 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 726 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 727 ret void 728} 729 730; %val = load <2 x i16> 731; %op1 = zext<2 x i32> %val 732; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65535) 733; %rst = mul <2 x i32> %op1, %op2 734; 735define void @mul_2xi16_varconst1(i8* nocapture readonly %a, i64 %index) { 736; CHECK-LABEL: mul_2xi16_varconst1: 737; CHECK: # BB#0: # %entry 738; CHECK-NEXT: movq {{.*}}(%rip), %rax 739; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 740; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u> 741; CHECK-NEXT: movdqa %xmm0, %xmm2 742; CHECK-NEXT: pmulhuw %xmm1, %xmm2 743; CHECK-NEXT: pmullw %xmm1, %xmm0 744; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 745; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) 746; CHECK-NEXT: retq 747entry: 748 %pre = load i32*, i32** @c 749 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 750 %tmp7 = bitcast i8* %tmp6 to <2 x i16>* 751 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 752 %tmp8 = zext <2 x i16> %wide.load to <2 x i32> 753 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65535> 754 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 755 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 756 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 757 ret void 758} 759 760; %val = load <2 x i16> 761; %op1 = sext<2 x i32> %val 762; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-32768 ~ 32767) 763; %rst = mul <2 x i32> %op1, %op2 764; 765define void @mul_2xi16_varconst2(i8* nocapture readonly %a, i64 %index) { 766; CHECK-LABEL: mul_2xi16_varconst2: 767; CHECK: # BB#0: # %entry 768; CHECK-NEXT: movq {{.*}}(%rip), %rax 769; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 770; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u> 771; CHECK-NEXT: movdqa %xmm0, %xmm2 772; CHECK-NEXT: pmulhw %xmm1, %xmm2 773; CHECK-NEXT: pmullw %xmm1, %xmm0 774; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 775; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) 776; CHECK-NEXT: retq 777entry: 778 %pre = load i32*, i32** @c 779 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 780 %tmp7 = bitcast i8* %tmp6 to <2 x i16>* 781 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 782 %tmp8 = sext <2 x i16> %wide.load to <2 x i32> 783 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -32768, i32 32767> 784 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 785 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 786 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 787 ret void 788} 789 790; %val = load <2 x i16> 791; %op1 = zext<2 x i32> %val 792; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65536) 793; %rst = mul <2 x i32> %op1, %op2 794; 795define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) { 796; CHECK-LABEL: mul_2xi16_varconst3: 797; CHECK: # BB#0: # %entry 798; CHECK-NEXT: movq {{.*}}(%rip), %rax 799; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 800; CHECK-NEXT: pxor %xmm1, %xmm1 801; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 802; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] 803; CHECK-NEXT: movl $65536, %ecx # imm = 0x10000 804; CHECK-NEXT: movd %rcx, %xmm1 805; CHECK-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] 806; CHECK-NEXT: movdqa %xmm0, %xmm2 807; CHECK-NEXT: pmuludq %xmm1, %xmm2 808; CHECK-NEXT: psrlq $32, %xmm0 809; CHECK-NEXT: pmuludq %xmm1, %xmm0 810; CHECK-NEXT: psllq $32, %xmm0 811; CHECK-NEXT: paddq %xmm2, %xmm0 812; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 813; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) 814; CHECK-NEXT: retq 815entry: 816 %pre = load i32*, i32** @c 817 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 818 %tmp7 = bitcast i8* %tmp6 to <2 x i16>* 819 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 820 %tmp8 = zext <2 x i16> %wide.load to <2 x i32> 821 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65536> 822 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 823 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 824 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 825 ret void 826} 827 828; %val = load <2 x i16> 829; %op1 = sext<2 x i32> %val 830; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 32768) 831; %rst = mul <2 x i32> %op1, %op2 832; 833define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) { 834; CHECK-LABEL: mul_2xi16_varconst4: 835; CHECK: # BB#0: # %entry 836; CHECK-NEXT: movq {{.*}}(%rip), %rax 837; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 838; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 839; CHECK-NEXT: psrad $16, %xmm0 840; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] 841; CHECK-NEXT: movl $32768, %ecx # imm = 0x8000 842; CHECK-NEXT: movd %rcx, %xmm1 843; CHECK-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] 844; CHECK-NEXT: movdqa %xmm0, %xmm2 845; CHECK-NEXT: pmuludq %xmm1, %xmm2 846; CHECK-NEXT: psrlq $32, %xmm0 847; CHECK-NEXT: pmuludq %xmm1, %xmm0 848; CHECK-NEXT: psllq $32, %xmm0 849; CHECK-NEXT: paddq %xmm2, %xmm0 850; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 851; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) 852; CHECK-NEXT: retq 853entry: 854 %pre = load i32*, i32** @c 855 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 856 %tmp7 = bitcast i8* %tmp6 to <2 x i16>* 857 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 858 %tmp8 = sext <2 x i16> %wide.load to <2 x i32> 859 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 32768> 860 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 861 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 862 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 863 ret void 864} 865