1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=silvermont | FileCheck %s --check-prefixes=CHECK32,SLM32 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=silvermont | FileCheck %s --check-prefixes=CHECK64,SLM64 4; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse4.2,+slow-pmulld | FileCheck %s --check-prefixes=CHECK32,SLOW32 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2,+slow-pmulld | FileCheck %s --check-prefixes=CHECK64,SLOW64 6; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4-32 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4-64 8; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX-32,AVX2-32 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX-64,AVX2-64 10; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX-32,AVX512-32,AVX512DQ-32 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX-64,AVX512-64,AVX512DQ-64 12; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX-32,AVX512-32,AVX512BW-32 13; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX-64,AVX512-64,AVX512BW-64 14; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=knl | FileCheck %s --check-prefixes=AVX-32,AVX512-32,KNL-32 15; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefixes=AVX-64,AVX512-64,KNL-64 16 17; Make sure that the slow-pmulld feature can be used without SSE4.1. 18; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=silvermont -mattr=-sse4.1 19 20define <4 x i32> @test_mul_v4i32_v4i8(<4 x i8> %A) { 21; CHECK32-LABEL: test_mul_v4i32_v4i8: 22; CHECK32: # %bb.0: 23; CHECK32-NEXT: pand {{\.LCPI.*}}, %xmm0 24; CHECK32-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0 25; CHECK32-NEXT: retl 26; 27; CHECK64-LABEL: test_mul_v4i32_v4i8: 28; CHECK64: # %bb.0: 29; CHECK64-NEXT: pand {{.*}}(%rip), %xmm0 30; CHECK64-NEXT: pmaddwd {{.*}}(%rip), %xmm0 31; CHECK64-NEXT: retq 32; 33; SSE4-32-LABEL: test_mul_v4i32_v4i8: 34; SSE4-32: # %bb.0: 35; SSE4-32-NEXT: pand {{\.LCPI.*}}, %xmm0 36; SSE4-32-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0 37; SSE4-32-NEXT: retl 38; 39; SSE4-64-LABEL: test_mul_v4i32_v4i8: 40; SSE4-64: # %bb.0: 41; SSE4-64-NEXT: pand {{.*}}(%rip), %xmm0 42; SSE4-64-NEXT: pmaddwd {{.*}}(%rip), %xmm0 43; SSE4-64-NEXT: retq 44; 45; AVX2-32-LABEL: test_mul_v4i32_v4i8: 46; AVX2-32: # %bb.0: 47; AVX2-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 48; AVX2-32-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0 49; AVX2-32-NEXT: retl 50; 51; AVX2-64-LABEL: test_mul_v4i32_v4i8: 52; AVX2-64: # %bb.0: 53; AVX2-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 54; AVX2-64-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0 55; AVX2-64-NEXT: retq 56; 57; AVX512DQ-32-LABEL: test_mul_v4i32_v4i8: 58; AVX512DQ-32: # %bb.0: 59; AVX512DQ-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 60; AVX512DQ-32-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0 61; AVX512DQ-32-NEXT: retl 62; 63; AVX512DQ-64-LABEL: test_mul_v4i32_v4i8: 64; AVX512DQ-64: # %bb.0: 65; AVX512DQ-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 66; AVX512DQ-64-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0 67; AVX512DQ-64-NEXT: retq 68; 69; AVX512BW-32-LABEL: test_mul_v4i32_v4i8: 70; AVX512BW-32: # %bb.0: 71; AVX512BW-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 72; AVX512BW-32-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0 73; AVX512BW-32-NEXT: retl 74; 75; AVX512BW-64-LABEL: test_mul_v4i32_v4i8: 76; AVX512BW-64: # %bb.0: 77; AVX512BW-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 78; AVX512BW-64-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0 79; AVX512BW-64-NEXT: retq 80; 81; KNL-32-LABEL: test_mul_v4i32_v4i8: 82; KNL-32: # %bb.0: 83; KNL-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 84; KNL-32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778] 85; KNL-32-NEXT: vpmulld %xmm1, %xmm0, %xmm0 86; KNL-32-NEXT: retl 87; 88; KNL-64-LABEL: test_mul_v4i32_v4i8: 89; KNL-64: # %bb.0: 90; KNL-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 91; KNL-64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778] 92; KNL-64-NEXT: vpmulld %xmm1, %xmm0, %xmm0 93; KNL-64-NEXT: retq 94 %z = zext <4 x i8> %A to <4 x i32> 95 %m = mul nuw nsw <4 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778> 96 ret <4 x i32> %m 97} 98 99define <8 x i32> @test_mul_v8i32_v8i8(<8 x i8> %A) { 100; SLM32-LABEL: test_mul_v8i32_v8i8: 101; SLM32: # %bb.0: 102; SLM32-NEXT: movdqa %xmm0, %xmm1 103; SLM32-NEXT: pand {{\.LCPI.*}}, %xmm1 104; SLM32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] 105; SLM32-NEXT: movdqa %xmm1, %xmm2 106; SLM32-NEXT: pmullw %xmm0, %xmm1 107; SLM32-NEXT: pmulhw %xmm0, %xmm2 108; SLM32-NEXT: movdqa %xmm1, %xmm0 109; SLM32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 110; SLM32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 111; SLM32-NEXT: retl 112; 113; SLM64-LABEL: test_mul_v8i32_v8i8: 114; SLM64: # %bb.0: 115; SLM64-NEXT: movdqa %xmm0, %xmm1 116; SLM64-NEXT: pand {{.*}}(%rip), %xmm1 117; SLM64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] 118; SLM64-NEXT: movdqa %xmm1, %xmm2 119; SLM64-NEXT: pmullw %xmm0, %xmm1 120; SLM64-NEXT: pmulhw %xmm0, %xmm2 121; SLM64-NEXT: movdqa %xmm1, %xmm0 122; SLM64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 123; SLM64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 124; SLM64-NEXT: retq 125; 126; SLOW32-LABEL: test_mul_v8i32_v8i8: 127; SLOW32: # %bb.0: 128; SLOW32-NEXT: movdqa %xmm0, %xmm1 129; SLOW32-NEXT: pand {{\.LCPI.*}}, %xmm1 130; SLOW32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] 131; SLOW32-NEXT: movdqa %xmm1, %xmm2 132; SLOW32-NEXT: pmulhw %xmm0, %xmm2 133; SLOW32-NEXT: pmullw %xmm0, %xmm1 134; SLOW32-NEXT: movdqa %xmm1, %xmm0 135; SLOW32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 136; SLOW32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 137; SLOW32-NEXT: retl 138; 139; SLOW64-LABEL: test_mul_v8i32_v8i8: 140; SLOW64: # %bb.0: 141; SLOW64-NEXT: movdqa %xmm0, %xmm1 142; SLOW64-NEXT: pand {{.*}}(%rip), %xmm1 143; SLOW64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] 144; SLOW64-NEXT: movdqa %xmm1, %xmm2 145; SLOW64-NEXT: pmulhw %xmm0, %xmm2 146; SLOW64-NEXT: pmullw %xmm0, %xmm1 147; SLOW64-NEXT: movdqa %xmm1, %xmm0 148; SLOW64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 149; SLOW64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 150; SLOW64-NEXT: retq 151; 152; SSE4-32-LABEL: test_mul_v8i32_v8i8: 153; SSE4-32: # %bb.0: 154; SSE4-32-NEXT: pand {{\.LCPI.*}}, %xmm0 155; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 156; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 157; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 158; SSE4-32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] 159; SSE4-32-NEXT: pmaddwd %xmm2, %xmm0 160; SSE4-32-NEXT: pmaddwd %xmm2, %xmm1 161; SSE4-32-NEXT: retl 162; 163; SSE4-64-LABEL: test_mul_v8i32_v8i8: 164; SSE4-64: # %bb.0: 165; SSE4-64-NEXT: pand {{.*}}(%rip), %xmm0 166; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 167; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 168; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 169; SSE4-64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] 170; SSE4-64-NEXT: pmaddwd %xmm2, %xmm0 171; SSE4-64-NEXT: pmaddwd %xmm2, %xmm1 172; SSE4-64-NEXT: retq 173; 174; AVX2-32-LABEL: test_mul_v8i32_v8i8: 175; AVX2-32: # %bb.0: 176; AVX2-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 177; AVX2-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 178; AVX2-32-NEXT: vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0 179; AVX2-32-NEXT: retl 180; 181; AVX2-64-LABEL: test_mul_v8i32_v8i8: 182; AVX2-64: # %bb.0: 183; AVX2-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 184; AVX2-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 185; AVX2-64-NEXT: vpmaddwd {{.*}}(%rip), %ymm0, %ymm0 186; AVX2-64-NEXT: retq 187; 188; AVX512DQ-32-LABEL: test_mul_v8i32_v8i8: 189; AVX512DQ-32: # %bb.0: 190; AVX512DQ-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 191; AVX512DQ-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 192; AVX512DQ-32-NEXT: vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0 193; AVX512DQ-32-NEXT: retl 194; 195; AVX512DQ-64-LABEL: test_mul_v8i32_v8i8: 196; AVX512DQ-64: # %bb.0: 197; AVX512DQ-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 198; AVX512DQ-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 199; AVX512DQ-64-NEXT: vpmaddwd {{.*}}(%rip), %ymm0, %ymm0 200; AVX512DQ-64-NEXT: retq 201; 202; AVX512BW-32-LABEL: test_mul_v8i32_v8i8: 203; AVX512BW-32: # %bb.0: 204; AVX512BW-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 205; AVX512BW-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 206; AVX512BW-32-NEXT: vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0 207; AVX512BW-32-NEXT: retl 208; 209; AVX512BW-64-LABEL: test_mul_v8i32_v8i8: 210; AVX512BW-64: # %bb.0: 211; AVX512BW-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 212; AVX512BW-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 213; AVX512BW-64-NEXT: vpmaddwd {{.*}}(%rip), %ymm0, %ymm0 214; AVX512BW-64-NEXT: retq 215; 216; KNL-32-LABEL: test_mul_v8i32_v8i8: 217; KNL-32: # %bb.0: 218; KNL-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 219; KNL-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 220; KNL-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778] 221; KNL-32-NEXT: vpmulld %ymm1, %ymm0, %ymm0 222; KNL-32-NEXT: retl 223; 224; KNL-64-LABEL: test_mul_v8i32_v8i8: 225; KNL-64: # %bb.0: 226; KNL-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 227; KNL-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 228; KNL-64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778] 229; KNL-64-NEXT: vpmulld %ymm1, %ymm0, %ymm0 230; KNL-64-NEXT: retq 231 %z = zext <8 x i8> %A to <8 x i32> 232 %m = mul nuw nsw <8 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778> 233 ret <8 x i32> %m 234} 235 236define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) { 237; SLM32-LABEL: test_mul_v16i32_v16i8: 238; SLM32: # %bb.0: 239; SLM32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778] 240; SLM32-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 241; SLM32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 242; SLM32-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 243; SLM32-NEXT: movdqa %xmm1, %xmm4 244; SLM32-NEXT: movdqa %xmm3, %xmm5 245; SLM32-NEXT: pmullw %xmm2, %xmm1 246; SLM32-NEXT: pmullw %xmm2, %xmm3 247; SLM32-NEXT: pmulhw %xmm2, %xmm4 248; SLM32-NEXT: pmulhw %xmm2, %xmm5 249; SLM32-NEXT: movdqa %xmm1, %xmm0 250; SLM32-NEXT: movdqa %xmm3, %xmm2 251; SLM32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] 252; SLM32-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] 253; SLM32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] 254; SLM32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] 255; SLM32-NEXT: retl 256; 257; SLM64-LABEL: test_mul_v16i32_v16i8: 258; SLM64: # %bb.0: 259; SLM64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778] 260; SLM64-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 261; SLM64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 262; SLM64-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 263; SLM64-NEXT: movdqa %xmm1, %xmm4 264; SLM64-NEXT: movdqa %xmm3, %xmm5 265; SLM64-NEXT: pmullw %xmm2, %xmm1 266; SLM64-NEXT: pmullw %xmm2, %xmm3 267; SLM64-NEXT: pmulhw %xmm2, %xmm4 268; SLM64-NEXT: pmulhw %xmm2, %xmm5 269; SLM64-NEXT: movdqa %xmm1, %xmm0 270; SLM64-NEXT: movdqa %xmm3, %xmm2 271; SLM64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] 272; SLM64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] 273; SLM64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] 274; SLM64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] 275; SLM64-NEXT: retq 276; 277; SLOW32-LABEL: test_mul_v16i32_v16i8: 278; SLOW32: # %bb.0: 279; SLOW32-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 280; SLOW32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778] 281; SLOW32-NEXT: movdqa %xmm1, %xmm3 282; SLOW32-NEXT: pmulhw %xmm2, %xmm3 283; SLOW32-NEXT: pmullw %xmm2, %xmm1 284; SLOW32-NEXT: movdqa %xmm1, %xmm4 285; SLOW32-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 286; SLOW32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] 287; SLOW32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 288; SLOW32-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 289; SLOW32-NEXT: movdqa %xmm3, %xmm0 290; SLOW32-NEXT: pmulhw %xmm2, %xmm0 291; SLOW32-NEXT: pmullw %xmm2, %xmm3 292; SLOW32-NEXT: movdqa %xmm3, %xmm2 293; SLOW32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 294; SLOW32-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] 295; SLOW32-NEXT: movdqa %xmm4, %xmm0 296; SLOW32-NEXT: retl 297; 298; SLOW64-LABEL: test_mul_v16i32_v16i8: 299; SLOW64: # %bb.0: 300; SLOW64-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 301; SLOW64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778] 302; SLOW64-NEXT: movdqa %xmm1, %xmm3 303; SLOW64-NEXT: pmulhw %xmm2, %xmm3 304; SLOW64-NEXT: pmullw %xmm2, %xmm1 305; SLOW64-NEXT: movdqa %xmm1, %xmm4 306; SLOW64-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 307; SLOW64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] 308; SLOW64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 309; SLOW64-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 310; SLOW64-NEXT: movdqa %xmm3, %xmm0 311; SLOW64-NEXT: pmulhw %xmm2, %xmm0 312; SLOW64-NEXT: pmullw %xmm2, %xmm3 313; SLOW64-NEXT: movdqa %xmm3, %xmm2 314; SLOW64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 315; SLOW64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] 316; SLOW64-NEXT: movdqa %xmm4, %xmm0 317; SLOW64-NEXT: retq 318; 319; SSE4-32-LABEL: test_mul_v16i32_v16i8: 320; SSE4-32: # %bb.0: 321; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] 322; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 323; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 324; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 325; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 326; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 327; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 328; SSE4-32-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778] 329; SSE4-32-NEXT: pmaddwd %xmm4, %xmm0 330; SSE4-32-NEXT: pmaddwd %xmm4, %xmm1 331; SSE4-32-NEXT: pmaddwd %xmm4, %xmm2 332; SSE4-32-NEXT: pmaddwd %xmm4, %xmm3 333; SSE4-32-NEXT: retl 334; 335; SSE4-64-LABEL: test_mul_v16i32_v16i8: 336; SSE4-64: # %bb.0: 337; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] 338; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 339; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 340; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 341; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 342; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 343; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 344; SSE4-64-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778] 345; SSE4-64-NEXT: pmaddwd %xmm4, %xmm0 346; SSE4-64-NEXT: pmaddwd %xmm4, %xmm1 347; SSE4-64-NEXT: pmaddwd %xmm4, %xmm2 348; SSE4-64-NEXT: pmaddwd %xmm4, %xmm3 349; SSE4-64-NEXT: retq 350; 351; AVX2-32-LABEL: test_mul_v16i32_v16i8: 352; AVX2-32: # %bb.0: 353; AVX2-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 354; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero 355; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 356; AVX2-32-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778] 357; AVX2-32-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 358; AVX2-32-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1 359; AVX2-32-NEXT: retl 360; 361; AVX2-64-LABEL: test_mul_v16i32_v16i8: 362; AVX2-64: # %bb.0: 363; AVX2-64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 364; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero 365; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 366; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778] 367; AVX2-64-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 368; AVX2-64-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1 369; AVX2-64-NEXT: retq 370; 371; AVX512DQ-32-LABEL: test_mul_v16i32_v16i8: 372; AVX512DQ-32: # %bb.0: 373; AVX512DQ-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 374; AVX512DQ-32-NEXT: vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0 375; AVX512DQ-32-NEXT: retl 376; 377; AVX512DQ-64-LABEL: test_mul_v16i32_v16i8: 378; AVX512DQ-64: # %bb.0: 379; AVX512DQ-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 380; AVX512DQ-64-NEXT: vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0 381; AVX512DQ-64-NEXT: retq 382; 383; AVX512BW-32-LABEL: test_mul_v16i32_v16i8: 384; AVX512BW-32: # %bb.0: 385; AVX512BW-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 386; AVX512BW-32-NEXT: vpmaddwd {{\.LCPI.*}}, %zmm0, %zmm0 387; AVX512BW-32-NEXT: retl 388; 389; AVX512BW-64-LABEL: test_mul_v16i32_v16i8: 390; AVX512BW-64: # %bb.0: 391; AVX512BW-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 392; AVX512BW-64-NEXT: vpmaddwd {{.*}}(%rip), %zmm0, %zmm0 393; AVX512BW-64-NEXT: retq 394; 395; KNL-32-LABEL: test_mul_v16i32_v16i8: 396; KNL-32: # %bb.0: 397; KNL-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 398; KNL-32-NEXT: vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0 399; KNL-32-NEXT: retl 400; 401; KNL-64-LABEL: test_mul_v16i32_v16i8: 402; KNL-64: # %bb.0: 403; KNL-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 404; KNL-64-NEXT: vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0 405; KNL-64-NEXT: retq 406 %z = zext <16 x i8> %A to <16 x i32> 407 %m = mul nuw nsw <16 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778> 408 ret <16 x i32> %m 409} 410 411define <4 x i32> @test_mul_v4i32_v4i16(<4 x i16> %A) { 412; SLM32-LABEL: test_mul_v4i32_v4i16: 413; SLM32: # %bb.0: 414; SLM32-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 415; SLM32-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u> 416; SLM32-NEXT: movdqa %xmm0, %xmm2 417; SLM32-NEXT: pmullw %xmm1, %xmm0 418; SLM32-NEXT: pmulhuw %xmm1, %xmm2 419; SLM32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 420; SLM32-NEXT: retl 421; 422; SLM64-LABEL: test_mul_v4i32_v4i16: 423; SLM64: # %bb.0: 424; SLM64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 425; SLM64-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u> 426; SLM64-NEXT: movdqa %xmm0, %xmm2 427; SLM64-NEXT: pmullw %xmm1, %xmm0 428; SLM64-NEXT: pmulhuw %xmm1, %xmm2 429; SLM64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 430; SLM64-NEXT: retq 431; 432; SLOW32-LABEL: test_mul_v4i32_v4i16: 433; SLOW32: # %bb.0: 434; SLOW32-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 435; SLOW32-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u> 436; SLOW32-NEXT: movdqa %xmm0, %xmm2 437; SLOW32-NEXT: pmulhuw %xmm1, %xmm2 438; SLOW32-NEXT: pmullw %xmm1, %xmm0 439; SLOW32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 440; SLOW32-NEXT: retl 441; 442; SLOW64-LABEL: test_mul_v4i32_v4i16: 443; SLOW64: # %bb.0: 444; SLOW64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 445; SLOW64-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u> 446; SLOW64-NEXT: movdqa %xmm0, %xmm2 447; SLOW64-NEXT: pmulhuw %xmm1, %xmm2 448; SLOW64-NEXT: pmullw %xmm1, %xmm0 449; SLOW64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 450; SLOW64-NEXT: retq 451; 452; SSE4-32-LABEL: test_mul_v4i32_v4i16: 453; SSE4-32: # %bb.0: 454; SSE4-32-NEXT: pxor %xmm1, %xmm1 455; SSE4-32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] 456; SSE4-32-NEXT: pmulld {{\.LCPI.*}}, %xmm0 457; SSE4-32-NEXT: retl 458; 459; SSE4-64-LABEL: test_mul_v4i32_v4i16: 460; SSE4-64: # %bb.0: 461; SSE4-64-NEXT: pxor %xmm1, %xmm1 462; SSE4-64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] 463; SSE4-64-NEXT: pmulld {{.*}}(%rip), %xmm0 464; SSE4-64-NEXT: retq 465; 466; AVX-32-LABEL: test_mul_v4i32_v4i16: 467; AVX-32: # %bb.0: 468; AVX-32-NEXT: vpxor %xmm1, %xmm1, %xmm1 469; AVX-32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] 470; AVX-32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778] 471; AVX-32-NEXT: vpmulld %xmm1, %xmm0, %xmm0 472; AVX-32-NEXT: retl 473; 474; AVX-64-LABEL: test_mul_v4i32_v4i16: 475; AVX-64: # %bb.0: 476; AVX-64-NEXT: vpxor %xmm1, %xmm1, %xmm1 477; AVX-64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] 478; AVX-64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778] 479; AVX-64-NEXT: vpmulld %xmm1, %xmm0, %xmm0 480; AVX-64-NEXT: retq 481 %z = zext <4 x i16> %A to <4 x i32> 482 %m = mul nuw nsw <4 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778> 483 ret <4 x i32> %m 484} 485 486define <8 x i32> @test_mul_v8i32_v8i16(<8 x i16> %A) { 487; SLM32-LABEL: test_mul_v8i32_v8i16: 488; SLM32: # %bb.0: 489; SLM32-NEXT: movdqa %xmm0, %xmm1 490; SLM32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] 491; SLM32-NEXT: movdqa %xmm1, %xmm2 492; SLM32-NEXT: pmullw %xmm0, %xmm1 493; SLM32-NEXT: pmulhuw %xmm0, %xmm2 494; SLM32-NEXT: movdqa %xmm1, %xmm0 495; SLM32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 496; SLM32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 497; SLM32-NEXT: retl 498; 499; SLM64-LABEL: test_mul_v8i32_v8i16: 500; SLM64: # %bb.0: 501; SLM64-NEXT: movdqa %xmm0, %xmm1 502; SLM64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] 503; SLM64-NEXT: movdqa %xmm1, %xmm2 504; SLM64-NEXT: pmullw %xmm0, %xmm1 505; SLM64-NEXT: pmulhuw %xmm0, %xmm2 506; SLM64-NEXT: movdqa %xmm1, %xmm0 507; SLM64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 508; SLM64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 509; SLM64-NEXT: retq 510; 511; SLOW32-LABEL: test_mul_v8i32_v8i16: 512; SLOW32: # %bb.0: 513; SLOW32-NEXT: movdqa %xmm0, %xmm1 514; SLOW32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] 515; SLOW32-NEXT: movdqa %xmm1, %xmm2 516; SLOW32-NEXT: pmulhuw %xmm0, %xmm2 517; SLOW32-NEXT: pmullw %xmm0, %xmm1 518; SLOW32-NEXT: movdqa %xmm1, %xmm0 519; SLOW32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 520; SLOW32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 521; SLOW32-NEXT: retl 522; 523; SLOW64-LABEL: test_mul_v8i32_v8i16: 524; SLOW64: # %bb.0: 525; SLOW64-NEXT: movdqa %xmm0, %xmm1 526; SLOW64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] 527; SLOW64-NEXT: movdqa %xmm1, %xmm2 528; SLOW64-NEXT: pmulhuw %xmm0, %xmm2 529; SLOW64-NEXT: pmullw %xmm0, %xmm1 530; SLOW64-NEXT: movdqa %xmm1, %xmm0 531; SLOW64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 532; SLOW64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 533; SLOW64-NEXT: retq 534; 535; SSE4-32-LABEL: test_mul_v8i32_v8i16: 536; SSE4-32: # %bb.0: 537; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 538; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 539; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 540; SSE4-32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] 541; SSE4-32-NEXT: pmulld %xmm2, %xmm0 542; SSE4-32-NEXT: pmulld %xmm2, %xmm1 543; SSE4-32-NEXT: retl 544; 545; SSE4-64-LABEL: test_mul_v8i32_v8i16: 546; SSE4-64: # %bb.0: 547; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 548; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 549; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 550; SSE4-64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] 551; SSE4-64-NEXT: pmulld %xmm2, %xmm0 552; SSE4-64-NEXT: pmulld %xmm2, %xmm1 553; SSE4-64-NEXT: retq 554; 555; AVX-32-LABEL: test_mul_v8i32_v8i16: 556; AVX-32: # %bb.0: 557; AVX-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 558; AVX-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778] 559; AVX-32-NEXT: vpmulld %ymm1, %ymm0, %ymm0 560; AVX-32-NEXT: retl 561; 562; AVX-64-LABEL: test_mul_v8i32_v8i16: 563; AVX-64: # %bb.0: 564; AVX-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 565; AVX-64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778] 566; AVX-64-NEXT: vpmulld %ymm1, %ymm0, %ymm0 567; AVX-64-NEXT: retq 568 %z = zext <8 x i16> %A to <8 x i32> 569 %m = mul nuw nsw <8 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778> 570 ret <8 x i32> %m 571} 572 573define <16 x i32> @test_mul_v16i32_v16i16(<16 x i16> %A) { 574; SLM32-LABEL: test_mul_v16i32_v16i16: 575; SLM32: # %bb.0: 576; SLM32-NEXT: movdqa %xmm1, %xmm3 577; SLM32-NEXT: movdqa %xmm0, %xmm1 578; SLM32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] 579; SLM32-NEXT: movdqa %xmm1, %xmm2 580; SLM32-NEXT: movdqa %xmm3, %xmm4 581; SLM32-NEXT: pmullw %xmm0, %xmm1 582; SLM32-NEXT: pmulhuw %xmm0, %xmm2 583; SLM32-NEXT: pmullw %xmm0, %xmm3 584; SLM32-NEXT: pmulhuw %xmm0, %xmm4 585; SLM32-NEXT: movdqa %xmm1, %xmm0 586; SLM32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 587; SLM32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 588; SLM32-NEXT: movdqa %xmm3, %xmm2 589; SLM32-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 590; SLM32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] 591; SLM32-NEXT: retl 592; 593; SLM64-LABEL: test_mul_v16i32_v16i16: 594; SLM64: # %bb.0: 595; SLM64-NEXT: movdqa %xmm1, %xmm3 596; SLM64-NEXT: movdqa %xmm0, %xmm1 597; SLM64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] 598; SLM64-NEXT: movdqa %xmm1, %xmm2 599; SLM64-NEXT: movdqa %xmm3, %xmm4 600; SLM64-NEXT: pmullw %xmm0, %xmm1 601; SLM64-NEXT: pmulhuw %xmm0, %xmm2 602; SLM64-NEXT: pmullw %xmm0, %xmm3 603; SLM64-NEXT: pmulhuw %xmm0, %xmm4 604; SLM64-NEXT: movdqa %xmm1, %xmm0 605; SLM64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 606; SLM64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 607; SLM64-NEXT: movdqa %xmm3, %xmm2 608; SLM64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 609; SLM64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] 610; SLM64-NEXT: retq 611; 612; SLOW32-LABEL: test_mul_v16i32_v16i16: 613; SLOW32: # %bb.0: 614; SLOW32-NEXT: movdqa %xmm1, %xmm3 615; SLOW32-NEXT: movdqa %xmm0, %xmm1 616; SLOW32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778] 617; SLOW32-NEXT: movdqa %xmm0, %xmm4 618; SLOW32-NEXT: pmulhuw %xmm2, %xmm4 619; SLOW32-NEXT: pmullw %xmm2, %xmm1 620; SLOW32-NEXT: movdqa %xmm1, %xmm0 621; SLOW32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] 622; SLOW32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] 623; SLOW32-NEXT: movdqa %xmm3, %xmm4 624; SLOW32-NEXT: pmulhuw %xmm2, %xmm4 625; SLOW32-NEXT: pmullw %xmm2, %xmm3 626; SLOW32-NEXT: movdqa %xmm3, %xmm2 627; SLOW32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] 628; SLOW32-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 629; SLOW32-NEXT: retl 630; 631; SLOW64-LABEL: test_mul_v16i32_v16i16: 632; SLOW64: # %bb.0: 633; SLOW64-NEXT: movdqa %xmm1, %xmm3 634; SLOW64-NEXT: movdqa %xmm0, %xmm1 635; SLOW64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778] 636; SLOW64-NEXT: movdqa %xmm0, %xmm4 637; SLOW64-NEXT: pmulhuw %xmm2, %xmm4 638; SLOW64-NEXT: pmullw %xmm2, %xmm1 639; SLOW64-NEXT: movdqa %xmm1, %xmm0 640; SLOW64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] 641; SLOW64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] 642; SLOW64-NEXT: movdqa %xmm3, %xmm4 643; SLOW64-NEXT: pmulhuw %xmm2, %xmm4 644; SLOW64-NEXT: pmullw %xmm2, %xmm3 645; SLOW64-NEXT: movdqa %xmm3, %xmm2 646; SLOW64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] 647; SLOW64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 648; SLOW64-NEXT: retq 649; 650; SSE4-32-LABEL: test_mul_v16i32_v16i16: 651; SSE4-32: # %bb.0: 652; SSE4-32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] 653; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 654; SSE4-32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 655; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 656; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 657; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 658; SSE4-32-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778] 659; SSE4-32-NEXT: pmulld %xmm1, %xmm0 660; SSE4-32-NEXT: pmulld %xmm1, %xmm2 661; SSE4-32-NEXT: pmulld %xmm1, %xmm4 662; SSE4-32-NEXT: pmulld %xmm1, %xmm3 663; SSE4-32-NEXT: movdqa %xmm4, %xmm1 664; SSE4-32-NEXT: retl 665; 666; SSE4-64-LABEL: test_mul_v16i32_v16i16: 667; SSE4-64: # %bb.0: 668; SSE4-64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] 669; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 670; SSE4-64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 671; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 672; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 673; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 674; SSE4-64-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778] 675; SSE4-64-NEXT: pmulld %xmm1, %xmm0 676; SSE4-64-NEXT: pmulld %xmm1, %xmm2 677; SSE4-64-NEXT: pmulld %xmm1, %xmm4 678; SSE4-64-NEXT: pmulld %xmm1, %xmm3 679; SSE4-64-NEXT: movdqa %xmm4, %xmm1 680; SSE4-64-NEXT: retq 681; 682; AVX2-32-LABEL: test_mul_v16i32_v16i16: 683; AVX2-32: # %bb.0: 684; AVX2-32-NEXT: vextracti128 $1, %ymm0, %xmm1 685; AVX2-32-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 686; AVX2-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 687; AVX2-32-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778] 688; AVX2-32-NEXT: vpmulld %ymm2, %ymm0, %ymm0 689; AVX2-32-NEXT: vpmulld %ymm2, %ymm1, %ymm1 690; AVX2-32-NEXT: retl 691; 692; AVX2-64-LABEL: test_mul_v16i32_v16i16: 693; AVX2-64: # %bb.0: 694; AVX2-64-NEXT: vextracti128 $1, %ymm0, %xmm1 695; AVX2-64-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 696; AVX2-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 697; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778] 698; AVX2-64-NEXT: vpmulld %ymm2, %ymm0, %ymm0 699; AVX2-64-NEXT: vpmulld %ymm2, %ymm1, %ymm1 700; AVX2-64-NEXT: retq 701; 702; AVX512-32-LABEL: test_mul_v16i32_v16i16: 703; AVX512-32: # %bb.0: 704; AVX512-32-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 705; AVX512-32-NEXT: vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0 706; AVX512-32-NEXT: retl 707; 708; AVX512-64-LABEL: test_mul_v16i32_v16i16: 709; AVX512-64: # %bb.0: 710; AVX512-64-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 711; AVX512-64-NEXT: vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0 712; AVX512-64-NEXT: retq 713 %z = zext <16 x i16> %A to <16 x i32> 714 %m = mul nuw nsw <16 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778> 715 ret <16 x i32> %m 716} 717 718; 719; MinSize Tests 720; 721 722define <4 x i32> @test_mul_v4i32_v4i8_minsize(<4 x i8> %A) minsize { 723; CHECK32-LABEL: test_mul_v4i32_v4i8_minsize: 724; CHECK32: # %bb.0: 725; CHECK32-NEXT: pand {{\.LCPI.*}}, %xmm0 726; CHECK32-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0 727; CHECK32-NEXT: retl 728; 729; CHECK64-LABEL: test_mul_v4i32_v4i8_minsize: 730; CHECK64: # %bb.0: 731; CHECK64-NEXT: pand {{.*}}(%rip), %xmm0 732; CHECK64-NEXT: pmaddwd {{.*}}(%rip), %xmm0 733; CHECK64-NEXT: retq 734; 735; SSE4-32-LABEL: test_mul_v4i32_v4i8_minsize: 736; SSE4-32: # %bb.0: 737; SSE4-32-NEXT: pand {{\.LCPI.*}}, %xmm0 738; SSE4-32-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0 739; SSE4-32-NEXT: retl 740; 741; SSE4-64-LABEL: test_mul_v4i32_v4i8_minsize: 742; SSE4-64: # %bb.0: 743; SSE4-64-NEXT: pand {{.*}}(%rip), %xmm0 744; SSE4-64-NEXT: pmaddwd {{.*}}(%rip), %xmm0 745; SSE4-64-NEXT: retq 746; 747; AVX2-32-LABEL: test_mul_v4i32_v4i8_minsize: 748; AVX2-32: # %bb.0: 749; AVX2-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 750; AVX2-32-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0 751; AVX2-32-NEXT: retl 752; 753; AVX2-64-LABEL: test_mul_v4i32_v4i8_minsize: 754; AVX2-64: # %bb.0: 755; AVX2-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 756; AVX2-64-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0 757; AVX2-64-NEXT: retq 758; 759; AVX512DQ-32-LABEL: test_mul_v4i32_v4i8_minsize: 760; AVX512DQ-32: # %bb.0: 761; AVX512DQ-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 762; AVX512DQ-32-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0 763; AVX512DQ-32-NEXT: retl 764; 765; AVX512DQ-64-LABEL: test_mul_v4i32_v4i8_minsize: 766; AVX512DQ-64: # %bb.0: 767; AVX512DQ-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 768; AVX512DQ-64-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0 769; AVX512DQ-64-NEXT: retq 770; 771; AVX512BW-32-LABEL: test_mul_v4i32_v4i8_minsize: 772; AVX512BW-32: # %bb.0: 773; AVX512BW-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 774; AVX512BW-32-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0 775; AVX512BW-32-NEXT: retl 776; 777; AVX512BW-64-LABEL: test_mul_v4i32_v4i8_minsize: 778; AVX512BW-64: # %bb.0: 779; AVX512BW-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 780; AVX512BW-64-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0 781; AVX512BW-64-NEXT: retq 782; 783; KNL-32-LABEL: test_mul_v4i32_v4i8_minsize: 784; KNL-32: # %bb.0: 785; KNL-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 786; KNL-32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778] 787; KNL-32-NEXT: vpmulld %xmm1, %xmm0, %xmm0 788; KNL-32-NEXT: retl 789; 790; KNL-64-LABEL: test_mul_v4i32_v4i8_minsize: 791; KNL-64: # %bb.0: 792; KNL-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 793; KNL-64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778] 794; KNL-64-NEXT: vpmulld %xmm1, %xmm0, %xmm0 795; KNL-64-NEXT: retq 796 %z = zext <4 x i8> %A to <4 x i32> 797 %m = mul nuw nsw <4 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778> 798 ret <4 x i32> %m 799} 800 801define <8 x i32> @test_mul_v8i32_v8i8_minsize(<8 x i8> %A) minsize { 802; SLM32-LABEL: test_mul_v8i32_v8i8_minsize: 803; SLM32: # %bb.0: 804; SLM32-NEXT: pand {{\.LCPI.*}}, %xmm0 805; SLM32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] 806; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 807; SLM32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 808; SLM32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 809; SLM32-NEXT: pmaddwd %xmm2, %xmm0 810; SLM32-NEXT: pmaddwd %xmm2, %xmm1 811; SLM32-NEXT: retl 812; 813; SLM64-LABEL: test_mul_v8i32_v8i8_minsize: 814; SLM64: # %bb.0: 815; SLM64-NEXT: pand {{.*}}(%rip), %xmm0 816; SLM64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] 817; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 818; SLM64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 819; SLM64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 820; SLM64-NEXT: pmaddwd %xmm2, %xmm0 821; SLM64-NEXT: pmaddwd %xmm2, %xmm1 822; SLM64-NEXT: retq 823; 824; SLOW32-LABEL: test_mul_v8i32_v8i8_minsize: 825; SLOW32: # %bb.0: 826; SLOW32-NEXT: pand {{\.LCPI.*}}, %xmm0 827; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 828; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 829; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 830; SLOW32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] 831; SLOW32-NEXT: pmaddwd %xmm2, %xmm0 832; SLOW32-NEXT: pmaddwd %xmm2, %xmm1 833; SLOW32-NEXT: retl 834; 835; SLOW64-LABEL: test_mul_v8i32_v8i8_minsize: 836; SLOW64: # %bb.0: 837; SLOW64-NEXT: pand {{.*}}(%rip), %xmm0 838; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 839; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 840; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 841; SLOW64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] 842; SLOW64-NEXT: pmaddwd %xmm2, %xmm0 843; SLOW64-NEXT: pmaddwd %xmm2, %xmm1 844; SLOW64-NEXT: retq 845; 846; SSE4-32-LABEL: test_mul_v8i32_v8i8_minsize: 847; SSE4-32: # %bb.0: 848; SSE4-32-NEXT: pand {{\.LCPI.*}}, %xmm0 849; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 850; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 851; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 852; SSE4-32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] 853; SSE4-32-NEXT: pmaddwd %xmm2, %xmm0 854; SSE4-32-NEXT: pmaddwd %xmm2, %xmm1 855; SSE4-32-NEXT: retl 856; 857; SSE4-64-LABEL: test_mul_v8i32_v8i8_minsize: 858; SSE4-64: # %bb.0: 859; SSE4-64-NEXT: pand {{.*}}(%rip), %xmm0 860; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 861; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 862; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 863; SSE4-64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] 864; SSE4-64-NEXT: pmaddwd %xmm2, %xmm0 865; SSE4-64-NEXT: pmaddwd %xmm2, %xmm1 866; SSE4-64-NEXT: retq 867; 868; AVX2-32-LABEL: test_mul_v8i32_v8i8_minsize: 869; AVX2-32: # %bb.0: 870; AVX2-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 871; AVX2-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 872; AVX2-32-NEXT: vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0 873; AVX2-32-NEXT: retl 874; 875; AVX2-64-LABEL: test_mul_v8i32_v8i8_minsize: 876; AVX2-64: # %bb.0: 877; AVX2-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 878; AVX2-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 879; AVX2-64-NEXT: vpmaddwd {{.*}}(%rip), %ymm0, %ymm0 880; AVX2-64-NEXT: retq 881; 882; AVX512DQ-32-LABEL: test_mul_v8i32_v8i8_minsize: 883; AVX512DQ-32: # %bb.0: 884; AVX512DQ-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 885; AVX512DQ-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 886; AVX512DQ-32-NEXT: vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0 887; AVX512DQ-32-NEXT: retl 888; 889; AVX512DQ-64-LABEL: test_mul_v8i32_v8i8_minsize: 890; AVX512DQ-64: # %bb.0: 891; AVX512DQ-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 892; AVX512DQ-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 893; AVX512DQ-64-NEXT: vpmaddwd {{.*}}(%rip), %ymm0, %ymm0 894; AVX512DQ-64-NEXT: retq 895; 896; AVX512BW-32-LABEL: test_mul_v8i32_v8i8_minsize: 897; AVX512BW-32: # %bb.0: 898; AVX512BW-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 899; AVX512BW-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 900; AVX512BW-32-NEXT: vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0 901; AVX512BW-32-NEXT: retl 902; 903; AVX512BW-64-LABEL: test_mul_v8i32_v8i8_minsize: 904; AVX512BW-64: # %bb.0: 905; AVX512BW-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 906; AVX512BW-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 907; AVX512BW-64-NEXT: vpmaddwd {{.*}}(%rip), %ymm0, %ymm0 908; AVX512BW-64-NEXT: retq 909; 910; KNL-32-LABEL: test_mul_v8i32_v8i8_minsize: 911; KNL-32: # %bb.0: 912; KNL-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 913; KNL-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 914; KNL-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778] 915; KNL-32-NEXT: vpmulld %ymm1, %ymm0, %ymm0 916; KNL-32-NEXT: retl 917; 918; KNL-64-LABEL: test_mul_v8i32_v8i8_minsize: 919; KNL-64: # %bb.0: 920; KNL-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 921; KNL-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 922; KNL-64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778] 923; KNL-64-NEXT: vpmulld %ymm1, %ymm0, %ymm0 924; KNL-64-NEXT: retq 925 %z = zext <8 x i8> %A to <8 x i32> 926 %m = mul nuw nsw <8 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778> 927 ret <8 x i32> %m 928} 929 930define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize { 931; SLM32-LABEL: test_mul_v16i32_v16i8_minsize: 932; SLM32: # %bb.0: 933; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] 934; SLM32-NEXT: movdqa {{.*#+}} xmm5 = [18778,18778,18778,18778] 935; SLM32-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3] 936; SLM32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 937; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 938; SLM32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 939; SLM32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 940; SLM32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero 941; SLM32-NEXT: pmaddwd %xmm5, %xmm0 942; SLM32-NEXT: pmaddwd %xmm5, %xmm1 943; SLM32-NEXT: pmaddwd %xmm5, %xmm2 944; SLM32-NEXT: pmaddwd %xmm5, %xmm3 945; SLM32-NEXT: retl 946; 947; SLM64-LABEL: test_mul_v16i32_v16i8_minsize: 948; SLM64: # %bb.0: 949; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] 950; SLM64-NEXT: movdqa {{.*#+}} xmm5 = [18778,18778,18778,18778] 951; SLM64-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3] 952; SLM64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 953; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 954; SLM64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 955; SLM64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 956; SLM64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero 957; SLM64-NEXT: pmaddwd %xmm5, %xmm0 958; SLM64-NEXT: pmaddwd %xmm5, %xmm1 959; SLM64-NEXT: pmaddwd %xmm5, %xmm2 960; SLM64-NEXT: pmaddwd %xmm5, %xmm3 961; SLM64-NEXT: retq 962; 963; SLOW32-LABEL: test_mul_v16i32_v16i8_minsize: 964; SLOW32: # %bb.0: 965; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] 966; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 967; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 968; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 969; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 970; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 971; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 972; SLOW32-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778] 973; SLOW32-NEXT: pmaddwd %xmm4, %xmm0 974; SLOW32-NEXT: pmaddwd %xmm4, %xmm1 975; SLOW32-NEXT: pmaddwd %xmm4, %xmm2 976; SLOW32-NEXT: pmaddwd %xmm4, %xmm3 977; SLOW32-NEXT: retl 978; 979; SLOW64-LABEL: test_mul_v16i32_v16i8_minsize: 980; SLOW64: # %bb.0: 981; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] 982; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 983; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 984; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 985; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 986; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 987; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 988; SLOW64-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778] 989; SLOW64-NEXT: pmaddwd %xmm4, %xmm0 990; SLOW64-NEXT: pmaddwd %xmm4, %xmm1 991; SLOW64-NEXT: pmaddwd %xmm4, %xmm2 992; SLOW64-NEXT: pmaddwd %xmm4, %xmm3 993; SLOW64-NEXT: retq 994; 995; SSE4-32-LABEL: test_mul_v16i32_v16i8_minsize: 996; SSE4-32: # %bb.0: 997; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] 998; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 999; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1000; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 1001; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1002; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 1003; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1004; SSE4-32-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778] 1005; SSE4-32-NEXT: pmaddwd %xmm4, %xmm0 1006; SSE4-32-NEXT: pmaddwd %xmm4, %xmm1 1007; SSE4-32-NEXT: pmaddwd %xmm4, %xmm2 1008; SSE4-32-NEXT: pmaddwd %xmm4, %xmm3 1009; SSE4-32-NEXT: retl 1010; 1011; SSE4-64-LABEL: test_mul_v16i32_v16i8_minsize: 1012; SSE4-64: # %bb.0: 1013; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] 1014; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 1015; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1016; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 1017; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1018; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 1019; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1020; SSE4-64-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778] 1021; SSE4-64-NEXT: pmaddwd %xmm4, %xmm0 1022; SSE4-64-NEXT: pmaddwd %xmm4, %xmm1 1023; SSE4-64-NEXT: pmaddwd %xmm4, %xmm2 1024; SSE4-64-NEXT: pmaddwd %xmm4, %xmm3 1025; SSE4-64-NEXT: retq 1026; 1027; AVX2-32-LABEL: test_mul_v16i32_v16i8_minsize: 1028; AVX2-32: # %bb.0: 1029; AVX2-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1030; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero 1031; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 1032; AVX2-32-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778] 1033; AVX2-32-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 1034; AVX2-32-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1 1035; AVX2-32-NEXT: retl 1036; 1037; AVX2-64-LABEL: test_mul_v16i32_v16i8_minsize: 1038; AVX2-64: # %bb.0: 1039; AVX2-64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1040; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero 1041; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 1042; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778] 1043; AVX2-64-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 1044; AVX2-64-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1 1045; AVX2-64-NEXT: retq 1046; 1047; AVX512DQ-32-LABEL: test_mul_v16i32_v16i8_minsize: 1048; AVX512DQ-32: # %bb.0: 1049; AVX512DQ-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 1050; AVX512DQ-32-NEXT: vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0 1051; AVX512DQ-32-NEXT: retl 1052; 1053; AVX512DQ-64-LABEL: test_mul_v16i32_v16i8_minsize: 1054; AVX512DQ-64: # %bb.0: 1055; AVX512DQ-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 1056; AVX512DQ-64-NEXT: vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0 1057; AVX512DQ-64-NEXT: retq 1058; 1059; AVX512BW-32-LABEL: test_mul_v16i32_v16i8_minsize: 1060; AVX512BW-32: # %bb.0: 1061; AVX512BW-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 1062; AVX512BW-32-NEXT: vpmaddwd {{\.LCPI.*}}, %zmm0, %zmm0 1063; AVX512BW-32-NEXT: retl 1064; 1065; AVX512BW-64-LABEL: test_mul_v16i32_v16i8_minsize: 1066; AVX512BW-64: # %bb.0: 1067; AVX512BW-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 1068; AVX512BW-64-NEXT: vpmaddwd {{.*}}(%rip), %zmm0, %zmm0 1069; AVX512BW-64-NEXT: retq 1070; 1071; KNL-32-LABEL: test_mul_v16i32_v16i8_minsize: 1072; KNL-32: # %bb.0: 1073; KNL-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 1074; KNL-32-NEXT: vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0 1075; KNL-32-NEXT: retl 1076; 1077; KNL-64-LABEL: test_mul_v16i32_v16i8_minsize: 1078; KNL-64: # %bb.0: 1079; KNL-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 1080; KNL-64-NEXT: vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0 1081; KNL-64-NEXT: retq 1082 %z = zext <16 x i8> %A to <16 x i32> 1083 %m = mul nuw nsw <16 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778> 1084 ret <16 x i32> %m 1085} 1086 1087define <4 x i32> @test_mul_v4i32_v4i16_minsize(<4 x i16> %A) minsize { 1088; CHECK32-LABEL: test_mul_v4i32_v4i16_minsize: 1089; CHECK32: # %bb.0: 1090; CHECK32-NEXT: pxor %xmm1, %xmm1 1091; CHECK32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] 1092; CHECK32-NEXT: pmulld {{\.LCPI.*}}, %xmm0 1093; CHECK32-NEXT: retl 1094; 1095; CHECK64-LABEL: test_mul_v4i32_v4i16_minsize: 1096; CHECK64: # %bb.0: 1097; CHECK64-NEXT: pxor %xmm1, %xmm1 1098; CHECK64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] 1099; CHECK64-NEXT: pmulld {{.*}}(%rip), %xmm0 1100; CHECK64-NEXT: retq 1101; 1102; SSE4-32-LABEL: test_mul_v4i32_v4i16_minsize: 1103; SSE4-32: # %bb.0: 1104; SSE4-32-NEXT: pxor %xmm1, %xmm1 1105; SSE4-32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] 1106; SSE4-32-NEXT: pmulld {{\.LCPI.*}}, %xmm0 1107; SSE4-32-NEXT: retl 1108; 1109; SSE4-64-LABEL: test_mul_v4i32_v4i16_minsize: 1110; SSE4-64: # %bb.0: 1111; SSE4-64-NEXT: pxor %xmm1, %xmm1 1112; SSE4-64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] 1113; SSE4-64-NEXT: pmulld {{.*}}(%rip), %xmm0 1114; SSE4-64-NEXT: retq 1115; 1116; AVX-32-LABEL: test_mul_v4i32_v4i16_minsize: 1117; AVX-32: # %bb.0: 1118; AVX-32-NEXT: vpxor %xmm1, %xmm1, %xmm1 1119; AVX-32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] 1120; AVX-32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778] 1121; AVX-32-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1122; AVX-32-NEXT: retl 1123; 1124; AVX-64-LABEL: test_mul_v4i32_v4i16_minsize: 1125; AVX-64: # %bb.0: 1126; AVX-64-NEXT: vpxor %xmm1, %xmm1, %xmm1 1127; AVX-64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] 1128; AVX-64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778] 1129; AVX-64-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1130; AVX-64-NEXT: retq 1131 %z = zext <4 x i16> %A to <4 x i32> 1132 %m = mul nuw nsw <4 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778> 1133 ret <4 x i32> %m 1134} 1135 1136define <8 x i32> @test_mul_v8i32_v8i16_minsize(<8 x i16> %A) minsize { 1137; SLM32-LABEL: test_mul_v8i32_v8i16_minsize: 1138; SLM32: # %bb.0: 1139; SLM32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] 1140; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1141; SLM32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1142; SLM32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 1143; SLM32-NEXT: pmulld %xmm2, %xmm0 1144; SLM32-NEXT: pmulld %xmm2, %xmm1 1145; SLM32-NEXT: retl 1146; 1147; SLM64-LABEL: test_mul_v8i32_v8i16_minsize: 1148; SLM64: # %bb.0: 1149; SLM64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] 1150; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1151; SLM64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1152; SLM64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 1153; SLM64-NEXT: pmulld %xmm2, %xmm0 1154; SLM64-NEXT: pmulld %xmm2, %xmm1 1155; SLM64-NEXT: retq 1156; 1157; SLOW32-LABEL: test_mul_v8i32_v8i16_minsize: 1158; SLOW32: # %bb.0: 1159; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1160; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 1161; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1162; SLOW32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] 1163; SLOW32-NEXT: pmulld %xmm2, %xmm0 1164; SLOW32-NEXT: pmulld %xmm2, %xmm1 1165; SLOW32-NEXT: retl 1166; 1167; SLOW64-LABEL: test_mul_v8i32_v8i16_minsize: 1168; SLOW64: # %bb.0: 1169; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1170; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 1171; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1172; SLOW64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] 1173; SLOW64-NEXT: pmulld %xmm2, %xmm0 1174; SLOW64-NEXT: pmulld %xmm2, %xmm1 1175; SLOW64-NEXT: retq 1176; 1177; SSE4-32-LABEL: test_mul_v8i32_v8i16_minsize: 1178; SSE4-32: # %bb.0: 1179; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1180; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 1181; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1182; SSE4-32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] 1183; SSE4-32-NEXT: pmulld %xmm2, %xmm0 1184; SSE4-32-NEXT: pmulld %xmm2, %xmm1 1185; SSE4-32-NEXT: retl 1186; 1187; SSE4-64-LABEL: test_mul_v8i32_v8i16_minsize: 1188; SSE4-64: # %bb.0: 1189; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1190; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 1191; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1192; SSE4-64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] 1193; SSE4-64-NEXT: pmulld %xmm2, %xmm0 1194; SSE4-64-NEXT: pmulld %xmm2, %xmm1 1195; SSE4-64-NEXT: retq 1196; 1197; AVX-32-LABEL: test_mul_v8i32_v8i16_minsize: 1198; AVX-32: # %bb.0: 1199; AVX-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1200; AVX-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778] 1201; AVX-32-NEXT: vpmulld %ymm1, %ymm0, %ymm0 1202; AVX-32-NEXT: retl 1203; 1204; AVX-64-LABEL: test_mul_v8i32_v8i16_minsize: 1205; AVX-64: # %bb.0: 1206; AVX-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1207; AVX-64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778] 1208; AVX-64-NEXT: vpmulld %ymm1, %ymm0, %ymm0 1209; AVX-64-NEXT: retq 1210 %z = zext <8 x i16> %A to <8 x i32> 1211 %m = mul nuw nsw <8 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778> 1212 ret <8 x i32> %m 1213} 1214 1215define <16 x i32> @test_mul_v16i32_v16i16_minsize(<16 x i16> %A) minsize { 1216; SLM32-LABEL: test_mul_v16i32_v16i16_minsize: 1217; SLM32: # %bb.0: 1218; SLM32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] 1219; SLM32-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 1220; SLM32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 1221; SLM32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1222; SLM32-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 1223; SLM32-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 1224; SLM32-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778] 1225; SLM32-NEXT: pmulld %xmm1, %xmm4 1226; SLM32-NEXT: pmulld %xmm1, %xmm0 1227; SLM32-NEXT: pmulld %xmm1, %xmm2 1228; SLM32-NEXT: pmulld %xmm1, %xmm3 1229; SLM32-NEXT: movdqa %xmm4, %xmm1 1230; SLM32-NEXT: retl 1231; 1232; SLM64-LABEL: test_mul_v16i32_v16i16_minsize: 1233; SLM64: # %bb.0: 1234; SLM64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] 1235; SLM64-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 1236; SLM64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 1237; SLM64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1238; SLM64-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 1239; SLM64-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 1240; SLM64-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778] 1241; SLM64-NEXT: pmulld %xmm1, %xmm4 1242; SLM64-NEXT: pmulld %xmm1, %xmm0 1243; SLM64-NEXT: pmulld %xmm1, %xmm2 1244; SLM64-NEXT: pmulld %xmm1, %xmm3 1245; SLM64-NEXT: movdqa %xmm4, %xmm1 1246; SLM64-NEXT: retq 1247; 1248; SLOW32-LABEL: test_mul_v16i32_v16i16_minsize: 1249; SLOW32: # %bb.0: 1250; SLOW32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] 1251; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 1252; SLOW32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 1253; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 1254; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 1255; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1256; SLOW32-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778] 1257; SLOW32-NEXT: pmulld %xmm1, %xmm0 1258; SLOW32-NEXT: pmulld %xmm1, %xmm2 1259; SLOW32-NEXT: pmulld %xmm1, %xmm4 1260; SLOW32-NEXT: pmulld %xmm1, %xmm3 1261; SLOW32-NEXT: movdqa %xmm4, %xmm1 1262; SLOW32-NEXT: retl 1263; 1264; SLOW64-LABEL: test_mul_v16i32_v16i16_minsize: 1265; SLOW64: # %bb.0: 1266; SLOW64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] 1267; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 1268; SLOW64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 1269; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 1270; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 1271; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1272; SLOW64-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778] 1273; SLOW64-NEXT: pmulld %xmm1, %xmm0 1274; SLOW64-NEXT: pmulld %xmm1, %xmm2 1275; SLOW64-NEXT: pmulld %xmm1, %xmm4 1276; SLOW64-NEXT: pmulld %xmm1, %xmm3 1277; SLOW64-NEXT: movdqa %xmm4, %xmm1 1278; SLOW64-NEXT: retq 1279; 1280; SSE4-32-LABEL: test_mul_v16i32_v16i16_minsize: 1281; SSE4-32: # %bb.0: 1282; SSE4-32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] 1283; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 1284; SSE4-32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 1285; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 1286; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 1287; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1288; SSE4-32-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778] 1289; SSE4-32-NEXT: pmulld %xmm1, %xmm0 1290; SSE4-32-NEXT: pmulld %xmm1, %xmm2 1291; SSE4-32-NEXT: pmulld %xmm1, %xmm4 1292; SSE4-32-NEXT: pmulld %xmm1, %xmm3 1293; SSE4-32-NEXT: movdqa %xmm4, %xmm1 1294; SSE4-32-NEXT: retl 1295; 1296; SSE4-64-LABEL: test_mul_v16i32_v16i16_minsize: 1297; SSE4-64: # %bb.0: 1298; SSE4-64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] 1299; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 1300; SSE4-64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 1301; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 1302; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 1303; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1304; SSE4-64-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778] 1305; SSE4-64-NEXT: pmulld %xmm1, %xmm0 1306; SSE4-64-NEXT: pmulld %xmm1, %xmm2 1307; SSE4-64-NEXT: pmulld %xmm1, %xmm4 1308; SSE4-64-NEXT: pmulld %xmm1, %xmm3 1309; SSE4-64-NEXT: movdqa %xmm4, %xmm1 1310; SSE4-64-NEXT: retq 1311; 1312; AVX2-32-LABEL: test_mul_v16i32_v16i16_minsize: 1313; AVX2-32: # %bb.0: 1314; AVX2-32-NEXT: vextracti128 $1, %ymm0, %xmm1 1315; AVX2-32-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 1316; AVX2-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1317; AVX2-32-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778] 1318; AVX2-32-NEXT: vpmulld %ymm2, %ymm0, %ymm0 1319; AVX2-32-NEXT: vpmulld %ymm2, %ymm1, %ymm1 1320; AVX2-32-NEXT: retl 1321; 1322; AVX2-64-LABEL: test_mul_v16i32_v16i16_minsize: 1323; AVX2-64: # %bb.0: 1324; AVX2-64-NEXT: vextracti128 $1, %ymm0, %xmm1 1325; AVX2-64-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 1326; AVX2-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1327; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778] 1328; AVX2-64-NEXT: vpmulld %ymm2, %ymm0, %ymm0 1329; AVX2-64-NEXT: vpmulld %ymm2, %ymm1, %ymm1 1330; AVX2-64-NEXT: retq 1331; 1332; AVX512-32-LABEL: test_mul_v16i32_v16i16_minsize: 1333; AVX512-32: # %bb.0: 1334; AVX512-32-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1335; AVX512-32-NEXT: vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0 1336; AVX512-32-NEXT: retl 1337; 1338; AVX512-64-LABEL: test_mul_v16i32_v16i16_minsize: 1339; AVX512-64: # %bb.0: 1340; AVX512-64-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1341; AVX512-64-NEXT: vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0 1342; AVX512-64-NEXT: retq 1343 %z = zext <16 x i16> %A to <16 x i32> 1344 %m = mul nuw nsw <16 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778> 1345 ret <16 x i32> %m 1346} 1347