1; NOTE: Assertions have been autogenerated by update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CDVL 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,-avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CD 6 7define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { 8; AVX1-LABEL: testv4i64: 9; AVX1: # BB#0: 10; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 11; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 12; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 13; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm3 14; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 15; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 16; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 17; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] 18; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1 19; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 20; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm5 21; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 22; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 23; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 24; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 25; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 26; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1 27; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm1 28; AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0 29; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3 30; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 31; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 32; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 33; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 34; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 35; AVX1-NEXT: vpsadbw %xmm2, %xmm0, %xmm0 36; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 37; AVX1-NEXT: retq 38; 39; AVX2-LABEL: testv4i64: 40; AVX2: # BB#0: 41; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 42; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm2 43; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 44; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 45; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 46; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 47; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3 48; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 49; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 50; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 51; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 52; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0 53; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0 54; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 55; AVX2-NEXT: retq 56; 57; AVX512CDVL-LABEL: testv4i64: 58; AVX512CDVL: # BB#0: 59; AVX512CDVL-NEXT: vpxord %ymm1, %ymm1, %ymm1 60; AVX512CDVL-NEXT: vpsubq %ymm0, %ymm1, %ymm2 61; AVX512CDVL-NEXT: vpandq %ymm2, %ymm0, %ymm0 62; AVX512CDVL-NEXT: vpsubq {{.*}}(%rip){1to4}, %ymm0, %ymm0 63; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 64; AVX512CDVL-NEXT: vpandq %ymm2, %ymm0, %ymm3 65; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 66; AVX512CDVL-NEXT: vpshufb %ymm3, %ymm4, %ymm3 67; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0 68; AVX512CDVL-NEXT: vpandq %ymm2, %ymm0, %ymm0 69; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm4, %ymm0 70; AVX512CDVL-NEXT: vpaddb %ymm3, %ymm0, %ymm0 71; AVX512CDVL-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 72; AVX512CDVL-NEXT: retq 73; 74; AVX512CD-LABEL: testv4i64: 75; AVX512CD: # BB#0: 76; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1 77; AVX512CD-NEXT: vpsubq %ymm0, %ymm1, %ymm2 78; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0 79; AVX512CD-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 80; AVX512CD-NEXT: vpsubq %ymm2, %ymm0, %ymm0 81; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 82; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm3 83; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 84; AVX512CD-NEXT: vpshufb %ymm3, %ymm4, %ymm3 85; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 86; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0 87; AVX512CD-NEXT: vpshufb %ymm0, %ymm4, %ymm0 88; AVX512CD-NEXT: vpaddb %ymm3, %ymm0, %ymm0 89; AVX512CD-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 90; AVX512CD-NEXT: retq 91 %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %in, i1 0) 92 ret <4 x i64> %out 93} 94 95define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { 96; AVX1-LABEL: testv4i64u: 97; AVX1: # BB#0: 98; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 99; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 100; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 101; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm3 102; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 103; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 104; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 105; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] 106; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1 107; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 108; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm5 109; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 110; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 111; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 112; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 113; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 114; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1 115; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm1 116; AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0 117; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3 118; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 119; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 120; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 121; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 122; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 123; AVX1-NEXT: vpsadbw %xmm2, %xmm0, %xmm0 124; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 125; AVX1-NEXT: retq 126; 127; AVX2-LABEL: testv4i64u: 128; AVX2: # BB#0: 129; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 130; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm2 131; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 132; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 133; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 134; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 135; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3 136; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 137; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 138; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 139; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 140; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0 141; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0 142; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 143; AVX2-NEXT: retq 144; 145; AVX512CDVL-LABEL: testv4i64u: 146; AVX512CDVL: # BB#0: 147; AVX512CDVL-NEXT: vpxord %ymm1, %ymm1, %ymm1 148; AVX512CDVL-NEXT: vpsubq %ymm0, %ymm1, %ymm1 149; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm0 150; AVX512CDVL-NEXT: vplzcntq %ymm0, %ymm0 151; AVX512CDVL-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 152; AVX512CDVL-NEXT: vpsubq %ymm0, %ymm1, %ymm0 153; AVX512CDVL-NEXT: retq 154; 155; AVX512CD-LABEL: testv4i64u: 156; AVX512CD: # BB#0: 157; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1 158; AVX512CD-NEXT: vpsubq %ymm0, %ymm1, %ymm1 159; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 160; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 161; AVX512CD-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 162; AVX512CD-NEXT: vpsubq %ymm0, %ymm1, %ymm0 163; AVX512CD-NEXT: retq 164 %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %in, i1 -1) 165 ret <4 x i64> %out 166} 167 168define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { 169; AVX1-LABEL: testv8i32: 170; AVX1: # BB#0: 171; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 172; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 173; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm2 174; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm3 175; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 176; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 177; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 178; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1] 179; AVX1-NEXT: vpsubd %xmm3, %xmm2, %xmm2 180; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 181; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 182; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 183; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 184; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 185; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 186; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 187; AVX1-NEXT: vpaddb %xmm5, %xmm2, %xmm2 188; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 189; AVX1-NEXT: vpsadbw %xmm1, %xmm5, %xmm5 190; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 191; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 192; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 193; AVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0 194; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3 195; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 196; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 197; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 198; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 199; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 200; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 201; AVX1-NEXT: vpsadbw %xmm1, %xmm3, %xmm3 202; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 203; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 204; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 205; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 206; AVX1-NEXT: retq 207; 208; AVX2-LABEL: testv8i32: 209; AVX2: # BB#0: 210; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 211; AVX2-NEXT: vpsubd %ymm0, %ymm1, %ymm2 212; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 213; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 214; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0 215; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 216; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3 217; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 218; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 219; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 220; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 221; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0 222; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0 223; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 224; AVX2-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 225; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 226; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 227; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 228; AVX2-NEXT: retq 229; 230; AVX512CDVL-LABEL: testv8i32: 231; AVX512CDVL: # BB#0: 232; AVX512CDVL-NEXT: vpxord %ymm1, %ymm1, %ymm1 233; AVX512CDVL-NEXT: vpsubd %ymm0, %ymm1, %ymm2 234; AVX512CDVL-NEXT: vpandd %ymm2, %ymm0, %ymm0 235; AVX512CDVL-NEXT: vpsubd {{.*}}(%rip){1to8}, %ymm0, %ymm0 236; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 237; AVX512CDVL-NEXT: vpandq %ymm2, %ymm0, %ymm3 238; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 239; AVX512CDVL-NEXT: vpshufb %ymm3, %ymm4, %ymm3 240; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0 241; AVX512CDVL-NEXT: vpandq %ymm2, %ymm0, %ymm0 242; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm4, %ymm0 243; AVX512CDVL-NEXT: vpaddb %ymm3, %ymm0, %ymm0 244; AVX512CDVL-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 245; AVX512CDVL-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 246; AVX512CDVL-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 247; AVX512CDVL-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 248; AVX512CDVL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 249; AVX512CDVL-NEXT: retq 250; 251; AVX512CD-LABEL: testv8i32: 252; AVX512CD: # BB#0: 253; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1 254; AVX512CD-NEXT: vpsubd %ymm0, %ymm1, %ymm2 255; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0 256; AVX512CD-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 257; AVX512CD-NEXT: vpsubd %ymm2, %ymm0, %ymm0 258; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 259; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm3 260; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 261; AVX512CD-NEXT: vpshufb %ymm3, %ymm4, %ymm3 262; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 263; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0 264; AVX512CD-NEXT: vpshufb %ymm0, %ymm4, %ymm0 265; AVX512CD-NEXT: vpaddb %ymm3, %ymm0, %ymm0 266; AVX512CD-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 267; AVX512CD-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 268; AVX512CD-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 269; AVX512CD-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 270; AVX512CD-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 271; AVX512CD-NEXT: retq 272 %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %in, i1 0) 273 ret <8 x i32> %out 274} 275 276define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { 277; AVX1-LABEL: testv8i32u: 278; AVX1: # BB#0: 279; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 280; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 281; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm2 282; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm3 283; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 284; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 285; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 286; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1] 287; AVX1-NEXT: vpsubd %xmm3, %xmm2, %xmm2 288; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 289; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 290; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 291; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 292; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 293; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 294; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 295; AVX1-NEXT: vpaddb %xmm5, %xmm2, %xmm2 296; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 297; AVX1-NEXT: vpsadbw %xmm1, %xmm5, %xmm5 298; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 299; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 300; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 301; AVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0 302; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3 303; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 304; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 305; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 306; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 307; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 308; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 309; AVX1-NEXT: vpsadbw %xmm1, %xmm3, %xmm3 310; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 311; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 312; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 313; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 314; AVX1-NEXT: retq 315; 316; AVX2-LABEL: testv8i32u: 317; AVX2: # BB#0: 318; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 319; AVX2-NEXT: vpsubd %ymm0, %ymm1, %ymm2 320; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 321; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 322; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0 323; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 324; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3 325; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 326; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 327; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 328; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 329; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0 330; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0 331; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 332; AVX2-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 333; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 334; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 335; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 336; AVX2-NEXT: retq 337; 338; AVX512CDVL-LABEL: testv8i32u: 339; AVX512CDVL: # BB#0: 340; AVX512CDVL-NEXT: vpxord %ymm1, %ymm1, %ymm1 341; AVX512CDVL-NEXT: vpsubd %ymm0, %ymm1, %ymm1 342; AVX512CDVL-NEXT: vpandd %ymm1, %ymm0, %ymm0 343; AVX512CDVL-NEXT: vplzcntd %ymm0, %ymm0 344; AVX512CDVL-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 345; AVX512CDVL-NEXT: vpsubd %ymm0, %ymm1, %ymm0 346; AVX512CDVL-NEXT: retq 347; 348; AVX512CD-LABEL: testv8i32u: 349; AVX512CD: # BB#0: 350; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1 351; AVX512CD-NEXT: vpsubd %ymm0, %ymm1, %ymm1 352; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 353; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 354; AVX512CD-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 355; AVX512CD-NEXT: vpsubd %ymm0, %ymm1, %ymm0 356; AVX512CD-NEXT: retq 357 %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %in, i1 -1) 358 ret <8 x i32> %out 359} 360 361define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { 362; AVX1-LABEL: testv16i16: 363; AVX1: # BB#0: 364; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 365; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 366; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1 367; AVX1-NEXT: vpsubw %xmm0, %xmm2, %xmm2 368; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 369; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 370; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1] 371; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm2 372; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 373; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 374; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 375; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 376; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 377; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 378; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 379; AVX1-NEXT: vpaddb %xmm4, %xmm2, %xmm2 380; AVX1-NEXT: vpsllw $8, %xmm2, %xmm4 381; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 382; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 383; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 384; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 385; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm1 386; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 387; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 388; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 389; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 390; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 391; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 392; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 393; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 394; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 395; AVX1-NEXT: retq 396; 397; AVX2-LABEL: testv16i16: 398; AVX2: # BB#0: 399; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 400; AVX2-NEXT: vpsubw %ymm0, %ymm1, %ymm1 401; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 402; AVX2-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 403; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 404; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 405; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 406; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 407; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 408; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 409; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 410; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 411; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 412; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 413; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 414; AVX2-NEXT: retq 415; 416; AVX512CDVL-LABEL: testv16i16: 417; AVX512CDVL: # BB#0: 418; AVX512CDVL-NEXT: vpxord %ymm1, %ymm1, %ymm1 419; AVX512CDVL-NEXT: vpsubw %ymm0, %ymm1, %ymm1 420; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm0 421; AVX512CDVL-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 422; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 423; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm2 424; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 425; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 426; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0 427; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm0 428; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 429; AVX512CDVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 430; AVX512CDVL-NEXT: vpsllw $8, %ymm0, %ymm1 431; AVX512CDVL-NEXT: vpaddb %ymm0, %ymm1, %ymm0 432; AVX512CDVL-NEXT: vpsrlw $8, %ymm0, %ymm0 433; AVX512CDVL-NEXT: retq 434; 435; AVX512CD-LABEL: testv16i16: 436; AVX512CD: # BB#0: 437; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1 438; AVX512CD-NEXT: vpsubw %ymm0, %ymm1, %ymm1 439; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 440; AVX512CD-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 441; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 442; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2 443; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 444; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2 445; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 446; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 447; AVX512CD-NEXT: vpshufb %ymm0, %ymm3, %ymm0 448; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0 449; AVX512CD-NEXT: vpsllw $8, %ymm0, %ymm1 450; AVX512CD-NEXT: vpaddb %ymm0, %ymm1, %ymm0 451; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0 452; AVX512CD-NEXT: retq 453 %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %in, i1 0) 454 ret <16 x i16> %out 455} 456 457define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind { 458; AVX1-LABEL: testv16i16u: 459; AVX1: # BB#0: 460; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 461; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 462; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1 463; AVX1-NEXT: vpsubw %xmm0, %xmm2, %xmm2 464; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 465; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 466; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1] 467; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm2 468; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 469; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 470; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 471; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 472; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 473; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 474; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 475; AVX1-NEXT: vpaddb %xmm4, %xmm2, %xmm2 476; AVX1-NEXT: vpsllw $8, %xmm2, %xmm4 477; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 478; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 479; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 480; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 481; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm1 482; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 483; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 484; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 485; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 486; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 487; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 488; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 489; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 490; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 491; AVX1-NEXT: retq 492; 493; AVX2-LABEL: testv16i16u: 494; AVX2: # BB#0: 495; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 496; AVX2-NEXT: vpsubw %ymm0, %ymm1, %ymm1 497; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 498; AVX2-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 499; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 500; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 501; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 502; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 503; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 504; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 505; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 506; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 507; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 508; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 509; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 510; AVX2-NEXT: retq 511; 512; AVX512CDVL-LABEL: testv16i16u: 513; AVX512CDVL: # BB#0: 514; AVX512CDVL-NEXT: vpxord %ymm1, %ymm1, %ymm1 515; AVX512CDVL-NEXT: vpsubw %ymm0, %ymm1, %ymm1 516; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm0 517; AVX512CDVL-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 518; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 519; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm2 520; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 521; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 522; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0 523; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm0 524; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 525; AVX512CDVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 526; AVX512CDVL-NEXT: vpsllw $8, %ymm0, %ymm1 527; AVX512CDVL-NEXT: vpaddb %ymm0, %ymm1, %ymm0 528; AVX512CDVL-NEXT: vpsrlw $8, %ymm0, %ymm0 529; AVX512CDVL-NEXT: retq 530; 531; AVX512CD-LABEL: testv16i16u: 532; AVX512CD: # BB#0: 533; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1 534; AVX512CD-NEXT: vpsubw %ymm0, %ymm1, %ymm1 535; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 536; AVX512CD-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 537; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 538; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2 539; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 540; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2 541; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 542; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 543; AVX512CD-NEXT: vpshufb %ymm0, %ymm3, %ymm0 544; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0 545; AVX512CD-NEXT: vpsllw $8, %ymm0, %ymm1 546; AVX512CD-NEXT: vpaddb %ymm0, %ymm1, %ymm0 547; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0 548; AVX512CD-NEXT: retq 549 %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %in, i1 -1) 550 ret <16 x i16> %out 551} 552 553define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { 554; AVX1-LABEL: testv32i8: 555; AVX1: # BB#0: 556; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 557; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 558; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 559; AVX1-NEXT: vpsubb %xmm0, %xmm2, %xmm2 560; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 561; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 562; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 563; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 564; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1 565; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 566; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 567; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 568; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 569; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 570; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 571; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 572; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 573; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm0 574; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 575; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 576; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 577; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 578; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 579; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 580; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 581; AVX1-NEXT: retq 582; 583; AVX2-LABEL: testv32i8: 584; AVX2: # BB#0: 585; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 586; AVX2-NEXT: vpsubb %ymm0, %ymm1, %ymm1 587; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 588; AVX2-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0 589; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 590; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 591; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 592; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 593; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 594; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 595; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 596; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 597; AVX2-NEXT: retq 598; 599; AVX512CDVL-LABEL: testv32i8: 600; AVX512CDVL: # BB#0: 601; AVX512CDVL-NEXT: vpxord %ymm1, %ymm1, %ymm1 602; AVX512CDVL-NEXT: vpsubb %ymm0, %ymm1, %ymm1 603; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm0 604; AVX512CDVL-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0 605; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 606; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm2 607; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 608; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 609; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0 610; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm0 611; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 612; AVX512CDVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 613; AVX512CDVL-NEXT: retq 614; 615; AVX512CD-LABEL: testv32i8: 616; AVX512CD: # BB#0: 617; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1 618; AVX512CD-NEXT: vpsubb %ymm0, %ymm1, %ymm1 619; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 620; AVX512CD-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0 621; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 622; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2 623; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 624; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2 625; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 626; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 627; AVX512CD-NEXT: vpshufb %ymm0, %ymm3, %ymm0 628; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0 629; AVX512CD-NEXT: retq 630 %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %in, i1 0) 631 ret <32 x i8> %out 632} 633 634define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { 635; AVX1-LABEL: testv32i8u: 636; AVX1: # BB#0: 637; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 638; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 639; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 640; AVX1-NEXT: vpsubb %xmm0, %xmm2, %xmm2 641; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 642; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 643; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 644; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 645; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1 646; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 647; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 648; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 649; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 650; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 651; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 652; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 653; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 654; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm0 655; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 656; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 657; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 658; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 659; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 660; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 661; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 662; AVX1-NEXT: retq 663; 664; AVX2-LABEL: testv32i8u: 665; AVX2: # BB#0: 666; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 667; AVX2-NEXT: vpsubb %ymm0, %ymm1, %ymm1 668; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 669; AVX2-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0 670; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 671; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 672; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 673; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 674; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 675; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 676; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 677; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 678; AVX2-NEXT: retq 679; 680; AVX512CDVL-LABEL: testv32i8u: 681; AVX512CDVL: # BB#0: 682; AVX512CDVL-NEXT: vpxord %ymm1, %ymm1, %ymm1 683; AVX512CDVL-NEXT: vpsubb %ymm0, %ymm1, %ymm1 684; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm0 685; AVX512CDVL-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0 686; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 687; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm2 688; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 689; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 690; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0 691; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm0 692; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 693; AVX512CDVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 694; AVX512CDVL-NEXT: retq 695; 696; AVX512CD-LABEL: testv32i8u: 697; AVX512CD: # BB#0: 698; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1 699; AVX512CD-NEXT: vpsubb %ymm0, %ymm1, %ymm1 700; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 701; AVX512CD-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0 702; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 703; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2 704; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 705; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2 706; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 707; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 708; AVX512CD-NEXT: vpshufb %ymm0, %ymm3, %ymm0 709; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0 710; AVX512CD-NEXT: retq 711 %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %in, i1 -1) 712 ret <32 x i8> %out 713} 714 715define <4 x i64> @foldv4i64() nounwind { 716; AVX1-LABEL: foldv4i64: 717; AVX1: # BB#0: 718; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] 719; AVX1-NEXT: retq 720; 721; AVX2-LABEL: foldv4i64: 722; AVX2: # BB#0: 723; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] 724; AVX2-NEXT: retq 725; 726; AVX512CDVL-LABEL: foldv4i64: 727; AVX512CDVL: # BB#0: 728; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm0 = [8,0,64,0] 729; AVX512CDVL-NEXT: retq 730; 731; AVX512CD-LABEL: foldv4i64: 732; AVX512CD: # BB#0: 733; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] 734; AVX512CD-NEXT: retq 735 %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 0) 736 ret <4 x i64> %out 737} 738 739define <4 x i64> @foldv4i64u() nounwind { 740; AVX1-LABEL: foldv4i64u: 741; AVX1: # BB#0: 742; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] 743; AVX1-NEXT: retq 744; 745; AVX2-LABEL: foldv4i64u: 746; AVX2: # BB#0: 747; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] 748; AVX2-NEXT: retq 749; 750; AVX512CDVL-LABEL: foldv4i64u: 751; AVX512CDVL: # BB#0: 752; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm0 = [8,0,64,0] 753; AVX512CDVL-NEXT: retq 754; 755; AVX512CD-LABEL: foldv4i64u: 756; AVX512CD: # BB#0: 757; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] 758; AVX512CD-NEXT: retq 759 %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 -1) 760 ret <4 x i64> %out 761} 762 763define <8 x i32> @foldv8i32() nounwind { 764; AVX1-LABEL: foldv8i32: 765; AVX1: # BB#0: 766; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] 767; AVX1-NEXT: retq 768; 769; AVX2-LABEL: foldv8i32: 770; AVX2: # BB#0: 771; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] 772; AVX2-NEXT: retq 773; 774; AVX512CDVL-LABEL: foldv8i32: 775; AVX512CDVL: # BB#0: 776; AVX512CDVL-NEXT: vmovdqa32 {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] 777; AVX512CDVL-NEXT: retq 778; 779; AVX512CD-LABEL: foldv8i32: 780; AVX512CD: # BB#0: 781; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] 782; AVX512CD-NEXT: retq 783 %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 0) 784 ret <8 x i32> %out 785} 786 787define <8 x i32> @foldv8i32u() nounwind { 788; AVX1-LABEL: foldv8i32u: 789; AVX1: # BB#0: 790; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] 791; AVX1-NEXT: retq 792; 793; AVX2-LABEL: foldv8i32u: 794; AVX2: # BB#0: 795; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] 796; AVX2-NEXT: retq 797; 798; AVX512CDVL-LABEL: foldv8i32u: 799; AVX512CDVL: # BB#0: 800; AVX512CDVL-NEXT: vmovdqa32 {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] 801; AVX512CDVL-NEXT: retq 802; 803; AVX512CD-LABEL: foldv8i32u: 804; AVX512CD: # BB#0: 805; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] 806; AVX512CD-NEXT: retq 807 %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 -1) 808 ret <8 x i32> %out 809} 810 811define <16 x i16> @foldv16i16() nounwind { 812; AVX1-LABEL: foldv16i16: 813; AVX1: # BB#0: 814; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] 815; AVX1-NEXT: retq 816; 817; AVX2-LABEL: foldv16i16: 818; AVX2: # BB#0: 819; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] 820; AVX2-NEXT: retq 821; 822; AVX512CDVL-LABEL: foldv16i16: 823; AVX512CDVL: # BB#0: 824; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] 825; AVX512CDVL-NEXT: retq 826; 827; AVX512CD-LABEL: foldv16i16: 828; AVX512CD: # BB#0: 829; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] 830; AVX512CD-NEXT: retq 831 %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 0) 832 ret <16 x i16> %out 833} 834 835define <16 x i16> @foldv16i16u() nounwind { 836; AVX1-LABEL: foldv16i16u: 837; AVX1: # BB#0: 838; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] 839; AVX1-NEXT: retq 840; 841; AVX2-LABEL: foldv16i16u: 842; AVX2: # BB#0: 843; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] 844; AVX2-NEXT: retq 845; 846; AVX512CDVL-LABEL: foldv16i16u: 847; AVX512CDVL: # BB#0: 848; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] 849; AVX512CDVL-NEXT: retq 850; 851; AVX512CD-LABEL: foldv16i16u: 852; AVX512CD: # BB#0: 853; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] 854; AVX512CD-NEXT: retq 855 %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 -1) 856 ret <16 x i16> %out 857} 858 859define <32 x i8> @foldv32i8() nounwind { 860; AVX1-LABEL: foldv32i8: 861; AVX1: # BB#0: 862; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] 863; AVX1-NEXT: retq 864; 865; AVX2-LABEL: foldv32i8: 866; AVX2: # BB#0: 867; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] 868; AVX2-NEXT: retq 869; 870; AVX512CDVL-LABEL: foldv32i8: 871; AVX512CDVL: # BB#0: 872; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] 873; AVX512CDVL-NEXT: retq 874; 875; AVX512CD-LABEL: foldv32i8: 876; AVX512CD: # BB#0: 877; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] 878; AVX512CD-NEXT: retq 879 %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 0) 880 ret <32 x i8> %out 881} 882 883define <32 x i8> @foldv32i8u() nounwind { 884; AVX1-LABEL: foldv32i8u: 885; AVX1: # BB#0: 886; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] 887; AVX1-NEXT: retq 888; 889; AVX2-LABEL: foldv32i8u: 890; AVX2: # BB#0: 891; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] 892; AVX2-NEXT: retq 893; 894; AVX512CDVL-LABEL: foldv32i8u: 895; AVX512CDVL: # BB#0: 896; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] 897; AVX512CDVL-NEXT: retq 898; 899; AVX512CD-LABEL: foldv32i8u: 900; AVX512CD: # BB#0: 901; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] 902; AVX512CD-NEXT: retq 903 %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 -1) 904 ret <32 x i8> %out 905} 906 907declare <4 x i64> @llvm.cttz.v4i64(<4 x i64>, i1) 908declare <8 x i32> @llvm.cttz.v8i32(<8 x i32>, i1) 909declare <16 x i16> @llvm.cttz.v16i16(<16 x i16>, i1) 910declare <32 x i8> @llvm.cttz.v32i8(<32 x i8>, i1) 911