1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -mattr=prefer-256-bit | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -mattr=prefer-256-bit,avx512vbmi | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI 4; Make sure CPUs default to prefer-256-bit. avx512vnni isn't interesting as it just adds an isel peephole for vpmaddwd+vpaddd 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=cascadelake | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=cooperlake | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=cannonlake | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=icelake-client | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=icelake-server | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=tigerlake | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI 12 13; This file primarily contains tests for specific places in X86ISelLowering.cpp that needed be made aware of the legalizer not allowing 512-bit vectors due to prefer-256-bit even though AVX512 is enabled. 14 15define void @add256(<16 x i32>* %a, <16 x i32>* %b, <16 x i32>* %c) "min-legal-vector-width"="256" { 16; CHECK-LABEL: add256: 17; CHECK: # %bb.0: 18; CHECK-NEXT: vmovdqa (%rdi), %ymm0 19; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 20; CHECK-NEXT: vpaddd 32(%rsi), %ymm1, %ymm1 21; CHECK-NEXT: vpaddd (%rsi), %ymm0, %ymm0 22; CHECK-NEXT: vmovdqa %ymm0, (%rdx) 23; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) 24; CHECK-NEXT: vzeroupper 25; CHECK-NEXT: retq 26 %d = load <16 x i32>, <16 x i32>* %a 27 %e = load <16 x i32>, <16 x i32>* %b 28 %f = add <16 x i32> %d, %e 29 store <16 x i32> %f, <16 x i32>* %c 30 ret void 31} 32 33define void @add512(<16 x i32>* %a, <16 x i32>* %b, <16 x i32>* %c) "min-legal-vector-width"="512" { 34; CHECK-LABEL: add512: 35; CHECK: # %bb.0: 36; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 37; CHECK-NEXT: vpaddd (%rsi), %zmm0, %zmm0 38; CHECK-NEXT: vmovdqa64 %zmm0, (%rdx) 39; CHECK-NEXT: vzeroupper 40; CHECK-NEXT: retq 41 %d = load <16 x i32>, <16 x i32>* %a 42 %e = load <16 x i32>, <16 x i32>* %b 43 %f = add <16 x i32> %d, %e 44 store <16 x i32> %f, <16 x i32>* %c 45 ret void 46} 47 48define void @avg_v64i8_256(<64 x i8>* %a, <64 x i8>* %b) "min-legal-vector-width"="256" { 49; CHECK-LABEL: avg_v64i8_256: 50; CHECK: # %bb.0: 51; CHECK-NEXT: vmovdqa (%rsi), %ymm0 52; CHECK-NEXT: vmovdqa 32(%rsi), %ymm1 53; CHECK-NEXT: vpavgb (%rdi), %ymm0, %ymm0 54; CHECK-NEXT: vpavgb 32(%rdi), %ymm1, %ymm1 55; CHECK-NEXT: vmovdqu %ymm1, (%rax) 56; CHECK-NEXT: vmovdqu %ymm0, (%rax) 57; CHECK-NEXT: vzeroupper 58; CHECK-NEXT: retq 59 %1 = load <64 x i8>, <64 x i8>* %a 60 %2 = load <64 x i8>, <64 x i8>* %b 61 %3 = zext <64 x i8> %1 to <64 x i32> 62 %4 = zext <64 x i8> %2 to <64 x i32> 63 %5 = add nuw nsw <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 64 %6 = add nuw nsw <64 x i32> %5, %4 65 %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 66 %8 = trunc <64 x i32> %7 to <64 x i8> 67 store <64 x i8> %8, <64 x i8>* undef, align 4 68 ret void 69} 70 71 72define void @avg_v64i8_512(<64 x i8>* %a, <64 x i8>* %b) "min-legal-vector-width"="512" { 73; CHECK-LABEL: avg_v64i8_512: 74; CHECK: # %bb.0: 75; CHECK-NEXT: vmovdqa64 (%rsi), %zmm0 76; CHECK-NEXT: vpavgb (%rdi), %zmm0, %zmm0 77; CHECK-NEXT: vmovdqu64 %zmm0, (%rax) 78; CHECK-NEXT: vzeroupper 79; CHECK-NEXT: retq 80 %1 = load <64 x i8>, <64 x i8>* %a 81 %2 = load <64 x i8>, <64 x i8>* %b 82 %3 = zext <64 x i8> %1 to <64 x i32> 83 %4 = zext <64 x i8> %2 to <64 x i32> 84 %5 = add nuw nsw <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 85 %6 = add nuw nsw <64 x i32> %5, %4 86 %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 87 %8 = trunc <64 x i32> %7 to <64 x i8> 88 store <64 x i8> %8, <64 x i8>* undef, align 4 89 ret void 90} 91 92define void @pmaddwd_32_256(<32 x i16>* %APtr, <32 x i16>* %BPtr, <16 x i32>* %CPtr) "min-legal-vector-width"="256" { 93; CHECK-LABEL: pmaddwd_32_256: 94; CHECK: # %bb.0: 95; CHECK-NEXT: vmovdqa (%rdi), %ymm0 96; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 97; CHECK-NEXT: vpmaddwd 32(%rsi), %ymm1, %ymm1 98; CHECK-NEXT: vpmaddwd (%rsi), %ymm0, %ymm0 99; CHECK-NEXT: vmovdqa %ymm0, (%rdx) 100; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) 101; CHECK-NEXT: vzeroupper 102; CHECK-NEXT: retq 103 %A = load <32 x i16>, <32 x i16>* %APtr 104 %B = load <32 x i16>, <32 x i16>* %BPtr 105 %a = sext <32 x i16> %A to <32 x i32> 106 %b = sext <32 x i16> %B to <32 x i32> 107 %m = mul nsw <32 x i32> %a, %b 108 %odd = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> 109 %even = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> 110 %ret = add <16 x i32> %odd, %even 111 store <16 x i32> %ret, <16 x i32>* %CPtr 112 ret void 113} 114 115define void @pmaddwd_32_512(<32 x i16>* %APtr, <32 x i16>* %BPtr, <16 x i32>* %CPtr) "min-legal-vector-width"="512" { 116; CHECK-LABEL: pmaddwd_32_512: 117; CHECK: # %bb.0: 118; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 119; CHECK-NEXT: vpmaddwd (%rsi), %zmm0, %zmm0 120; CHECK-NEXT: vmovdqa64 %zmm0, (%rdx) 121; CHECK-NEXT: vzeroupper 122; CHECK-NEXT: retq 123 %A = load <32 x i16>, <32 x i16>* %APtr 124 %B = load <32 x i16>, <32 x i16>* %BPtr 125 %a = sext <32 x i16> %A to <32 x i32> 126 %b = sext <32 x i16> %B to <32 x i32> 127 %m = mul nsw <32 x i32> %a, %b 128 %odd = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> 129 %even = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> 130 %ret = add <16 x i32> %odd, %even 131 store <16 x i32> %ret, <16 x i32>* %CPtr 132 ret void 133} 134 135define void @psubus_64i8_max_256(<64 x i8>* %xptr, <64 x i8>* %yptr, <64 x i8>* %zptr) "min-legal-vector-width"="256" { 136; CHECK-LABEL: psubus_64i8_max_256: 137; CHECK: # %bb.0: 138; CHECK-NEXT: vmovdqa (%rdi), %ymm0 139; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 140; CHECK-NEXT: vpsubusb 32(%rsi), %ymm1, %ymm1 141; CHECK-NEXT: vpsubusb (%rsi), %ymm0, %ymm0 142; CHECK-NEXT: vmovdqa %ymm0, (%rdx) 143; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) 144; CHECK-NEXT: vzeroupper 145; CHECK-NEXT: retq 146 %x = load <64 x i8>, <64 x i8>* %xptr 147 %y = load <64 x i8>, <64 x i8>* %yptr 148 %cmp = icmp ult <64 x i8> %x, %y 149 %max = select <64 x i1> %cmp, <64 x i8> %y, <64 x i8> %x 150 %res = sub <64 x i8> %max, %y 151 store <64 x i8> %res, <64 x i8>* %zptr 152 ret void 153} 154 155define void @psubus_64i8_max_512(<64 x i8>* %xptr, <64 x i8>* %yptr, <64 x i8>* %zptr) "min-legal-vector-width"="512" { 156; CHECK-LABEL: psubus_64i8_max_512: 157; CHECK: # %bb.0: 158; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 159; CHECK-NEXT: vpsubusb (%rsi), %zmm0, %zmm0 160; CHECK-NEXT: vmovdqa64 %zmm0, (%rdx) 161; CHECK-NEXT: vzeroupper 162; CHECK-NEXT: retq 163 %x = load <64 x i8>, <64 x i8>* %xptr 164 %y = load <64 x i8>, <64 x i8>* %yptr 165 %cmp = icmp ult <64 x i8> %x, %y 166 %max = select <64 x i1> %cmp, <64 x i8> %y, <64 x i8> %x 167 %res = sub <64 x i8> %max, %y 168 store <64 x i8> %res, <64 x i8>* %zptr 169 ret void 170} 171 172define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly, i32) "min-legal-vector-width"="256" { 173; CHECK-LABEL: _Z9test_charPcS_i_256: 174; CHECK: # %bb.0: # %entry 175; CHECK-NEXT: movl %edx, %eax 176; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 177; CHECK-NEXT: xorl %ecx, %ecx 178; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 179; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 180; CHECK-NEXT: .p2align 4, 0x90 181; CHECK-NEXT: .LBB8_1: # %vector.body 182; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 183; CHECK-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm3 184; CHECK-NEXT: vpmovsxbw (%rdi,%rcx), %ymm4 185; CHECK-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm5 186; CHECK-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3 187; CHECK-NEXT: vpaddd %ymm2, %ymm3, %ymm2 188; CHECK-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3 189; CHECK-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3 190; CHECK-NEXT: vpaddd %ymm1, %ymm3, %ymm1 191; CHECK-NEXT: addq $32, %rcx 192; CHECK-NEXT: cmpq %rcx, %rax 193; CHECK-NEXT: jne .LBB8_1 194; CHECK-NEXT: # %bb.2: # %middle.block 195; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm1 196; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 197; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 198; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 199; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 200; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 201; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 202; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 203; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 204; CHECK-NEXT: vmovd %xmm0, %eax 205; CHECK-NEXT: vzeroupper 206; CHECK-NEXT: retq 207entry: 208 %3 = zext i32 %2 to i64 209 br label %vector.body 210 211vector.body: 212 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 213 %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] 214 %4 = getelementptr inbounds i8, i8* %0, i64 %index 215 %5 = bitcast i8* %4 to <32 x i8>* 216 %wide.load = load <32 x i8>, <32 x i8>* %5, align 1 217 %6 = sext <32 x i8> %wide.load to <32 x i32> 218 %7 = getelementptr inbounds i8, i8* %1, i64 %index 219 %8 = bitcast i8* %7 to <32 x i8>* 220 %wide.load14 = load <32 x i8>, <32 x i8>* %8, align 1 221 %9 = sext <32 x i8> %wide.load14 to <32 x i32> 222 %10 = mul nsw <32 x i32> %9, %6 223 %11 = add nsw <32 x i32> %10, %vec.phi 224 %index.next = add i64 %index, 32 225 %12 = icmp eq i64 %index.next, %3 226 br i1 %12, label %middle.block, label %vector.body 227 228middle.block: 229 %rdx.shuf1 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 230 %bin.rdx1 = add <32 x i32> %11, %rdx.shuf1 231 %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 232 %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf 233 %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 234 %bin.rdx32 = add <32 x i32> %bin.rdx, %rdx.shuf15 235 %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx32, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 236 %bin.rdx18 = add <32 x i32> %bin.rdx32, %rdx.shuf17 237 %rdx.shuf19 = shufflevector <32 x i32> %bin.rdx18, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 238 %bin.rdx20 = add <32 x i32> %bin.rdx18, %rdx.shuf19 239 %13 = extractelement <32 x i32> %bin.rdx20, i32 0 240 ret i32 %13 241} 242 243define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly, i32) "min-legal-vector-width"="512" { 244; CHECK-LABEL: _Z9test_charPcS_i_512: 245; CHECK: # %bb.0: # %entry 246; CHECK-NEXT: movl %edx, %eax 247; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 248; CHECK-NEXT: xorl %ecx, %ecx 249; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 250; CHECK-NEXT: .p2align 4, 0x90 251; CHECK-NEXT: .LBB9_1: # %vector.body 252; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 253; CHECK-NEXT: vpmovsxbw (%rdi,%rcx), %zmm2 254; CHECK-NEXT: vpmovsxbw (%rsi,%rcx), %zmm3 255; CHECK-NEXT: vpmaddwd %zmm2, %zmm3, %zmm2 256; CHECK-NEXT: vpaddd %zmm1, %zmm2, %zmm1 257; CHECK-NEXT: addq $32, %rcx 258; CHECK-NEXT: cmpq %rcx, %rax 259; CHECK-NEXT: jne .LBB9_1 260; CHECK-NEXT: # %bb.2: # %middle.block 261; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 262; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 263; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 264; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 265; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 266; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 267; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 268; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 269; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 270; CHECK-NEXT: vmovd %xmm0, %eax 271; CHECK-NEXT: vzeroupper 272; CHECK-NEXT: retq 273entry: 274 %3 = zext i32 %2 to i64 275 br label %vector.body 276 277vector.body: 278 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 279 %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] 280 %4 = getelementptr inbounds i8, i8* %0, i64 %index 281 %5 = bitcast i8* %4 to <32 x i8>* 282 %wide.load = load <32 x i8>, <32 x i8>* %5, align 1 283 %6 = sext <32 x i8> %wide.load to <32 x i32> 284 %7 = getelementptr inbounds i8, i8* %1, i64 %index 285 %8 = bitcast i8* %7 to <32 x i8>* 286 %wide.load14 = load <32 x i8>, <32 x i8>* %8, align 1 287 %9 = sext <32 x i8> %wide.load14 to <32 x i32> 288 %10 = mul nsw <32 x i32> %9, %6 289 %11 = add nsw <32 x i32> %10, %vec.phi 290 %index.next = add i64 %index, 32 291 %12 = icmp eq i64 %index.next, %3 292 br i1 %12, label %middle.block, label %vector.body 293 294middle.block: 295 %rdx.shuf1 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 296 %bin.rdx1 = add <32 x i32> %11, %rdx.shuf1 297 %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 298 %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf 299 %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 300 %bin.rdx32 = add <32 x i32> %bin.rdx, %rdx.shuf15 301 %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx32, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 302 %bin.rdx18 = add <32 x i32> %bin.rdx32, %rdx.shuf17 303 %rdx.shuf19 = shufflevector <32 x i32> %bin.rdx18, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 304 %bin.rdx20 = add <32 x i32> %bin.rdx18, %rdx.shuf19 305 %13 = extractelement <32 x i32> %bin.rdx20, i32 0 306 ret i32 %13 307} 308 309@a = global [1024 x i8] zeroinitializer, align 16 310@b = global [1024 x i8] zeroinitializer, align 16 311 312define i32 @sad_16i8_256() "min-legal-vector-width"="256" { 313; CHECK-LABEL: sad_16i8_256: 314; CHECK: # %bb.0: # %entry 315; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 316; CHECK-NEXT: movq $-1024, %rax # imm = 0xFC00 317; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 318; CHECK-NEXT: .p2align 4, 0x90 319; CHECK-NEXT: .LBB10_1: # %vector.body 320; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 321; CHECK-NEXT: vmovdqu a+1024(%rax), %xmm2 322; CHECK-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2 323; CHECK-NEXT: vpaddd %ymm1, %ymm2, %ymm1 324; CHECK-NEXT: addq $4, %rax 325; CHECK-NEXT: jne .LBB10_1 326; CHECK-NEXT: # %bb.2: # %middle.block 327; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 328; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 329; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 330; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 331; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 332; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 333; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 334; CHECK-NEXT: vmovd %xmm0, %eax 335; CHECK-NEXT: vzeroupper 336; CHECK-NEXT: retq 337entry: 338 br label %vector.body 339 340vector.body: 341 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 342 %vec.phi = phi <16 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ] 343 %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index 344 %1 = bitcast i8* %0 to <16 x i8>* 345 %wide.load = load <16 x i8>, <16 x i8>* %1, align 4 346 %2 = zext <16 x i8> %wide.load to <16 x i32> 347 %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index 348 %4 = bitcast i8* %3 to <16 x i8>* 349 %wide.load1 = load <16 x i8>, <16 x i8>* %4, align 4 350 %5 = zext <16 x i8> %wide.load1 to <16 x i32> 351 %6 = sub nsw <16 x i32> %2, %5 352 %7 = icmp sgt <16 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> 353 %8 = sub nsw <16 x i32> zeroinitializer, %6 354 %9 = select <16 x i1> %7, <16 x i32> %6, <16 x i32> %8 355 %10 = add nsw <16 x i32> %9, %vec.phi 356 %index.next = add i64 %index, 4 357 %11 = icmp eq i64 %index.next, 1024 358 br i1 %11, label %middle.block, label %vector.body 359 360middle.block: 361 %rdx.shuf = shufflevector <16 x i32> %10, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 362 %bin.rdx = add <16 x i32> %10, %rdx.shuf 363 %rdx.shuf2 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 364 %bin.rdx2 = add <16 x i32> %bin.rdx, %rdx.shuf2 365 %rdx.shuf3 = shufflevector <16 x i32> %bin.rdx2, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 366 %bin.rdx3 = add <16 x i32> %bin.rdx2, %rdx.shuf3 367 %rdx.shuf4 = shufflevector <16 x i32> %bin.rdx3, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 368 %bin.rdx4 = add <16 x i32> %bin.rdx3, %rdx.shuf4 369 %12 = extractelement <16 x i32> %bin.rdx4, i32 0 370 ret i32 %12 371} 372 373define i32 @sad_16i8_512() "min-legal-vector-width"="512" { 374; CHECK-LABEL: sad_16i8_512: 375; CHECK: # %bb.0: # %entry 376; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 377; CHECK-NEXT: movq $-1024, %rax # imm = 0xFC00 378; CHECK-NEXT: .p2align 4, 0x90 379; CHECK-NEXT: .LBB11_1: # %vector.body 380; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 381; CHECK-NEXT: vmovdqu a+1024(%rax), %xmm1 382; CHECK-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1 383; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 384; CHECK-NEXT: addq $4, %rax 385; CHECK-NEXT: jne .LBB11_1 386; CHECK-NEXT: # %bb.2: # %middle.block 387; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 388; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 389; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 390; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 391; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 392; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 393; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 394; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 395; CHECK-NEXT: vmovd %xmm0, %eax 396; CHECK-NEXT: vzeroupper 397; CHECK-NEXT: retq 398entry: 399 br label %vector.body 400 401vector.body: 402 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 403 %vec.phi = phi <16 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ] 404 %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index 405 %1 = bitcast i8* %0 to <16 x i8>* 406 %wide.load = load <16 x i8>, <16 x i8>* %1, align 4 407 %2 = zext <16 x i8> %wide.load to <16 x i32> 408 %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index 409 %4 = bitcast i8* %3 to <16 x i8>* 410 %wide.load1 = load <16 x i8>, <16 x i8>* %4, align 4 411 %5 = zext <16 x i8> %wide.load1 to <16 x i32> 412 %6 = sub nsw <16 x i32> %2, %5 413 %7 = icmp sgt <16 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> 414 %8 = sub nsw <16 x i32> zeroinitializer, %6 415 %9 = select <16 x i1> %7, <16 x i32> %6, <16 x i32> %8 416 %10 = add nsw <16 x i32> %9, %vec.phi 417 %index.next = add i64 %index, 4 418 %11 = icmp eq i64 %index.next, 1024 419 br i1 %11, label %middle.block, label %vector.body 420 421middle.block: 422 %rdx.shuf = shufflevector <16 x i32> %10, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 423 %bin.rdx = add <16 x i32> %10, %rdx.shuf 424 %rdx.shuf2 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 425 %bin.rdx2 = add <16 x i32> %bin.rdx, %rdx.shuf2 426 %rdx.shuf3 = shufflevector <16 x i32> %bin.rdx2, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 427 %bin.rdx3 = add <16 x i32> %bin.rdx2, %rdx.shuf3 428 %rdx.shuf4 = shufflevector <16 x i32> %bin.rdx3, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 429 %bin.rdx4 = add <16 x i32> %bin.rdx3, %rdx.shuf4 430 %12 = extractelement <16 x i32> %bin.rdx4, i32 0 431 ret i32 %12 432} 433 434define void @sbto16f32_256(<16 x i16> %a, <16 x float>* %res) "min-legal-vector-width"="256" { 435; CHECK-LABEL: sbto16f32_256: 436; CHECK: # %bb.0: 437; CHECK-NEXT: vpmovw2m %ymm0, %k0 438; CHECK-NEXT: kshiftrw $8, %k0, %k1 439; CHECK-NEXT: vpmovm2d %k1, %ymm0 440; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 441; CHECK-NEXT: vpmovm2d %k0, %ymm1 442; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1 443; CHECK-NEXT: vmovaps %ymm1, (%rdi) 444; CHECK-NEXT: vmovaps %ymm0, 32(%rdi) 445; CHECK-NEXT: vzeroupper 446; CHECK-NEXT: retq 447 %mask = icmp slt <16 x i16> %a, zeroinitializer 448 %1 = sitofp <16 x i1> %mask to <16 x float> 449 store <16 x float> %1, <16 x float>* %res 450 ret void 451} 452 453define void @sbto16f32_512(<16 x i16> %a, <16 x float>* %res) "min-legal-vector-width"="512" { 454; CHECK-LABEL: sbto16f32_512: 455; CHECK: # %bb.0: 456; CHECK-NEXT: vpmovw2m %ymm0, %k0 457; CHECK-NEXT: vpmovm2d %k0, %zmm0 458; CHECK-NEXT: vcvtdq2ps %zmm0, %zmm0 459; CHECK-NEXT: vmovaps %zmm0, (%rdi) 460; CHECK-NEXT: vzeroupper 461; CHECK-NEXT: retq 462 %mask = icmp slt <16 x i16> %a, zeroinitializer 463 %1 = sitofp <16 x i1> %mask to <16 x float> 464 store <16 x float> %1, <16 x float>* %res 465 ret void 466} 467 468define void @sbto16f64_256(<16 x i16> %a, <16 x double>* %res) "min-legal-vector-width"="256" { 469; CHECK-LABEL: sbto16f64_256: 470; CHECK: # %bb.0: 471; CHECK-NEXT: vpmovw2m %ymm0, %k0 472; CHECK-NEXT: kshiftrw $8, %k0, %k1 473; CHECK-NEXT: vpmovm2d %k1, %ymm0 474; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm1 475; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 476; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm0 477; CHECK-NEXT: vpmovm2d %k0, %ymm2 478; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm3 479; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2 480; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm2 481; CHECK-NEXT: vmovaps %ymm2, 32(%rdi) 482; CHECK-NEXT: vmovaps %ymm3, (%rdi) 483; CHECK-NEXT: vmovaps %ymm0, 96(%rdi) 484; CHECK-NEXT: vmovaps %ymm1, 64(%rdi) 485; CHECK-NEXT: vzeroupper 486; CHECK-NEXT: retq 487 %mask = icmp slt <16 x i16> %a, zeroinitializer 488 %1 = sitofp <16 x i1> %mask to <16 x double> 489 store <16 x double> %1, <16 x double>* %res 490 ret void 491} 492 493define void @sbto16f64_512(<16 x i16> %a, <16 x double>* %res) "min-legal-vector-width"="512" { 494; CHECK-LABEL: sbto16f64_512: 495; CHECK: # %bb.0: 496; CHECK-NEXT: vpmovw2m %ymm0, %k0 497; CHECK-NEXT: vpmovm2d %k0, %zmm0 498; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm1 499; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 500; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm0 501; CHECK-NEXT: vmovaps %zmm0, 64(%rdi) 502; CHECK-NEXT: vmovaps %zmm1, (%rdi) 503; CHECK-NEXT: vzeroupper 504; CHECK-NEXT: retq 505 %mask = icmp slt <16 x i16> %a, zeroinitializer 506 %1 = sitofp <16 x i1> %mask to <16 x double> 507 store <16 x double> %1, <16 x double>* %res 508 ret void 509} 510 511define void @ubto16f32_256(<16 x i16> %a, <16 x float>* %res) "min-legal-vector-width"="256" { 512; CHECK-LABEL: ubto16f32_256: 513; CHECK: # %bb.0: 514; CHECK-NEXT: vpmovw2m %ymm0, %k0 515; CHECK-NEXT: kshiftrw $8, %k0, %k1 516; CHECK-NEXT: vpmovm2d %k1, %ymm0 517; CHECK-NEXT: vpsrld $31, %ymm0, %ymm0 518; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 519; CHECK-NEXT: vpmovm2d %k0, %ymm1 520; CHECK-NEXT: vpsrld $31, %ymm1, %ymm1 521; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1 522; CHECK-NEXT: vmovaps %ymm1, (%rdi) 523; CHECK-NEXT: vmovaps %ymm0, 32(%rdi) 524; CHECK-NEXT: vzeroupper 525; CHECK-NEXT: retq 526 %mask = icmp slt <16 x i16> %a, zeroinitializer 527 %1 = uitofp <16 x i1> %mask to <16 x float> 528 store <16 x float> %1, <16 x float>* %res 529 ret void 530} 531 532define void @ubto16f32_512(<16 x i16> %a, <16 x float>* %res) "min-legal-vector-width"="512" { 533; CHECK-LABEL: ubto16f32_512: 534; CHECK: # %bb.0: 535; CHECK-NEXT: vpmovw2m %ymm0, %k0 536; CHECK-NEXT: vpmovm2d %k0, %zmm0 537; CHECK-NEXT: vpsrld $31, %zmm0, %zmm0 538; CHECK-NEXT: vcvtdq2ps %zmm0, %zmm0 539; CHECK-NEXT: vmovaps %zmm0, (%rdi) 540; CHECK-NEXT: vzeroupper 541; CHECK-NEXT: retq 542 %mask = icmp slt <16 x i16> %a, zeroinitializer 543 %1 = uitofp <16 x i1> %mask to <16 x float> 544 store <16 x float> %1, <16 x float>* %res 545 ret void 546} 547 548define void @ubto16f64_256(<16 x i16> %a, <16 x double>* %res) "min-legal-vector-width"="256" { 549; CHECK-LABEL: ubto16f64_256: 550; CHECK: # %bb.0: 551; CHECK-NEXT: vpmovw2m %ymm0, %k0 552; CHECK-NEXT: kshiftrw $8, %k0, %k1 553; CHECK-NEXT: vpmovm2d %k1, %ymm0 554; CHECK-NEXT: vpsrld $31, %ymm0, %ymm0 555; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm1 556; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 557; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm0 558; CHECK-NEXT: vpmovm2d %k0, %ymm2 559; CHECK-NEXT: vpsrld $31, %ymm2, %ymm2 560; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm3 561; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2 562; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm2 563; CHECK-NEXT: vmovaps %ymm2, 32(%rdi) 564; CHECK-NEXT: vmovaps %ymm3, (%rdi) 565; CHECK-NEXT: vmovaps %ymm0, 96(%rdi) 566; CHECK-NEXT: vmovaps %ymm1, 64(%rdi) 567; CHECK-NEXT: vzeroupper 568; CHECK-NEXT: retq 569 %mask = icmp slt <16 x i16> %a, zeroinitializer 570 %1 = uitofp <16 x i1> %mask to <16 x double> 571 store <16 x double> %1, <16 x double>* %res 572 ret void 573} 574 575define void @ubto16f64_512(<16 x i16> %a, <16 x double>* %res) "min-legal-vector-width"="512" { 576; CHECK-LABEL: ubto16f64_512: 577; CHECK: # %bb.0: 578; CHECK-NEXT: vpmovw2m %ymm0, %k0 579; CHECK-NEXT: vpmovm2d %k0, %zmm0 580; CHECK-NEXT: vpsrld $31, %zmm0, %zmm0 581; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm1 582; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 583; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm0 584; CHECK-NEXT: vmovaps %zmm0, 64(%rdi) 585; CHECK-NEXT: vmovaps %zmm1, (%rdi) 586; CHECK-NEXT: vzeroupper 587; CHECK-NEXT: retq 588 %mask = icmp slt <16 x i16> %a, zeroinitializer 589 %1 = uitofp <16 x i1> %mask to <16 x double> 590 store <16 x double> %1, <16 x double>* %res 591 ret void 592} 593 594define <16 x i16> @test_16f32toub_256(<16 x float>* %ptr, <16 x i16> %passthru) "min-legal-vector-width"="256" { 595; CHECK-LABEL: test_16f32toub_256: 596; CHECK: # %bb.0: 597; CHECK-NEXT: vcvttps2dq (%rdi), %ymm1 598; CHECK-NEXT: vpslld $31, %ymm1, %ymm1 599; CHECK-NEXT: vpmovd2m %ymm1, %k0 600; CHECK-NEXT: vcvttps2dq 32(%rdi), %ymm1 601; CHECK-NEXT: vpslld $31, %ymm1, %ymm1 602; CHECK-NEXT: vpmovd2m %ymm1, %k1 603; CHECK-NEXT: kunpckbw %k0, %k1, %k1 604; CHECK-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} 605; CHECK-NEXT: retq 606 %a = load <16 x float>, <16 x float>* %ptr 607 %mask = fptoui <16 x float> %a to <16 x i1> 608 %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer 609 ret <16 x i16> %select 610} 611 612define <16 x i16> @test_16f32toub_512(<16 x float>* %ptr, <16 x i16> %passthru) "min-legal-vector-width"="512" { 613; CHECK-LABEL: test_16f32toub_512: 614; CHECK: # %bb.0: 615; CHECK-NEXT: vcvttps2dq (%rdi), %zmm1 616; CHECK-NEXT: vpslld $31, %zmm1, %zmm1 617; CHECK-NEXT: vpmovd2m %zmm1, %k1 618; CHECK-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} 619; CHECK-NEXT: retq 620 %a = load <16 x float>, <16 x float>* %ptr 621 %mask = fptoui <16 x float> %a to <16 x i1> 622 %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer 623 ret <16 x i16> %select 624} 625 626define <16 x i16> @test_16f32tosb_256(<16 x float>* %ptr, <16 x i16> %passthru) "min-legal-vector-width"="256" { 627; CHECK-LABEL: test_16f32tosb_256: 628; CHECK: # %bb.0: 629; CHECK-NEXT: vcvttps2dq (%rdi), %ymm1 630; CHECK-NEXT: vpmovd2m %ymm1, %k0 631; CHECK-NEXT: vcvttps2dq 32(%rdi), %ymm1 632; CHECK-NEXT: vpmovd2m %ymm1, %k1 633; CHECK-NEXT: kunpckbw %k0, %k1, %k1 634; CHECK-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} 635; CHECK-NEXT: retq 636 %a = load <16 x float>, <16 x float>* %ptr 637 %mask = fptosi <16 x float> %a to <16 x i1> 638 %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer 639 ret <16 x i16> %select 640} 641 642define <16 x i16> @test_16f32tosb_512(<16 x float>* %ptr, <16 x i16> %passthru) "min-legal-vector-width"="512" { 643; CHECK-LABEL: test_16f32tosb_512: 644; CHECK: # %bb.0: 645; CHECK-NEXT: vcvttps2dq (%rdi), %zmm1 646; CHECK-NEXT: vpmovd2m %zmm1, %k1 647; CHECK-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} 648; CHECK-NEXT: retq 649 %a = load <16 x float>, <16 x float>* %ptr 650 %mask = fptosi <16 x float> %a to <16 x i1> 651 %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer 652 ret <16 x i16> %select 653} 654 655define void @mul256(<64 x i8>* %a, <64 x i8>* %b, <64 x i8>* %c) "min-legal-vector-width"="256" { 656; CHECK-AVX512-LABEL: mul256: 657; CHECK-AVX512: # %bb.0: 658; CHECK-AVX512-NEXT: vmovdqa (%rdi), %ymm0 659; CHECK-AVX512-NEXT: vmovdqa 32(%rdi), %ymm1 660; CHECK-AVX512-NEXT: vmovdqa (%rsi), %ymm2 661; CHECK-AVX512-NEXT: vmovdqa 32(%rsi), %ymm3 662; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 663; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 664; CHECK-AVX512-NEXT: vpmullw %ymm4, %ymm5, %ymm4 665; CHECK-AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 666; CHECK-AVX512-NEXT: vpand %ymm5, %ymm4, %ymm4 667; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 668; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 669; CHECK-AVX512-NEXT: vpmullw %ymm3, %ymm1, %ymm1 670; CHECK-AVX512-NEXT: vpand %ymm5, %ymm1, %ymm1 671; CHECK-AVX512-NEXT: vpackuswb %ymm4, %ymm1, %ymm1 672; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 673; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 674; CHECK-AVX512-NEXT: vpmullw %ymm3, %ymm4, %ymm3 675; CHECK-AVX512-NEXT: vpand %ymm5, %ymm3, %ymm3 676; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 677; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 678; CHECK-AVX512-NEXT: vpmullw %ymm2, %ymm0, %ymm0 679; CHECK-AVX512-NEXT: vpand %ymm5, %ymm0, %ymm0 680; CHECK-AVX512-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 681; CHECK-AVX512-NEXT: vmovdqa %ymm0, (%rdx) 682; CHECK-AVX512-NEXT: vmovdqa %ymm1, 32(%rdx) 683; CHECK-AVX512-NEXT: vzeroupper 684; CHECK-AVX512-NEXT: retq 685; 686; CHECK-VBMI-LABEL: mul256: 687; CHECK-VBMI: # %bb.0: 688; CHECK-VBMI-NEXT: vmovdqa (%rdi), %ymm0 689; CHECK-VBMI-NEXT: vmovdqa 32(%rdi), %ymm1 690; CHECK-VBMI-NEXT: vmovdqa (%rsi), %ymm2 691; CHECK-VBMI-NEXT: vmovdqa 32(%rsi), %ymm3 692; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 693; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 694; CHECK-VBMI-NEXT: vpmullw %ymm4, %ymm5, %ymm4 695; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 696; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 697; CHECK-VBMI-NEXT: vpmullw %ymm3, %ymm1, %ymm1 698; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62] 699; CHECK-VBMI-NEXT: vpermt2b %ymm4, %ymm3, %ymm1 700; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 701; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 702; CHECK-VBMI-NEXT: vpmullw %ymm4, %ymm5, %ymm4 703; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 704; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 705; CHECK-VBMI-NEXT: vpmullw %ymm2, %ymm0, %ymm0 706; CHECK-VBMI-NEXT: vpermt2b %ymm4, %ymm3, %ymm0 707; CHECK-VBMI-NEXT: vmovdqa %ymm0, (%rdx) 708; CHECK-VBMI-NEXT: vmovdqa %ymm1, 32(%rdx) 709; CHECK-VBMI-NEXT: vzeroupper 710; CHECK-VBMI-NEXT: retq 711 %d = load <64 x i8>, <64 x i8>* %a 712 %e = load <64 x i8>, <64 x i8>* %b 713 %f = mul <64 x i8> %d, %e 714 store <64 x i8> %f, <64 x i8>* %c 715 ret void 716} 717 718define void @mul512(<64 x i8>* %a, <64 x i8>* %b, <64 x i8>* %c) "min-legal-vector-width"="512" { 719; CHECK-AVX512-LABEL: mul512: 720; CHECK-AVX512: # %bb.0: 721; CHECK-AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 722; CHECK-AVX512-NEXT: vmovdqa64 (%rsi), %zmm1 723; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 724; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 725; CHECK-AVX512-NEXT: vpmullw %zmm2, %zmm3, %zmm2 726; CHECK-AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 727; CHECK-AVX512-NEXT: vpandq %zmm3, %zmm2, %zmm2 728; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 729; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 730; CHECK-AVX512-NEXT: vpmullw %zmm1, %zmm0, %zmm0 731; CHECK-AVX512-NEXT: vpandq %zmm3, %zmm0, %zmm0 732; CHECK-AVX512-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 733; CHECK-AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) 734; CHECK-AVX512-NEXT: vzeroupper 735; CHECK-AVX512-NEXT: retq 736; 737; CHECK-VBMI-LABEL: mul512: 738; CHECK-VBMI: # %bb.0: 739; CHECK-VBMI-NEXT: vmovdqa64 (%rdi), %zmm0 740; CHECK-VBMI-NEXT: vmovdqa64 (%rsi), %zmm1 741; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 742; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 743; CHECK-VBMI-NEXT: vpmullw %zmm2, %zmm3, %zmm2 744; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 745; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 746; CHECK-VBMI-NEXT: vpmullw %zmm1, %zmm0, %zmm0 747; CHECK-VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94,32,34,36,38,40,42,44,46,96,98,100,102,104,106,108,110,48,50,52,54,56,58,60,62,112,114,116,118,120,122,124,126] 748; CHECK-VBMI-NEXT: vpermi2b %zmm2, %zmm0, %zmm1 749; CHECK-VBMI-NEXT: vmovdqa64 %zmm1, (%rdx) 750; CHECK-VBMI-NEXT: vzeroupper 751; CHECK-VBMI-NEXT: retq 752 %d = load <64 x i8>, <64 x i8>* %a 753 %e = load <64 x i8>, <64 x i8>* %b 754 %f = mul <64 x i8> %d, %e 755 store <64 x i8> %f, <64 x i8>* %c 756 ret void 757} 758 759; This threw an assertion at one point. 760define <4 x i32> @mload_v4i32(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) "min-legal-vector-width"="256" { 761; CHECK-LABEL: mload_v4i32: 762; CHECK: # %bb.0: 763; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 764; CHECK-NEXT: vpblendmd (%rdi), %xmm1, %xmm0 {%k1} 765; CHECK-NEXT: retq 766 %mask = icmp eq <4 x i32> %trigger, zeroinitializer 767 %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst) 768 ret <4 x i32> %res 769} 770declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>) 771 772define <16 x i32> @trunc_v16i64_v16i32(<16 x i64>* %x) nounwind "min-legal-vector-width"="256" { 773; CHECK-LABEL: trunc_v16i64_v16i32: 774; CHECK: # %bb.0: 775; CHECK-NEXT: vmovdqa (%rdi), %ymm0 776; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 777; CHECK-NEXT: vmovdqa 64(%rdi), %ymm2 778; CHECK-NEXT: vmovdqa 96(%rdi), %ymm3 779; CHECK-NEXT: vpmovqd %ymm0, %xmm0 780; CHECK-NEXT: vpmovqd %ymm1, %xmm1 781; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 782; CHECK-NEXT: vpmovqd %ymm2, %xmm1 783; CHECK-NEXT: vpmovqd %ymm3, %xmm2 784; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 785; CHECK-NEXT: retq 786 %a = load <16 x i64>, <16 x i64>* %x 787 %b = trunc <16 x i64> %a to <16 x i32> 788 ret <16 x i32> %b 789} 790 791define <16 x i8> @trunc_v16i64_v16i8(<16 x i64>* %x) nounwind "min-legal-vector-width"="256" { 792; CHECK-LABEL: trunc_v16i64_v16i8: 793; CHECK: # %bb.0: 794; CHECK-NEXT: vmovdqa (%rdi), %ymm0 795; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 796; CHECK-NEXT: vmovdqa 64(%rdi), %ymm2 797; CHECK-NEXT: vmovdqa 96(%rdi), %ymm3 798; CHECK-NEXT: vpmovqb %ymm3, %xmm3 799; CHECK-NEXT: vpmovqb %ymm2, %xmm2 800; CHECK-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 801; CHECK-NEXT: vpmovqb %ymm1, %xmm1 802; CHECK-NEXT: vpmovqb %ymm0, %xmm0 803; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 804; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 805; CHECK-NEXT: vzeroupper 806; CHECK-NEXT: retq 807 %a = load <16 x i64>, <16 x i64>* %x 808 %b = trunc <16 x i64> %a to <16 x i8> 809 ret <16 x i8> %b 810} 811 812define <16 x i8> @trunc_v16i32_v16i8(<16 x i32>* %x) nounwind "min-legal-vector-width"="256" { 813; CHECK-LABEL: trunc_v16i32_v16i8: 814; CHECK: # %bb.0: 815; CHECK-NEXT: vmovdqa (%rdi), %ymm0 816; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 817; CHECK-NEXT: vpmovdb %ymm1, %xmm1 818; CHECK-NEXT: vpmovdb %ymm0, %xmm0 819; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 820; CHECK-NEXT: vzeroupper 821; CHECK-NEXT: retq 822 %a = load <16 x i32>, <16 x i32>* %x 823 %b = trunc <16 x i32> %a to <16 x i8> 824 ret <16 x i8> %b 825} 826 827define <8 x i8> @trunc_v8i64_v8i8(<8 x i64>* %x) nounwind "min-legal-vector-width"="256" { 828; CHECK-LABEL: trunc_v8i64_v8i8: 829; CHECK: # %bb.0: 830; CHECK-NEXT: vmovdqa (%rdi), %ymm0 831; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 832; CHECK-NEXT: vpmovqb %ymm1, %xmm1 833; CHECK-NEXT: vpmovqb %ymm0, %xmm0 834; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 835; CHECK-NEXT: vzeroupper 836; CHECK-NEXT: retq 837 %a = load <8 x i64>, <8 x i64>* %x 838 %b = trunc <8 x i64> %a to <8 x i8> 839 ret <8 x i8> %b 840} 841 842define <8 x i16> @trunc_v8i64_v8i16(<8 x i64>* %x) nounwind "min-legal-vector-width"="256" { 843; CHECK-LABEL: trunc_v8i64_v8i16: 844; CHECK: # %bb.0: 845; CHECK-NEXT: vmovdqa (%rdi), %ymm0 846; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 847; CHECK-NEXT: vpmovqw %ymm1, %xmm1 848; CHECK-NEXT: vpmovqw %ymm0, %xmm0 849; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 850; CHECK-NEXT: vzeroupper 851; CHECK-NEXT: retq 852 %a = load <8 x i64>, <8 x i64>* %x 853 %b = trunc <8 x i64> %a to <8 x i16> 854 ret <8 x i16> %b 855} 856 857define <8 x i32> @trunc_v8i64_v8i32_zeroes(<8 x i64>* %x) nounwind "min-legal-vector-width"="256" { 858; CHECK-LABEL: trunc_v8i64_v8i32_zeroes: 859; CHECK: # %bb.0: 860; CHECK-NEXT: vpsrlq $48, 32(%rdi), %ymm0 861; CHECK-NEXT: vpsrlq $48, (%rdi), %ymm1 862; CHECK-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 863; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 864; CHECK-NEXT: retq 865 %a = load <8 x i64>, <8 x i64>* %x 866 %b = lshr <8 x i64> %a, <i64 48, i64 48, i64 48, i64 48, i64 48, i64 48, i64 48, i64 48> 867 %c = trunc <8 x i64> %b to <8 x i32> 868 ret <8 x i32> %c 869} 870 871define <16 x i16> @trunc_v16i32_v16i16_zeroes(<16 x i32>* %x) nounwind "min-legal-vector-width"="256" { 872; CHECK-LABEL: trunc_v16i32_v16i16_zeroes: 873; CHECK: # %bb.0: 874; CHECK-NEXT: vmovdqa (%rdi), %ymm1 875; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] 876; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0 877; CHECK-NEXT: retq 878 %a = load <16 x i32>, <16 x i32>* %x 879 %b = lshr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 880 %c = trunc <16 x i32> %b to <16 x i16> 881 ret <16 x i16> %c 882} 883 884define <32 x i8> @trunc_v32i16_v32i8_zeroes(<32 x i16>* %x) nounwind "min-legal-vector-width"="256" { 885; CHECK-AVX512-LABEL: trunc_v32i16_v32i8_zeroes: 886; CHECK-AVX512: # %bb.0: 887; CHECK-AVX512-NEXT: vpsrlw $8, 32(%rdi), %ymm0 888; CHECK-AVX512-NEXT: vpsrlw $8, (%rdi), %ymm1 889; CHECK-AVX512-NEXT: vpackuswb %ymm0, %ymm1, %ymm0 890; CHECK-AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 891; CHECK-AVX512-NEXT: retq 892; 893; CHECK-VBMI-LABEL: trunc_v32i16_v32i8_zeroes: 894; CHECK-VBMI: # %bb.0: 895; CHECK-VBMI-NEXT: vmovdqa (%rdi), %ymm1 896; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] 897; CHECK-VBMI-NEXT: vpermi2b 32(%rdi), %ymm1, %ymm0 898; CHECK-VBMI-NEXT: retq 899 %a = load <32 x i16>, <32 x i16>* %x 900 %b = lshr <32 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 901 %c = trunc <32 x i16> %b to <32 x i8> 902 ret <32 x i8> %c 903} 904 905define <8 x i32> @trunc_v8i64_v8i32_sign(<8 x i64>* %x) nounwind "min-legal-vector-width"="256" { 906; CHECK-LABEL: trunc_v8i64_v8i32_sign: 907; CHECK: # %bb.0: 908; CHECK-NEXT: vpsraq $48, 32(%rdi), %ymm0 909; CHECK-NEXT: vpsraq $48, (%rdi), %ymm1 910; CHECK-NEXT: vpmovqd %ymm1, %xmm1 911; CHECK-NEXT: vpmovqd %ymm0, %xmm0 912; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 913; CHECK-NEXT: retq 914 %a = load <8 x i64>, <8 x i64>* %x 915 %b = ashr <8 x i64> %a, <i64 48, i64 48, i64 48, i64 48, i64 48, i64 48, i64 48, i64 48> 916 %c = trunc <8 x i64> %b to <8 x i32> 917 ret <8 x i32> %c 918} 919 920define <16 x i16> @trunc_v16i32_v16i16_sign(<16 x i32>* %x) nounwind "min-legal-vector-width"="256" { 921; CHECK-LABEL: trunc_v16i32_v16i16_sign: 922; CHECK: # %bb.0: 923; CHECK-NEXT: vpsrad $16, 32(%rdi), %ymm0 924; CHECK-NEXT: vpsrad $16, (%rdi), %ymm1 925; CHECK-NEXT: vpackssdw %ymm0, %ymm1, %ymm0 926; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 927; CHECK-NEXT: retq 928 %a = load <16 x i32>, <16 x i32>* %x 929 %b = ashr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 930 %c = trunc <16 x i32> %b to <16 x i16> 931 ret <16 x i16> %c 932} 933 934define <32 x i8> @trunc_v32i16_v32i8_sign(<32 x i16>* %x) nounwind "min-legal-vector-width"="256" { 935; CHECK-LABEL: trunc_v32i16_v32i8_sign: 936; CHECK: # %bb.0: 937; CHECK-NEXT: vpsraw $8, 32(%rdi), %ymm0 938; CHECK-NEXT: vpsraw $8, (%rdi), %ymm1 939; CHECK-NEXT: vpacksswb %ymm0, %ymm1, %ymm0 940; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 941; CHECK-NEXT: retq 942 %a = load <32 x i16>, <32 x i16>* %x 943 %b = ashr <32 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 944 %c = trunc <32 x i16> %b to <32 x i8> 945 ret <32 x i8> %c 946} 947 948define void @zext_v16i8_v16i64(<16 x i8> %x, <16 x i64>* %y) nounwind "min-legal-vector-width"="256" { 949; CHECK-LABEL: zext_v16i8_v16i64: 950; CHECK: # %bb.0: 951; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 952; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 953; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero 954; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1 955; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] 956; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero 957; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 958; CHECK-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero 959; CHECK-NEXT: vmovdqa %ymm0, (%rdi) 960; CHECK-NEXT: vmovdqa %ymm1, 64(%rdi) 961; CHECK-NEXT: vmovdqa %ymm3, 96(%rdi) 962; CHECK-NEXT: vmovdqa %ymm2, 32(%rdi) 963; CHECK-NEXT: vzeroupper 964; CHECK-NEXT: retq 965 %a = zext <16 x i8> %x to <16 x i64> 966 store <16 x i64> %a, <16 x i64>* %y 967 ret void 968} 969 970define void @sext_v16i8_v16i64(<16 x i8> %x, <16 x i64>* %y) nounwind "min-legal-vector-width"="256" { 971; CHECK-LABEL: sext_v16i8_v16i64: 972; CHECK: # %bb.0: 973; CHECK-NEXT: vpmovsxbw %xmm0, %ymm1 974; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 975; CHECK-NEXT: vpmovsxwq %xmm2, %ymm2 976; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1 977; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] 978; CHECK-NEXT: vpmovsxwq %xmm3, %ymm3 979; CHECK-NEXT: vpmovsxwq %xmm1, %ymm1 980; CHECK-NEXT: vpmovsxbq %xmm0, %ymm0 981; CHECK-NEXT: vmovdqa %ymm0, (%rdi) 982; CHECK-NEXT: vmovdqa %ymm1, 64(%rdi) 983; CHECK-NEXT: vmovdqa %ymm3, 96(%rdi) 984; CHECK-NEXT: vmovdqa %ymm2, 32(%rdi) 985; CHECK-NEXT: vzeroupper 986; CHECK-NEXT: retq 987 %a = sext <16 x i8> %x to <16 x i64> 988 store <16 x i64> %a, <16 x i64>* %y 989 ret void 990} 991 992define void @vselect_split_v8i16_setcc(<8 x i16> %s, <8 x i16> %t, <8 x i64>* %p, <8 x i64>* %q, <8 x i64>* %r) "min-legal-vector-width"="256" { 993; CHECK-LABEL: vselect_split_v8i16_setcc: 994; CHECK: # %bb.0: 995; CHECK-NEXT: vmovdqa (%rsi), %ymm2 996; CHECK-NEXT: vmovdqa 32(%rsi), %ymm3 997; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 998; CHECK-NEXT: kshiftrb $4, %k1, %k2 999; CHECK-NEXT: vmovdqa64 32(%rdi), %ymm3 {%k2} 1000; CHECK-NEXT: vmovdqa64 (%rdi), %ymm2 {%k1} 1001; CHECK-NEXT: vmovdqa %ymm2, (%rdx) 1002; CHECK-NEXT: vmovdqa %ymm3, 32(%rdx) 1003; CHECK-NEXT: vzeroupper 1004; CHECK-NEXT: retq 1005 %x = load <8 x i64>, <8 x i64>* %p 1006 %y = load <8 x i64>, <8 x i64>* %q 1007 %a = icmp eq <8 x i16> %s, %t 1008 %b = select <8 x i1> %a, <8 x i64> %x, <8 x i64> %y 1009 store <8 x i64> %b, <8 x i64>* %r 1010 ret void 1011} 1012 1013define void @vselect_split_v8i32_setcc(<8 x i32> %s, <8 x i32> %t, <8 x i64>* %p, <8 x i64>* %q, <8 x i64>* %r) "min-legal-vector-width"="256" { 1014; CHECK-LABEL: vselect_split_v8i32_setcc: 1015; CHECK: # %bb.0: 1016; CHECK-NEXT: vmovdqa (%rsi), %ymm2 1017; CHECK-NEXT: vmovdqa 32(%rsi), %ymm3 1018; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 1019; CHECK-NEXT: kshiftrb $4, %k1, %k2 1020; CHECK-NEXT: vmovdqa64 32(%rdi), %ymm3 {%k2} 1021; CHECK-NEXT: vmovdqa64 (%rdi), %ymm2 {%k1} 1022; CHECK-NEXT: vmovdqa %ymm2, (%rdx) 1023; CHECK-NEXT: vmovdqa %ymm3, 32(%rdx) 1024; CHECK-NEXT: vzeroupper 1025; CHECK-NEXT: retq 1026 %x = load <8 x i64>, <8 x i64>* %p 1027 %y = load <8 x i64>, <8 x i64>* %q 1028 %a = icmp eq <8 x i32> %s, %t 1029 %b = select <8 x i1> %a, <8 x i64> %x, <8 x i64> %y 1030 store <8 x i64> %b, <8 x i64>* %r 1031 ret void 1032} 1033 1034define void @vselect_split_v16i8_setcc(<16 x i8> %s, <16 x i8> %t, <16 x i32>* %p, <16 x i32>* %q, <16 x i32>* %r) "min-legal-vector-width"="256" { 1035; CHECK-LABEL: vselect_split_v16i8_setcc: 1036; CHECK: # %bb.0: 1037; CHECK-NEXT: vmovdqa (%rsi), %ymm2 1038; CHECK-NEXT: vmovdqa 32(%rsi), %ymm3 1039; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k1 1040; CHECK-NEXT: kshiftrw $8, %k1, %k2 1041; CHECK-NEXT: vmovdqa32 32(%rdi), %ymm3 {%k2} 1042; CHECK-NEXT: vmovdqa32 (%rdi), %ymm2 {%k1} 1043; CHECK-NEXT: vmovdqa %ymm2, (%rdx) 1044; CHECK-NEXT: vmovdqa %ymm3, 32(%rdx) 1045; CHECK-NEXT: vzeroupper 1046; CHECK-NEXT: retq 1047 %x = load <16 x i32>, <16 x i32>* %p 1048 %y = load <16 x i32>, <16 x i32>* %q 1049 %a = icmp eq <16 x i8> %s, %t 1050 %b = select <16 x i1> %a, <16 x i32> %x, <16 x i32> %y 1051 store <16 x i32> %b, <16 x i32>* %r 1052 ret void 1053} 1054 1055define void @vselect_split_v16i16_setcc(<16 x i16> %s, <16 x i16> %t, <16 x i32>* %p, <16 x i32>* %q, <16 x i32>* %r) "min-legal-vector-width"="256" { 1056; CHECK-LABEL: vselect_split_v16i16_setcc: 1057; CHECK: # %bb.0: 1058; CHECK-NEXT: vmovdqa (%rsi), %ymm2 1059; CHECK-NEXT: vmovdqa 32(%rsi), %ymm3 1060; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 1061; CHECK-NEXT: kshiftrw $8, %k1, %k2 1062; CHECK-NEXT: vmovdqa32 32(%rdi), %ymm3 {%k2} 1063; CHECK-NEXT: vmovdqa32 (%rdi), %ymm2 {%k1} 1064; CHECK-NEXT: vmovdqa %ymm2, (%rdx) 1065; CHECK-NEXT: vmovdqa %ymm3, 32(%rdx) 1066; CHECK-NEXT: vzeroupper 1067; CHECK-NEXT: retq 1068 %x = load <16 x i32>, <16 x i32>* %p 1069 %y = load <16 x i32>, <16 x i32>* %q 1070 %a = icmp eq <16 x i16> %s, %t 1071 %b = select <16 x i1> %a, <16 x i32> %x, <16 x i32> %y 1072 store <16 x i32> %b, <16 x i32>* %r 1073 ret void 1074} 1075 1076define <16 x i8> @trunc_packus_v16i32_v16i8(<16 x i32>* %p) "min-legal-vector-width"="256" { 1077; CHECK-LABEL: trunc_packus_v16i32_v16i8: 1078; CHECK: # %bb.0: 1079; CHECK-NEXT: vmovdqa (%rdi), %ymm0 1080; CHECK-NEXT: vpackusdw 32(%rdi), %ymm0, %ymm0 1081; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 1082; CHECK-NEXT: vpmovuswb %ymm0, %xmm0 1083; CHECK-NEXT: vzeroupper 1084; CHECK-NEXT: retq 1085 %a = load <16 x i32>, <16 x i32>* %p 1086 %b = icmp slt <16 x i32> %a, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> 1087 %c = select <16 x i1> %b, <16 x i32> %a, <16 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> 1088 %d = icmp sgt <16 x i32> %c, zeroinitializer 1089 %e = select <16 x i1> %d, <16 x i32> %c, <16 x i32> zeroinitializer 1090 %f = trunc <16 x i32> %e to <16 x i8> 1091 ret <16 x i8> %f 1092} 1093 1094define void @trunc_packus_v16i32_v16i8_store(<16 x i32>* %p, <16 x i8>* %q) "min-legal-vector-width"="256" { 1095; CHECK-LABEL: trunc_packus_v16i32_v16i8_store: 1096; CHECK: # %bb.0: 1097; CHECK-NEXT: vmovdqa (%rdi), %ymm0 1098; CHECK-NEXT: vpackusdw 32(%rdi), %ymm0, %ymm0 1099; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 1100; CHECK-NEXT: vpmovuswb %ymm0, (%rsi) 1101; CHECK-NEXT: vzeroupper 1102; CHECK-NEXT: retq 1103 %a = load <16 x i32>, <16 x i32>* %p 1104 %b = icmp slt <16 x i32> %a, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> 1105 %c = select <16 x i1> %b, <16 x i32> %a, <16 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> 1106 %d = icmp sgt <16 x i32> %c, zeroinitializer 1107 %e = select <16 x i1> %d, <16 x i32> %c, <16 x i32> zeroinitializer 1108 %f = trunc <16 x i32> %e to <16 x i8> 1109 store <16 x i8> %f, <16 x i8>* %q 1110 ret void 1111} 1112 1113define <64 x i1> @v64i1_argument_return(<64 x i1> %x) "min-legal-vector-width"="256" { 1114; CHECK-LABEL: v64i1_argument_return: 1115; CHECK: # %bb.0: 1116; CHECK-NEXT: retq 1117 ret <64 x i1> %x 1118} 1119 1120define void @v64i1_shuffle(<64 x i8>* %x, <64 x i8>* %y) "min-legal-vector-width"="256" { 1121; CHECK-LABEL: v64i1_shuffle: 1122; CHECK: # %bb.0: # %entry 1123; CHECK-NEXT: vmovdqa (%rdi), %ymm1 1124; CHECK-NEXT: vmovdqa 32(%rdi), %ymm0 1125; CHECK-NEXT: vptestnmb %ymm1, %ymm1, %k0 1126; CHECK-NEXT: kshiftrd $1, %k0, %k1 1127; CHECK-NEXT: movq $-3, %rax 1128; CHECK-NEXT: kmovq %rax, %k2 1129; CHECK-NEXT: kandq %k2, %k1, %k1 1130; CHECK-NEXT: kshiftlq $63, %k0, %k2 1131; CHECK-NEXT: kshiftrq $62, %k2, %k2 1132; CHECK-NEXT: korq %k2, %k1, %k1 1133; CHECK-NEXT: movq $-5, %rax 1134; CHECK-NEXT: kmovq %rax, %k2 1135; CHECK-NEXT: kandq %k2, %k1, %k1 1136; CHECK-NEXT: kshiftrd $3, %k0, %k2 1137; CHECK-NEXT: kshiftlq $63, %k2, %k2 1138; CHECK-NEXT: kshiftrq $61, %k2, %k2 1139; CHECK-NEXT: korq %k2, %k1, %k1 1140; CHECK-NEXT: movq $-9, %rax 1141; CHECK-NEXT: kmovq %rax, %k2 1142; CHECK-NEXT: kandq %k2, %k1, %k1 1143; CHECK-NEXT: kshiftrd $2, %k0, %k2 1144; CHECK-NEXT: kshiftlq $63, %k2, %k2 1145; CHECK-NEXT: kshiftrq $60, %k2, %k2 1146; CHECK-NEXT: korq %k2, %k1, %k1 1147; CHECK-NEXT: movq $-17, %rax 1148; CHECK-NEXT: kmovq %rax, %k2 1149; CHECK-NEXT: kandq %k2, %k1, %k1 1150; CHECK-NEXT: kshiftrd $5, %k0, %k2 1151; CHECK-NEXT: kshiftlq $63, %k2, %k2 1152; CHECK-NEXT: kshiftrq $59, %k2, %k2 1153; CHECK-NEXT: korq %k2, %k1, %k1 1154; CHECK-NEXT: movq $-33, %rax 1155; CHECK-NEXT: kmovq %rax, %k2 1156; CHECK-NEXT: kandq %k2, %k1, %k1 1157; CHECK-NEXT: kshiftrd $4, %k0, %k2 1158; CHECK-NEXT: kshiftlq $63, %k2, %k2 1159; CHECK-NEXT: kshiftrq $58, %k2, %k2 1160; CHECK-NEXT: korq %k2, %k1, %k1 1161; CHECK-NEXT: movq $-65, %rax 1162; CHECK-NEXT: kmovq %rax, %k2 1163; CHECK-NEXT: kandq %k2, %k1, %k1 1164; CHECK-NEXT: kshiftrd $7, %k0, %k2 1165; CHECK-NEXT: kshiftlq $63, %k2, %k2 1166; CHECK-NEXT: kshiftrq $57, %k2, %k2 1167; CHECK-NEXT: korq %k2, %k1, %k1 1168; CHECK-NEXT: movq $-129, %rax 1169; CHECK-NEXT: kmovq %rax, %k2 1170; CHECK-NEXT: kandq %k2, %k1, %k1 1171; CHECK-NEXT: kshiftrd $6, %k0, %k2 1172; CHECK-NEXT: kshiftlq $63, %k2, %k2 1173; CHECK-NEXT: kshiftrq $56, %k2, %k2 1174; CHECK-NEXT: korq %k2, %k1, %k1 1175; CHECK-NEXT: movq $-257, %rax # imm = 0xFEFF 1176; CHECK-NEXT: kmovq %rax, %k2 1177; CHECK-NEXT: kandq %k2, %k1, %k1 1178; CHECK-NEXT: kshiftrd $9, %k0, %k2 1179; CHECK-NEXT: kshiftlq $63, %k2, %k2 1180; CHECK-NEXT: kshiftrq $55, %k2, %k2 1181; CHECK-NEXT: korq %k2, %k1, %k1 1182; CHECK-NEXT: movq $-513, %rax # imm = 0xFDFF 1183; CHECK-NEXT: kmovq %rax, %k2 1184; CHECK-NEXT: kandq %k2, %k1, %k1 1185; CHECK-NEXT: kshiftrd $8, %k0, %k2 1186; CHECK-NEXT: kshiftlq $63, %k2, %k2 1187; CHECK-NEXT: kshiftrq $54, %k2, %k2 1188; CHECK-NEXT: korq %k2, %k1, %k1 1189; CHECK-NEXT: movq $-1025, %rax # imm = 0xFBFF 1190; CHECK-NEXT: kmovq %rax, %k2 1191; CHECK-NEXT: kandq %k2, %k1, %k1 1192; CHECK-NEXT: kshiftrd $11, %k0, %k2 1193; CHECK-NEXT: kshiftlq $63, %k2, %k2 1194; CHECK-NEXT: kshiftrq $53, %k2, %k2 1195; CHECK-NEXT: korq %k2, %k1, %k1 1196; CHECK-NEXT: movq $-2049, %rax # imm = 0xF7FF 1197; CHECK-NEXT: kmovq %rax, %k2 1198; CHECK-NEXT: kandq %k2, %k1, %k1 1199; CHECK-NEXT: kshiftrd $10, %k0, %k2 1200; CHECK-NEXT: kshiftlq $63, %k2, %k2 1201; CHECK-NEXT: kshiftrq $52, %k2, %k2 1202; CHECK-NEXT: korq %k2, %k1, %k1 1203; CHECK-NEXT: movq $-4097, %rax # imm = 0xEFFF 1204; CHECK-NEXT: kmovq %rax, %k2 1205; CHECK-NEXT: kandq %k2, %k1, %k1 1206; CHECK-NEXT: kshiftrd $13, %k0, %k2 1207; CHECK-NEXT: kshiftlq $63, %k2, %k2 1208; CHECK-NEXT: kshiftrq $51, %k2, %k2 1209; CHECK-NEXT: korq %k2, %k1, %k1 1210; CHECK-NEXT: movq $-8193, %rax # imm = 0xDFFF 1211; CHECK-NEXT: kmovq %rax, %k2 1212; CHECK-NEXT: kandq %k2, %k1, %k1 1213; CHECK-NEXT: kshiftrd $12, %k0, %k2 1214; CHECK-NEXT: kshiftlq $63, %k2, %k2 1215; CHECK-NEXT: kshiftrq $50, %k2, %k2 1216; CHECK-NEXT: korq %k2, %k1, %k1 1217; CHECK-NEXT: movq $-16385, %rax # imm = 0xBFFF 1218; CHECK-NEXT: kmovq %rax, %k2 1219; CHECK-NEXT: kandq %k2, %k1, %k1 1220; CHECK-NEXT: kshiftrd $15, %k0, %k2 1221; CHECK-NEXT: kshiftlq $63, %k2, %k2 1222; CHECK-NEXT: kshiftrq $49, %k2, %k2 1223; CHECK-NEXT: korq %k2, %k1, %k1 1224; CHECK-NEXT: movq $-32769, %rax # imm = 0xFFFF7FFF 1225; CHECK-NEXT: kmovq %rax, %k2 1226; CHECK-NEXT: kandq %k2, %k1, %k1 1227; CHECK-NEXT: kshiftrd $14, %k0, %k2 1228; CHECK-NEXT: kshiftlq $63, %k2, %k2 1229; CHECK-NEXT: kshiftrq $48, %k2, %k2 1230; CHECK-NEXT: korq %k2, %k1, %k1 1231; CHECK-NEXT: movq $-65537, %rax # imm = 0xFFFEFFFF 1232; CHECK-NEXT: kmovq %rax, %k2 1233; CHECK-NEXT: kandq %k2, %k1, %k1 1234; CHECK-NEXT: kshiftrd $17, %k0, %k2 1235; CHECK-NEXT: kshiftlq $63, %k2, %k2 1236; CHECK-NEXT: kshiftrq $47, %k2, %k2 1237; CHECK-NEXT: korq %k2, %k1, %k1 1238; CHECK-NEXT: movq $-131073, %rax # imm = 0xFFFDFFFF 1239; CHECK-NEXT: kmovq %rax, %k2 1240; CHECK-NEXT: kandq %k2, %k1, %k1 1241; CHECK-NEXT: kshiftrd $16, %k0, %k2 1242; CHECK-NEXT: kshiftlq $63, %k2, %k2 1243; CHECK-NEXT: kshiftrq $46, %k2, %k2 1244; CHECK-NEXT: korq %k2, %k1, %k1 1245; CHECK-NEXT: movq $-262145, %rax # imm = 0xFFFBFFFF 1246; CHECK-NEXT: kmovq %rax, %k2 1247; CHECK-NEXT: kandq %k2, %k1, %k1 1248; CHECK-NEXT: kshiftrd $19, %k0, %k2 1249; CHECK-NEXT: kshiftlq $63, %k2, %k2 1250; CHECK-NEXT: kshiftrq $45, %k2, %k2 1251; CHECK-NEXT: korq %k2, %k1, %k1 1252; CHECK-NEXT: movq $-524289, %rax # imm = 0xFFF7FFFF 1253; CHECK-NEXT: kmovq %rax, %k2 1254; CHECK-NEXT: kandq %k2, %k1, %k1 1255; CHECK-NEXT: kshiftrd $18, %k0, %k2 1256; CHECK-NEXT: kshiftlq $63, %k2, %k2 1257; CHECK-NEXT: kshiftrq $44, %k2, %k2 1258; CHECK-NEXT: korq %k2, %k1, %k1 1259; CHECK-NEXT: movq $-1048577, %rax # imm = 0xFFEFFFFF 1260; CHECK-NEXT: kmovq %rax, %k2 1261; CHECK-NEXT: kandq %k2, %k1, %k1 1262; CHECK-NEXT: kshiftrd $21, %k0, %k2 1263; CHECK-NEXT: kshiftlq $63, %k2, %k2 1264; CHECK-NEXT: kshiftrq $43, %k2, %k2 1265; CHECK-NEXT: korq %k2, %k1, %k1 1266; CHECK-NEXT: movq $-2097153, %rax # imm = 0xFFDFFFFF 1267; CHECK-NEXT: kmovq %rax, %k2 1268; CHECK-NEXT: kandq %k2, %k1, %k1 1269; CHECK-NEXT: kshiftrd $20, %k0, %k2 1270; CHECK-NEXT: kshiftlq $63, %k2, %k2 1271; CHECK-NEXT: kshiftrq $42, %k2, %k2 1272; CHECK-NEXT: korq %k2, %k1, %k1 1273; CHECK-NEXT: movq $-4194305, %rax # imm = 0xFFBFFFFF 1274; CHECK-NEXT: kmovq %rax, %k2 1275; CHECK-NEXT: kandq %k2, %k1, %k1 1276; CHECK-NEXT: kshiftrd $23, %k0, %k2 1277; CHECK-NEXT: kshiftlq $63, %k2, %k2 1278; CHECK-NEXT: kshiftrq $41, %k2, %k2 1279; CHECK-NEXT: korq %k2, %k1, %k1 1280; CHECK-NEXT: movq $-8388609, %rax # imm = 0xFF7FFFFF 1281; CHECK-NEXT: kmovq %rax, %k2 1282; CHECK-NEXT: kandq %k2, %k1, %k1 1283; CHECK-NEXT: kshiftrd $22, %k0, %k2 1284; CHECK-NEXT: kshiftlq $63, %k2, %k2 1285; CHECK-NEXT: kshiftrq $40, %k2, %k2 1286; CHECK-NEXT: korq %k2, %k1, %k1 1287; CHECK-NEXT: movq $-16777217, %rax # imm = 0xFEFFFFFF 1288; CHECK-NEXT: kmovq %rax, %k2 1289; CHECK-NEXT: kandq %k2, %k1, %k1 1290; CHECK-NEXT: kshiftrd $25, %k0, %k2 1291; CHECK-NEXT: kshiftlq $63, %k2, %k2 1292; CHECK-NEXT: kshiftrq $39, %k2, %k2 1293; CHECK-NEXT: korq %k2, %k1, %k1 1294; CHECK-NEXT: movq $-33554433, %rax # imm = 0xFDFFFFFF 1295; CHECK-NEXT: kmovq %rax, %k2 1296; CHECK-NEXT: kandq %k2, %k1, %k1 1297; CHECK-NEXT: kshiftrd $24, %k0, %k2 1298; CHECK-NEXT: kshiftlq $63, %k2, %k2 1299; CHECK-NEXT: kshiftrq $38, %k2, %k2 1300; CHECK-NEXT: korq %k2, %k1, %k1 1301; CHECK-NEXT: movq $-67108865, %rax # imm = 0xFBFFFFFF 1302; CHECK-NEXT: kmovq %rax, %k2 1303; CHECK-NEXT: kandq %k2, %k1, %k1 1304; CHECK-NEXT: kshiftrd $27, %k0, %k2 1305; CHECK-NEXT: kshiftlq $63, %k2, %k2 1306; CHECK-NEXT: kshiftrq $37, %k2, %k2 1307; CHECK-NEXT: korq %k2, %k1, %k1 1308; CHECK-NEXT: movq $-134217729, %rax # imm = 0xF7FFFFFF 1309; CHECK-NEXT: kmovq %rax, %k2 1310; CHECK-NEXT: kandq %k2, %k1, %k1 1311; CHECK-NEXT: kshiftrd $26, %k0, %k2 1312; CHECK-NEXT: kshiftlq $63, %k2, %k2 1313; CHECK-NEXT: kshiftrq $36, %k2, %k2 1314; CHECK-NEXT: korq %k2, %k1, %k1 1315; CHECK-NEXT: movq $-268435457, %rax # imm = 0xEFFFFFFF 1316; CHECK-NEXT: kmovq %rax, %k2 1317; CHECK-NEXT: kandq %k2, %k1, %k1 1318; CHECK-NEXT: kshiftrd $29, %k0, %k2 1319; CHECK-NEXT: kshiftlq $63, %k2, %k2 1320; CHECK-NEXT: kshiftrq $35, %k2, %k2 1321; CHECK-NEXT: korq %k2, %k1, %k1 1322; CHECK-NEXT: movq $-536870913, %rax # imm = 0xDFFFFFFF 1323; CHECK-NEXT: kmovq %rax, %k2 1324; CHECK-NEXT: kandq %k2, %k1, %k1 1325; CHECK-NEXT: kshiftrd $28, %k0, %k2 1326; CHECK-NEXT: kshiftlq $63, %k2, %k2 1327; CHECK-NEXT: kshiftrq $34, %k2, %k2 1328; CHECK-NEXT: korq %k2, %k1, %k1 1329; CHECK-NEXT: movq $-1073741825, %rax # imm = 0xBFFFFFFF 1330; CHECK-NEXT: kmovq %rax, %k2 1331; CHECK-NEXT: kandq %k2, %k1, %k1 1332; CHECK-NEXT: kshiftrd $31, %k0, %k2 1333; CHECK-NEXT: kshiftlq $63, %k2, %k2 1334; CHECK-NEXT: kshiftrq $33, %k2, %k2 1335; CHECK-NEXT: korq %k2, %k1, %k1 1336; CHECK-NEXT: movabsq $-2147483649, %rax # imm = 0xFFFFFFFF7FFFFFFF 1337; CHECK-NEXT: kmovq %rax, %k2 1338; CHECK-NEXT: kandq %k2, %k1, %k2 1339; CHECK-NEXT: vptestnmb %ymm0, %ymm0, %k1 1340; CHECK-NEXT: kshiftrd $30, %k0, %k0 1341; CHECK-NEXT: kshiftlq $63, %k0, %k0 1342; CHECK-NEXT: kshiftrq $32, %k0, %k0 1343; CHECK-NEXT: korq %k0, %k2, %k0 1344; CHECK-NEXT: movabsq $-4294967297, %rax # imm = 0xFFFFFFFEFFFFFFFF 1345; CHECK-NEXT: kmovq %rax, %k2 1346; CHECK-NEXT: kandq %k2, %k0, %k0 1347; CHECK-NEXT: kshiftrd $1, %k1, %k2 1348; CHECK-NEXT: kshiftlq $63, %k2, %k2 1349; CHECK-NEXT: kshiftrq $31, %k2, %k2 1350; CHECK-NEXT: korq %k2, %k0, %k0 1351; CHECK-NEXT: movabsq $-8589934593, %rax # imm = 0xFFFFFFFDFFFFFFFF 1352; CHECK-NEXT: kmovq %rax, %k2 1353; CHECK-NEXT: kandq %k2, %k0, %k0 1354; CHECK-NEXT: kshiftlq $63, %k1, %k2 1355; CHECK-NEXT: kshiftrq $30, %k2, %k2 1356; CHECK-NEXT: korq %k2, %k0, %k0 1357; CHECK-NEXT: movabsq $-17179869185, %rax # imm = 0xFFFFFFFBFFFFFFFF 1358; CHECK-NEXT: kmovq %rax, %k2 1359; CHECK-NEXT: kandq %k2, %k0, %k0 1360; CHECK-NEXT: kshiftrd $3, %k1, %k2 1361; CHECK-NEXT: kshiftlq $63, %k2, %k2 1362; CHECK-NEXT: kshiftrq $29, %k2, %k2 1363; CHECK-NEXT: korq %k2, %k0, %k0 1364; CHECK-NEXT: movabsq $-34359738369, %rax # imm = 0xFFFFFFF7FFFFFFFF 1365; CHECK-NEXT: kmovq %rax, %k2 1366; CHECK-NEXT: kandq %k2, %k0, %k0 1367; CHECK-NEXT: kshiftrd $2, %k1, %k2 1368; CHECK-NEXT: kshiftlq $63, %k2, %k2 1369; CHECK-NEXT: kshiftrq $28, %k2, %k2 1370; CHECK-NEXT: korq %k2, %k0, %k0 1371; CHECK-NEXT: movabsq $-68719476737, %rax # imm = 0xFFFFFFEFFFFFFFFF 1372; CHECK-NEXT: kmovq %rax, %k2 1373; CHECK-NEXT: kandq %k2, %k0, %k0 1374; CHECK-NEXT: kshiftrd $5, %k1, %k2 1375; CHECK-NEXT: kshiftlq $63, %k2, %k2 1376; CHECK-NEXT: kshiftrq $27, %k2, %k2 1377; CHECK-NEXT: korq %k2, %k0, %k0 1378; CHECK-NEXT: movabsq $-137438953473, %rax # imm = 0xFFFFFFDFFFFFFFFF 1379; CHECK-NEXT: kmovq %rax, %k2 1380; CHECK-NEXT: kandq %k2, %k0, %k0 1381; CHECK-NEXT: kshiftrd $4, %k1, %k2 1382; CHECK-NEXT: kshiftlq $63, %k2, %k2 1383; CHECK-NEXT: kshiftrq $26, %k2, %k2 1384; CHECK-NEXT: korq %k2, %k0, %k0 1385; CHECK-NEXT: movabsq $-274877906945, %rax # imm = 0xFFFFFFBFFFFFFFFF 1386; CHECK-NEXT: kmovq %rax, %k2 1387; CHECK-NEXT: kandq %k2, %k0, %k0 1388; CHECK-NEXT: kshiftrd $7, %k1, %k2 1389; CHECK-NEXT: kshiftlq $63, %k2, %k2 1390; CHECK-NEXT: kshiftrq $25, %k2, %k2 1391; CHECK-NEXT: korq %k2, %k0, %k0 1392; CHECK-NEXT: movabsq $-549755813889, %rax # imm = 0xFFFFFF7FFFFFFFFF 1393; CHECK-NEXT: kmovq %rax, %k2 1394; CHECK-NEXT: kandq %k2, %k0, %k0 1395; CHECK-NEXT: kshiftrd $6, %k1, %k2 1396; CHECK-NEXT: kshiftlq $63, %k2, %k2 1397; CHECK-NEXT: kshiftrq $24, %k2, %k2 1398; CHECK-NEXT: korq %k2, %k0, %k0 1399; CHECK-NEXT: movabsq $-1099511627777, %rax # imm = 0xFFFFFEFFFFFFFFFF 1400; CHECK-NEXT: kmovq %rax, %k2 1401; CHECK-NEXT: kandq %k2, %k0, %k0 1402; CHECK-NEXT: kshiftrd $9, %k1, %k2 1403; CHECK-NEXT: kshiftlq $63, %k2, %k2 1404; CHECK-NEXT: kshiftrq $23, %k2, %k2 1405; CHECK-NEXT: korq %k2, %k0, %k0 1406; CHECK-NEXT: movabsq $-2199023255553, %rax # imm = 0xFFFFFDFFFFFFFFFF 1407; CHECK-NEXT: kmovq %rax, %k2 1408; CHECK-NEXT: kandq %k2, %k0, %k0 1409; CHECK-NEXT: kshiftrd $8, %k1, %k2 1410; CHECK-NEXT: kshiftlq $63, %k2, %k2 1411; CHECK-NEXT: kshiftrq $22, %k2, %k2 1412; CHECK-NEXT: korq %k2, %k0, %k0 1413; CHECK-NEXT: movabsq $-4398046511105, %rax # imm = 0xFFFFFBFFFFFFFFFF 1414; CHECK-NEXT: kmovq %rax, %k2 1415; CHECK-NEXT: kandq %k2, %k0, %k0 1416; CHECK-NEXT: kshiftrd $11, %k1, %k2 1417; CHECK-NEXT: kshiftlq $63, %k2, %k2 1418; CHECK-NEXT: kshiftrq $21, %k2, %k2 1419; CHECK-NEXT: korq %k2, %k0, %k0 1420; CHECK-NEXT: movabsq $-8796093022209, %rax # imm = 0xFFFFF7FFFFFFFFFF 1421; CHECK-NEXT: kmovq %rax, %k2 1422; CHECK-NEXT: kandq %k2, %k0, %k0 1423; CHECK-NEXT: kshiftrd $10, %k1, %k2 1424; CHECK-NEXT: kshiftlq $63, %k2, %k2 1425; CHECK-NEXT: kshiftrq $20, %k2, %k2 1426; CHECK-NEXT: korq %k2, %k0, %k0 1427; CHECK-NEXT: movabsq $-17592186044417, %rax # imm = 0xFFFFEFFFFFFFFFFF 1428; CHECK-NEXT: kmovq %rax, %k2 1429; CHECK-NEXT: kandq %k2, %k0, %k0 1430; CHECK-NEXT: kshiftrd $13, %k1, %k2 1431; CHECK-NEXT: kshiftlq $63, %k2, %k2 1432; CHECK-NEXT: kshiftrq $19, %k2, %k2 1433; CHECK-NEXT: korq %k2, %k0, %k0 1434; CHECK-NEXT: movabsq $-35184372088833, %rax # imm = 0xFFFFDFFFFFFFFFFF 1435; CHECK-NEXT: kmovq %rax, %k2 1436; CHECK-NEXT: kandq %k2, %k0, %k0 1437; CHECK-NEXT: kshiftrd $12, %k1, %k2 1438; CHECK-NEXT: kshiftlq $63, %k2, %k2 1439; CHECK-NEXT: kshiftrq $18, %k2, %k2 1440; CHECK-NEXT: korq %k2, %k0, %k0 1441; CHECK-NEXT: movabsq $-70368744177665, %rax # imm = 0xFFFFBFFFFFFFFFFF 1442; CHECK-NEXT: kmovq %rax, %k2 1443; CHECK-NEXT: kandq %k2, %k0, %k0 1444; CHECK-NEXT: kshiftrd $15, %k1, %k2 1445; CHECK-NEXT: kshiftlq $63, %k2, %k2 1446; CHECK-NEXT: kshiftrq $17, %k2, %k2 1447; CHECK-NEXT: korq %k2, %k0, %k0 1448; CHECK-NEXT: movabsq $-140737488355329, %rax # imm = 0xFFFF7FFFFFFFFFFF 1449; CHECK-NEXT: kmovq %rax, %k2 1450; CHECK-NEXT: kandq %k2, %k0, %k0 1451; CHECK-NEXT: kshiftrd $14, %k1, %k2 1452; CHECK-NEXT: kshiftlq $63, %k2, %k2 1453; CHECK-NEXT: kshiftrq $16, %k2, %k2 1454; CHECK-NEXT: korq %k2, %k0, %k0 1455; CHECK-NEXT: movabsq $-281474976710657, %rax # imm = 0xFFFEFFFFFFFFFFFF 1456; CHECK-NEXT: kmovq %rax, %k2 1457; CHECK-NEXT: kandq %k2, %k0, %k0 1458; CHECK-NEXT: kshiftrd $17, %k1, %k2 1459; CHECK-NEXT: kshiftlq $63, %k2, %k2 1460; CHECK-NEXT: kshiftrq $15, %k2, %k2 1461; CHECK-NEXT: korq %k2, %k0, %k0 1462; CHECK-NEXT: movabsq $-562949953421313, %rax # imm = 0xFFFDFFFFFFFFFFFF 1463; CHECK-NEXT: kmovq %rax, %k2 1464; CHECK-NEXT: kandq %k2, %k0, %k0 1465; CHECK-NEXT: kshiftrd $16, %k1, %k2 1466; CHECK-NEXT: kshiftlq $63, %k2, %k2 1467; CHECK-NEXT: kshiftrq $14, %k2, %k2 1468; CHECK-NEXT: korq %k2, %k0, %k0 1469; CHECK-NEXT: movabsq $-1125899906842625, %rax # imm = 0xFFFBFFFFFFFFFFFF 1470; CHECK-NEXT: kmovq %rax, %k2 1471; CHECK-NEXT: kandq %k2, %k0, %k0 1472; CHECK-NEXT: kshiftrd $19, %k1, %k2 1473; CHECK-NEXT: kshiftlq $63, %k2, %k2 1474; CHECK-NEXT: kshiftrq $13, %k2, %k2 1475; CHECK-NEXT: korq %k2, %k0, %k0 1476; CHECK-NEXT: movabsq $-2251799813685249, %rax # imm = 0xFFF7FFFFFFFFFFFF 1477; CHECK-NEXT: kmovq %rax, %k2 1478; CHECK-NEXT: kandq %k2, %k0, %k0 1479; CHECK-NEXT: kshiftrd $18, %k1, %k2 1480; CHECK-NEXT: kshiftlq $63, %k2, %k2 1481; CHECK-NEXT: kshiftrq $12, %k2, %k2 1482; CHECK-NEXT: korq %k2, %k0, %k0 1483; CHECK-NEXT: movabsq $-4503599627370497, %rax # imm = 0xFFEFFFFFFFFFFFFF 1484; CHECK-NEXT: kmovq %rax, %k2 1485; CHECK-NEXT: kandq %k2, %k0, %k0 1486; CHECK-NEXT: kshiftrd $21, %k1, %k2 1487; CHECK-NEXT: kshiftlq $63, %k2, %k2 1488; CHECK-NEXT: kshiftrq $11, %k2, %k2 1489; CHECK-NEXT: korq %k2, %k0, %k0 1490; CHECK-NEXT: movabsq $-9007199254740993, %rax # imm = 0xFFDFFFFFFFFFFFFF 1491; CHECK-NEXT: kmovq %rax, %k2 1492; CHECK-NEXT: kandq %k2, %k0, %k0 1493; CHECK-NEXT: kshiftrd $20, %k1, %k2 1494; CHECK-NEXT: kshiftlq $63, %k2, %k2 1495; CHECK-NEXT: kshiftrq $10, %k2, %k2 1496; CHECK-NEXT: korq %k2, %k0, %k0 1497; CHECK-NEXT: movabsq $-18014398509481985, %rax # imm = 0xFFBFFFFFFFFFFFFF 1498; CHECK-NEXT: kmovq %rax, %k2 1499; CHECK-NEXT: kandq %k2, %k0, %k0 1500; CHECK-NEXT: kshiftrd $23, %k1, %k2 1501; CHECK-NEXT: kshiftlq $63, %k2, %k2 1502; CHECK-NEXT: kshiftrq $9, %k2, %k2 1503; CHECK-NEXT: korq %k2, %k0, %k0 1504; CHECK-NEXT: movabsq $-36028797018963969, %rax # imm = 0xFF7FFFFFFFFFFFFF 1505; CHECK-NEXT: kmovq %rax, %k2 1506; CHECK-NEXT: kandq %k2, %k0, %k0 1507; CHECK-NEXT: kshiftrd $22, %k1, %k2 1508; CHECK-NEXT: kshiftlq $63, %k2, %k2 1509; CHECK-NEXT: kshiftrq $8, %k2, %k2 1510; CHECK-NEXT: korq %k2, %k0, %k0 1511; CHECK-NEXT: movabsq $-72057594037927937, %rax # imm = 0xFEFFFFFFFFFFFFFF 1512; CHECK-NEXT: kmovq %rax, %k2 1513; CHECK-NEXT: kandq %k2, %k0, %k0 1514; CHECK-NEXT: kshiftrd $25, %k1, %k2 1515; CHECK-NEXT: kshiftlq $63, %k2, %k2 1516; CHECK-NEXT: kshiftrq $7, %k2, %k2 1517; CHECK-NEXT: korq %k2, %k0, %k0 1518; CHECK-NEXT: movabsq $-144115188075855873, %rax # imm = 0xFDFFFFFFFFFFFFFF 1519; CHECK-NEXT: kmovq %rax, %k2 1520; CHECK-NEXT: kandq %k2, %k0, %k0 1521; CHECK-NEXT: kshiftrd $24, %k1, %k2 1522; CHECK-NEXT: kshiftlq $63, %k2, %k2 1523; CHECK-NEXT: kshiftrq $6, %k2, %k2 1524; CHECK-NEXT: korq %k2, %k0, %k0 1525; CHECK-NEXT: movabsq $-288230376151711745, %rax # imm = 0xFBFFFFFFFFFFFFFF 1526; CHECK-NEXT: kmovq %rax, %k2 1527; CHECK-NEXT: kandq %k2, %k0, %k0 1528; CHECK-NEXT: kshiftrd $27, %k1, %k2 1529; CHECK-NEXT: kshiftlq $63, %k2, %k2 1530; CHECK-NEXT: kshiftrq $5, %k2, %k2 1531; CHECK-NEXT: korq %k2, %k0, %k0 1532; CHECK-NEXT: movabsq $-576460752303423489, %rax # imm = 0xF7FFFFFFFFFFFFFF 1533; CHECK-NEXT: kmovq %rax, %k2 1534; CHECK-NEXT: kandq %k2, %k0, %k0 1535; CHECK-NEXT: kshiftrd $26, %k1, %k2 1536; CHECK-NEXT: kshiftlq $63, %k2, %k2 1537; CHECK-NEXT: kshiftrq $4, %k2, %k2 1538; CHECK-NEXT: korq %k2, %k0, %k0 1539; CHECK-NEXT: movabsq $-1152921504606846977, %rax # imm = 0xEFFFFFFFFFFFFFFF 1540; CHECK-NEXT: kmovq %rax, %k2 1541; CHECK-NEXT: kandq %k2, %k0, %k0 1542; CHECK-NEXT: kshiftrd $29, %k1, %k2 1543; CHECK-NEXT: kshiftlq $63, %k2, %k2 1544; CHECK-NEXT: kshiftrq $3, %k2, %k2 1545; CHECK-NEXT: korq %k2, %k0, %k0 1546; CHECK-NEXT: movabsq $-2305843009213693953, %rax # imm = 0xDFFFFFFFFFFFFFFF 1547; CHECK-NEXT: kmovq %rax, %k2 1548; CHECK-NEXT: kandq %k2, %k0, %k0 1549; CHECK-NEXT: kshiftrd $28, %k1, %k2 1550; CHECK-NEXT: kshiftlq $63, %k2, %k2 1551; CHECK-NEXT: kshiftrq $2, %k2, %k2 1552; CHECK-NEXT: korq %k2, %k0, %k0 1553; CHECK-NEXT: movabsq $-4611686018427387905, %rax # imm = 0xBFFFFFFFFFFFFFFF 1554; CHECK-NEXT: kmovq %rax, %k2 1555; CHECK-NEXT: kandq %k2, %k0, %k0 1556; CHECK-NEXT: kshiftrd $31, %k1, %k2 1557; CHECK-NEXT: kshiftlq $62, %k2, %k2 1558; CHECK-NEXT: korq %k2, %k0, %k0 1559; CHECK-NEXT: kshiftrd $30, %k1, %k1 1560; CHECK-NEXT: kshiftlq $1, %k0, %k0 1561; CHECK-NEXT: kshiftrq $1, %k0, %k0 1562; CHECK-NEXT: kshiftlq $63, %k1, %k1 1563; CHECK-NEXT: korq %k1, %k0, %k1 1564; CHECK-NEXT: vmovdqu8 %ymm1, (%rsi) {%k1} 1565; CHECK-NEXT: kshiftrq $32, %k1, %k1 1566; CHECK-NEXT: vmovdqu8 %ymm0, 32(%rsi) {%k1} 1567; CHECK-NEXT: vzeroupper 1568; CHECK-NEXT: retq 1569entry: 1570 %a = load <64 x i8>, <64 x i8>* %x 1571 %b = icmp eq <64 x i8> %a, zeroinitializer 1572 %shuf = shufflevector <64 x i1> %b, <64 x i1> undef, <64 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14, i32 17, i32 16, i32 19, i32 18, i32 21, i32 20, i32 23, i32 22, i32 25, i32 24, i32 27, i32 26, i32 29, i32 28, i32 31, i32 30, i32 33, i32 32, i32 35, i32 34, i32 37, i32 36, i32 39, i32 38, i32 41, i32 40, i32 43, i32 42, i32 45, i32 44, i32 47, i32 46, i32 49, i32 48, i32 51, i32 50, i32 53, i32 52, i32 55, i32 54, i32 57, i32 56, i32 59, i32 58, i32 61, i32 60, i32 63, i32 62> 1573 call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> %a, <64 x i8>* %y, i32 1, <64 x i1> %shuf) 1574 ret void 1575} 1576declare void @llvm.masked.store.v64i8.p0v64i8(<64 x i8>, <64 x i8>*, i32, <64 x i1>) 1577 1578@mem64_dst = global i64 0, align 8 1579@mem64_src = global i64 0, align 8 1580define i32 @v64i1_inline_asm() "min-legal-vector-width"="256" { 1581; CHECK-LABEL: v64i1_inline_asm: 1582; CHECK: # %bb.0: 1583; CHECK-NEXT: kmovq {{.*}}(%rip), %k0 1584; CHECK-NEXT: #APP 1585; CHECK-NEXT: #NO_APP 1586; CHECK-NEXT: kmovq %k0, {{.*}}(%rip) 1587; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax 1588; CHECK-NEXT: retq 1589 %1 = alloca i32, align 4 1590 %2 = load i64, i64* @mem64_src, align 8 1591 %3 = call i64 asm "", "=k,k,~{dirflag},~{fpsr},~{flags}"(i64 %2) 1592 store i64 %3, i64* @mem64_dst, align 8 1593 %4 = load i32, i32* %1, align 4 1594 ret i32 %4 1595} 1596 1597define void @cmp_v8i64_sext(<8 x i64>* %xptr, <8 x i64>* %yptr, <8 x i64>* %zptr) "min-legal-vector-width"="256" { 1598; CHECK-LABEL: cmp_v8i64_sext: 1599; CHECK: # %bb.0: 1600; CHECK-NEXT: vmovdqa (%rsi), %ymm0 1601; CHECK-NEXT: vmovdqa 32(%rsi), %ymm1 1602; CHECK-NEXT: vpcmpgtq 32(%rdi), %ymm1, %ymm1 1603; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0 1604; CHECK-NEXT: vmovdqa %ymm0, (%rdx) 1605; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) 1606; CHECK-NEXT: vzeroupper 1607; CHECK-NEXT: retq 1608 %x = load <8 x i64>, <8 x i64>* %xptr 1609 %y = load <8 x i64>, <8 x i64>* %yptr 1610 %cmp = icmp slt <8 x i64> %x, %y 1611 %ext = sext <8 x i1> %cmp to <8 x i64> 1612 store <8 x i64> %ext, <8 x i64>* %zptr 1613 ret void 1614} 1615 1616define void @cmp_v8i64_zext(<8 x i64>* %xptr, <8 x i64>* %yptr, <8 x i64>* %zptr) "min-legal-vector-width"="256" { 1617; CHECK-LABEL: cmp_v8i64_zext: 1618; CHECK: # %bb.0: 1619; CHECK-NEXT: vmovdqa (%rsi), %ymm0 1620; CHECK-NEXT: vmovdqa 32(%rsi), %ymm1 1621; CHECK-NEXT: vpcmpgtq 32(%rdi), %ymm1, %ymm1 1622; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0 1623; CHECK-NEXT: vpsrlq $63, %ymm1, %ymm1 1624; CHECK-NEXT: vpsrlq $63, %ymm0, %ymm0 1625; CHECK-NEXT: vmovdqa %ymm0, (%rdx) 1626; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) 1627; CHECK-NEXT: vzeroupper 1628; CHECK-NEXT: retq 1629 %x = load <8 x i64>, <8 x i64>* %xptr 1630 %y = load <8 x i64>, <8 x i64>* %yptr 1631 %cmp = icmp slt <8 x i64> %x, %y 1632 %ext = zext <8 x i1> %cmp to <8 x i64> 1633 store <8 x i64> %ext, <8 x i64>* %zptr 1634 ret void 1635} 1636 1637define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind "min-legal-vector-width"="256" { 1638; CHECK-LABEL: var_rotate_v16i8: 1639; CHECK: # %bb.0: 1640; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 1641; CHECK-NEXT: vpsubb %xmm1, %xmm2, %xmm2 1642; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 1643; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1644; CHECK-NEXT: vpsllvw %ymm1, %ymm0, %ymm1 1645; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero 1646; CHECK-NEXT: vpsrlvw %ymm2, %ymm0, %ymm0 1647; CHECK-NEXT: vpor %ymm0, %ymm1, %ymm0 1648; CHECK-NEXT: vpmovwb %ymm0, %xmm0 1649; CHECK-NEXT: vzeroupper 1650; CHECK-NEXT: retq 1651 %b8 = sub <16 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b 1652 %shl = shl <16 x i8> %a, %b 1653 %lshr = lshr <16 x i8> %a, %b8 1654 %or = or <16 x i8> %shl, %lshr 1655 ret <16 x i8> %or 1656} 1657 1658define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind "min-legal-vector-width"="256" { 1659; CHECK-LABEL: var_rotate_v32i8: 1660; CHECK: # %bb.0: 1661; CHECK-NEXT: vpsllw $4, %ymm0, %ymm2 1662; CHECK-NEXT: vpsrlw $4, %ymm0, %ymm3 1663; CHECK-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm2, %ymm3 1664; CHECK-NEXT: vpsllw $5, %ymm1, %ymm1 1665; CHECK-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 1666; CHECK-NEXT: vpsllw $2, %ymm0, %ymm2 1667; CHECK-NEXT: vpsrlw $6, %ymm0, %ymm3 1668; CHECK-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm2, %ymm3 1669; CHECK-NEXT: vpaddb %ymm1, %ymm1, %ymm1 1670; CHECK-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 1671; CHECK-NEXT: vpsrlw $7, %ymm0, %ymm2 1672; CHECK-NEXT: vpaddb %ymm0, %ymm0, %ymm3 1673; CHECK-NEXT: vpternlogq $248, {{.*}}(%rip), %ymm2, %ymm3 1674; CHECK-NEXT: vpaddb %ymm1, %ymm1, %ymm1 1675; CHECK-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 1676; CHECK-NEXT: retq 1677 %b8 = sub <32 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b 1678 %shl = shl <32 x i8> %a, %b 1679 %lshr = lshr <32 x i8> %a, %b8 1680 %or = or <32 x i8> %shl, %lshr 1681 ret <32 x i8> %or 1682} 1683 1684define <32 x i8> @splatvar_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind "min-legal-vector-width"="256" { 1685; CHECK-LABEL: splatvar_rotate_v32i8: 1686; CHECK: # %bb.0: 1687; CHECK-NEXT: vpbroadcastb %xmm1, %xmm1 1688; CHECK-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1689; CHECK-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1690; CHECK-NEXT: vpsllw %xmm2, %ymm0, %ymm3 1691; CHECK-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 1692; CHECK-NEXT: vpsubb %xmm1, %xmm4, %xmm1 1693; CHECK-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 1694; CHECK-NEXT: vpsllw %xmm2, %xmm4, %xmm2 1695; CHECK-NEXT: vpbroadcastb %xmm2, %ymm2 1696; CHECK-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1697; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm5 1698; CHECK-NEXT: vpand %ymm2, %ymm3, %ymm2 1699; CHECK-NEXT: vpsrlw %xmm1, %xmm4, %xmm0 1700; CHECK-NEXT: vpsrlw $8, %xmm0, %xmm0 1701; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0 1702; CHECK-NEXT: vpternlogq $236, %ymm5, %ymm2, %ymm0 1703; CHECK-NEXT: retq 1704 %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer 1705 %splat8 = sub <32 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %splat 1706 %shl = shl <32 x i8> %a, %splat 1707 %lshr = lshr <32 x i8> %a, %splat8 1708 %or = or <32 x i8> %shl, %lshr 1709 ret <32 x i8> %or 1710} 1711 1712define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind "min-legal-vector-width"="256" { 1713; CHECK-AVX512-LABEL: constant_rotate_v32i8: 1714; CHECK-AVX512: # %bb.0: 1715; CHECK-AVX512-NEXT: vpsllw $4, %ymm0, %ymm1 1716; CHECK-AVX512-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 1717; CHECK-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] 1718; CHECK-AVX512-NEXT: # ymm2 = mem[0,1,0,1] 1719; CHECK-AVX512-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 1720; CHECK-AVX512-NEXT: vpsllw $2, %ymm1, %ymm3 1721; CHECK-AVX512-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 1722; CHECK-AVX512-NEXT: vpaddb %ymm2, %ymm2, %ymm2 1723; CHECK-AVX512-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 1724; CHECK-AVX512-NEXT: vpaddb %ymm1, %ymm1, %ymm3 1725; CHECK-AVX512-NEXT: vpaddb %ymm2, %ymm2, %ymm2 1726; CHECK-AVX512-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 1727; CHECK-AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 1728; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] 1729; CHECK-AVX512-NEXT: vpsllvw {{.*}}(%rip), %ymm3, %ymm3 1730; CHECK-AVX512-NEXT: vpsrlw $8, %ymm3, %ymm3 1731; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] 1732; CHECK-AVX512-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0 1733; CHECK-AVX512-NEXT: vpsrlw $8, %ymm0, %ymm0 1734; CHECK-AVX512-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 1735; CHECK-AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 1736; CHECK-AVX512-NEXT: retq 1737; 1738; CHECK-VBMI-LABEL: constant_rotate_v32i8: 1739; CHECK-VBMI: # %bb.0: 1740; CHECK-VBMI-NEXT: vpsllw $4, %ymm0, %ymm1 1741; CHECK-VBMI-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 1742; CHECK-VBMI-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] 1743; CHECK-VBMI-NEXT: # ymm2 = mem[0,1,0,1] 1744; CHECK-VBMI-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 1745; CHECK-VBMI-NEXT: vpsllw $2, %ymm1, %ymm3 1746; CHECK-VBMI-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 1747; CHECK-VBMI-NEXT: vpaddb %ymm2, %ymm2, %ymm2 1748; CHECK-VBMI-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 1749; CHECK-VBMI-NEXT: vpaddb %ymm1, %ymm1, %ymm3 1750; CHECK-VBMI-NEXT: vpaddb %ymm2, %ymm2, %ymm2 1751; CHECK-VBMI-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 1752; CHECK-VBMI-NEXT: vpxor %xmm2, %xmm2, %xmm2 1753; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] 1754; CHECK-VBMI-NEXT: vpsllvw {{.*}}(%rip), %ymm3, %ymm3 1755; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] 1756; CHECK-VBMI-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0 1757; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15,33,35,37,39,41,43,45,47,17,19,21,23,25,27,29,31,49,51,53,55,57,59,61,63] 1758; CHECK-VBMI-NEXT: vpermi2b %ymm3, %ymm0, %ymm2 1759; CHECK-VBMI-NEXT: vpor %ymm2, %ymm1, %ymm0 1760; CHECK-VBMI-NEXT: retq 1761 %shl = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1> 1762 %lshr = lshr <32 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7> 1763 %or = or <32 x i8> %shl, %lshr 1764 ret <32 x i8> %or 1765} 1766 1767define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind "min-legal-vector-width"="256" { 1768; CHECK-LABEL: splatconstant_rotate_v32i8: 1769; CHECK: # %bb.0: 1770; CHECK-NEXT: vpsllw $4, %ymm0, %ymm1 1771; CHECK-NEXT: vpsrlw $4, %ymm0, %ymm0 1772; CHECK-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm1, %ymm0 1773; CHECK-NEXT: retq 1774 %shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 1775 %lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 1776 %or = or <32 x i8> %shl, %lshr 1777 ret <32 x i8> %or 1778} 1779 1780define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind "min-legal-vector-width"="256" { 1781; CHECK-LABEL: splatconstant_rotate_mask_v32i8: 1782; CHECK: # %bb.0: 1783; CHECK-NEXT: vpsllw $4, %ymm0, %ymm1 1784; CHECK-NEXT: vpsrlw $4, %ymm0, %ymm0 1785; CHECK-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm1, %ymm0 1786; CHECK-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 1787; CHECK-NEXT: retq 1788 %shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 1789 %lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 1790 %rmask = and <32 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55> 1791 %lmask = and <32 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33> 1792 %or = or <32 x i8> %lmask, %rmask 1793 ret <32 x i8> %or 1794} 1795