1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512bw,avx512dq,prefer-256-bit | FileCheck %s 3 4; This file primarily contains tests for specific places in X86ISelLowering.cpp that needed be made aware of the legalizer not allowing 512-bit vectors due to prefer-256-bit even though AVX512 is enabled. 5 6define void @add256(<16 x i32>* %a, <16 x i32>* %b, <16 x i32>* %c) "required-vector-width"="256" { 7; CHECK-LABEL: add256: 8; CHECK: # %bb.0: 9; CHECK-NEXT: vmovdqa (%rdi), %ymm0 10; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 11; CHECK-NEXT: vpaddd (%rsi), %ymm0, %ymm0 12; CHECK-NEXT: vpaddd 32(%rsi), %ymm1, %ymm1 13; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) 14; CHECK-NEXT: vmovdqa %ymm0, (%rdx) 15; CHECK-NEXT: vzeroupper 16; CHECK-NEXT: retq 17 %d = load <16 x i32>, <16 x i32>* %a 18 %e = load <16 x i32>, <16 x i32>* %b 19 %f = add <16 x i32> %d, %e 20 store <16 x i32> %f, <16 x i32>* %c 21 ret void 22} 23 24define void @add512(<16 x i32>* %a, <16 x i32>* %b, <16 x i32>* %c) "required-vector-width"="512" { 25; CHECK-LABEL: add512: 26; CHECK: # %bb.0: 27; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 28; CHECK-NEXT: vpaddd (%rsi), %zmm0, %zmm0 29; CHECK-NEXT: vmovdqa64 %zmm0, (%rdx) 30; CHECK-NEXT: vzeroupper 31; CHECK-NEXT: retq 32 %d = load <16 x i32>, <16 x i32>* %a 33 %e = load <16 x i32>, <16 x i32>* %b 34 %f = add <16 x i32> %d, %e 35 store <16 x i32> %f, <16 x i32>* %c 36 ret void 37} 38 39define void @avg_v64i8_256(<64 x i8>* %a, <64 x i8>* %b) "required-vector-width"="256" { 40; CHECK-LABEL: avg_v64i8_256: 41; CHECK: # %bb.0: 42; CHECK-NEXT: vmovdqa (%rsi), %ymm0 43; CHECK-NEXT: vmovdqa 32(%rsi), %ymm1 44; CHECK-NEXT: vpavgb (%rdi), %ymm0, %ymm0 45; CHECK-NEXT: vpavgb 32(%rdi), %ymm1, %ymm1 46; CHECK-NEXT: vmovdqu %ymm1, (%rax) 47; CHECK-NEXT: vmovdqu %ymm0, (%rax) 48; CHECK-NEXT: vzeroupper 49; CHECK-NEXT: retq 50 %1 = load <64 x i8>, <64 x i8>* %a 51 %2 = load <64 x i8>, <64 x i8>* %b 52 %3 = zext <64 x i8> %1 to <64 x i32> 53 %4 = zext <64 x i8> %2 to <64 x i32> 54 %5 = add nuw nsw <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 55 %6 = add nuw nsw <64 x i32> %5, %4 56 %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 57 %8 = trunc <64 x i32> %7 to <64 x i8> 58 store <64 x i8> %8, <64 x i8>* undef, align 4 59 ret void 60} 61 62 63define void @avg_v64i8_512(<64 x i8>* %a, <64 x i8>* %b) "required-vector-width"="512" { 64; CHECK-LABEL: avg_v64i8_512: 65; CHECK: # %bb.0: 66; CHECK-NEXT: vmovdqa64 (%rsi), %zmm0 67; CHECK-NEXT: vpavgb (%rdi), %zmm0, %zmm0 68; CHECK-NEXT: vmovdqu64 %zmm0, (%rax) 69; CHECK-NEXT: vzeroupper 70; CHECK-NEXT: retq 71 %1 = load <64 x i8>, <64 x i8>* %a 72 %2 = load <64 x i8>, <64 x i8>* %b 73 %3 = zext <64 x i8> %1 to <64 x i32> 74 %4 = zext <64 x i8> %2 to <64 x i32> 75 %5 = add nuw nsw <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 76 %6 = add nuw nsw <64 x i32> %5, %4 77 %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 78 %8 = trunc <64 x i32> %7 to <64 x i8> 79 store <64 x i8> %8, <64 x i8>* undef, align 4 80 ret void 81} 82 83define void @pmaddwd_32_256(<32 x i16>* %APtr, <32 x i16>* %BPtr, <16 x i32>* %CPtr) "required-vector-width"="256" { 84; CHECK-LABEL: pmaddwd_32_256: 85; CHECK: # %bb.0: 86; CHECK-NEXT: vmovdqa (%rdi), %ymm0 87; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 88; CHECK-NEXT: vpmaddwd (%rsi), %ymm0, %ymm0 89; CHECK-NEXT: vpmaddwd 32(%rsi), %ymm1, %ymm1 90; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) 91; CHECK-NEXT: vmovdqa %ymm0, (%rdx) 92; CHECK-NEXT: vzeroupper 93; CHECK-NEXT: retq 94 %A = load <32 x i16>, <32 x i16>* %APtr 95 %B = load <32 x i16>, <32 x i16>* %BPtr 96 %a = sext <32 x i16> %A to <32 x i32> 97 %b = sext <32 x i16> %B to <32 x i32> 98 %m = mul nsw <32 x i32> %a, %b 99 %odd = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> 100 %even = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> 101 %ret = add <16 x i32> %odd, %even 102 store <16 x i32> %ret, <16 x i32>* %CPtr 103 ret void 104} 105 106define void @pmaddwd_32_512(<32 x i16>* %APtr, <32 x i16>* %BPtr, <16 x i32>* %CPtr) "required-vector-width"="512" { 107; CHECK-LABEL: pmaddwd_32_512: 108; CHECK: # %bb.0: 109; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 110; CHECK-NEXT: vpmaddwd (%rsi), %zmm0, %zmm0 111; CHECK-NEXT: vmovdqa64 %zmm0, (%rdx) 112; CHECK-NEXT: vzeroupper 113; CHECK-NEXT: retq 114 %A = load <32 x i16>, <32 x i16>* %APtr 115 %B = load <32 x i16>, <32 x i16>* %BPtr 116 %a = sext <32 x i16> %A to <32 x i32> 117 %b = sext <32 x i16> %B to <32 x i32> 118 %m = mul nsw <32 x i32> %a, %b 119 %odd = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> 120 %even = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> 121 %ret = add <16 x i32> %odd, %even 122 store <16 x i32> %ret, <16 x i32>* %CPtr 123 ret void 124} 125 126define void @psubus_64i8_max_256(<64 x i8>* %xptr, <64 x i8>* %yptr, <64 x i8>* %zptr) "required-vector-width"="256" { 127; CHECK-LABEL: psubus_64i8_max_256: 128; CHECK: # %bb.0: 129; CHECK-NEXT: vmovdqa (%rdi), %ymm0 130; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 131; CHECK-NEXT: vpsubusb (%rsi), %ymm0, %ymm0 132; CHECK-NEXT: vpsubusb 32(%rsi), %ymm1, %ymm1 133; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) 134; CHECK-NEXT: vmovdqa %ymm0, (%rdx) 135; CHECK-NEXT: vzeroupper 136; CHECK-NEXT: retq 137 %x = load <64 x i8>, <64 x i8>* %xptr 138 %y = load <64 x i8>, <64 x i8>* %yptr 139 %cmp = icmp ult <64 x i8> %x, %y 140 %max = select <64 x i1> %cmp, <64 x i8> %y, <64 x i8> %x 141 %res = sub <64 x i8> %max, %y 142 store <64 x i8> %res, <64 x i8>* %zptr 143 ret void 144} 145 146define void @psubus_64i8_max_512(<64 x i8>* %xptr, <64 x i8>* %yptr, <64 x i8>* %zptr) "required-vector-width"="512" { 147; CHECK-LABEL: psubus_64i8_max_512: 148; CHECK: # %bb.0: 149; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 150; CHECK-NEXT: vpsubusb (%rsi), %zmm0, %zmm0 151; CHECK-NEXT: vmovdqa64 %zmm0, (%rdx) 152; CHECK-NEXT: vzeroupper 153; CHECK-NEXT: retq 154 %x = load <64 x i8>, <64 x i8>* %xptr 155 %y = load <64 x i8>, <64 x i8>* %yptr 156 %cmp = icmp ult <64 x i8> %x, %y 157 %max = select <64 x i1> %cmp, <64 x i8> %y, <64 x i8> %x 158 %res = sub <64 x i8> %max, %y 159 store <64 x i8> %res, <64 x i8>* %zptr 160 ret void 161} 162 163define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly, i32) "required-vector-width"="256" { 164; CHECK-LABEL: _Z9test_charPcS_i_256: 165; CHECK: # %bb.0: # %entry 166; CHECK-NEXT: movl %edx, %eax 167; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 168; CHECK-NEXT: xorl %ecx, %ecx 169; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 170; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 171; CHECK-NEXT: .p2align 4, 0x90 172; CHECK-NEXT: .LBB8_1: # %vector.body 173; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 174; CHECK-NEXT: vpmovsxbw (%rdi,%rcx), %ymm3 175; CHECK-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm4 176; CHECK-NEXT: vpmovsxbw (%rsi,%rcx), %ymm5 177; CHECK-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3 178; CHECK-NEXT: vpaddd %ymm1, %ymm3, %ymm1 179; CHECK-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm3 180; CHECK-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3 181; CHECK-NEXT: vpaddd %ymm2, %ymm3, %ymm2 182; CHECK-NEXT: addq $32, %rcx 183; CHECK-NEXT: cmpq %rcx, %rax 184; CHECK-NEXT: jne .LBB8_1 185; CHECK-NEXT: # %bb.2: # %middle.block 186; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm1 187; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 188; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 189; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 190; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 191; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 192; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 193; CHECK-NEXT: vphaddd %ymm0, %ymm0, %ymm0 194; CHECK-NEXT: vmovd %xmm0, %eax 195; CHECK-NEXT: vzeroupper 196; CHECK-NEXT: retq 197entry: 198 %3 = zext i32 %2 to i64 199 br label %vector.body 200 201vector.body: 202 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 203 %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] 204 %4 = getelementptr inbounds i8, i8* %0, i64 %index 205 %5 = bitcast i8* %4 to <32 x i8>* 206 %wide.load = load <32 x i8>, <32 x i8>* %5, align 1 207 %6 = sext <32 x i8> %wide.load to <32 x i32> 208 %7 = getelementptr inbounds i8, i8* %1, i64 %index 209 %8 = bitcast i8* %7 to <32 x i8>* 210 %wide.load14 = load <32 x i8>, <32 x i8>* %8, align 1 211 %9 = sext <32 x i8> %wide.load14 to <32 x i32> 212 %10 = mul nsw <32 x i32> %9, %6 213 %11 = add nsw <32 x i32> %10, %vec.phi 214 %index.next = add i64 %index, 32 215 %12 = icmp eq i64 %index.next, %3 216 br i1 %12, label %middle.block, label %vector.body 217 218middle.block: 219 %rdx.shuf1 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 220 %bin.rdx1 = add <32 x i32> %11, %rdx.shuf1 221 %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 222 %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf 223 %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 224 %bin.rdx32 = add <32 x i32> %bin.rdx, %rdx.shuf15 225 %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx32, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 226 %bin.rdx18 = add <32 x i32> %bin.rdx32, %rdx.shuf17 227 %rdx.shuf19 = shufflevector <32 x i32> %bin.rdx18, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 228 %bin.rdx20 = add <32 x i32> %bin.rdx18, %rdx.shuf19 229 %13 = extractelement <32 x i32> %bin.rdx20, i32 0 230 ret i32 %13 231} 232 233define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly, i32) "required-vector-width"="512" { 234; CHECK-LABEL: _Z9test_charPcS_i_512: 235; CHECK: # %bb.0: # %entry 236; CHECK-NEXT: movl %edx, %eax 237; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 238; CHECK-NEXT: xorl %ecx, %ecx 239; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 240; CHECK-NEXT: .p2align 4, 0x90 241; CHECK-NEXT: .LBB9_1: # %vector.body 242; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 243; CHECK-NEXT: vpmovsxbw (%rdi,%rcx), %zmm2 244; CHECK-NEXT: vpmovsxbw (%rsi,%rcx), %zmm3 245; CHECK-NEXT: vpmaddwd %zmm2, %zmm3, %zmm2 246; CHECK-NEXT: vpaddd %zmm1, %zmm2, %zmm1 247; CHECK-NEXT: addq $32, %rcx 248; CHECK-NEXT: cmpq %rcx, %rax 249; CHECK-NEXT: jne .LBB9_1 250; CHECK-NEXT: # %bb.2: # %middle.block 251; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 252; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 253; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 254; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 255; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 256; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 257; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 258; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 259; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 260; CHECK-NEXT: vmovd %xmm0, %eax 261; CHECK-NEXT: vzeroupper 262; CHECK-NEXT: retq 263entry: 264 %3 = zext i32 %2 to i64 265 br label %vector.body 266 267vector.body: 268 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 269 %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] 270 %4 = getelementptr inbounds i8, i8* %0, i64 %index 271 %5 = bitcast i8* %4 to <32 x i8>* 272 %wide.load = load <32 x i8>, <32 x i8>* %5, align 1 273 %6 = sext <32 x i8> %wide.load to <32 x i32> 274 %7 = getelementptr inbounds i8, i8* %1, i64 %index 275 %8 = bitcast i8* %7 to <32 x i8>* 276 %wide.load14 = load <32 x i8>, <32 x i8>* %8, align 1 277 %9 = sext <32 x i8> %wide.load14 to <32 x i32> 278 %10 = mul nsw <32 x i32> %9, %6 279 %11 = add nsw <32 x i32> %10, %vec.phi 280 %index.next = add i64 %index, 32 281 %12 = icmp eq i64 %index.next, %3 282 br i1 %12, label %middle.block, label %vector.body 283 284middle.block: 285 %rdx.shuf1 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 286 %bin.rdx1 = add <32 x i32> %11, %rdx.shuf1 287 %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 288 %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf 289 %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 290 %bin.rdx32 = add <32 x i32> %bin.rdx, %rdx.shuf15 291 %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx32, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 292 %bin.rdx18 = add <32 x i32> %bin.rdx32, %rdx.shuf17 293 %rdx.shuf19 = shufflevector <32 x i32> %bin.rdx18, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 294 %bin.rdx20 = add <32 x i32> %bin.rdx18, %rdx.shuf19 295 %13 = extractelement <32 x i32> %bin.rdx20, i32 0 296 ret i32 %13 297} 298 299@a = global [1024 x i8] zeroinitializer, align 16 300@b = global [1024 x i8] zeroinitializer, align 16 301 302define i32 @sad_16i8_256() "required-vector-width"="256" { 303; CHECK-LABEL: sad_16i8_256: 304; CHECK: # %bb.0: # %entry 305; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 306; CHECK-NEXT: movq $-1024, %rax # imm = 0xFC00 307; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 308; CHECK-NEXT: .p2align 4, 0x90 309; CHECK-NEXT: .LBB10_1: # %vector.body 310; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 311; CHECK-NEXT: vmovdqu a+1024(%rax), %xmm2 312; CHECK-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2 313; CHECK-NEXT: vpaddd %ymm1, %ymm2, %ymm1 314; CHECK-NEXT: addq $4, %rax 315; CHECK-NEXT: jne .LBB10_1 316; CHECK-NEXT: # %bb.2: # %middle.block 317; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 318; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 319; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 320; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 321; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 322; CHECK-NEXT: vphaddd %ymm0, %ymm0, %ymm0 323; CHECK-NEXT: vmovd %xmm0, %eax 324; CHECK-NEXT: vzeroupper 325; CHECK-NEXT: retq 326entry: 327 br label %vector.body 328 329vector.body: 330 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 331 %vec.phi = phi <16 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ] 332 %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index 333 %1 = bitcast i8* %0 to <16 x i8>* 334 %wide.load = load <16 x i8>, <16 x i8>* %1, align 4 335 %2 = zext <16 x i8> %wide.load to <16 x i32> 336 %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index 337 %4 = bitcast i8* %3 to <16 x i8>* 338 %wide.load1 = load <16 x i8>, <16 x i8>* %4, align 4 339 %5 = zext <16 x i8> %wide.load1 to <16 x i32> 340 %6 = sub nsw <16 x i32> %2, %5 341 %7 = icmp sgt <16 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> 342 %8 = sub nsw <16 x i32> zeroinitializer, %6 343 %9 = select <16 x i1> %7, <16 x i32> %6, <16 x i32> %8 344 %10 = add nsw <16 x i32> %9, %vec.phi 345 %index.next = add i64 %index, 4 346 %11 = icmp eq i64 %index.next, 1024 347 br i1 %11, label %middle.block, label %vector.body 348 349middle.block: 350 %.lcssa = phi <16 x i32> [ %10, %vector.body ] 351 %rdx.shuf = shufflevector <16 x i32> %.lcssa, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 352 %bin.rdx = add <16 x i32> %.lcssa, %rdx.shuf 353 %rdx.shuf2 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 354 %bin.rdx2 = add <16 x i32> %bin.rdx, %rdx.shuf2 355 %rdx.shuf3 = shufflevector <16 x i32> %bin.rdx2, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 356 %bin.rdx3 = add <16 x i32> %bin.rdx2, %rdx.shuf3 357 %rdx.shuf4 = shufflevector <16 x i32> %bin.rdx3, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 358 %bin.rdx4 = add <16 x i32> %bin.rdx3, %rdx.shuf4 359 %12 = extractelement <16 x i32> %bin.rdx4, i32 0 360 ret i32 %12 361} 362 363define i32 @sad_16i8_512() "required-vector-width"="512" { 364; CHECK-LABEL: sad_16i8_512: 365; CHECK: # %bb.0: # %entry 366; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 367; CHECK-NEXT: movq $-1024, %rax # imm = 0xFC00 368; CHECK-NEXT: .p2align 4, 0x90 369; CHECK-NEXT: .LBB11_1: # %vector.body 370; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 371; CHECK-NEXT: vmovdqu a+1024(%rax), %xmm1 372; CHECK-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1 373; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 374; CHECK-NEXT: addq $4, %rax 375; CHECK-NEXT: jne .LBB11_1 376; CHECK-NEXT: # %bb.2: # %middle.block 377; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 378; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 379; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 380; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 381; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 382; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 383; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 384; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 385; CHECK-NEXT: vmovd %xmm0, %eax 386; CHECK-NEXT: vzeroupper 387; CHECK-NEXT: retq 388entry: 389 br label %vector.body 390 391vector.body: 392 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 393 %vec.phi = phi <16 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ] 394 %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index 395 %1 = bitcast i8* %0 to <16 x i8>* 396 %wide.load = load <16 x i8>, <16 x i8>* %1, align 4 397 %2 = zext <16 x i8> %wide.load to <16 x i32> 398 %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index 399 %4 = bitcast i8* %3 to <16 x i8>* 400 %wide.load1 = load <16 x i8>, <16 x i8>* %4, align 4 401 %5 = zext <16 x i8> %wide.load1 to <16 x i32> 402 %6 = sub nsw <16 x i32> %2, %5 403 %7 = icmp sgt <16 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> 404 %8 = sub nsw <16 x i32> zeroinitializer, %6 405 %9 = select <16 x i1> %7, <16 x i32> %6, <16 x i32> %8 406 %10 = add nsw <16 x i32> %9, %vec.phi 407 %index.next = add i64 %index, 4 408 %11 = icmp eq i64 %index.next, 1024 409 br i1 %11, label %middle.block, label %vector.body 410 411middle.block: 412 %.lcssa = phi <16 x i32> [ %10, %vector.body ] 413 %rdx.shuf = shufflevector <16 x i32> %.lcssa, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 414 %bin.rdx = add <16 x i32> %.lcssa, %rdx.shuf 415 %rdx.shuf2 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 416 %bin.rdx2 = add <16 x i32> %bin.rdx, %rdx.shuf2 417 %rdx.shuf3 = shufflevector <16 x i32> %bin.rdx2, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 418 %bin.rdx3 = add <16 x i32> %bin.rdx2, %rdx.shuf3 419 %rdx.shuf4 = shufflevector <16 x i32> %bin.rdx3, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 420 %bin.rdx4 = add <16 x i32> %bin.rdx3, %rdx.shuf4 421 %12 = extractelement <16 x i32> %bin.rdx4, i32 0 422 ret i32 %12 423} 424 425define void @sbto16f32_256(<16 x i16> %a, <16 x float>* %res) "required-vector-width"="256" { 426; CHECK-LABEL: sbto16f32_256: 427; CHECK: # %bb.0: 428; CHECK-NEXT: vpmovw2m %ymm0, %k0 429; CHECK-NEXT: kshiftrw $8, %k0, %k1 430; CHECK-NEXT: vpmovm2d %k1, %ymm0 431; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 432; CHECK-NEXT: vpmovm2d %k0, %ymm1 433; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1 434; CHECK-NEXT: vmovaps %ymm1, (%rdi) 435; CHECK-NEXT: vmovaps %ymm0, 32(%rdi) 436; CHECK-NEXT: vzeroupper 437; CHECK-NEXT: retq 438 %mask = icmp slt <16 x i16> %a, zeroinitializer 439 %1 = sitofp <16 x i1> %mask to <16 x float> 440 store <16 x float> %1, <16 x float>* %res 441 ret void 442} 443 444define void @sbto16f32_512(<16 x i16> %a, <16 x float>* %res) "required-vector-width"="512" { 445; CHECK-LABEL: sbto16f32_512: 446; CHECK: # %bb.0: 447; CHECK-NEXT: vpmovw2m %ymm0, %k0 448; CHECK-NEXT: vpmovm2d %k0, %zmm0 449; CHECK-NEXT: vcvtdq2ps %zmm0, %zmm0 450; CHECK-NEXT: vmovaps %zmm0, (%rdi) 451; CHECK-NEXT: vzeroupper 452; CHECK-NEXT: retq 453 %mask = icmp slt <16 x i16> %a, zeroinitializer 454 %1 = sitofp <16 x i1> %mask to <16 x float> 455 store <16 x float> %1, <16 x float>* %res 456 ret void 457} 458 459define void @sbto16f64_256(<16 x i16> %a, <16 x double>* %res) "required-vector-width"="256" { 460; CHECK-LABEL: sbto16f64_256: 461; CHECK: # %bb.0: 462; CHECK-NEXT: vpmovw2m %ymm0, %k0 463; CHECK-NEXT: kshiftrw $8, %k0, %k1 464; CHECK-NEXT: vpmovm2d %k1, %ymm0 465; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm1 466; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 467; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm0 468; CHECK-NEXT: vpmovm2d %k0, %ymm2 469; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm3 470; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2 471; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm2 472; CHECK-NEXT: vmovaps %ymm2, 32(%rdi) 473; CHECK-NEXT: vmovaps %ymm3, (%rdi) 474; CHECK-NEXT: vmovaps %ymm0, 96(%rdi) 475; CHECK-NEXT: vmovaps %ymm1, 64(%rdi) 476; CHECK-NEXT: vzeroupper 477; CHECK-NEXT: retq 478 %mask = icmp slt <16 x i16> %a, zeroinitializer 479 %1 = sitofp <16 x i1> %mask to <16 x double> 480 store <16 x double> %1, <16 x double>* %res 481 ret void 482} 483 484define void @sbto16f64_512(<16 x i16> %a, <16 x double>* %res) "required-vector-width"="512" { 485; CHECK-LABEL: sbto16f64_512: 486; CHECK: # %bb.0: 487; CHECK-NEXT: vpmovw2m %ymm0, %k0 488; CHECK-NEXT: vpmovm2d %k0, %zmm0 489; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm1 490; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 491; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm0 492; CHECK-NEXT: vmovaps %zmm0, 64(%rdi) 493; CHECK-NEXT: vmovaps %zmm1, (%rdi) 494; CHECK-NEXT: vzeroupper 495; CHECK-NEXT: retq 496 %mask = icmp slt <16 x i16> %a, zeroinitializer 497 %1 = sitofp <16 x i1> %mask to <16 x double> 498 store <16 x double> %1, <16 x double>* %res 499 ret void 500} 501 502define void @ubto16f32_256(<16 x i16> %a, <16 x float>* %res) "required-vector-width"="256" { 503; CHECK-LABEL: ubto16f32_256: 504; CHECK: # %bb.0: 505; CHECK-NEXT: vpmovw2m %ymm0, %k0 506; CHECK-NEXT: kshiftrw $8, %k0, %k1 507; CHECK-NEXT: vpmovm2d %k1, %ymm0 508; CHECK-NEXT: vpsrld $31, %ymm0, %ymm0 509; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 510; CHECK-NEXT: vpmovm2d %k0, %ymm1 511; CHECK-NEXT: vpsrld $31, %ymm1, %ymm1 512; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1 513; CHECK-NEXT: vmovaps %ymm1, (%rdi) 514; CHECK-NEXT: vmovaps %ymm0, 32(%rdi) 515; CHECK-NEXT: vzeroupper 516; CHECK-NEXT: retq 517 %mask = icmp slt <16 x i16> %a, zeroinitializer 518 %1 = uitofp <16 x i1> %mask to <16 x float> 519 store <16 x float> %1, <16 x float>* %res 520 ret void 521} 522 523define void @ubto16f32_512(<16 x i16> %a, <16 x float>* %res) "required-vector-width"="512" { 524; CHECK-LABEL: ubto16f32_512: 525; CHECK: # %bb.0: 526; CHECK-NEXT: vpmovw2m %ymm0, %k0 527; CHECK-NEXT: vpmovm2d %k0, %zmm0 528; CHECK-NEXT: vpsrld $31, %zmm0, %zmm0 529; CHECK-NEXT: vcvtdq2ps %zmm0, %zmm0 530; CHECK-NEXT: vmovaps %zmm0, (%rdi) 531; CHECK-NEXT: vzeroupper 532; CHECK-NEXT: retq 533 %mask = icmp slt <16 x i16> %a, zeroinitializer 534 %1 = uitofp <16 x i1> %mask to <16 x float> 535 store <16 x float> %1, <16 x float>* %res 536 ret void 537} 538 539define void @ubto16f64_256(<16 x i16> %a, <16 x double>* %res) "required-vector-width"="256" { 540; CHECK-LABEL: ubto16f64_256: 541; CHECK: # %bb.0: 542; CHECK-NEXT: vpmovw2m %ymm0, %k0 543; CHECK-NEXT: kshiftrw $8, %k0, %k1 544; CHECK-NEXT: vpmovm2d %k1, %ymm0 545; CHECK-NEXT: vpsrld $31, %ymm0, %ymm0 546; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm1 547; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 548; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm0 549; CHECK-NEXT: vpmovm2d %k0, %ymm2 550; CHECK-NEXT: vpsrld $31, %ymm2, %ymm2 551; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm3 552; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2 553; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm2 554; CHECK-NEXT: vmovaps %ymm2, 32(%rdi) 555; CHECK-NEXT: vmovaps %ymm3, (%rdi) 556; CHECK-NEXT: vmovaps %ymm0, 96(%rdi) 557; CHECK-NEXT: vmovaps %ymm1, 64(%rdi) 558; CHECK-NEXT: vzeroupper 559; CHECK-NEXT: retq 560 %mask = icmp slt <16 x i16> %a, zeroinitializer 561 %1 = uitofp <16 x i1> %mask to <16 x double> 562 store <16 x double> %1, <16 x double>* %res 563 ret void 564} 565 566define void @ubto16f64_512(<16 x i16> %a, <16 x double>* %res) "required-vector-width"="512" { 567; CHECK-LABEL: ubto16f64_512: 568; CHECK: # %bb.0: 569; CHECK-NEXT: vpmovw2m %ymm0, %k0 570; CHECK-NEXT: vpmovm2d %k0, %zmm0 571; CHECK-NEXT: vpsrld $31, %zmm0, %zmm0 572; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm1 573; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 574; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm0 575; CHECK-NEXT: vmovaps %zmm0, 64(%rdi) 576; CHECK-NEXT: vmovaps %zmm1, (%rdi) 577; CHECK-NEXT: vzeroupper 578; CHECK-NEXT: retq 579 %mask = icmp slt <16 x i16> %a, zeroinitializer 580 %1 = uitofp <16 x i1> %mask to <16 x double> 581 store <16 x double> %1, <16 x double>* %res 582 ret void 583} 584 585define <16 x i16> @test_16f32toub_256(<16 x float>* %ptr, <16 x i16> %passthru) "required-vector-width"="256" { 586; CHECK-LABEL: test_16f32toub_256: 587; CHECK: # %bb.0: 588; CHECK-NEXT: vcvttps2dq (%rdi), %ymm1 589; CHECK-NEXT: vpmovdw %ymm1, %xmm1 590; CHECK-NEXT: vcvttps2dq 32(%rdi), %ymm2 591; CHECK-NEXT: vpmovdw %ymm2, %xmm2 592; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 593; CHECK-NEXT: vpsllw $15, %ymm1, %ymm1 594; CHECK-NEXT: vpmovw2m %ymm1, %k1 595; CHECK-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} 596; CHECK-NEXT: retq 597 %a = load <16 x float>, <16 x float>* %ptr 598 %mask = fptoui <16 x float> %a to <16 x i1> 599 %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer 600 ret <16 x i16> %select 601} 602 603define <16 x i16> @test_16f32toub_512(<16 x float>* %ptr, <16 x i16> %passthru) "required-vector-width"="512" { 604; CHECK-LABEL: test_16f32toub_512: 605; CHECK: # %bb.0: 606; CHECK-NEXT: vcvttps2dq (%rdi), %zmm1 607; CHECK-NEXT: vpslld $31, %zmm1, %zmm1 608; CHECK-NEXT: vpmovd2m %zmm1, %k1 609; CHECK-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} 610; CHECK-NEXT: retq 611 %a = load <16 x float>, <16 x float>* %ptr 612 %mask = fptoui <16 x float> %a to <16 x i1> 613 %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer 614 ret <16 x i16> %select 615} 616 617define <16 x i16> @test_16f32tosb_256(<16 x float>* %ptr, <16 x i16> %passthru) "required-vector-width"="256" { 618; CHECK-LABEL: test_16f32tosb_256: 619; CHECK: # %bb.0: 620; CHECK-NEXT: vcvttps2dq (%rdi), %ymm1 621; CHECK-NEXT: vpmovdw %ymm1, %xmm1 622; CHECK-NEXT: vcvttps2dq 32(%rdi), %ymm2 623; CHECK-NEXT: vpmovdw %ymm2, %xmm2 624; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 625; CHECK-NEXT: vpsllw $15, %ymm1, %ymm1 626; CHECK-NEXT: vpmovw2m %ymm1, %k1 627; CHECK-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} 628; CHECK-NEXT: retq 629 %a = load <16 x float>, <16 x float>* %ptr 630 %mask = fptosi <16 x float> %a to <16 x i1> 631 %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer 632 ret <16 x i16> %select 633} 634 635define <16 x i16> @test_16f32tosb_512(<16 x float>* %ptr, <16 x i16> %passthru) "required-vector-width"="512" { 636; CHECK-LABEL: test_16f32tosb_512: 637; CHECK: # %bb.0: 638; CHECK-NEXT: vcvttps2dq (%rdi), %zmm1 639; CHECK-NEXT: vpmovd2m %zmm1, %k1 640; CHECK-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} 641; CHECK-NEXT: retq 642 %a = load <16 x float>, <16 x float>* %ptr 643 %mask = fptosi <16 x float> %a to <16 x i1> 644 %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer 645 ret <16 x i16> %select 646} 647