1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vbmi,+avx512cd,+avx512vpopcntdq,+avx512vnni < %s | FileCheck %s 3 4target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 5target triple = "x86_64-unknown-unknown" 6 7; Stack reload folding tests. 8; 9; By including a nop call with sideeffects we can force a partial register spill of the 10; relevant registers and check that the reload is correctly folded into the instruction. 11 12define <16 x i32> @stack_fold_valignd(<16 x i32> %a, <16 x i32> %b) { 13; CHECK-LABEL: stack_fold_valignd: 14; CHECK: # %bb.0: 15; CHECK-NEXT: subq $56, %rsp 16; CHECK-NEXT: .cfi_def_cfa_offset 64 17; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19; CHECK-NEXT: #APP 20; CHECK-NEXT: nop 21; CHECK-NEXT: #NO_APP 22; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 23; CHECK-NEXT: valignd $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 24; CHECK-NEXT: # zmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0] 25; CHECK-NEXT: addq $56, %rsp 26; CHECK-NEXT: .cfi_def_cfa_offset 8 27; CHECK-NEXT: retq 28 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 29 %2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16> 30 ret <16 x i32> %2 31} 32 33define <16 x i32> @stack_fold_valignd_mask(<16 x i32> %a, <16 x i32> %b, <16 x i32>* %passthru, i16 %mask) { 34; CHECK-LABEL: stack_fold_valignd_mask: 35; CHECK: # %bb.0: 36; CHECK-NEXT: subq $56, %rsp 37; CHECK-NEXT: .cfi_def_cfa_offset 64 38; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 39; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 40; CHECK-NEXT: #APP 41; CHECK-NEXT: nop 42; CHECK-NEXT: #NO_APP 43; CHECK-NEXT: kmovd %esi, %k1 44; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 45; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 46; CHECK-NEXT: valignd $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload 47; CHECK-NEXT: # zmm1 {%k1} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0] 48; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 49; CHECK-NEXT: addq $56, %rsp 50; CHECK-NEXT: .cfi_def_cfa_offset 8 51; CHECK-NEXT: retq 52 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 53 %2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16> 54 %3 = bitcast i16 %mask to <16 x i1> 55 %4 = load <16 x i32>, <16 x i32>* %passthru 56 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4 57 ret <16 x i32> %5 58} 59 60define <16 x i32> @stack_fold_valignd_maskz(<16 x i32> %a, <16 x i32> %b, i16 %mask) { 61; CHECK-LABEL: stack_fold_valignd_maskz: 62; CHECK: # %bb.0: 63; CHECK-NEXT: subq $56, %rsp 64; CHECK-NEXT: .cfi_def_cfa_offset 64 65; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 66; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 67; CHECK-NEXT: #APP 68; CHECK-NEXT: nop 69; CHECK-NEXT: #NO_APP 70; CHECK-NEXT: kmovd %edi, %k1 71; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 72; CHECK-NEXT: valignd $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 73; CHECK-NEXT: # zmm0 {%k1} {z} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0] 74; CHECK-NEXT: addq $56, %rsp 75; CHECK-NEXT: .cfi_def_cfa_offset 8 76; CHECK-NEXT: retq 77 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 78 %2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16> 79 %3 = bitcast i16 %mask to <16 x i1> 80 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 81 ret <16 x i32> %4 82} 83 84define <8 x i64> @stack_fold_valignq(<8 x i64> %a, <8 x i64> %b) { 85; CHECK-LABEL: stack_fold_valignq: 86; CHECK: # %bb.0: 87; CHECK-NEXT: subq $56, %rsp 88; CHECK-NEXT: .cfi_def_cfa_offset 64 89; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 90; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 91; CHECK-NEXT: #APP 92; CHECK-NEXT: nop 93; CHECK-NEXT: #NO_APP 94; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 95; CHECK-NEXT: valignq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 96; CHECK-NEXT: # zmm0 = mem[1,2,3,4,5,6,7],zmm0[0] 97; CHECK-NEXT: addq $56, %rsp 98; CHECK-NEXT: .cfi_def_cfa_offset 8 99; CHECK-NEXT: retq 100 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 101 %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 102 ret <8 x i64> %2 103} 104 105define <8 x i64> @stack_fold_valignq_mask(<8 x i64> %a, <8 x i64> %b, <8 x i64>* %passthru, i8 %mask) { 106; CHECK-LABEL: stack_fold_valignq_mask: 107; CHECK: # %bb.0: 108; CHECK-NEXT: subq $56, %rsp 109; CHECK-NEXT: .cfi_def_cfa_offset 64 110; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 111; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 112; CHECK-NEXT: #APP 113; CHECK-NEXT: nop 114; CHECK-NEXT: #NO_APP 115; CHECK-NEXT: kmovd %esi, %k1 116; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 117; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 118; CHECK-NEXT: valignq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload 119; CHECK-NEXT: # zmm1 {%k1} = mem[1,2,3,4,5,6,7],zmm0[0] 120; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 121; CHECK-NEXT: addq $56, %rsp 122; CHECK-NEXT: .cfi_def_cfa_offset 8 123; CHECK-NEXT: retq 124 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 125 %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 126 %3 = bitcast i8 %mask to <8 x i1> 127 %4 = load <8 x i64>, <8 x i64>* %passthru 128 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4 129 ret <8 x i64> %5 130} 131 132define <8 x i64> @stack_fold_valignq_maskz(<8 x i64> %a, <8 x i64> %b, i8 %mask) { 133; CHECK-LABEL: stack_fold_valignq_maskz: 134; CHECK: # %bb.0: 135; CHECK-NEXT: subq $56, %rsp 136; CHECK-NEXT: .cfi_def_cfa_offset 64 137; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 138; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 139; CHECK-NEXT: #APP 140; CHECK-NEXT: nop 141; CHECK-NEXT: #NO_APP 142; CHECK-NEXT: kmovd %edi, %k1 143; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 144; CHECK-NEXT: valignq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 145; CHECK-NEXT: # zmm0 {%k1} {z} = mem[1,2,3,4,5,6,7],zmm0[0] 146; CHECK-NEXT: addq $56, %rsp 147; CHECK-NEXT: .cfi_def_cfa_offset 8 148; CHECK-NEXT: retq 149 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 150 %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 151 %3 = bitcast i8 %mask to <8 x i1> 152 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer 153 ret <8 x i64> %4 154} 155 156define <64 x i8> @stack_fold_pavgb(<64 x i8> %a0, <64 x i8> %a1) { 157; CHECK-LABEL: stack_fold_pavgb: 158; CHECK: # %bb.0: 159; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 160; CHECK-NEXT: #APP 161; CHECK-NEXT: nop 162; CHECK-NEXT: #NO_APP 163; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 164; CHECK-NEXT: retq 165 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 166 %2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a0, <64 x i8> %a1) 167 ret <64 x i8> %2 168} 169declare <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8>, <64 x i8>) 170 171define <64 x i8> @stack_fold_pavgb_commuted(<64 x i8> %a0, <64 x i8> %a1) { 172; CHECK-LABEL: stack_fold_pavgb_commuted: 173; CHECK: # %bb.0: 174; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 175; CHECK-NEXT: #APP 176; CHECK-NEXT: nop 177; CHECK-NEXT: #NO_APP 178; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 179; CHECK-NEXT: retq 180 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 181 %2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a1, <64 x i8> %a0) 182 ret <64 x i8> %2 183} 184 185define <64 x i8> @stack_fold_pavgb_mask(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) { 186; CHECK-LABEL: stack_fold_pavgb_mask: 187; CHECK: # %bb.0: 188; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 189; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 190; CHECK-NEXT: #APP 191; CHECK-NEXT: nop 192; CHECK-NEXT: #NO_APP 193; CHECK-NEXT: kmovq %rsi, %k1 194; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 195; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 196; CHECK-NEXT: retq 197 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 198 %2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a0, <64 x i8> %a1) 199 %3 = bitcast i64 %mask to <64 x i1> 200 ; load needed to keep the operation from being scheduled about the asm block 201 %4 = load <64 x i8>, <64 x i8>* %a2 202 %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4 203 ret <64 x i8> %5 204} 205 206define <64 x i8> @stack_fold_pavgb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) { 207; CHECK-LABEL: stack_fold_pavgb_mask_commuted: 208; CHECK: # %bb.0: 209; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 210; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 211; CHECK-NEXT: #APP 212; CHECK-NEXT: nop 213; CHECK-NEXT: #NO_APP 214; CHECK-NEXT: kmovq %rsi, %k1 215; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 216; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 217; CHECK-NEXT: retq 218 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 219 %2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a1, <64 x i8> %a0) 220 %3 = bitcast i64 %mask to <64 x i1> 221 ; load needed to keep the operation from being scheduled about the asm block 222 %4 = load <64 x i8>, <64 x i8>* %a2 223 %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4 224 ret <64 x i8> %5 225} 226 227define <64 x i8> @stack_fold_pavgb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 228; CHECK-LABEL: stack_fold_pavgb_maskz: 229; CHECK: # %bb.0: 230; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 231; CHECK-NEXT: #APP 232; CHECK-NEXT: nop 233; CHECK-NEXT: #NO_APP 234; CHECK-NEXT: kmovq %rdi, %k1 235; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 236; CHECK-NEXT: retq 237 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 238 %2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a0, <64 x i8> %a1) 239 %3 = bitcast i64 %mask to <64 x i1> 240 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer 241 ret <64 x i8> %4 242} 243 244define <64 x i8> @stack_fold_pavgb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 245; CHECK-LABEL: stack_fold_pavgb_maskz_commuted: 246; CHECK: # %bb.0: 247; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 248; CHECK-NEXT: #APP 249; CHECK-NEXT: nop 250; CHECK-NEXT: #NO_APP 251; CHECK-NEXT: kmovq %rdi, %k1 252; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 253; CHECK-NEXT: retq 254 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 255 %2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a1, <64 x i8> %a0) 256 %3 = bitcast i64 %mask to <64 x i1> 257 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer 258 ret <64 x i8> %4 259} 260 261define <32 x i16> @stack_fold_pavgw(<32 x i16> %a0, <32 x i16> %a1) { 262; CHECK-LABEL: stack_fold_pavgw: 263; CHECK: # %bb.0: 264; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 265; CHECK-NEXT: #APP 266; CHECK-NEXT: nop 267; CHECK-NEXT: #NO_APP 268; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 269; CHECK-NEXT: retq 270 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 271 %2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a0, <32 x i16> %a1) 272 ret <32 x i16> %2 273} 274declare <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16>, <32 x i16>) 275 276define <32 x i16> @stack_fold_pavgw_commuted(<32 x i16> %a0, <32 x i16> %a1) { 277; CHECK-LABEL: stack_fold_pavgw_commuted: 278; CHECK: # %bb.0: 279; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 280; CHECK-NEXT: #APP 281; CHECK-NEXT: nop 282; CHECK-NEXT: #NO_APP 283; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 284; CHECK-NEXT: retq 285 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 286 %2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a1, <32 x i16> %a0) 287 ret <32 x i16> %2 288} 289 290define <32 x i16> @stack_fold_pavgw_mask(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) { 291; CHECK-LABEL: stack_fold_pavgw_mask: 292; CHECK: # %bb.0: 293; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 294; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 295; CHECK-NEXT: #APP 296; CHECK-NEXT: nop 297; CHECK-NEXT: #NO_APP 298; CHECK-NEXT: kmovd %esi, %k1 299; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 300; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 301; CHECK-NEXT: retq 302 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 303 %2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a0, <32 x i16> %a1) 304 %3 = bitcast i32 %mask to <32 x i1> 305 ; load needed to keep the operation from being scheduled about the asm block 306 %4 = load <32 x i16>, <32 x i16>* %a2 307 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4 308 ret <32 x i16> %5 309} 310 311define <32 x i16> @stack_fold_pavgw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) { 312; CHECK-LABEL: stack_fold_pavgw_mask_commuted: 313; CHECK: # %bb.0: 314; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 315; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 316; CHECK-NEXT: #APP 317; CHECK-NEXT: nop 318; CHECK-NEXT: #NO_APP 319; CHECK-NEXT: kmovd %esi, %k1 320; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 321; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 322; CHECK-NEXT: retq 323 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 324 %2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a1, <32 x i16> %a0) 325 %3 = bitcast i32 %mask to <32 x i1> 326 ; load needed to keep the operation from being scheduled about the asm block 327 %4 = load <32 x i16>, <32 x i16>* %a2 328 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4 329 ret <32 x i16> %5 330} 331 332define <32 x i16> @stack_fold_pavgw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { 333; CHECK-LABEL: stack_fold_pavgw_maskz: 334; CHECK: # %bb.0: 335; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 336; CHECK-NEXT: #APP 337; CHECK-NEXT: nop 338; CHECK-NEXT: #NO_APP 339; CHECK-NEXT: kmovd %edi, %k1 340; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 341; CHECK-NEXT: retq 342 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 343 %2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a0, <32 x i16> %a1) 344 %3 = bitcast i32 %mask to <32 x i1> 345 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer 346 ret <32 x i16> %4 347} 348 349define <32 x i16> @stack_fold_pavgw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { 350; CHECK-LABEL: stack_fold_pavgw_maskz_commuted: 351; CHECK: # %bb.0: 352; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 353; CHECK-NEXT: #APP 354; CHECK-NEXT: nop 355; CHECK-NEXT: #NO_APP 356; CHECK-NEXT: kmovd %edi, %k1 357; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 358; CHECK-NEXT: retq 359 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 360 %2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a1, <32 x i16> %a0) 361 %3 = bitcast i32 %mask to <32 x i1> 362 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer 363 ret <32 x i16> %4 364} 365 366define <4 x i32> @stack_fold_extracti32x4(<16 x i16> %a0, <16 x i32> %a1) { 367; CHECK-LABEL: stack_fold_extracti32x4: 368; CHECK: # %bb.0: 369; CHECK-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 370; CHECK-NEXT: vextracti32x4 $3, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill 371; CHECK-NEXT: #APP 372; CHECK-NEXT: nop 373; CHECK-NEXT: #NO_APP 374; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 375; CHECK-NEXT: vzeroupper 376; CHECK-NEXT: retq 377 ; zext forces execution domain 378 %1 = zext <16 x i16> %a0 to <16 x i32> 379 %2 = shufflevector <16 x i32> %1, <16 x i32> %a1, <4 x i32> <i32 12, i32 13, i32 14, i32 15> 380 %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 381 ret <4 x i32> %2 382} 383 384define <2 x i64> @stack_fold_extracti64x2(<8 x i32> %a0, <8 x i64> %a1) { 385; CHECK-LABEL: stack_fold_extracti64x2: 386; CHECK: # %bb.0: 387; CHECK-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero 388; CHECK-NEXT: vextracti32x4 $3, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill 389; CHECK-NEXT: #APP 390; CHECK-NEXT: nop 391; CHECK-NEXT: #NO_APP 392; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 393; CHECK-NEXT: vzeroupper 394; CHECK-NEXT: retq 395 ; zext forces execution domain 396 %1 = zext <8 x i32> %a0 to <8 x i64> 397 %2 = shufflevector <8 x i64> %1, <8 x i64> %a1, <2 x i32> <i32 6, i32 7> 398 %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 399 ret <2 x i64> %2 400} 401 402define <8 x i32> @stack_fold_extracti32x8(<16 x i16> %a0, <16 x i32> %a1) { 403; CHECK-LABEL: stack_fold_extracti32x8: 404; CHECK: # %bb.0: 405; CHECK-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 406; CHECK-NEXT: vextracti64x4 $1, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill 407; CHECK-NEXT: #APP 408; CHECK-NEXT: nop 409; CHECK-NEXT: #NO_APP 410; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 411; CHECK-NEXT: retq 412 ; zext forces execution domain 413 %1 = zext <16 x i16> %a0 to <16 x i32> 414 %2 = shufflevector <16 x i32> %1, <16 x i32> %a1, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 415 %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 416 ret <8 x i32> %2 417} 418 419define <4 x i64> @stack_fold_extracti64x4(<8 x i32> %a0, <8 x i64> %a1) { 420; CHECK-LABEL: stack_fold_extracti64x4: 421; CHECK: # %bb.0: 422; CHECK-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero 423; CHECK-NEXT: vextracti64x4 $1, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill 424; CHECK-NEXT: #APP 425; CHECK-NEXT: nop 426; CHECK-NEXT: #NO_APP 427; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 428; CHECK-NEXT: retq 429 ; zext forces execution domain 430 %1 = zext <8 x i32> %a0 to <8 x i64> 431 %2 = shufflevector <8 x i64> %1, <8 x i64> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 432 %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 433 ret <4 x i64> %2 434} 435 436define <16 x i32> @stack_fold_inserti32x8(<8 x i32> %a0, <8 x i32> %a1) { 437; CHECK-LABEL: stack_fold_inserti32x8: 438; CHECK: # %bb.0: 439; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 440; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 441; CHECK-NEXT: #APP 442; CHECK-NEXT: nop 443; CHECK-NEXT: #NO_APP 444; CHECK-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload 445; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 446; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0 447; CHECK-NEXT: retq 448 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 449 %2 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 450 ; add forces execution domain 451 %3 = add <16 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 452 ret <16 x i32> %3 453} 454 455define <8 x i64> @stack_fold_inserti64x4(<4 x i64> %a0, <4 x i64> %a1) { 456; CHECK-LABEL: stack_fold_inserti64x4: 457; CHECK: # %bb.0: 458; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 459; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 460; CHECK-NEXT: #APP 461; CHECK-NEXT: nop 462; CHECK-NEXT: #NO_APP 463; CHECK-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload 464; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 465; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0 466; CHECK-NEXT: retq 467 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 468 %2 = shufflevector <4 x i64> %a0, <4 x i64> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 469 ; add forces execution domain 470 %3 = add <8 x i64> %2, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> 471 ret <8 x i64> %3 472} 473 474define <64 x i8> @stack_fold_pabsb(<64 x i8> %a0) { 475; CHECK-LABEL: stack_fold_pabsb: 476; CHECK: # %bb.0: 477; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 478; CHECK-NEXT: #APP 479; CHECK-NEXT: nop 480; CHECK-NEXT: #NO_APP 481; CHECK-NEXT: vpabsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 482; CHECK-NEXT: retq 483 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 484 %2 = icmp sgt <64 x i8> %a0, zeroinitializer 485 %3 = sub <64 x i8> zeroinitializer, %a0 486 %4 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %3 487 ret <64 x i8> %4 488} 489 490define <64 x i8> @stack_fold_pabsb_mask(<64 x i8> %passthru, <64 x i8> %a0, i64 %mask) { 491; CHECK-LABEL: stack_fold_pabsb_mask: 492; CHECK: # %bb.0: 493; CHECK-NEXT: subq $56, %rsp 494; CHECK-NEXT: .cfi_def_cfa_offset 64 495; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 496; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 497; CHECK-NEXT: #APP 498; CHECK-NEXT: nop 499; CHECK-NEXT: #NO_APP 500; CHECK-NEXT: kmovq %rdi, %k1 501; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 502; CHECK-NEXT: vpabsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload 503; CHECK-NEXT: addq $56, %rsp 504; CHECK-NEXT: .cfi_def_cfa_offset 8 505; CHECK-NEXT: retq 506 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 507 %2 = icmp sgt <64 x i8> %a0, zeroinitializer 508 %3 = sub <64 x i8> zeroinitializer, %a0 509 %4 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %3 510 %5 = bitcast i64 %mask to <64 x i1> 511 %6 = select <64 x i1> %5, <64 x i8> %4, <64 x i8> %passthru 512 ret <64 x i8> %6 513} 514 515define <64 x i8> @stack_fold_pabsb_maskz(<64 x i8> %a0, i64 %mask) { 516; CHECK-LABEL: stack_fold_pabsb_maskz: 517; CHECK: # %bb.0: 518; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 519; CHECK-NEXT: #APP 520; CHECK-NEXT: nop 521; CHECK-NEXT: #NO_APP 522; CHECK-NEXT: kmovq %rdi, %k1 523; CHECK-NEXT: vpabsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload 524; CHECK-NEXT: retq 525 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 526 %2 = icmp sgt <64 x i8> %a0, zeroinitializer 527 %3 = sub <64 x i8> zeroinitializer, %a0 528 %4 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %3 529 %5 = bitcast i64 %mask to <64 x i1> 530 %6 = select <64 x i1> %5, <64 x i8> %4, <64 x i8> zeroinitializer 531 ret <64 x i8> %6 532} 533 534define <16 x i32> @stack_fold_pabsd(<16 x i32> %a0) { 535; CHECK-LABEL: stack_fold_pabsd: 536; CHECK: # %bb.0: 537; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 538; CHECK-NEXT: #APP 539; CHECK-NEXT: nop 540; CHECK-NEXT: #NO_APP 541; CHECK-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 542; CHECK-NEXT: retq 543 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 544 %2 = icmp sgt <16 x i32> %a0, zeroinitializer 545 %3 = sub <16 x i32> zeroinitializer, %a0 546 %4 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %3 547 ret <16 x i32> %4 548} 549 550define <16 x i32> @stack_fold_pabsd_mask(<16 x i32> %passthru, <16 x i32> %a0, i16 %mask) { 551; CHECK-LABEL: stack_fold_pabsd_mask: 552; CHECK: # %bb.0: 553; CHECK-NEXT: subq $56, %rsp 554; CHECK-NEXT: .cfi_def_cfa_offset 64 555; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 556; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 557; CHECK-NEXT: #APP 558; CHECK-NEXT: nop 559; CHECK-NEXT: #NO_APP 560; CHECK-NEXT: kmovd %edi, %k1 561; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 562; CHECK-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload 563; CHECK-NEXT: addq $56, %rsp 564; CHECK-NEXT: .cfi_def_cfa_offset 8 565; CHECK-NEXT: retq 566 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 567 %2 = icmp sgt <16 x i32> %a0, zeroinitializer 568 %3 = sub <16 x i32> zeroinitializer, %a0 569 %4 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %3 570 %5 = bitcast i16 %mask to <16 x i1> 571 %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> %passthru 572 ret <16 x i32> %6 573} 574 575define <16 x i32> @stack_fold_pabsd_maskz(<16 x i32> %a0, i16 %mask) { 576; CHECK-LABEL: stack_fold_pabsd_maskz: 577; CHECK: # %bb.0: 578; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 579; CHECK-NEXT: #APP 580; CHECK-NEXT: nop 581; CHECK-NEXT: #NO_APP 582; CHECK-NEXT: kmovd %edi, %k1 583; CHECK-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload 584; CHECK-NEXT: retq 585 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 586 %2 = icmp sgt <16 x i32> %a0, zeroinitializer 587 %3 = sub <16 x i32> zeroinitializer, %a0 588 %4 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %3 589 %5 = bitcast i16 %mask to <16 x i1> 590 %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer 591 ret <16 x i32> %6 592} 593 594define <8 x i64> @stack_fold_pabsq(<8 x i64> %a0) { 595; CHECK-LABEL: stack_fold_pabsq: 596; CHECK: # %bb.0: 597; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 598; CHECK-NEXT: #APP 599; CHECK-NEXT: nop 600; CHECK-NEXT: #NO_APP 601; CHECK-NEXT: vpabsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 602; CHECK-NEXT: retq 603 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 604 %2 = icmp sgt <8 x i64> %a0, zeroinitializer 605 %3 = sub <8 x i64> zeroinitializer, %a0 606 %4 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %3 607 ret <8 x i64> %4 608} 609 610define <8 x i64> @stack_fold_pabsq_mask(<8 x i64> %passthru, <8 x i64> %a0, i8 %mask) { 611; CHECK-LABEL: stack_fold_pabsq_mask: 612; CHECK: # %bb.0: 613; CHECK-NEXT: subq $56, %rsp 614; CHECK-NEXT: .cfi_def_cfa_offset 64 615; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 616; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 617; CHECK-NEXT: #APP 618; CHECK-NEXT: nop 619; CHECK-NEXT: #NO_APP 620; CHECK-NEXT: kmovd %edi, %k1 621; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 622; CHECK-NEXT: vpabsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload 623; CHECK-NEXT: addq $56, %rsp 624; CHECK-NEXT: .cfi_def_cfa_offset 8 625; CHECK-NEXT: retq 626 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 627 %2 = icmp sgt <8 x i64> %a0, zeroinitializer 628 %3 = sub <8 x i64> zeroinitializer, %a0 629 %4 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %3 630 %5 = bitcast i8 %mask to <8 x i1> 631 %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> %passthru 632 ret <8 x i64> %6 633} 634 635define <8 x i64> @stack_fold_pabsq_maskz(<8 x i64> %a0, i8 %mask) { 636; CHECK-LABEL: stack_fold_pabsq_maskz: 637; CHECK: # %bb.0: 638; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 639; CHECK-NEXT: #APP 640; CHECK-NEXT: nop 641; CHECK-NEXT: #NO_APP 642; CHECK-NEXT: kmovd %edi, %k1 643; CHECK-NEXT: vpabsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload 644; CHECK-NEXT: retq 645 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 646 %2 = icmp sgt <8 x i64> %a0, zeroinitializer 647 %3 = sub <8 x i64> zeroinitializer, %a0 648 %4 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %3 649 %5 = bitcast i8 %mask to <8 x i1> 650 %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer 651 ret <8 x i64> %6 652} 653 654define <32 x i16> @stack_fold_pabsw(<32 x i16> %a0) { 655; CHECK-LABEL: stack_fold_pabsw: 656; CHECK: # %bb.0: 657; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 658; CHECK-NEXT: #APP 659; CHECK-NEXT: nop 660; CHECK-NEXT: #NO_APP 661; CHECK-NEXT: vpabsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 662; CHECK-NEXT: retq 663 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 664 %2 = icmp sgt <32 x i16> %a0, zeroinitializer 665 %3 = sub <32 x i16> zeroinitializer, %a0 666 %4 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %3 667 ret <32 x i16> %4 668} 669 670define <32 x i16> @stack_fold_pabsw_mask(<32 x i16> %passthru, <32 x i16> %a0, i32 %mask) { 671; CHECK-LABEL: stack_fold_pabsw_mask: 672; CHECK: # %bb.0: 673; CHECK-NEXT: subq $56, %rsp 674; CHECK-NEXT: .cfi_def_cfa_offset 64 675; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 676; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 677; CHECK-NEXT: #APP 678; CHECK-NEXT: nop 679; CHECK-NEXT: #NO_APP 680; CHECK-NEXT: kmovd %edi, %k1 681; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 682; CHECK-NEXT: vpabsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload 683; CHECK-NEXT: addq $56, %rsp 684; CHECK-NEXT: .cfi_def_cfa_offset 8 685; CHECK-NEXT: retq 686 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 687 %2 = icmp sgt <32 x i16> %a0, zeroinitializer 688 %3 = sub <32 x i16> zeroinitializer, %a0 689 %4 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %3 690 %5 = bitcast i32 %mask to <32 x i1> 691 %6 = select <32 x i1> %5, <32 x i16> %4, <32 x i16> %passthru 692 ret <32 x i16> %6 693} 694 695define <32 x i16> @stack_fold_pabsw_maskz(<32 x i16> %a0, i32 %mask) { 696; CHECK-LABEL: stack_fold_pabsw_maskz: 697; CHECK: # %bb.0: 698; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 699; CHECK-NEXT: #APP 700; CHECK-NEXT: nop 701; CHECK-NEXT: #NO_APP 702; CHECK-NEXT: kmovd %edi, %k1 703; CHECK-NEXT: vpabsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload 704; CHECK-NEXT: retq 705 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 706 %2 = icmp sgt <32 x i16> %a0, zeroinitializer 707 %3 = sub <32 x i16> zeroinitializer, %a0 708 %4 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %3 709 %5 = bitcast i32 %mask to <32 x i1> 710 %6 = select <32 x i1> %5, <32 x i16> %4, <32 x i16> zeroinitializer 711 ret <32 x i16> %6 712} 713 714define <32 x i16> @stack_fold_packssdw(<16 x i32> %a0, <16 x i32> %a1) { 715; CHECK-LABEL: stack_fold_packssdw: 716; CHECK: # %bb.0: 717; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 718; CHECK-NEXT: #APP 719; CHECK-NEXT: nop 720; CHECK-NEXT: #NO_APP 721; CHECK-NEXT: vpackssdw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 722; CHECK-NEXT: retq 723 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 724 %2 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a0, <16 x i32> %a1) 725 ret <32 x i16> %2 726} 727declare <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32>, <16 x i32>) nounwind readnone 728 729define <64 x i8> @stack_fold_packsswb(<32 x i16> %a0, <32 x i16> %a1) { 730; CHECK-LABEL: stack_fold_packsswb: 731; CHECK: # %bb.0: 732; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 733; CHECK-NEXT: #APP 734; CHECK-NEXT: nop 735; CHECK-NEXT: #NO_APP 736; CHECK-NEXT: vpacksswb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 737; CHECK-NEXT: retq 738 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 739 %2 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a0, <32 x i16> %a1) 740 ret <64 x i8> %2 741} 742declare <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16>, <32 x i16>) nounwind readnone 743 744define <32 x i16> @stack_fold_packusdw(<16 x i32> %a0, <16 x i32> %a1) { 745; CHECK-LABEL: stack_fold_packusdw: 746; CHECK: # %bb.0: 747; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 748; CHECK-NEXT: #APP 749; CHECK-NEXT: nop 750; CHECK-NEXT: #NO_APP 751; CHECK-NEXT: vpackusdw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 752; CHECK-NEXT: retq 753 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 754 %2 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a0, <16 x i32> %a1) 755 ret <32 x i16> %2 756} 757declare <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32>, <16 x i32>) nounwind readnone 758 759define <32 x i16> @stack_fold_packusdw_mask(<32 x i16>* %passthru, <16 x i32> %a0, <16 x i32> %a1, i32 %mask) { 760; CHECK-LABEL: stack_fold_packusdw_mask: 761; CHECK: # %bb.0: 762; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 763; CHECK-NEXT: #APP 764; CHECK-NEXT: nop 765; CHECK-NEXT: #NO_APP 766; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 767; CHECK-NEXT: kmovd %esi, %k1 768; CHECK-NEXT: vpackusdw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 769; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 770; CHECK-NEXT: retq 771 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 772 %2 = load <32 x i16>, <32 x i16>* %passthru 773 %3 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a0, <16 x i32> %a1) 774 %4 = bitcast i32 %mask to <32 x i1> 775 %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %2 776 ret <32 x i16> %5 777} 778 779define <32 x i16> @stack_fold_packusdw_maskz(<16 x i32> %a0, <16 x i32> %a1, i32 %mask) { 780; CHECK-LABEL: stack_fold_packusdw_maskz: 781; CHECK: # %bb.0: 782; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 783; CHECK-NEXT: #APP 784; CHECK-NEXT: nop 785; CHECK-NEXT: #NO_APP 786; CHECK-NEXT: kmovd %edi, %k1 787; CHECK-NEXT: vpackusdw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 788; CHECK-NEXT: retq 789 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 790 %2 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a0, <16 x i32> %a1) 791 %3 = bitcast i32 %mask to <32 x i1> 792 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer 793 ret <32 x i16> %4 794} 795 796define <64 x i8> @stack_fold_packuswb(<32 x i16> %a0, <32 x i16> %a1) { 797; CHECK-LABEL: stack_fold_packuswb: 798; CHECK: # %bb.0: 799; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 800; CHECK-NEXT: #APP 801; CHECK-NEXT: nop 802; CHECK-NEXT: #NO_APP 803; CHECK-NEXT: vpackuswb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 804; CHECK-NEXT: retq 805 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 806 %2 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a0, <32 x i16> %a1) 807 ret <64 x i8> %2 808} 809declare <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16>, <32 x i16>) nounwind readnone 810 811define <64 x i8> @stack_fold_paddb(<64 x i8> %a0, <64 x i8> %a1) { 812; CHECK-LABEL: stack_fold_paddb: 813; CHECK: # %bb.0: 814; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 815; CHECK-NEXT: #APP 816; CHECK-NEXT: nop 817; CHECK-NEXT: #NO_APP 818; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 819; CHECK-NEXT: retq 820 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 821 %2 = add <64 x i8> %a0, %a1 822 ret <64 x i8> %2 823} 824 825define <64 x i8> @stack_fold_paddb_commuted(<64 x i8> %a0, <64 x i8> %a1) { 826; CHECK-LABEL: stack_fold_paddb_commuted: 827; CHECK: # %bb.0: 828; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 829; CHECK-NEXT: #APP 830; CHECK-NEXT: nop 831; CHECK-NEXT: #NO_APP 832; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 833; CHECK-NEXT: retq 834 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 835 %2 = add <64 x i8> %a1, %a0 836 ret <64 x i8> %2 837} 838 839define <64 x i8> @stack_fold_paddb_mask(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) { 840; CHECK-LABEL: stack_fold_paddb_mask: 841; CHECK: # %bb.0: 842; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 843; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 844; CHECK-NEXT: #APP 845; CHECK-NEXT: nop 846; CHECK-NEXT: #NO_APP 847; CHECK-NEXT: kmovq %rsi, %k1 848; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 849; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 850; CHECK-NEXT: retq 851 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 852 %2 = add <64 x i8> %a0, %a1 853 %3 = bitcast i64 %mask to <64 x i1> 854 ; load needed to keep the operation from being scheduled about the asm block 855 %4 = load <64 x i8>, <64 x i8>* %a2 856 %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4 857 ret <64 x i8> %5 858} 859 860define <64 x i8> @stack_fold_paddb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) { 861; CHECK-LABEL: stack_fold_paddb_mask_commuted: 862; CHECK: # %bb.0: 863; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 864; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 865; CHECK-NEXT: #APP 866; CHECK-NEXT: nop 867; CHECK-NEXT: #NO_APP 868; CHECK-NEXT: kmovq %rsi, %k1 869; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 870; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 871; CHECK-NEXT: retq 872 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 873 %2 = add <64 x i8> %a1, %a0 874 %3 = bitcast i64 %mask to <64 x i1> 875 ; load needed to keep the operation from being scheduled about the asm block 876 %4 = load <64 x i8>, <64 x i8>* %a2 877 %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4 878 ret <64 x i8> %5 879} 880 881define <64 x i8> @stack_fold_paddb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 882; CHECK-LABEL: stack_fold_paddb_maskz: 883; CHECK: # %bb.0: 884; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 885; CHECK-NEXT: #APP 886; CHECK-NEXT: nop 887; CHECK-NEXT: #NO_APP 888; CHECK-NEXT: kmovq %rdi, %k1 889; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 890; CHECK-NEXT: retq 891 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 892 %2 = add <64 x i8> %a0, %a1 893 %3 = bitcast i64 %mask to <64 x i1> 894 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer 895 ret <64 x i8> %4 896} 897 898define <64 x i8> @stack_fold_paddb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 899; CHECK-LABEL: stack_fold_paddb_maskz_commuted: 900; CHECK: # %bb.0: 901; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 902; CHECK-NEXT: #APP 903; CHECK-NEXT: nop 904; CHECK-NEXT: #NO_APP 905; CHECK-NEXT: kmovq %rdi, %k1 906; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 907; CHECK-NEXT: retq 908 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 909 %2 = add <64 x i8> %a1, %a0 910 %3 = bitcast i64 %mask to <64 x i1> 911 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer 912 ret <64 x i8> %4 913} 914 915define <16 x i32> @stack_fold_paddd(<16 x i32> %a0, <16 x i32> %a1) { 916; CHECK-LABEL: stack_fold_paddd: 917; CHECK: # %bb.0: 918; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 919; CHECK-NEXT: #APP 920; CHECK-NEXT: nop 921; CHECK-NEXT: #NO_APP 922; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 923; CHECK-NEXT: retq 924 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 925 %2 = add <16 x i32> %a0, %a1 926 ret <16 x i32> %2 927} 928 929define <16 x i32> @stack_fold_paddd_commuted(<16 x i32> %a0, <16 x i32> %a1) { 930; CHECK-LABEL: stack_fold_paddd_commuted: 931; CHECK: # %bb.0: 932; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 933; CHECK-NEXT: #APP 934; CHECK-NEXT: nop 935; CHECK-NEXT: #NO_APP 936; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 937; CHECK-NEXT: retq 938 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 939 %2 = add <16 x i32> %a1, %a0 940 ret <16 x i32> %2 941} 942 943define <16 x i32> @stack_fold_paddd_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) { 944; CHECK-LABEL: stack_fold_paddd_mask: 945; CHECK: # %bb.0: 946; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 947; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 948; CHECK-NEXT: #APP 949; CHECK-NEXT: nop 950; CHECK-NEXT: #NO_APP 951; CHECK-NEXT: kmovd %esi, %k1 952; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 953; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 954; CHECK-NEXT: retq 955 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 956 %2 = add <16 x i32> %a0, %a1 957 %3 = bitcast i16 %mask to <16 x i1> 958 ; load needed to keep the operation from being scheduled about the asm block 959 %4 = load <16 x i32>, <16 x i32>* %a2 960 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4 961 ret <16 x i32> %5 962} 963 964define <16 x i32> @stack_fold_paddd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) { 965; CHECK-LABEL: stack_fold_paddd_mask_commuted: 966; CHECK: # %bb.0: 967; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 968; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 969; CHECK-NEXT: #APP 970; CHECK-NEXT: nop 971; CHECK-NEXT: #NO_APP 972; CHECK-NEXT: kmovd %esi, %k1 973; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 974; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 975; CHECK-NEXT: retq 976 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 977 %2 = add <16 x i32> %a1, %a0 978 %3 = bitcast i16 %mask to <16 x i1> 979 ; load needed to keep the operation from being scheduled about the asm block 980 %4 = load <16 x i32>, <16 x i32>* %a2 981 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4 982 ret <16 x i32> %5 983} 984 985define <16 x i32> @stack_fold_paddd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { 986; CHECK-LABEL: stack_fold_paddd_maskz: 987; CHECK: # %bb.0: 988; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 989; CHECK-NEXT: #APP 990; CHECK-NEXT: nop 991; CHECK-NEXT: #NO_APP 992; CHECK-NEXT: kmovd %edi, %k1 993; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 994; CHECK-NEXT: retq 995 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 996 %2 = add <16 x i32> %a0, %a1 997 %3 = bitcast i16 %mask to <16 x i1> 998 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 999 ret <16 x i32> %4 1000} 1001 1002define <16 x i32> @stack_fold_paddd_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { 1003; CHECK-LABEL: stack_fold_paddd_maskz_commuted: 1004; CHECK: # %bb.0: 1005; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1006; CHECK-NEXT: #APP 1007; CHECK-NEXT: nop 1008; CHECK-NEXT: #NO_APP 1009; CHECK-NEXT: kmovd %edi, %k1 1010; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 1011; CHECK-NEXT: retq 1012 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1013 %2 = add <16 x i32> %a1, %a0 1014 %3 = bitcast i16 %mask to <16 x i1> 1015 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 1016 ret <16 x i32> %4 1017} 1018 1019define <8 x i64> @stack_fold_paddq(<8 x i64> %a0, <8 x i64> %a1) { 1020; CHECK-LABEL: stack_fold_paddq: 1021; CHECK: # %bb.0: 1022; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1023; CHECK-NEXT: #APP 1024; CHECK-NEXT: nop 1025; CHECK-NEXT: #NO_APP 1026; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 1027; CHECK-NEXT: retq 1028 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1029 %2 = add <8 x i64> %a0, %a1 1030 ret <8 x i64> %2 1031} 1032 1033define <8 x i64> @stack_fold_paddq_commuted(<8 x i64> %a0, <8 x i64> %a1) { 1034; CHECK-LABEL: stack_fold_paddq_commuted: 1035; CHECK: # %bb.0: 1036; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1037; CHECK-NEXT: #APP 1038; CHECK-NEXT: nop 1039; CHECK-NEXT: #NO_APP 1040; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 1041; CHECK-NEXT: retq 1042 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1043 %2 = add <8 x i64> %a1, %a0 1044 ret <8 x i64> %2 1045} 1046 1047define <8 x i64> @stack_fold_paddq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) { 1048; CHECK-LABEL: stack_fold_paddq_mask: 1049; CHECK: # %bb.0: 1050; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1051; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 1052; CHECK-NEXT: #APP 1053; CHECK-NEXT: nop 1054; CHECK-NEXT: #NO_APP 1055; CHECK-NEXT: kmovd %esi, %k1 1056; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 1057; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 1058; CHECK-NEXT: retq 1059 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1060 %2 = add <8 x i64> %a0, %a1 1061 %3 = bitcast i8 %mask to <8 x i1> 1062 ; load needed to keep the operation from being scheduled about the asm block 1063 %4 = load <8 x i64>, <8 x i64>* %a2 1064 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4 1065 ret <8 x i64> %5 1066} 1067 1068define <8 x i64> @stack_fold_paddq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) { 1069; CHECK-LABEL: stack_fold_paddq_mask_commuted: 1070; CHECK: # %bb.0: 1071; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1072; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 1073; CHECK-NEXT: #APP 1074; CHECK-NEXT: nop 1075; CHECK-NEXT: #NO_APP 1076; CHECK-NEXT: kmovd %esi, %k1 1077; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 1078; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 1079; CHECK-NEXT: retq 1080 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1081 %2 = add <8 x i64> %a1, %a0 1082 %3 = bitcast i8 %mask to <8 x i1> 1083 ; load needed to keep the operation from being scheduled about the asm block 1084 %4 = load <8 x i64>, <8 x i64>* %a2 1085 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4 1086 ret <8 x i64> %5 1087} 1088 1089define <8 x i64> @stack_fold_paddq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 1090; CHECK-LABEL: stack_fold_paddq_maskz: 1091; CHECK: # %bb.0: 1092; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1093; CHECK-NEXT: #APP 1094; CHECK-NEXT: nop 1095; CHECK-NEXT: #NO_APP 1096; CHECK-NEXT: kmovd %edi, %k1 1097; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 1098; CHECK-NEXT: retq 1099 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1100 %2 = add <8 x i64> %a0, %a1 1101 %3 = bitcast i8 %mask to <8 x i1> 1102 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer 1103 ret <8 x i64> %4 1104} 1105 1106define <8 x i64> @stack_fold_paddq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 1107; CHECK-LABEL: stack_fold_paddq_maskz_commuted: 1108; CHECK: # %bb.0: 1109; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1110; CHECK-NEXT: #APP 1111; CHECK-NEXT: nop 1112; CHECK-NEXT: #NO_APP 1113; CHECK-NEXT: kmovd %edi, %k1 1114; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 1115; CHECK-NEXT: retq 1116 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1117 %2 = add <8 x i64> %a1, %a0 1118 %3 = bitcast i8 %mask to <8 x i1> 1119 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer 1120 ret <8 x i64> %4 1121} 1122 1123define <64 x i8> @stack_fold_paddsb(<64 x i8> %a0, <64 x i8> %a1) { 1124; CHECK-LABEL: stack_fold_paddsb: 1125; CHECK: # %bb.0: 1126; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1127; CHECK-NEXT: #APP 1128; CHECK-NEXT: nop 1129; CHECK-NEXT: #NO_APP 1130; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 1131; CHECK-NEXT: retq 1132 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1133 %2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1) 1134 ret <64 x i8> %2 1135} 1136 1137define <64 x i8> @stack_fold_paddsb_commuted(<64 x i8> %a0, <64 x i8> %a1) { 1138; CHECK-LABEL: stack_fold_paddsb_commuted: 1139; CHECK: # %bb.0: 1140; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1141; CHECK-NEXT: #APP 1142; CHECK-NEXT: nop 1143; CHECK-NEXT: #NO_APP 1144; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 1145; CHECK-NEXT: retq 1146 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1147 %2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0) 1148 ret <64 x i8> %2 1149} 1150 1151define <64 x i8> @stack_fold_paddsb_mask(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) { 1152; CHECK-LABEL: stack_fold_paddsb_mask: 1153; CHECK: # %bb.0: 1154; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1155; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 1156; CHECK-NEXT: #APP 1157; CHECK-NEXT: nop 1158; CHECK-NEXT: #NO_APP 1159; CHECK-NEXT: kmovq %rsi, %k1 1160; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 1161; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 1162; CHECK-NEXT: retq 1163 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1164 %2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1) 1165 %3 = bitcast i64 %mask to <64 x i1> 1166 ; load needed to keep the operation from being scheduled about the asm block 1167 %4 = load <64 x i8>, <64 x i8>* %a2 1168 %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4 1169 ret <64 x i8> %5 1170} 1171 1172define <64 x i8> @stack_fold_paddsb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) { 1173; CHECK-LABEL: stack_fold_paddsb_mask_commuted: 1174; CHECK: # %bb.0: 1175; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1176; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 1177; CHECK-NEXT: #APP 1178; CHECK-NEXT: nop 1179; CHECK-NEXT: #NO_APP 1180; CHECK-NEXT: kmovq %rsi, %k1 1181; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 1182; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 1183; CHECK-NEXT: retq 1184 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1185 %2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0) 1186 %3 = bitcast i64 %mask to <64 x i1> 1187 ; load needed to keep the operation from being scheduled about the asm block 1188 %4 = load <64 x i8>, <64 x i8>* %a2 1189 %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4 1190 ret <64 x i8> %5 1191} 1192 1193define <64 x i8> @stack_fold_paddsb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 1194; CHECK-LABEL: stack_fold_paddsb_maskz: 1195; CHECK: # %bb.0: 1196; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1197; CHECK-NEXT: #APP 1198; CHECK-NEXT: nop 1199; CHECK-NEXT: #NO_APP 1200; CHECK-NEXT: kmovq %rdi, %k1 1201; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 1202; CHECK-NEXT: retq 1203 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1204 %2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1) 1205 %3 = bitcast i64 %mask to <64 x i1> 1206 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer 1207 ret <64 x i8> %4 1208} 1209 1210define <64 x i8> @stack_fold_paddsb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 1211; CHECK-LABEL: stack_fold_paddsb_maskz_commuted: 1212; CHECK: # %bb.0: 1213; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1214; CHECK-NEXT: #APP 1215; CHECK-NEXT: nop 1216; CHECK-NEXT: #NO_APP 1217; CHECK-NEXT: kmovq %rdi, %k1 1218; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 1219; CHECK-NEXT: retq 1220 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1221 %2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0) 1222 %3 = bitcast i64 %mask to <64 x i1> 1223 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer 1224 ret <64 x i8> %4 1225} 1226 1227define <32 x i16> @stack_fold_paddsw(<32 x i16> %a0, <32 x i16> %a1) { 1228; CHECK-LABEL: stack_fold_paddsw: 1229; CHECK: # %bb.0: 1230; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1231; CHECK-NEXT: #APP 1232; CHECK-NEXT: nop 1233; CHECK-NEXT: #NO_APP 1234; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 1235; CHECK-NEXT: retq 1236 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1237 %2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1) 1238 ret <32 x i16> %2 1239} 1240 1241define <32 x i16> @stack_fold_paddsw_commuted(<32 x i16> %a0, <32 x i16> %a1) { 1242; CHECK-LABEL: stack_fold_paddsw_commuted: 1243; CHECK: # %bb.0: 1244; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1245; CHECK-NEXT: #APP 1246; CHECK-NEXT: nop 1247; CHECK-NEXT: #NO_APP 1248; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 1249; CHECK-NEXT: retq 1250 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1251 %2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0) 1252 ret <32 x i16> %2 1253} 1254 1255define <32 x i16> @stack_fold_paddsw_mask(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) { 1256; CHECK-LABEL: stack_fold_paddsw_mask: 1257; CHECK: # %bb.0: 1258; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1259; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 1260; CHECK-NEXT: #APP 1261; CHECK-NEXT: nop 1262; CHECK-NEXT: #NO_APP 1263; CHECK-NEXT: kmovd %esi, %k1 1264; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 1265; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 1266; CHECK-NEXT: retq 1267 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1268 %2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1) 1269 %3 = bitcast i32 %mask to <32 x i1> 1270 ; load needed to keep the operation from being scheduled about the asm block 1271 %4 = load <32 x i16>, <32 x i16>* %a2 1272 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4 1273 ret <32 x i16> %5 1274} 1275 1276define <32 x i16> @stack_fold_paddsw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) { 1277; CHECK-LABEL: stack_fold_paddsw_mask_commuted: 1278; CHECK: # %bb.0: 1279; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1280; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 1281; CHECK-NEXT: #APP 1282; CHECK-NEXT: nop 1283; CHECK-NEXT: #NO_APP 1284; CHECK-NEXT: kmovd %esi, %k1 1285; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 1286; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 1287; CHECK-NEXT: retq 1288 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1289 %2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0) 1290 %3 = bitcast i32 %mask to <32 x i1> 1291 ; load needed to keep the operation from being scheduled about the asm block 1292 %4 = load <32 x i16>, <32 x i16>* %a2 1293 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4 1294 ret <32 x i16> %5 1295} 1296 1297define <32 x i16> @stack_fold_paddsw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { 1298; CHECK-LABEL: stack_fold_paddsw_maskz: 1299; CHECK: # %bb.0: 1300; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1301; CHECK-NEXT: #APP 1302; CHECK-NEXT: nop 1303; CHECK-NEXT: #NO_APP 1304; CHECK-NEXT: kmovd %edi, %k1 1305; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 1306; CHECK-NEXT: retq 1307 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1308 %2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1) 1309 %3 = bitcast i32 %mask to <32 x i1> 1310 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer 1311 ret <32 x i16> %4 1312} 1313 1314define <32 x i16> @stack_fold_paddsw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { 1315; CHECK-LABEL: stack_fold_paddsw_maskz_commuted: 1316; CHECK: # %bb.0: 1317; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1318; CHECK-NEXT: #APP 1319; CHECK-NEXT: nop 1320; CHECK-NEXT: #NO_APP 1321; CHECK-NEXT: kmovd %edi, %k1 1322; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 1323; CHECK-NEXT: retq 1324 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1325 %2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0) 1326 %3 = bitcast i32 %mask to <32 x i1> 1327 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer 1328 ret <32 x i16> %4 1329} 1330 1331define <64 x i8> @stack_fold_paddusb(<64 x i8> %a0, <64 x i8> %a1) { 1332; CHECK-LABEL: stack_fold_paddusb: 1333; CHECK: # %bb.0: 1334; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1335; CHECK-NEXT: #APP 1336; CHECK-NEXT: nop 1337; CHECK-NEXT: #NO_APP 1338; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 1339; CHECK-NEXT: retq 1340 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1341 %2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1) 1342 ret <64 x i8> %2 1343} 1344 1345define <64 x i8> @stack_fold_paddusb_commuted(<64 x i8> %a0, <64 x i8> %a1) { 1346; CHECK-LABEL: stack_fold_paddusb_commuted: 1347; CHECK: # %bb.0: 1348; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1349; CHECK-NEXT: #APP 1350; CHECK-NEXT: nop 1351; CHECK-NEXT: #NO_APP 1352; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 1353; CHECK-NEXT: retq 1354 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1355 %2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0) 1356 ret <64 x i8> %2 1357} 1358 1359define <64 x i8> @stack_fold_paddusb_mask(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) { 1360; CHECK-LABEL: stack_fold_paddusb_mask: 1361; CHECK: # %bb.0: 1362; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1363; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 1364; CHECK-NEXT: #APP 1365; CHECK-NEXT: nop 1366; CHECK-NEXT: #NO_APP 1367; CHECK-NEXT: kmovq %rsi, %k1 1368; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 1369; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 1370; CHECK-NEXT: retq 1371 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1372 %2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1) 1373 %3 = bitcast i64 %mask to <64 x i1> 1374 ; load needed to keep the operation from being scheduled about the asm block 1375 %4 = load <64 x i8>, <64 x i8>* %a2 1376 %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4 1377 ret <64 x i8> %5 1378} 1379 1380define <64 x i8> @stack_fold_paddusb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) { 1381; CHECK-LABEL: stack_fold_paddusb_mask_commuted: 1382; CHECK: # %bb.0: 1383; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1384; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 1385; CHECK-NEXT: #APP 1386; CHECK-NEXT: nop 1387; CHECK-NEXT: #NO_APP 1388; CHECK-NEXT: kmovq %rsi, %k1 1389; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 1390; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 1391; CHECK-NEXT: retq 1392 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1393 %2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0) 1394 %3 = bitcast i64 %mask to <64 x i1> 1395 ; load needed to keep the operation from being scheduled about the asm block 1396 %4 = load <64 x i8>, <64 x i8>* %a2 1397 %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4 1398 ret <64 x i8> %5 1399} 1400 1401define <64 x i8> @stack_fold_paddusb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 1402; CHECK-LABEL: stack_fold_paddusb_maskz: 1403; CHECK: # %bb.0: 1404; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1405; CHECK-NEXT: #APP 1406; CHECK-NEXT: nop 1407; CHECK-NEXT: #NO_APP 1408; CHECK-NEXT: kmovq %rdi, %k1 1409; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 1410; CHECK-NEXT: retq 1411 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1412 %2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1) 1413 %3 = bitcast i64 %mask to <64 x i1> 1414 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer 1415 ret <64 x i8> %4 1416} 1417 1418define <64 x i8> @stack_fold_paddusb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 1419; CHECK-LABEL: stack_fold_paddusb_maskz_commuted: 1420; CHECK: # %bb.0: 1421; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1422; CHECK-NEXT: #APP 1423; CHECK-NEXT: nop 1424; CHECK-NEXT: #NO_APP 1425; CHECK-NEXT: kmovq %rdi, %k1 1426; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 1427; CHECK-NEXT: retq 1428 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1429 %2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0) 1430 %3 = bitcast i64 %mask to <64 x i1> 1431 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer 1432 ret <64 x i8> %4 1433} 1434 1435define <32 x i16> @stack_fold_paddusw(<32 x i16> %a0, <32 x i16> %a1) { 1436; CHECK-LABEL: stack_fold_paddusw: 1437; CHECK: # %bb.0: 1438; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1439; CHECK-NEXT: #APP 1440; CHECK-NEXT: nop 1441; CHECK-NEXT: #NO_APP 1442; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 1443; CHECK-NEXT: retq 1444 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1445 %2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1) 1446 ret <32 x i16> %2 1447} 1448 1449define <32 x i16> @stack_fold_paddusw_commuted(<32 x i16> %a0, <32 x i16> %a1) { 1450; CHECK-LABEL: stack_fold_paddusw_commuted: 1451; CHECK: # %bb.0: 1452; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1453; CHECK-NEXT: #APP 1454; CHECK-NEXT: nop 1455; CHECK-NEXT: #NO_APP 1456; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 1457; CHECK-NEXT: retq 1458 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1459 %2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0) 1460 ret <32 x i16> %2 1461} 1462 1463define <32 x i16> @stack_fold_paddusw_mask(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) { 1464; CHECK-LABEL: stack_fold_paddusw_mask: 1465; CHECK: # %bb.0: 1466; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1467; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 1468; CHECK-NEXT: #APP 1469; CHECK-NEXT: nop 1470; CHECK-NEXT: #NO_APP 1471; CHECK-NEXT: kmovd %esi, %k1 1472; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 1473; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 1474; CHECK-NEXT: retq 1475 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1476 %2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1) 1477 %3 = bitcast i32 %mask to <32 x i1> 1478 ; load needed to keep the operation from being scheduled about the asm block 1479 %4 = load <32 x i16>, <32 x i16>* %a2 1480 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4 1481 ret <32 x i16> %5 1482} 1483 1484define <32 x i16> @stack_fold_paddusw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) { 1485; CHECK-LABEL: stack_fold_paddusw_mask_commuted: 1486; CHECK: # %bb.0: 1487; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1488; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 1489; CHECK-NEXT: #APP 1490; CHECK-NEXT: nop 1491; CHECK-NEXT: #NO_APP 1492; CHECK-NEXT: kmovd %esi, %k1 1493; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 1494; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 1495; CHECK-NEXT: retq 1496 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1497 %2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0) 1498 %3 = bitcast i32 %mask to <32 x i1> 1499 ; load needed to keep the operation from being scheduled about the asm block 1500 %4 = load <32 x i16>, <32 x i16>* %a2 1501 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4 1502 ret <32 x i16> %5 1503} 1504 1505define <32 x i16> @stack_fold_paddusw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { 1506; CHECK-LABEL: stack_fold_paddusw_maskz: 1507; CHECK: # %bb.0: 1508; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1509; CHECK-NEXT: #APP 1510; CHECK-NEXT: nop 1511; CHECK-NEXT: #NO_APP 1512; CHECK-NEXT: kmovd %edi, %k1 1513; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 1514; CHECK-NEXT: retq 1515 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1516 %2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1) 1517 %3 = bitcast i32 %mask to <32 x i1> 1518 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer 1519 ret <32 x i16> %4 1520} 1521 1522define <32 x i16> @stack_fold_paddusw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { 1523; CHECK-LABEL: stack_fold_paddusw_maskz_commuted: 1524; CHECK: # %bb.0: 1525; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1526; CHECK-NEXT: #APP 1527; CHECK-NEXT: nop 1528; CHECK-NEXT: #NO_APP 1529; CHECK-NEXT: kmovd %edi, %k1 1530; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 1531; CHECK-NEXT: retq 1532 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1533 %2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0) 1534 %3 = bitcast i32 %mask to <32 x i1> 1535 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer 1536 ret <32 x i16> %4 1537} 1538 1539define <32 x i16> @stack_fold_paddw(<32 x i16> %a0, <32 x i16> %a1) { 1540; CHECK-LABEL: stack_fold_paddw: 1541; CHECK: # %bb.0: 1542; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1543; CHECK-NEXT: #APP 1544; CHECK-NEXT: nop 1545; CHECK-NEXT: #NO_APP 1546; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 1547; CHECK-NEXT: retq 1548 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1549 %2 = add <32 x i16> %a0, %a1 1550 ret <32 x i16> %2 1551} 1552 1553define <32 x i16> @stack_fold_paddw_commuted(<32 x i16> %a0, <32 x i16> %a1) { 1554; CHECK-LABEL: stack_fold_paddw_commuted: 1555; CHECK: # %bb.0: 1556; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1557; CHECK-NEXT: #APP 1558; CHECK-NEXT: nop 1559; CHECK-NEXT: #NO_APP 1560; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 1561; CHECK-NEXT: retq 1562 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1563 %2 = add <32 x i16> %a1, %a0 1564 ret <32 x i16> %2 1565} 1566 1567define <32 x i16> @stack_fold_paddw_mask(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) { 1568; CHECK-LABEL: stack_fold_paddw_mask: 1569; CHECK: # %bb.0: 1570; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1571; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 1572; CHECK-NEXT: #APP 1573; CHECK-NEXT: nop 1574; CHECK-NEXT: #NO_APP 1575; CHECK-NEXT: kmovd %esi, %k1 1576; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 1577; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 1578; CHECK-NEXT: retq 1579 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1580 %2 = add <32 x i16> %a0, %a1 1581 %3 = bitcast i32 %mask to <32 x i1> 1582 ; load needed to keep the operation from being scheduled about the asm block 1583 %4 = load <32 x i16>, <32 x i16>* %a2 1584 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4 1585 ret <32 x i16> %5 1586} 1587 1588define <32 x i16> @stack_fold_paddw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) { 1589; CHECK-LABEL: stack_fold_paddw_mask_commuted: 1590; CHECK: # %bb.0: 1591; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1592; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 1593; CHECK-NEXT: #APP 1594; CHECK-NEXT: nop 1595; CHECK-NEXT: #NO_APP 1596; CHECK-NEXT: kmovd %esi, %k1 1597; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 1598; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 1599; CHECK-NEXT: retq 1600 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1601 %2 = add <32 x i16> %a1, %a0 1602 %3 = bitcast i32 %mask to <32 x i1> 1603 ; load needed to keep the operation from being scheduled about the asm block 1604 %4 = load <32 x i16>, <32 x i16>* %a2 1605 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4 1606 ret <32 x i16> %5 1607} 1608 1609define <32 x i16> @stack_fold_paddw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { 1610; CHECK-LABEL: stack_fold_paddw_maskz: 1611; CHECK: # %bb.0: 1612; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1613; CHECK-NEXT: #APP 1614; CHECK-NEXT: nop 1615; CHECK-NEXT: #NO_APP 1616; CHECK-NEXT: kmovd %edi, %k1 1617; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 1618; CHECK-NEXT: retq 1619 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1620 %2 = add <32 x i16> %a0, %a1 1621 %3 = bitcast i32 %mask to <32 x i1> 1622 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer 1623 ret <32 x i16> %4 1624} 1625 1626define <32 x i16> @stack_fold_paddw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { 1627; CHECK-LABEL: stack_fold_paddw_maskz_commuted: 1628; CHECK: # %bb.0: 1629; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1630; CHECK-NEXT: #APP 1631; CHECK-NEXT: nop 1632; CHECK-NEXT: #NO_APP 1633; CHECK-NEXT: kmovd %edi, %k1 1634; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 1635; CHECK-NEXT: retq 1636 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1637 %2 = add <32 x i16> %a1, %a0 1638 %3 = bitcast i32 %mask to <32 x i1> 1639 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer 1640 ret <32 x i16> %4 1641} 1642 1643define <64 x i8> @stack_fold_palignr(<64 x i8> %a0, <64 x i8> %a1) { 1644; CHECK-LABEL: stack_fold_palignr: 1645; CHECK: # %bb.0: 1646; CHECK-NEXT: subq $56, %rsp 1647; CHECK-NEXT: .cfi_def_cfa_offset 64 1648; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1649; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1650; CHECK-NEXT: #APP 1651; CHECK-NEXT: nop 1652; CHECK-NEXT: #NO_APP 1653; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 1654; CHECK-NEXT: vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 1655; CHECK-NEXT: # zmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16],mem[33,34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32],mem[49,50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48] 1656; CHECK-NEXT: addq $56, %rsp 1657; CHECK-NEXT: .cfi_def_cfa_offset 8 1658; CHECK-NEXT: retq 1659 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1660 %2 = shufflevector <64 x i8> %a1, <64 x i8> %a0, <64 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112> 1661 ret <64 x i8> %2 1662} 1663 1664define <64 x i8> @stack_fold_palignr_mask(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %passthru, i64 %mask) { 1665; CHECK-LABEL: stack_fold_palignr_mask: 1666; CHECK: # %bb.0: 1667; CHECK-NEXT: subq $56, %rsp 1668; CHECK-NEXT: .cfi_def_cfa_offset 64 1669; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1670; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1671; CHECK-NEXT: #APP 1672; CHECK-NEXT: nop 1673; CHECK-NEXT: #NO_APP 1674; CHECK-NEXT: kmovq %rsi, %k1 1675; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 1676; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 1677; CHECK-NEXT: vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload 1678; CHECK-NEXT: # zmm1 {%k1} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16],mem[33,34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32],mem[49,50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48] 1679; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 1680; CHECK-NEXT: addq $56, %rsp 1681; CHECK-NEXT: .cfi_def_cfa_offset 8 1682; CHECK-NEXT: retq 1683 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1684 %2 = shufflevector <64 x i8> %a1, <64 x i8> %a0, <64 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112> 1685 %3 = bitcast i64 %mask to <64 x i1> 1686 %4 = load <64 x i8>, <64 x i8>* %passthru 1687 %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4 1688 ret <64 x i8> %5 1689} 1690 1691define <64 x i8> @stack_fold_palignr_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 1692; CHECK-LABEL: stack_fold_palignr_maskz: 1693; CHECK: # %bb.0: 1694; CHECK-NEXT: subq $56, %rsp 1695; CHECK-NEXT: .cfi_def_cfa_offset 64 1696; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1697; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1698; CHECK-NEXT: #APP 1699; CHECK-NEXT: nop 1700; CHECK-NEXT: #NO_APP 1701; CHECK-NEXT: kmovq %rdi, %k1 1702; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 1703; CHECK-NEXT: vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 1704; CHECK-NEXT: # zmm0 {%k1} {z} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16],mem[33,34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32],mem[49,50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48] 1705; CHECK-NEXT: addq $56, %rsp 1706; CHECK-NEXT: .cfi_def_cfa_offset 8 1707; CHECK-NEXT: retq 1708 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1709 %2 = shufflevector <64 x i8> %a1, <64 x i8> %a0, <64 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112> 1710 %3 = bitcast i64 %mask to <64 x i1> 1711 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer 1712 ret <64 x i8> %4 1713} 1714 1715define <16 x i32> @stack_fold_pandd(<16 x i32> %a0, <16 x i32> %a1) { 1716; CHECK-LABEL: stack_fold_pandd: 1717; CHECK: # %bb.0: 1718; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1719; CHECK-NEXT: #APP 1720; CHECK-NEXT: nop 1721; CHECK-NEXT: #NO_APP 1722; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 1723; CHECK-NEXT: retq 1724 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1725 %2 = and <16 x i32> %a0, %a1 1726 ret <16 x i32> %2 1727} 1728 1729define <16 x i32> @stack_fold_pandd_commuted(<16 x i32> %a0, <16 x i32> %a1) { 1730; CHECK-LABEL: stack_fold_pandd_commuted: 1731; CHECK: # %bb.0: 1732; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1733; CHECK-NEXT: #APP 1734; CHECK-NEXT: nop 1735; CHECK-NEXT: #NO_APP 1736; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 1737; CHECK-NEXT: retq 1738 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1739 %2 = and <16 x i32> %a1, %a0 1740 ret <16 x i32> %2 1741} 1742 1743define <16 x i32> @stack_fold_pandd_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) { 1744; CHECK-LABEL: stack_fold_pandd_mask: 1745; CHECK: # %bb.0: 1746; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1747; CHECK-NEXT: vmovaps %zmm0, %zmm1 1748; CHECK-NEXT: #APP 1749; CHECK-NEXT: nop 1750; CHECK-NEXT: #NO_APP 1751; CHECK-NEXT: kmovd %esi, %k1 1752; CHECK-NEXT: vmovaps (%rdi), %zmm0 1753; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 1754; CHECK-NEXT: retq 1755 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1756 %2 = and <16 x i32> %a0, %a1 1757 %3 = bitcast i16 %mask to <16 x i1> 1758 ; load needed to keep the operation from being scheduled about the asm block 1759 %4 = load <16 x i32>, <16 x i32>* %a2 1760 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4 1761 ret <16 x i32> %5 1762} 1763 1764define <16 x i32> @stack_fold_pandd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) { 1765; CHECK-LABEL: stack_fold_pandd_mask_commuted: 1766; CHECK: # %bb.0: 1767; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1768; CHECK-NEXT: vmovaps %zmm0, %zmm1 1769; CHECK-NEXT: #APP 1770; CHECK-NEXT: nop 1771; CHECK-NEXT: #NO_APP 1772; CHECK-NEXT: kmovd %esi, %k1 1773; CHECK-NEXT: vmovaps (%rdi), %zmm0 1774; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 1775; CHECK-NEXT: retq 1776 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1777 %2 = and <16 x i32> %a1, %a0 1778 %3 = bitcast i16 %mask to <16 x i1> 1779 ; load needed to keep the operation from being scheduled about the asm block 1780 %4 = load <16 x i32>, <16 x i32>* %a2 1781 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4 1782 ret <16 x i32> %5 1783} 1784 1785define <16 x i32> @stack_fold_pandd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { 1786; CHECK-LABEL: stack_fold_pandd_maskz: 1787; CHECK: # %bb.0: 1788; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1789; CHECK-NEXT: #APP 1790; CHECK-NEXT: nop 1791; CHECK-NEXT: #NO_APP 1792; CHECK-NEXT: kmovd %edi, %k1 1793; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 1794; CHECK-NEXT: retq 1795 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1796 %2 = and <16 x i32> %a0, %a1 1797 %3 = bitcast i16 %mask to <16 x i1> 1798 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 1799 ret <16 x i32> %4 1800} 1801 1802define <16 x i32> @stack_fold_pandd_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { 1803; CHECK-LABEL: stack_fold_pandd_maskz_commuted: 1804; CHECK: # %bb.0: 1805; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1806; CHECK-NEXT: #APP 1807; CHECK-NEXT: nop 1808; CHECK-NEXT: #NO_APP 1809; CHECK-NEXT: kmovd %edi, %k1 1810; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 1811; CHECK-NEXT: retq 1812 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1813 %2 = and <16 x i32> %a1, %a0 1814 %3 = bitcast i16 %mask to <16 x i1> 1815 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 1816 ret <16 x i32> %4 1817} 1818 1819define <8 x i64> @stack_fold_pandq(<8 x i64> %a0, <8 x i64> %a1) { 1820; CHECK-LABEL: stack_fold_pandq: 1821; CHECK: # %bb.0: 1822; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1823; CHECK-NEXT: #APP 1824; CHECK-NEXT: nop 1825; CHECK-NEXT: #NO_APP 1826; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 1827; CHECK-NEXT: retq 1828 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1829 %2 = and <8 x i64> %a0, %a1 1830 ret <8 x i64> %2 1831} 1832 1833define <8 x i64> @stack_fold_pandq_commuted(<8 x i64> %a0, <8 x i64> %a1) { 1834; CHECK-LABEL: stack_fold_pandq_commuted: 1835; CHECK: # %bb.0: 1836; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1837; CHECK-NEXT: #APP 1838; CHECK-NEXT: nop 1839; CHECK-NEXT: #NO_APP 1840; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 1841; CHECK-NEXT: retq 1842 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1843 %2 = and <8 x i64> %a1, %a0 1844 ret <8 x i64> %2 1845} 1846 1847define <8 x i64> @stack_fold_pandq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) { 1848; CHECK-LABEL: stack_fold_pandq_mask: 1849; CHECK: # %bb.0: 1850; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1851; CHECK-NEXT: vmovapd %zmm0, %zmm1 1852; CHECK-NEXT: #APP 1853; CHECK-NEXT: nop 1854; CHECK-NEXT: #NO_APP 1855; CHECK-NEXT: kmovd %esi, %k1 1856; CHECK-NEXT: vmovapd (%rdi), %zmm0 1857; CHECK-NEXT: vandpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 1858; CHECK-NEXT: retq 1859 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1860 %2 = and <8 x i64> %a0, %a1 1861 %3 = bitcast i8 %mask to <8 x i1> 1862 ; load needed to keep the operation from being scheduled about the asm block 1863 %4 = load <8 x i64>, <8 x i64>* %a2 1864 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4 1865 ret <8 x i64> %5 1866} 1867 1868define <8 x i64> @stack_fold_pandq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) { 1869; CHECK-LABEL: stack_fold_pandq_mask_commuted: 1870; CHECK: # %bb.0: 1871; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1872; CHECK-NEXT: vmovapd %zmm0, %zmm1 1873; CHECK-NEXT: #APP 1874; CHECK-NEXT: nop 1875; CHECK-NEXT: #NO_APP 1876; CHECK-NEXT: kmovd %esi, %k1 1877; CHECK-NEXT: vmovapd (%rdi), %zmm0 1878; CHECK-NEXT: vandpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 1879; CHECK-NEXT: retq 1880 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1881 %2 = and <8 x i64> %a1, %a0 1882 %3 = bitcast i8 %mask to <8 x i1> 1883 ; load needed to keep the operation from being scheduled about the asm block 1884 %4 = load <8 x i64>, <8 x i64>* %a2 1885 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4 1886 ret <8 x i64> %5 1887} 1888 1889define <8 x i64> @stack_fold_pandq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 1890; CHECK-LABEL: stack_fold_pandq_maskz: 1891; CHECK: # %bb.0: 1892; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1893; CHECK-NEXT: #APP 1894; CHECK-NEXT: nop 1895; CHECK-NEXT: #NO_APP 1896; CHECK-NEXT: kmovd %edi, %k1 1897; CHECK-NEXT: vandpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 1898; CHECK-NEXT: retq 1899 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1900 %2 = and <8 x i64> %a0, %a1 1901 %3 = bitcast i8 %mask to <8 x i1> 1902 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer 1903 ret <8 x i64> %4 1904} 1905 1906define <8 x i64> @stack_fold_pandq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 1907; CHECK-LABEL: stack_fold_pandq_maskz_commuted: 1908; CHECK: # %bb.0: 1909; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1910; CHECK-NEXT: #APP 1911; CHECK-NEXT: nop 1912; CHECK-NEXT: #NO_APP 1913; CHECK-NEXT: kmovd %edi, %k1 1914; CHECK-NEXT: vandpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 1915; CHECK-NEXT: retq 1916 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1917 %2 = and <8 x i64> %a1, %a0 1918 %3 = bitcast i8 %mask to <8 x i1> 1919 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer 1920 ret <8 x i64> %4 1921} 1922 1923define <16 x i32> @stack_fold_vpconflictd(<16 x i32> %a0) { 1924; CHECK-LABEL: stack_fold_vpconflictd: 1925; CHECK: # %bb.0: 1926; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1927; CHECK-NEXT: #APP 1928; CHECK-NEXT: nop 1929; CHECK-NEXT: #NO_APP 1930; CHECK-NEXT: vpconflictd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 1931; CHECK-NEXT: retq 1932 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1933 %2 = call <16 x i32> @llvm.x86.avx512.conflict.d.512(<16 x i32> %a0) 1934 ret <16 x i32> %2 1935} 1936declare <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly 1937 1938define <8 x i64> @stack_fold_vpconflictq(<8 x i64> %a0) { 1939; CHECK-LABEL: stack_fold_vpconflictq: 1940; CHECK: # %bb.0: 1941; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1942; CHECK-NEXT: #APP 1943; CHECK-NEXT: nop 1944; CHECK-NEXT: #NO_APP 1945; CHECK-NEXT: vpconflictq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 1946; CHECK-NEXT: retq 1947 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1948 %2 = call <8 x i64> @llvm.x86.avx512.conflict.q.512(<8 x i64> %a0) 1949 ret <8 x i64> %2 1950} 1951declare <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64>, <8 x i64>, i8) nounwind readnone 1952 1953define i64 @stack_fold_pcmpeqb(<64 x i8> %a0, <64 x i8> %a1) { 1954; CHECK-LABEL: stack_fold_pcmpeqb: 1955; CHECK: # %bb.0: 1956; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1957; CHECK-NEXT: #APP 1958; CHECK-NEXT: nop 1959; CHECK-NEXT: #NO_APP 1960; CHECK-NEXT: vpcmpeqb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload 1961; CHECK-NEXT: kmovq %k0, %rax 1962; CHECK-NEXT: vzeroupper 1963; CHECK-NEXT: retq 1964 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1965 %2 = icmp eq <64 x i8> %a0, %a1 1966 %3 = bitcast <64 x i1> %2 to i64 1967 ret i64 %3 1968} 1969 1970define i16 @stack_fold_pcmpeqd(<16 x i32> %a0, <16 x i32> %a1) { 1971; CHECK-LABEL: stack_fold_pcmpeqd: 1972; CHECK: # %bb.0: 1973; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1974; CHECK-NEXT: #APP 1975; CHECK-NEXT: nop 1976; CHECK-NEXT: #NO_APP 1977; CHECK-NEXT: vpcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload 1978; CHECK-NEXT: kmovd %k0, %eax 1979; CHECK-NEXT: # kill: def $ax killed $ax killed $eax 1980; CHECK-NEXT: vzeroupper 1981; CHECK-NEXT: retq 1982 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1983 %2 = icmp eq <16 x i32> %a0, %a1 1984 %3 = bitcast <16 x i1> %2 to i16 1985 ret i16 %3 1986} 1987 1988define i8 @stack_fold_pcmpeqq(<8 x i64> %a0, <8 x i64> %a1) { 1989; CHECK-LABEL: stack_fold_pcmpeqq: 1990; CHECK: # %bb.0: 1991; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1992; CHECK-NEXT: #APP 1993; CHECK-NEXT: nop 1994; CHECK-NEXT: #NO_APP 1995; CHECK-NEXT: vpcmpeqq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload 1996; CHECK-NEXT: kmovd %k0, %eax 1997; CHECK-NEXT: # kill: def $al killed $al killed $eax 1998; CHECK-NEXT: vzeroupper 1999; CHECK-NEXT: retq 2000 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2001 %2 = icmp eq <8 x i64> %a0, %a1 2002 %3 = bitcast <8 x i1> %2 to i8 2003 ret i8 %3 2004} 2005 2006define i32 @stack_fold_pcmpeqw(<32 x i16> %a0, <32 x i16> %a1) { 2007; CHECK-LABEL: stack_fold_pcmpeqw: 2008; CHECK: # %bb.0: 2009; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2010; CHECK-NEXT: #APP 2011; CHECK-NEXT: nop 2012; CHECK-NEXT: #NO_APP 2013; CHECK-NEXT: vpcmpeqw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload 2014; CHECK-NEXT: kmovd %k0, %eax 2015; CHECK-NEXT: vzeroupper 2016; CHECK-NEXT: retq 2017 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2018 %2 = icmp eq <32 x i16> %a0, %a1 2019 %3 = bitcast <32 x i1> %2 to i32 2020 ret i32 %3 2021} 2022 2023define <16 x i32> @stack_fold_pcmpeqd_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask, <16 x i32> %b0, <16 x i32> %b1) { 2024; CHECK-LABEL: stack_fold_pcmpeqd_mask: 2025; CHECK: # %bb.0: 2026; CHECK-NEXT: subq $184, %rsp 2027; CHECK-NEXT: .cfi_def_cfa_offset 192 2028; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2029; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill 2030; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2031; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2032; CHECK-NEXT: #APP 2033; CHECK-NEXT: nop 2034; CHECK-NEXT: #NO_APP 2035; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 2036; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0 2037; CHECK-NEXT: kmovd %esi, %k1 2038; CHECK-NEXT: vpcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k1 {%k1} # 64-byte Folded Reload 2039; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 2040; CHECK-NEXT: vpblendmd (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload 2041; CHECK-NEXT: addq $184, %rsp 2042; CHECK-NEXT: .cfi_def_cfa_offset 8 2043; CHECK-NEXT: retq 2044 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2045 ; load and add are here to keep the operations below the side effecting block and to avoid folding the wrong load 2046 %2 = load <16 x i32>, <16 x i32>* %a2 2047 %3 = add <16 x i32> %a1, %2 2048 %4 = bitcast i16 %mask to <16 x i1> 2049 %5 = icmp eq <16 x i32> %3, %a0 2050 %6 = and <16 x i1> %4, %5 2051 %7 = select <16 x i1> %6, <16 x i32> %b0, <16 x i32> %b1 2052 ret <16 x i32> %7 2053} 2054 2055define <16 x i32> @stack_fold_pcmpeqd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask, <16 x i32> %b0, <16 x i32> %b1) { 2056; CHECK-LABEL: stack_fold_pcmpeqd_mask_commuted: 2057; CHECK: # %bb.0: 2058; CHECK-NEXT: subq $184, %rsp 2059; CHECK-NEXT: .cfi_def_cfa_offset 192 2060; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2061; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill 2062; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2063; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2064; CHECK-NEXT: #APP 2065; CHECK-NEXT: nop 2066; CHECK-NEXT: #NO_APP 2067; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 2068; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0 2069; CHECK-NEXT: kmovd %esi, %k1 2070; CHECK-NEXT: vpcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k1 {%k1} # 64-byte Folded Reload 2071; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 2072; CHECK-NEXT: vpblendmd (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload 2073; CHECK-NEXT: addq $184, %rsp 2074; CHECK-NEXT: .cfi_def_cfa_offset 8 2075; CHECK-NEXT: retq 2076 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2077 ; load and add are here to keep the operations below the side effecting block and to avoid folding the wrong load 2078 %2 = load <16 x i32>, <16 x i32>* %a2 2079 %3 = add <16 x i32> %a1, %2 2080 %4 = bitcast i16 %mask to <16 x i1> 2081 %5 = icmp eq <16 x i32> %a0, %3 2082 %6 = and <16 x i1> %4, %5 2083 %7 = select <16 x i1> %6, <16 x i32> %b0, <16 x i32> %b1 2084 ret <16 x i32> %7 2085} 2086 2087define <16 x i32> @stack_fold_pcmpled_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask, <16 x i32> %b0, <16 x i32> %b1) { 2088; CHECK-LABEL: stack_fold_pcmpled_mask: 2089; CHECK: # %bb.0: 2090; CHECK-NEXT: subq $184, %rsp 2091; CHECK-NEXT: .cfi_def_cfa_offset 192 2092; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2093; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill 2094; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2095; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2096; CHECK-NEXT: #APP 2097; CHECK-NEXT: nop 2098; CHECK-NEXT: #NO_APP 2099; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 2100; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0 2101; CHECK-NEXT: kmovd %esi, %k1 2102; CHECK-NEXT: vpcmpled {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k1 {%k1} # 64-byte Folded Reload 2103; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 2104; CHECK-NEXT: vpblendmd (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload 2105; CHECK-NEXT: addq $184, %rsp 2106; CHECK-NEXT: .cfi_def_cfa_offset 8 2107; CHECK-NEXT: retq 2108 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2109 ; load and add are here to keep the operations below the side effecting block and to avoid folding the wrong load 2110 %2 = load <16 x i32>, <16 x i32>* %a2 2111 %3 = add <16 x i32> %a1, %2 2112 %4 = bitcast i16 %mask to <16 x i1> 2113 %5 = icmp sge <16 x i32> %a0, %3 2114 %6 = and <16 x i1> %4, %5 2115 %7 = select <16 x i1> %6, <16 x i32> %b0, <16 x i32> %b1 2116 ret <16 x i32> %7 2117} 2118 2119define i16 @stack_fold_pcmpleud(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) { 2120; CHECK-LABEL: stack_fold_pcmpleud: 2121; CHECK: # %bb.0: 2122; CHECK-NEXT: subq $56, %rsp 2123; CHECK-NEXT: .cfi_def_cfa_offset 64 2124; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2125; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2126; CHECK-NEXT: #APP 2127; CHECK-NEXT: nop 2128; CHECK-NEXT: #NO_APP 2129; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 2130; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0 2131; CHECK-NEXT: vpcmpleud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload 2132; CHECK-NEXT: kmovd %k0, %eax 2133; CHECK-NEXT: andl %esi, %eax 2134; CHECK-NEXT: # kill: def $ax killed $ax killed $eax 2135; CHECK-NEXT: addq $56, %rsp 2136; CHECK-NEXT: .cfi_def_cfa_offset 8 2137; CHECK-NEXT: vzeroupper 2138; CHECK-NEXT: retq 2139 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2140 %2 = load <16 x i32>, <16 x i32>* %a2 2141 %3 = add <16 x i32> %a1, %2 2142 %4 = bitcast i16 %mask to <16 x i1> 2143 %5 = icmp uge <16 x i32> %a0, %3 2144 %6 = and <16 x i1> %5, %4 2145 %7 = bitcast <16 x i1> %6 to i16 2146 ret i16 %7 2147} 2148 2149define <64 x i8> @stack_fold_permbvar(<64 x i8> %a0, <64 x i8> %a1) { 2150; CHECK-LABEL: stack_fold_permbvar: 2151; CHECK: # %bb.0: 2152; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2153; CHECK-NEXT: #APP 2154; CHECK-NEXT: nop 2155; CHECK-NEXT: #NO_APP 2156; CHECK-NEXT: vpermb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 2157; CHECK-NEXT: retq 2158 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2159 %2 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a1, <64 x i8> %a0) 2160 ret <64 x i8> %2 2161} 2162declare <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8>, <64 x i8>) nounwind readonly 2163 2164define <64 x i8> @stack_fold_permbvar_mask(<64 x i8>* %passthru, <64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 2165; CHECK-LABEL: stack_fold_permbvar_mask: 2166; CHECK: # %bb.0: 2167; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2168; CHECK-NEXT: #APP 2169; CHECK-NEXT: nop 2170; CHECK-NEXT: #NO_APP 2171; CHECK-NEXT: kmovq %rsi, %k1 2172; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 2173; CHECK-NEXT: vpermb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 2174; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 2175; CHECK-NEXT: retq 2176 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2177 %2 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a1, <64 x i8> %a0) 2178 %3 = bitcast i64 %mask to <64 x i1> 2179 ; load needed to keep the operation from being scheduled above the asm block 2180 %4 = load <64 x i8>, <64 x i8>* %passthru 2181 %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4 2182 ret <64 x i8> %5 2183} 2184 2185define <64 x i8> @stack_fold_permbvar_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 2186; CHECK-LABEL: stack_fold_permbvar_maskz: 2187; CHECK: # %bb.0: 2188; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2189; CHECK-NEXT: #APP 2190; CHECK-NEXT: nop 2191; CHECK-NEXT: #NO_APP 2192; CHECK-NEXT: kmovq %rdi, %k1 2193; CHECK-NEXT: vpermb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 2194; CHECK-NEXT: retq 2195 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2196 %2 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a1, <64 x i8> %a0) 2197 %3 = bitcast i64 %mask to <64 x i1> 2198 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer 2199 ret <64 x i8> %4 2200} 2201 2202define <16 x i32> @stack_fold_permd(<16 x i32> %a0, <16 x i32> %a1) { 2203; CHECK-LABEL: stack_fold_permd: 2204; CHECK: # %bb.0: 2205; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2206; CHECK-NEXT: #APP 2207; CHECK-NEXT: nop 2208; CHECK-NEXT: #NO_APP 2209; CHECK-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 2210; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 2211; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0 2212; CHECK-NEXT: retq 2213 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2214 %2 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a1, <16 x i32> %a0) 2215 ; add forces execution domain 2216 %3 = add <16 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 2217 ret <16 x i32> %3 2218} 2219declare <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32>, <16 x i32>) nounwind readonly 2220 2221define <64 x i8> @stack_fold_vpermi2b(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) { 2222; CHECK-LABEL: stack_fold_vpermi2b: 2223; CHECK: # %bb.0: 2224; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2225; CHECK-NEXT: #APP 2226; CHECK-NEXT: nop 2227; CHECK-NEXT: #NO_APP 2228; CHECK-NEXT: vpermi2b {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 2229; CHECK-NEXT: retq 2230 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2231 %2 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x1, <64 x i8> %x0, <64 x i8> %x2) 2232 ret <64 x i8> %2 2233} 2234 2235define <16 x i32> @stack_fold_vpermi2d(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { 2236; CHECK-LABEL: stack_fold_vpermi2d: 2237; CHECK: # %bb.0: 2238; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2239; CHECK-NEXT: #APP 2240; CHECK-NEXT: nop 2241; CHECK-NEXT: #NO_APP 2242; CHECK-NEXT: vpermi2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 2243; CHECK-NEXT: retq 2244 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2245 %2 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2) 2246 ret <16 x i32> %2 2247} 2248 2249define <8 x i64> @stack_fold_vpermi2q(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) { 2250; CHECK-LABEL: stack_fold_vpermi2q: 2251; CHECK: # %bb.0: 2252; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2253; CHECK-NEXT: #APP 2254; CHECK-NEXT: nop 2255; CHECK-NEXT: #NO_APP 2256; CHECK-NEXT: vpermi2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 2257; CHECK-NEXT: retq 2258 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2259 %2 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x1, <8 x i64> %x0, <8 x i64> %x2) 2260 ret <8 x i64> %2 2261} 2262 2263define <32 x i16> @stack_fold_vpermi2w(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) { 2264; CHECK-LABEL: stack_fold_vpermi2w: 2265; CHECK: # %bb.0: 2266; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2267; CHECK-NEXT: #APP 2268; CHECK-NEXT: nop 2269; CHECK-NEXT: #NO_APP 2270; CHECK-NEXT: vpermi2w {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 2271; CHECK-NEXT: retq 2272 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2273 %2 = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x1, <32 x i16> %x0, <32 x i16> %x2) 2274 ret <32 x i16> %2 2275} 2276 2277define <8 x i64> @stack_fold_permq(<8 x i64> %a0) { 2278; CHECK-LABEL: stack_fold_permq: 2279; CHECK: # %bb.0: 2280; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2281; CHECK-NEXT: #APP 2282; CHECK-NEXT: nop 2283; CHECK-NEXT: #NO_APP 2284; CHECK-NEXT: vpermq $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 2285; CHECK-NEXT: # zmm0 = mem[3,2,2,3,7,6,6,7] 2286; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 2287; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0 2288; CHECK-NEXT: retq 2289 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2290 %2 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7> 2291 ; add forces execution domain 2292 %3 = add <8 x i64> %2, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> 2293 ret <8 x i64> %3 2294} 2295 2296define <8 x i64> @stack_fold_permq_mask(<8 x i64>* %passthru, <8 x i64> %a0, i8 %mask) { 2297; CHECK-LABEL: stack_fold_permq_mask: 2298; CHECK: # %bb.0: 2299; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2300; CHECK-NEXT: #APP 2301; CHECK-NEXT: nop 2302; CHECK-NEXT: #NO_APP 2303; CHECK-NEXT: kmovd %esi, %k1 2304; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 2305; CHECK-NEXT: vpermq $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload 2306; CHECK-NEXT: # zmm0 {%k1} = mem[3,2,2,3,7,6,6,7] 2307; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 2308; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0 2309; CHECK-NEXT: retq 2310 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2311 %2 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7> 2312 %3 = bitcast i8 %mask to <8 x i1> 2313 ; load needed to keep the operation from being scheduled above the asm block 2314 %4 = load <8 x i64>, <8 x i64>* %passthru 2315 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4 2316 ; add forces execution domain 2317 %6 = add <8 x i64> %5, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> 2318 ret <8 x i64> %6 2319} 2320 2321define <8 x i64> @stack_fold_permq_maskz(<8 x i64>* %passthru, <8 x i64> %a0, i8 %mask) { 2322; CHECK-LABEL: stack_fold_permq_maskz: 2323; CHECK: # %bb.0: 2324; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2325; CHECK-NEXT: #APP 2326; CHECK-NEXT: nop 2327; CHECK-NEXT: #NO_APP 2328; CHECK-NEXT: kmovd %esi, %k1 2329; CHECK-NEXT: vpermq $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload 2330; CHECK-NEXT: # zmm0 {%k1} {z} = mem[3,2,2,3,7,6,6,7] 2331; CHECK-NEXT: retq 2332 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2333 %2 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7> 2334 %3 = bitcast i8 %mask to <8 x i1> 2335 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer 2336 ret <8 x i64> %4 2337} 2338 2339define <8 x i64> @stack_fold_permqvar(<8 x i64> %a0, <8 x i64> %a1) { 2340; CHECK-LABEL: stack_fold_permqvar: 2341; CHECK: # %bb.0: 2342; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2343; CHECK-NEXT: #APP 2344; CHECK-NEXT: nop 2345; CHECK-NEXT: #NO_APP 2346; CHECK-NEXT: vpermq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 2347; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 2348; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0 2349; CHECK-NEXT: retq 2350 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2351 %2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a1, <8 x i64> %a0) 2352 ; add forces execution domain 2353 %3 = add <8 x i64> %2, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> 2354 ret <8 x i64> %3 2355} 2356declare <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64>, <8 x i64>) nounwind readonly 2357 2358define <8 x i64> @stack_fold_permqvar_mask(<8 x i64>* %passthru, <8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 2359; CHECK-LABEL: stack_fold_permqvar_mask: 2360; CHECK: # %bb.0: 2361; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2362; CHECK-NEXT: #APP 2363; CHECK-NEXT: nop 2364; CHECK-NEXT: #NO_APP 2365; CHECK-NEXT: kmovd %esi, %k1 2366; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 2367; CHECK-NEXT: vpermq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload 2368; CHECK-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 2369; CHECK-NEXT: vpsubq %zmm0, %zmm1, %zmm0 2370; CHECK-NEXT: retq 2371 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2372 %2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a1, <8 x i64> %a0) 2373 %3 = bitcast i8 %mask to <8 x i1> 2374 ; load needed to keep the operation from being scheduled above the asm block 2375 %4 = load <8 x i64>, <8 x i64>* %passthru 2376 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4 2377 ; add forces execution domain 2378 %6 = add <8 x i64> %5, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> 2379 ret <8 x i64> %6 2380} 2381 2382define <64 x i8> @stack_fold_vpermt2b(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) { 2383; CHECK-LABEL: stack_fold_vpermt2b: 2384; CHECK: # %bb.0: 2385; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2386; CHECK-NEXT: #APP 2387; CHECK-NEXT: nop 2388; CHECK-NEXT: #NO_APP 2389; CHECK-NEXT: vpermt2b {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 2390; CHECK-NEXT: retq 2391 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2392 %2 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) 2393 ret <64 x i8> %2 2394} 2395declare <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8>, <64 x i8>, <64 x i8>) 2396 2397define <16 x i32> @stack_fold_vpermt2d(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { 2398; CHECK-LABEL: stack_fold_vpermt2d: 2399; CHECK: # %bb.0: 2400; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2401; CHECK-NEXT: #APP 2402; CHECK-NEXT: nop 2403; CHECK-NEXT: #NO_APP 2404; CHECK-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 2405; CHECK-NEXT: retq 2406 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2407 %2 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) 2408 ret <16 x i32> %2 2409} 2410declare <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>) 2411 2412define <8 x i64> @stack_fold_vpermt2q(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) { 2413; CHECK-LABEL: stack_fold_vpermt2q: 2414; CHECK: # %bb.0: 2415; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2416; CHECK-NEXT: #APP 2417; CHECK-NEXT: nop 2418; CHECK-NEXT: #NO_APP 2419; CHECK-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 2420; CHECK-NEXT: retq 2421 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2422 %2 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) 2423 ret <8 x i64> %2 2424} 2425declare <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>) 2426 2427define <32 x i16> @stack_fold_vpermt2w(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) { 2428; CHECK-LABEL: stack_fold_vpermt2w: 2429; CHECK: # %bb.0: 2430; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2431; CHECK-NEXT: #APP 2432; CHECK-NEXT: nop 2433; CHECK-NEXT: #NO_APP 2434; CHECK-NEXT: vpermt2w {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 2435; CHECK-NEXT: retq 2436 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2437 %2 = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) 2438 ret <32 x i16> %2 2439} 2440declare <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>) 2441 2442define <32 x i16> @stack_fold_permwvar(<32 x i16> %a0, <32 x i16> %a1) { 2443; CHECK-LABEL: stack_fold_permwvar: 2444; CHECK: # %bb.0: 2445; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2446; CHECK-NEXT: #APP 2447; CHECK-NEXT: nop 2448; CHECK-NEXT: #NO_APP 2449; CHECK-NEXT: vpermw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 2450; CHECK-NEXT: retq 2451 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2452 %2 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a1, <32 x i16> %a0) 2453 ret <32 x i16> %2 2454} 2455declare <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16>, <32 x i16>) nounwind readonly 2456 2457define <32 x i16> @stack_fold_permwvar_mask(<32 x i16>* %passthru, <32 x i16> %a0, <32 x i16> %a1, i32 %mask) { 2458; CHECK-LABEL: stack_fold_permwvar_mask: 2459; CHECK: # %bb.0: 2460; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2461; CHECK-NEXT: #APP 2462; CHECK-NEXT: nop 2463; CHECK-NEXT: #NO_APP 2464; CHECK-NEXT: kmovd %esi, %k1 2465; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 2466; CHECK-NEXT: vpermw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 2467; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 2468; CHECK-NEXT: retq 2469 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2470 %2 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a1, <32 x i16> %a0) 2471 %3 = bitcast i32 %mask to <32 x i1> 2472 ; load needed to keep the operation from being scheduled above the asm block 2473 %4 = load <32 x i16>, <32 x i16>* %passthru 2474 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4 2475 ret <32 x i16> %5 2476} 2477 2478define <32 x i16> @stack_fold_permwvar_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { 2479; CHECK-LABEL: stack_fold_permwvar_maskz: 2480; CHECK: # %bb.0: 2481; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2482; CHECK-NEXT: #APP 2483; CHECK-NEXT: nop 2484; CHECK-NEXT: #NO_APP 2485; CHECK-NEXT: kmovd %edi, %k1 2486; CHECK-NEXT: vpermw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 2487; CHECK-NEXT: retq 2488 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2489 %2 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a1, <32 x i16> %a0) 2490 %3 = bitcast i32 %mask to <32 x i1> 2491 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer 2492 ret <32 x i16> %4 2493} 2494 2495define i32 @stack_fold_pextrd(<4 x i32> %a0, <4 x i32> %a1) { 2496; CHECK-LABEL: stack_fold_pextrd: 2497; CHECK: # %bb.0: 2498; CHECK-NEXT: pushq %rbp 2499; CHECK-NEXT: .cfi_def_cfa_offset 16 2500; CHECK-NEXT: pushq %r15 2501; CHECK-NEXT: .cfi_def_cfa_offset 24 2502; CHECK-NEXT: pushq %r14 2503; CHECK-NEXT: .cfi_def_cfa_offset 32 2504; CHECK-NEXT: pushq %r13 2505; CHECK-NEXT: .cfi_def_cfa_offset 40 2506; CHECK-NEXT: pushq %r12 2507; CHECK-NEXT: .cfi_def_cfa_offset 48 2508; CHECK-NEXT: pushq %rbx 2509; CHECK-NEXT: .cfi_def_cfa_offset 56 2510; CHECK-NEXT: .cfi_offset %rbx, -56 2511; CHECK-NEXT: .cfi_offset %r12, -48 2512; CHECK-NEXT: .cfi_offset %r13, -40 2513; CHECK-NEXT: .cfi_offset %r14, -32 2514; CHECK-NEXT: .cfi_offset %r15, -24 2515; CHECK-NEXT: .cfi_offset %rbp, -16 2516; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 2517; CHECK-NEXT: vpextrd $1, %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill 2518; CHECK-NEXT: #APP 2519; CHECK-NEXT: nop 2520; CHECK-NEXT: #NO_APP 2521; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload 2522; CHECK-NEXT: popq %rbx 2523; CHECK-NEXT: .cfi_def_cfa_offset 48 2524; CHECK-NEXT: popq %r12 2525; CHECK-NEXT: .cfi_def_cfa_offset 40 2526; CHECK-NEXT: popq %r13 2527; CHECK-NEXT: .cfi_def_cfa_offset 32 2528; CHECK-NEXT: popq %r14 2529; CHECK-NEXT: .cfi_def_cfa_offset 24 2530; CHECK-NEXT: popq %r15 2531; CHECK-NEXT: .cfi_def_cfa_offset 16 2532; CHECK-NEXT: popq %rbp 2533; CHECK-NEXT: .cfi_def_cfa_offset 8 2534; CHECK-NEXT: retq 2535 ; add forces execution domain 2536 %1 = add <4 x i32> %a0, %a1 2537 %2 = extractelement <4 x i32> %1, i32 1 2538 %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 2539 ret i32 %2 2540} 2541 2542define i64 @stack_fold_pextrq(<2 x i64> %a0) { 2543; CHECK-LABEL: stack_fold_pextrq: 2544; CHECK: # %bb.0: 2545; CHECK-NEXT: pushq %rbp 2546; CHECK-NEXT: .cfi_def_cfa_offset 16 2547; CHECK-NEXT: pushq %r15 2548; CHECK-NEXT: .cfi_def_cfa_offset 24 2549; CHECK-NEXT: pushq %r14 2550; CHECK-NEXT: .cfi_def_cfa_offset 32 2551; CHECK-NEXT: pushq %r13 2552; CHECK-NEXT: .cfi_def_cfa_offset 40 2553; CHECK-NEXT: pushq %r12 2554; CHECK-NEXT: .cfi_def_cfa_offset 48 2555; CHECK-NEXT: pushq %rbx 2556; CHECK-NEXT: .cfi_def_cfa_offset 56 2557; CHECK-NEXT: .cfi_offset %rbx, -56 2558; CHECK-NEXT: .cfi_offset %r12, -48 2559; CHECK-NEXT: .cfi_offset %r13, -40 2560; CHECK-NEXT: .cfi_offset %r14, -32 2561; CHECK-NEXT: .cfi_offset %r15, -24 2562; CHECK-NEXT: .cfi_offset %rbp, -16 2563; CHECK-NEXT: vpextrq $1, %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill 2564; CHECK-NEXT: #APP 2565; CHECK-NEXT: nop 2566; CHECK-NEXT: #NO_APP 2567; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 2568; CHECK-NEXT: popq %rbx 2569; CHECK-NEXT: .cfi_def_cfa_offset 48 2570; CHECK-NEXT: popq %r12 2571; CHECK-NEXT: .cfi_def_cfa_offset 40 2572; CHECK-NEXT: popq %r13 2573; CHECK-NEXT: .cfi_def_cfa_offset 32 2574; CHECK-NEXT: popq %r14 2575; CHECK-NEXT: .cfi_def_cfa_offset 24 2576; CHECK-NEXT: popq %r15 2577; CHECK-NEXT: .cfi_def_cfa_offset 16 2578; CHECK-NEXT: popq %rbp 2579; CHECK-NEXT: .cfi_def_cfa_offset 8 2580; CHECK-NEXT: retq 2581 %1 = extractelement <2 x i64> %a0, i32 1 2582 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 2583 ret i64 %1 2584} 2585 2586define <16 x i8> @stack_fold_pinsrb(<16 x i8> %a0, i8 %a1) { 2587; CHECK-LABEL: stack_fold_pinsrb: 2588; CHECK: # %bb.0: 2589; CHECK-NEXT: pushq %rbp 2590; CHECK-NEXT: .cfi_def_cfa_offset 16 2591; CHECK-NEXT: pushq %r15 2592; CHECK-NEXT: .cfi_def_cfa_offset 24 2593; CHECK-NEXT: pushq %r14 2594; CHECK-NEXT: .cfi_def_cfa_offset 32 2595; CHECK-NEXT: pushq %r13 2596; CHECK-NEXT: .cfi_def_cfa_offset 40 2597; CHECK-NEXT: pushq %r12 2598; CHECK-NEXT: .cfi_def_cfa_offset 48 2599; CHECK-NEXT: pushq %rbx 2600; CHECK-NEXT: .cfi_def_cfa_offset 56 2601; CHECK-NEXT: .cfi_offset %rbx, -56 2602; CHECK-NEXT: .cfi_offset %r12, -48 2603; CHECK-NEXT: .cfi_offset %r13, -40 2604; CHECK-NEXT: .cfi_offset %r14, -32 2605; CHECK-NEXT: .cfi_offset %r15, -24 2606; CHECK-NEXT: .cfi_offset %rbp, -16 2607; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 2608; CHECK-NEXT: #APP 2609; CHECK-NEXT: nop 2610; CHECK-NEXT: #NO_APP 2611; CHECK-NEXT: vpinsrb $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload 2612; CHECK-NEXT: popq %rbx 2613; CHECK-NEXT: .cfi_def_cfa_offset 48 2614; CHECK-NEXT: popq %r12 2615; CHECK-NEXT: .cfi_def_cfa_offset 40 2616; CHECK-NEXT: popq %r13 2617; CHECK-NEXT: .cfi_def_cfa_offset 32 2618; CHECK-NEXT: popq %r14 2619; CHECK-NEXT: .cfi_def_cfa_offset 24 2620; CHECK-NEXT: popq %r15 2621; CHECK-NEXT: .cfi_def_cfa_offset 16 2622; CHECK-NEXT: popq %rbp 2623; CHECK-NEXT: .cfi_def_cfa_offset 8 2624; CHECK-NEXT: retq 2625 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 2626 %2 = insertelement <16 x i8> %a0, i8 %a1, i32 1 2627 ret <16 x i8> %2 2628} 2629 2630define <4 x i32> @stack_fold_pinsrd(<4 x i32> %a0, i32 %a1) { 2631; CHECK-LABEL: stack_fold_pinsrd: 2632; CHECK: # %bb.0: 2633; CHECK-NEXT: pushq %rbp 2634; CHECK-NEXT: .cfi_def_cfa_offset 16 2635; CHECK-NEXT: pushq %r15 2636; CHECK-NEXT: .cfi_def_cfa_offset 24 2637; CHECK-NEXT: pushq %r14 2638; CHECK-NEXT: .cfi_def_cfa_offset 32 2639; CHECK-NEXT: pushq %r13 2640; CHECK-NEXT: .cfi_def_cfa_offset 40 2641; CHECK-NEXT: pushq %r12 2642; CHECK-NEXT: .cfi_def_cfa_offset 48 2643; CHECK-NEXT: pushq %rbx 2644; CHECK-NEXT: .cfi_def_cfa_offset 56 2645; CHECK-NEXT: .cfi_offset %rbx, -56 2646; CHECK-NEXT: .cfi_offset %r12, -48 2647; CHECK-NEXT: .cfi_offset %r13, -40 2648; CHECK-NEXT: .cfi_offset %r14, -32 2649; CHECK-NEXT: .cfi_offset %r15, -24 2650; CHECK-NEXT: .cfi_offset %rbp, -16 2651; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 2652; CHECK-NEXT: #APP 2653; CHECK-NEXT: nop 2654; CHECK-NEXT: #NO_APP 2655; CHECK-NEXT: vpinsrd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload 2656; CHECK-NEXT: popq %rbx 2657; CHECK-NEXT: .cfi_def_cfa_offset 48 2658; CHECK-NEXT: popq %r12 2659; CHECK-NEXT: .cfi_def_cfa_offset 40 2660; CHECK-NEXT: popq %r13 2661; CHECK-NEXT: .cfi_def_cfa_offset 32 2662; CHECK-NEXT: popq %r14 2663; CHECK-NEXT: .cfi_def_cfa_offset 24 2664; CHECK-NEXT: popq %r15 2665; CHECK-NEXT: .cfi_def_cfa_offset 16 2666; CHECK-NEXT: popq %rbp 2667; CHECK-NEXT: .cfi_def_cfa_offset 8 2668; CHECK-NEXT: retq 2669 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 2670 %2 = insertelement <4 x i32> %a0, i32 %a1, i32 1 2671 ret <4 x i32> %2 2672} 2673 2674define <2 x i64> @stack_fold_pinsrq(<2 x i64> %a0, i64 %a1) { 2675; CHECK-LABEL: stack_fold_pinsrq: 2676; CHECK: # %bb.0: 2677; CHECK-NEXT: pushq %rbp 2678; CHECK-NEXT: .cfi_def_cfa_offset 16 2679; CHECK-NEXT: pushq %r15 2680; CHECK-NEXT: .cfi_def_cfa_offset 24 2681; CHECK-NEXT: pushq %r14 2682; CHECK-NEXT: .cfi_def_cfa_offset 32 2683; CHECK-NEXT: pushq %r13 2684; CHECK-NEXT: .cfi_def_cfa_offset 40 2685; CHECK-NEXT: pushq %r12 2686; CHECK-NEXT: .cfi_def_cfa_offset 48 2687; CHECK-NEXT: pushq %rbx 2688; CHECK-NEXT: .cfi_def_cfa_offset 56 2689; CHECK-NEXT: .cfi_offset %rbx, -56 2690; CHECK-NEXT: .cfi_offset %r12, -48 2691; CHECK-NEXT: .cfi_offset %r13, -40 2692; CHECK-NEXT: .cfi_offset %r14, -32 2693; CHECK-NEXT: .cfi_offset %r15, -24 2694; CHECK-NEXT: .cfi_offset %rbp, -16 2695; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2696; CHECK-NEXT: #APP 2697; CHECK-NEXT: nop 2698; CHECK-NEXT: #NO_APP 2699; CHECK-NEXT: vpinsrq $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload 2700; CHECK-NEXT: popq %rbx 2701; CHECK-NEXT: .cfi_def_cfa_offset 48 2702; CHECK-NEXT: popq %r12 2703; CHECK-NEXT: .cfi_def_cfa_offset 40 2704; CHECK-NEXT: popq %r13 2705; CHECK-NEXT: .cfi_def_cfa_offset 32 2706; CHECK-NEXT: popq %r14 2707; CHECK-NEXT: .cfi_def_cfa_offset 24 2708; CHECK-NEXT: popq %r15 2709; CHECK-NEXT: .cfi_def_cfa_offset 16 2710; CHECK-NEXT: popq %rbp 2711; CHECK-NEXT: .cfi_def_cfa_offset 8 2712; CHECK-NEXT: retq 2713 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 2714 %2 = insertelement <2 x i64> %a0, i64 %a1, i32 1 2715 ret <2 x i64> %2 2716} 2717 2718define <8 x i16> @stack_fold_pinsrw(<8 x i16> %a0, i16 %a1) { 2719; CHECK-LABEL: stack_fold_pinsrw: 2720; CHECK: # %bb.0: 2721; CHECK-NEXT: pushq %rbp 2722; CHECK-NEXT: .cfi_def_cfa_offset 16 2723; CHECK-NEXT: pushq %r15 2724; CHECK-NEXT: .cfi_def_cfa_offset 24 2725; CHECK-NEXT: pushq %r14 2726; CHECK-NEXT: .cfi_def_cfa_offset 32 2727; CHECK-NEXT: pushq %r13 2728; CHECK-NEXT: .cfi_def_cfa_offset 40 2729; CHECK-NEXT: pushq %r12 2730; CHECK-NEXT: .cfi_def_cfa_offset 48 2731; CHECK-NEXT: pushq %rbx 2732; CHECK-NEXT: .cfi_def_cfa_offset 56 2733; CHECK-NEXT: .cfi_offset %rbx, -56 2734; CHECK-NEXT: .cfi_offset %r12, -48 2735; CHECK-NEXT: .cfi_offset %r13, -40 2736; CHECK-NEXT: .cfi_offset %r14, -32 2737; CHECK-NEXT: .cfi_offset %r15, -24 2738; CHECK-NEXT: .cfi_offset %rbp, -16 2739; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 2740; CHECK-NEXT: #APP 2741; CHECK-NEXT: nop 2742; CHECK-NEXT: #NO_APP 2743; CHECK-NEXT: vpinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload 2744; CHECK-NEXT: popq %rbx 2745; CHECK-NEXT: .cfi_def_cfa_offset 48 2746; CHECK-NEXT: popq %r12 2747; CHECK-NEXT: .cfi_def_cfa_offset 40 2748; CHECK-NEXT: popq %r13 2749; CHECK-NEXT: .cfi_def_cfa_offset 32 2750; CHECK-NEXT: popq %r14 2751; CHECK-NEXT: .cfi_def_cfa_offset 24 2752; CHECK-NEXT: popq %r15 2753; CHECK-NEXT: .cfi_def_cfa_offset 16 2754; CHECK-NEXT: popq %rbp 2755; CHECK-NEXT: .cfi_def_cfa_offset 8 2756; CHECK-NEXT: retq 2757 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 2758 %2 = insertelement <8 x i16> %a0, i16 %a1, i32 1 2759 ret <8 x i16> %2 2760} 2761 2762define <16 x i32> @stack_fold_vplzcntd(<16 x i32> %a0) { 2763; CHECK-LABEL: stack_fold_vplzcntd: 2764; CHECK: # %bb.0: 2765; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2766; CHECK-NEXT: #APP 2767; CHECK-NEXT: nop 2768; CHECK-NEXT: #NO_APP 2769; CHECK-NEXT: vplzcntd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 2770; CHECK-NEXT: retq 2771 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2772 %2 = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a0, i1 false) 2773 ret <16 x i32> %2 2774} 2775 2776define <8 x i64> @stack_fold_vplzcntq(<8 x i64> %a0) { 2777; CHECK-LABEL: stack_fold_vplzcntq: 2778; CHECK: # %bb.0: 2779; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2780; CHECK-NEXT: #APP 2781; CHECK-NEXT: nop 2782; CHECK-NEXT: #NO_APP 2783; CHECK-NEXT: vplzcntq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 2784; CHECK-NEXT: retq 2785 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2786 %2 = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a0, i1 false) 2787 ret <8 x i64> %2 2788} 2789 2790define <32 x i16> @stack_fold_pmaddubsw_zmm(<64 x i8> %a0, <64 x i8> %a1) { 2791; CHECK-LABEL: stack_fold_pmaddubsw_zmm: 2792; CHECK: # %bb.0: 2793; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2794; CHECK-NEXT: #APP 2795; CHECK-NEXT: nop 2796; CHECK-NEXT: #NO_APP 2797; CHECK-NEXT: vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 2798; CHECK-NEXT: retq 2799 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2800 %2 = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> %a0, <64 x i8> %a1) 2801 ret <32 x i16> %2 2802} 2803declare <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8>, <64 x i8>) nounwind readnone 2804 2805define <32 x i16> @stack_fold_pmaddubsw_zmm_mask(<32 x i16>* %passthru, <64 x i8> %a0, <64 x i8> %a1, i32 %mask) { 2806; CHECK-LABEL: stack_fold_pmaddubsw_zmm_mask: 2807; CHECK: # %bb.0: 2808; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2809; CHECK-NEXT: #APP 2810; CHECK-NEXT: nop 2811; CHECK-NEXT: #NO_APP 2812; CHECK-NEXT: kmovd %esi, %k1 2813; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 2814; CHECK-NEXT: vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 2815; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 2816; CHECK-NEXT: retq 2817 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2818 %2 = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> %a0, <64 x i8> %a1) 2819 %3 = bitcast i32 %mask to <32 x i1> 2820 ; load needed to keep the operation from being scheduled about the asm block 2821 %4 = load <32 x i16>, <32 x i16>* %passthru 2822 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4 2823 ret <32 x i16> %5 2824} 2825 2826define <32 x i16> @stack_fold_pmaddubsw_zmm_maskz(<64 x i8> %a0, <64 x i8> %a1, i32 %mask) { 2827; CHECK-LABEL: stack_fold_pmaddubsw_zmm_maskz: 2828; CHECK: # %bb.0: 2829; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2830; CHECK-NEXT: #APP 2831; CHECK-NEXT: nop 2832; CHECK-NEXT: #NO_APP 2833; CHECK-NEXT: kmovd %edi, %k1 2834; CHECK-NEXT: vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 2835; CHECK-NEXT: retq 2836 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2837 %2 = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> %a0, <64 x i8> %a1) 2838 %3 = bitcast i32 %mask to <32 x i1> 2839 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer 2840 ret <32 x i16> %4 2841} 2842 2843define <16 x i32> @stack_fold_pmaddwd_zmm(<32 x i16> %a0, <32 x i16> %a1) { 2844; CHECK-LABEL: stack_fold_pmaddwd_zmm: 2845; CHECK: # %bb.0: 2846; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2847; CHECK-NEXT: #APP 2848; CHECK-NEXT: nop 2849; CHECK-NEXT: #NO_APP 2850; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 2851; CHECK-NEXT: retq 2852 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2853 %2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a0, <32 x i16> %a1) 2854 ret <16 x i32> %2 2855} 2856declare <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16>, <32 x i16>) nounwind readnone 2857 2858define <16 x i32> @stack_fold_pmaddwd_zmm_commuted(<32 x i16> %a0, <32 x i16> %a1) { 2859; CHECK-LABEL: stack_fold_pmaddwd_zmm_commuted: 2860; CHECK: # %bb.0: 2861; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2862; CHECK-NEXT: #APP 2863; CHECK-NEXT: nop 2864; CHECK-NEXT: #NO_APP 2865; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 2866; CHECK-NEXT: retq 2867 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2868 %2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a1, <32 x i16> %a0) 2869 ret <16 x i32> %2 2870} 2871 2872define <16 x i32> @stack_fold_pmaddwd_zmm_mask(<16 x i32>* %passthru, <32 x i16> %a0, <32 x i16> %a1, i16 %mask) { 2873; CHECK-LABEL: stack_fold_pmaddwd_zmm_mask: 2874; CHECK: # %bb.0: 2875; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2876; CHECK-NEXT: #APP 2877; CHECK-NEXT: nop 2878; CHECK-NEXT: #NO_APP 2879; CHECK-NEXT: kmovd %esi, %k1 2880; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 2881; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 2882; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 2883; CHECK-NEXT: retq 2884 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2885 %2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a0, <32 x i16> %a1) 2886 %3 = bitcast i16 %mask to <16 x i1> 2887 ; load needed to keep the operation from being scheduled about the asm block 2888 %4 = load <16 x i32>, <16 x i32>* %passthru 2889 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4 2890 ret <16 x i32> %5 2891} 2892 2893define <16 x i32> @stack_fold_pmaddwd_zmm_mask_commuted(<16 x i32>* %passthru, <32 x i16> %a0, <32 x i16> %a1, i16 %mask) { 2894; CHECK-LABEL: stack_fold_pmaddwd_zmm_mask_commuted: 2895; CHECK: # %bb.0: 2896; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2897; CHECK-NEXT: #APP 2898; CHECK-NEXT: nop 2899; CHECK-NEXT: #NO_APP 2900; CHECK-NEXT: kmovd %esi, %k1 2901; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 2902; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 2903; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 2904; CHECK-NEXT: retq 2905 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2906 %2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a1, <32 x i16> %a0) 2907 %3 = bitcast i16 %mask to <16 x i1> 2908 ; load needed to keep the operation from being scheduled about the asm block 2909 %4 = load <16 x i32>, <16 x i32>* %passthru 2910 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4 2911 ret <16 x i32> %5 2912} 2913 2914define <16 x i32> @stack_fold_pmaddwd_zmm_maskz(<16 x i32>* %passthru, <32 x i16> %a0, <32 x i16> %a1, i16 %mask) { 2915; CHECK-LABEL: stack_fold_pmaddwd_zmm_maskz: 2916; CHECK: # %bb.0: 2917; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2918; CHECK-NEXT: #APP 2919; CHECK-NEXT: nop 2920; CHECK-NEXT: #NO_APP 2921; CHECK-NEXT: kmovd %esi, %k1 2922; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 2923; CHECK-NEXT: retq 2924 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2925 %2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a0, <32 x i16> %a1) 2926 %3 = bitcast i16 %mask to <16 x i1> 2927 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 2928 ret <16 x i32> %4 2929} 2930 2931define <16 x i32> @stack_fold_pmaddwd_zmm_maskz_commuted(<16 x i32>* %passthru, <32 x i16> %a0, <32 x i16> %a1, i16 %mask) { 2932; CHECK-LABEL: stack_fold_pmaddwd_zmm_maskz_commuted: 2933; CHECK: # %bb.0: 2934; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2935; CHECK-NEXT: #APP 2936; CHECK-NEXT: nop 2937; CHECK-NEXT: #NO_APP 2938; CHECK-NEXT: kmovd %esi, %k1 2939; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 2940; CHECK-NEXT: retq 2941 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2942 %2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a1, <32 x i16> %a0) 2943 %3 = bitcast i16 %mask to <16 x i1> 2944 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 2945 ret <16 x i32> %4 2946} 2947 2948define <64 x i8> @stack_fold_pmaxsb(<64 x i8> %a0, <64 x i8> %a1) { 2949; CHECK-LABEL: stack_fold_pmaxsb: 2950; CHECK: # %bb.0: 2951; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2952; CHECK-NEXT: #APP 2953; CHECK-NEXT: nop 2954; CHECK-NEXT: #NO_APP 2955; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 2956; CHECK-NEXT: retq 2957 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2958 %2 = icmp sgt <64 x i8> %a0, %a1 2959 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1 2960 ret <64 x i8> %3 2961} 2962 2963define <64 x i8> @stack_fold_pmaxsb_commuted(<64 x i8> %a0, <64 x i8> %a1) { 2964; CHECK-LABEL: stack_fold_pmaxsb_commuted: 2965; CHECK: # %bb.0: 2966; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2967; CHECK-NEXT: #APP 2968; CHECK-NEXT: nop 2969; CHECK-NEXT: #NO_APP 2970; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 2971; CHECK-NEXT: retq 2972 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2973 %2 = icmp sgt <64 x i8> %a1, %a0 2974 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0 2975 ret <64 x i8> %3 2976} 2977 2978define <64 x i8> @stack_fold_pmaxsb_mask(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) { 2979; CHECK-LABEL: stack_fold_pmaxsb_mask: 2980; CHECK: # %bb.0: 2981; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2982; CHECK-NEXT: #APP 2983; CHECK-NEXT: nop 2984; CHECK-NEXT: #NO_APP 2985; CHECK-NEXT: kmovq %rdi, %k1 2986; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 2987; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 2988; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 2989; CHECK-NEXT: retq 2990 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2991 %2 = icmp sgt <64 x i8> %a0, %a1 2992 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1 2993 %4 = bitcast i64 %mask to <64 x i1> 2994 ; load needed to keep the operation from being scheduled about the asm block 2995 %5 = load <64 x i8>, <64 x i8>* %passthru 2996 %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5 2997 ret <64 x i8> %6 2998} 2999 3000define <64 x i8> @stack_fold_pmaxsb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) { 3001; CHECK-LABEL: stack_fold_pmaxsb_mask_commuted: 3002; CHECK: # %bb.0: 3003; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3004; CHECK-NEXT: #APP 3005; CHECK-NEXT: nop 3006; CHECK-NEXT: #NO_APP 3007; CHECK-NEXT: kmovq %rdi, %k1 3008; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 3009; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3010; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 3011; CHECK-NEXT: retq 3012 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3013 %2 = icmp sgt <64 x i8> %a1, %a0 3014 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0 3015 %4 = bitcast i64 %mask to <64 x i1> 3016 ; load needed to keep the operation from being scheduled about the asm block 3017 %5 = load <64 x i8>, <64 x i8>* %passthru 3018 %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5 3019 ret <64 x i8> %6 3020} 3021 3022define <64 x i8> @stack_fold_pmaxsb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 3023; CHECK-LABEL: stack_fold_pmaxsb_maskz: 3024; CHECK: # %bb.0: 3025; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3026; CHECK-NEXT: #APP 3027; CHECK-NEXT: nop 3028; CHECK-NEXT: #NO_APP 3029; CHECK-NEXT: kmovq %rdi, %k1 3030; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 3031; CHECK-NEXT: retq 3032 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3033 %2 = icmp sgt <64 x i8> %a0, %a1 3034 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1 3035 %4 = bitcast i64 %mask to <64 x i1> 3036 %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer 3037 ret <64 x i8> %5 3038} 3039 3040define <64 x i8> @stack_fold_pmaxsb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 3041; CHECK-LABEL: stack_fold_pmaxsb_maskz_commuted: 3042; CHECK: # %bb.0: 3043; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3044; CHECK-NEXT: #APP 3045; CHECK-NEXT: nop 3046; CHECK-NEXT: #NO_APP 3047; CHECK-NEXT: kmovq %rdi, %k1 3048; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 3049; CHECK-NEXT: retq 3050 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3051 %2 = icmp sgt <64 x i8> %a1, %a0 3052 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0 3053 %4 = bitcast i64 %mask to <64 x i1> 3054 %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer 3055 ret <64 x i8> %5 3056} 3057 3058define <16 x i32> @stack_fold_pmaxsd(<16 x i32> %a0, <16 x i32> %a1) { 3059; CHECK-LABEL: stack_fold_pmaxsd: 3060; CHECK: # %bb.0: 3061; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3062; CHECK-NEXT: #APP 3063; CHECK-NEXT: nop 3064; CHECK-NEXT: #NO_APP 3065; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 3066; CHECK-NEXT: retq 3067 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3068 %2 = icmp sgt <16 x i32> %a0, %a1 3069 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1 3070 ret <16 x i32> %3 3071} 3072 3073define <16 x i32> @stack_fold_pmaxsd_commuted(<16 x i32> %a0, <16 x i32> %a1) { 3074; CHECK-LABEL: stack_fold_pmaxsd_commuted: 3075; CHECK: # %bb.0: 3076; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3077; CHECK-NEXT: #APP 3078; CHECK-NEXT: nop 3079; CHECK-NEXT: #NO_APP 3080; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 3081; CHECK-NEXT: retq 3082 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3083 %2 = icmp sgt <16 x i32> %a1, %a0 3084 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0 3085 ret <16 x i32> %3 3086} 3087 3088define <16 x i32> @stack_fold_pmaxsd_mask(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) { 3089; CHECK-LABEL: stack_fold_pmaxsd_mask: 3090; CHECK: # %bb.0: 3091; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3092; CHECK-NEXT: #APP 3093; CHECK-NEXT: nop 3094; CHECK-NEXT: #NO_APP 3095; CHECK-NEXT: kmovd %edi, %k1 3096; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 3097; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3098; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 3099; CHECK-NEXT: retq 3100 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3101 %2 = icmp sgt <16 x i32> %a0, %a1 3102 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1 3103 %4 = bitcast i16 %mask to <16 x i1> 3104 ; load needed to keep the operation from being scheduled about the asm block 3105 %5 = load <16 x i32>, <16 x i32>* %passthru 3106 %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5 3107 ret <16 x i32> %6 3108} 3109 3110define <16 x i32> @stack_fold_pmaxsd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) { 3111; CHECK-LABEL: stack_fold_pmaxsd_mask_commuted: 3112; CHECK: # %bb.0: 3113; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3114; CHECK-NEXT: #APP 3115; CHECK-NEXT: nop 3116; CHECK-NEXT: #NO_APP 3117; CHECK-NEXT: kmovd %edi, %k1 3118; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 3119; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3120; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 3121; CHECK-NEXT: retq 3122 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3123 %2 = icmp sgt <16 x i32> %a1, %a0 3124 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0 3125 %4 = bitcast i16 %mask to <16 x i1> 3126 ; load needed to keep the operation from being scheduled about the asm block 3127 %5 = load <16 x i32>, <16 x i32>* %passthru 3128 %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5 3129 ret <16 x i32> %6 3130} 3131 3132define <16 x i32> @stack_fold_pmaxsd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { 3133; CHECK-LABEL: stack_fold_pmaxsd_maskz: 3134; CHECK: # %bb.0: 3135; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3136; CHECK-NEXT: #APP 3137; CHECK-NEXT: nop 3138; CHECK-NEXT: #NO_APP 3139; CHECK-NEXT: kmovd %edi, %k1 3140; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 3141; CHECK-NEXT: retq 3142 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3143 %2 = icmp sgt <16 x i32> %a0, %a1 3144 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1 3145 %4 = bitcast i16 %mask to <16 x i1> 3146 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer 3147 ret <16 x i32> %5 3148} 3149 3150define <16 x i32> @stack_fold_pmaxsd_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { 3151; CHECK-LABEL: stack_fold_pmaxsd_maskz_commuted: 3152; CHECK: # %bb.0: 3153; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3154; CHECK-NEXT: #APP 3155; CHECK-NEXT: nop 3156; CHECK-NEXT: #NO_APP 3157; CHECK-NEXT: kmovd %edi, %k1 3158; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 3159; CHECK-NEXT: retq 3160 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3161 %2 = icmp sgt <16 x i32> %a1, %a0 3162 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0 3163 %4 = bitcast i16 %mask to <16 x i1> 3164 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer 3165 ret <16 x i32> %5 3166} 3167 3168define <8 x i64> @stack_fold_pmaxsq(<8 x i64> %a0, <8 x i64> %a1) { 3169; CHECK-LABEL: stack_fold_pmaxsq: 3170; CHECK: # %bb.0: 3171; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3172; CHECK-NEXT: #APP 3173; CHECK-NEXT: nop 3174; CHECK-NEXT: #NO_APP 3175; CHECK-NEXT: vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 3176; CHECK-NEXT: retq 3177 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3178 %2 = icmp sgt <8 x i64> %a0, %a1 3179 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1 3180 ret <8 x i64> %3 3181} 3182 3183define <8 x i64> @stack_fold_pmaxsq_commuted(<8 x i64> %a0, <8 x i64> %a1) { 3184; CHECK-LABEL: stack_fold_pmaxsq_commuted: 3185; CHECK: # %bb.0: 3186; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3187; CHECK-NEXT: #APP 3188; CHECK-NEXT: nop 3189; CHECK-NEXT: #NO_APP 3190; CHECK-NEXT: vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 3191; CHECK-NEXT: retq 3192 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3193 %2 = icmp sgt <8 x i64> %a1, %a0 3194 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0 3195 ret <8 x i64> %3 3196} 3197 3198define <8 x i64> @stack_fold_pmaxsq_mask(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) { 3199; CHECK-LABEL: stack_fold_pmaxsq_mask: 3200; CHECK: # %bb.0: 3201; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3202; CHECK-NEXT: #APP 3203; CHECK-NEXT: nop 3204; CHECK-NEXT: #NO_APP 3205; CHECK-NEXT: kmovd %edi, %k1 3206; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 3207; CHECK-NEXT: vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3208; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 3209; CHECK-NEXT: retq 3210 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3211 %2 = icmp sgt <8 x i64> %a0, %a1 3212 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1 3213 %4 = bitcast i8 %mask to <8 x i1> 3214 ; load needed to keep the operation from being scheduled about the asm block 3215 %5 = load <8 x i64>, <8 x i64>* %passthru 3216 %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5 3217 ret <8 x i64> %6 3218} 3219 3220define <8 x i64> @stack_fold_pmaxsq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) { 3221; CHECK-LABEL: stack_fold_pmaxsq_mask_commuted: 3222; CHECK: # %bb.0: 3223; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3224; CHECK-NEXT: #APP 3225; CHECK-NEXT: nop 3226; CHECK-NEXT: #NO_APP 3227; CHECK-NEXT: kmovd %edi, %k1 3228; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 3229; CHECK-NEXT: vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3230; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 3231; CHECK-NEXT: retq 3232 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3233 %2 = icmp sgt <8 x i64> %a1, %a0 3234 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0 3235 %4 = bitcast i8 %mask to <8 x i1> 3236 ; load needed to keep the operation from being scheduled about the asm block 3237 %5 = load <8 x i64>, <8 x i64>* %passthru 3238 %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5 3239 ret <8 x i64> %6 3240} 3241 3242define <8 x i64> @stack_fold_pmaxsq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 3243; CHECK-LABEL: stack_fold_pmaxsq_maskz: 3244; CHECK: # %bb.0: 3245; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3246; CHECK-NEXT: #APP 3247; CHECK-NEXT: nop 3248; CHECK-NEXT: #NO_APP 3249; CHECK-NEXT: kmovd %edi, %k1 3250; CHECK-NEXT: vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 3251; CHECK-NEXT: retq 3252 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3253 %2 = icmp sgt <8 x i64> %a0, %a1 3254 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1 3255 %4 = bitcast i8 %mask to <8 x i1> 3256 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer 3257 ret <8 x i64> %5 3258} 3259 3260define <8 x i64> @stack_fold_pmaxsq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 3261; CHECK-LABEL: stack_fold_pmaxsq_maskz_commuted: 3262; CHECK: # %bb.0: 3263; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3264; CHECK-NEXT: #APP 3265; CHECK-NEXT: nop 3266; CHECK-NEXT: #NO_APP 3267; CHECK-NEXT: kmovd %edi, %k1 3268; CHECK-NEXT: vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 3269; CHECK-NEXT: retq 3270 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3271 %2 = icmp sgt <8 x i64> %a1, %a0 3272 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0 3273 %4 = bitcast i8 %mask to <8 x i1> 3274 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer 3275 ret <8 x i64> %5 3276} 3277 3278define <32 x i16> @stack_fold_pmaxsw(<32 x i16> %a0, <32 x i16> %a1) { 3279; CHECK-LABEL: stack_fold_pmaxsw: 3280; CHECK: # %bb.0: 3281; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3282; CHECK-NEXT: #APP 3283; CHECK-NEXT: nop 3284; CHECK-NEXT: #NO_APP 3285; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 3286; CHECK-NEXT: retq 3287 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3288 %2 = icmp sgt <32 x i16> %a0, %a1 3289 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1 3290 ret <32 x i16> %3 3291} 3292 3293define <32 x i16> @stack_fold_pmaxsw_commuted(<32 x i16> %a0, <32 x i16> %a1) { 3294; CHECK-LABEL: stack_fold_pmaxsw_commuted: 3295; CHECK: # %bb.0: 3296; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3297; CHECK-NEXT: #APP 3298; CHECK-NEXT: nop 3299; CHECK-NEXT: #NO_APP 3300; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 3301; CHECK-NEXT: retq 3302 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3303 %2 = icmp sgt <32 x i16> %a1, %a0 3304 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0 3305 ret <32 x i16> %3 3306} 3307 3308define <32 x i16> @stack_fold_pmaxsw_mask(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) { 3309; CHECK-LABEL: stack_fold_pmaxsw_mask: 3310; CHECK: # %bb.0: 3311; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3312; CHECK-NEXT: #APP 3313; CHECK-NEXT: nop 3314; CHECK-NEXT: #NO_APP 3315; CHECK-NEXT: kmovd %edi, %k1 3316; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 3317; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3318; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 3319; CHECK-NEXT: retq 3320 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3321 %2 = icmp sgt <32 x i16> %a0, %a1 3322 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1 3323 %4 = bitcast i32 %mask to <32 x i1> 3324 ; load needed to keep the operation from being scheduled about the asm block 3325 %5 = load <32 x i16>, <32 x i16>* %passthru 3326 %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5 3327 ret <32 x i16> %6 3328} 3329 3330define <32 x i16> @stack_fold_pmaxsw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) { 3331; CHECK-LABEL: stack_fold_pmaxsw_mask_commuted: 3332; CHECK: # %bb.0: 3333; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3334; CHECK-NEXT: #APP 3335; CHECK-NEXT: nop 3336; CHECK-NEXT: #NO_APP 3337; CHECK-NEXT: kmovd %edi, %k1 3338; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 3339; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3340; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 3341; CHECK-NEXT: retq 3342 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3343 %2 = icmp sgt <32 x i16> %a1, %a0 3344 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0 3345 %4 = bitcast i32 %mask to <32 x i1> 3346 ; load needed to keep the operation from being scheduled about the asm block 3347 %5 = load <32 x i16>, <32 x i16>* %passthru 3348 %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5 3349 ret <32 x i16> %6 3350} 3351 3352define <32 x i16> @stack_fold_pmaxsw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { 3353; CHECK-LABEL: stack_fold_pmaxsw_maskz: 3354; CHECK: # %bb.0: 3355; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3356; CHECK-NEXT: #APP 3357; CHECK-NEXT: nop 3358; CHECK-NEXT: #NO_APP 3359; CHECK-NEXT: kmovd %edi, %k1 3360; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 3361; CHECK-NEXT: retq 3362 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3363 %2 = icmp sgt <32 x i16> %a0, %a1 3364 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1 3365 %4 = bitcast i32 %mask to <32 x i1> 3366 %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer 3367 ret <32 x i16> %5 3368} 3369 3370define <32 x i16> @stack_fold_pmaxsw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { 3371; CHECK-LABEL: stack_fold_pmaxsw_maskz_commuted: 3372; CHECK: # %bb.0: 3373; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3374; CHECK-NEXT: #APP 3375; CHECK-NEXT: nop 3376; CHECK-NEXT: #NO_APP 3377; CHECK-NEXT: kmovd %edi, %k1 3378; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 3379; CHECK-NEXT: retq 3380 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3381 %2 = icmp sgt <32 x i16> %a1, %a0 3382 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0 3383 %4 = bitcast i32 %mask to <32 x i1> 3384 %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer 3385 ret <32 x i16> %5 3386} 3387 3388define <64 x i8> @stack_fold_pmaxub(<64 x i8> %a0, <64 x i8> %a1) { 3389; CHECK-LABEL: stack_fold_pmaxub: 3390; CHECK: # %bb.0: 3391; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3392; CHECK-NEXT: #APP 3393; CHECK-NEXT: nop 3394; CHECK-NEXT: #NO_APP 3395; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 3396; CHECK-NEXT: retq 3397 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3398 %2 = icmp ugt <64 x i8> %a0, %a1 3399 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1 3400 ret <64 x i8> %3 3401} 3402 3403define <64 x i8> @stack_fold_pmaxub_commuted(<64 x i8> %a0, <64 x i8> %a1) { 3404; CHECK-LABEL: stack_fold_pmaxub_commuted: 3405; CHECK: # %bb.0: 3406; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3407; CHECK-NEXT: #APP 3408; CHECK-NEXT: nop 3409; CHECK-NEXT: #NO_APP 3410; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 3411; CHECK-NEXT: retq 3412 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3413 %2 = icmp ugt <64 x i8> %a1, %a0 3414 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0 3415 ret <64 x i8> %3 3416} 3417 3418define <64 x i8> @stack_fold_pmaxub_mask(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) { 3419; CHECK-LABEL: stack_fold_pmaxub_mask: 3420; CHECK: # %bb.0: 3421; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3422; CHECK-NEXT: #APP 3423; CHECK-NEXT: nop 3424; CHECK-NEXT: #NO_APP 3425; CHECK-NEXT: kmovq %rdi, %k1 3426; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 3427; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3428; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 3429; CHECK-NEXT: retq 3430 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3431 %2 = icmp ugt <64 x i8> %a0, %a1 3432 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1 3433 %4 = bitcast i64 %mask to <64 x i1> 3434 ; load needed to keep the operation from being scheduled about the asm block 3435 %5 = load <64 x i8>, <64 x i8>* %passthru 3436 %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5 3437 ret <64 x i8> %6 3438} 3439 3440define <64 x i8> @stack_fold_pmaxub_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) { 3441; CHECK-LABEL: stack_fold_pmaxub_mask_commuted: 3442; CHECK: # %bb.0: 3443; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3444; CHECK-NEXT: #APP 3445; CHECK-NEXT: nop 3446; CHECK-NEXT: #NO_APP 3447; CHECK-NEXT: kmovq %rdi, %k1 3448; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 3449; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3450; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 3451; CHECK-NEXT: retq 3452 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3453 %2 = icmp ugt <64 x i8> %a1, %a0 3454 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0 3455 %4 = bitcast i64 %mask to <64 x i1> 3456 ; load needed to keep the operation from being scheduled about the asm block 3457 %5 = load <64 x i8>, <64 x i8>* %passthru 3458 %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5 3459 ret <64 x i8> %6 3460} 3461 3462define <64 x i8> @stack_fold_pmaxub_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 3463; CHECK-LABEL: stack_fold_pmaxub_maskz: 3464; CHECK: # %bb.0: 3465; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3466; CHECK-NEXT: #APP 3467; CHECK-NEXT: nop 3468; CHECK-NEXT: #NO_APP 3469; CHECK-NEXT: kmovq %rdi, %k1 3470; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 3471; CHECK-NEXT: retq 3472 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3473 %2 = icmp ugt <64 x i8> %a0, %a1 3474 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1 3475 %4 = bitcast i64 %mask to <64 x i1> 3476 %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer 3477 ret <64 x i8> %5 3478} 3479 3480define <64 x i8> @stack_fold_pmaxub_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 3481; CHECK-LABEL: stack_fold_pmaxub_maskz_commuted: 3482; CHECK: # %bb.0: 3483; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3484; CHECK-NEXT: #APP 3485; CHECK-NEXT: nop 3486; CHECK-NEXT: #NO_APP 3487; CHECK-NEXT: kmovq %rdi, %k1 3488; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 3489; CHECK-NEXT: retq 3490 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3491 %2 = icmp ugt <64 x i8> %a1, %a0 3492 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0 3493 %4 = bitcast i64 %mask to <64 x i1> 3494 %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer 3495 ret <64 x i8> %5 3496} 3497 3498define <16 x i32> @stack_fold_pmaxud(<16 x i32> %a0, <16 x i32> %a1) { 3499; CHECK-LABEL: stack_fold_pmaxud: 3500; CHECK: # %bb.0: 3501; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3502; CHECK-NEXT: #APP 3503; CHECK-NEXT: nop 3504; CHECK-NEXT: #NO_APP 3505; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 3506; CHECK-NEXT: retq 3507 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3508 %2 = icmp ugt <16 x i32> %a0, %a1 3509 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1 3510 ret <16 x i32> %3 3511} 3512 3513define <16 x i32> @stack_fold_pmaxud_commuted(<16 x i32> %a0, <16 x i32> %a1) { 3514; CHECK-LABEL: stack_fold_pmaxud_commuted: 3515; CHECK: # %bb.0: 3516; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3517; CHECK-NEXT: #APP 3518; CHECK-NEXT: nop 3519; CHECK-NEXT: #NO_APP 3520; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 3521; CHECK-NEXT: retq 3522 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3523 %2 = icmp ugt <16 x i32> %a1, %a0 3524 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0 3525 ret <16 x i32> %3 3526} 3527 3528define <16 x i32> @stack_fold_pmaxud_mask(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) { 3529; CHECK-LABEL: stack_fold_pmaxud_mask: 3530; CHECK: # %bb.0: 3531; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3532; CHECK-NEXT: #APP 3533; CHECK-NEXT: nop 3534; CHECK-NEXT: #NO_APP 3535; CHECK-NEXT: kmovd %edi, %k1 3536; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 3537; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3538; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 3539; CHECK-NEXT: retq 3540 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3541 %2 = icmp ugt <16 x i32> %a0, %a1 3542 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1 3543 %4 = bitcast i16 %mask to <16 x i1> 3544 ; load needed to keep the operation from being scheduled about the asm block 3545 %5 = load <16 x i32>, <16 x i32>* %passthru 3546 %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5 3547 ret <16 x i32> %6 3548} 3549 3550define <16 x i32> @stack_fold_pmaxud_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) { 3551; CHECK-LABEL: stack_fold_pmaxud_mask_commuted: 3552; CHECK: # %bb.0: 3553; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3554; CHECK-NEXT: #APP 3555; CHECK-NEXT: nop 3556; CHECK-NEXT: #NO_APP 3557; CHECK-NEXT: kmovd %edi, %k1 3558; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 3559; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3560; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 3561; CHECK-NEXT: retq 3562 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3563 %2 = icmp ugt <16 x i32> %a1, %a0 3564 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0 3565 %4 = bitcast i16 %mask to <16 x i1> 3566 ; load needed to keep the operation from being scheduled about the asm block 3567 %5 = load <16 x i32>, <16 x i32>* %passthru 3568 %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5 3569 ret <16 x i32> %6 3570} 3571 3572define <16 x i32> @stack_fold_pmaxud_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { 3573; CHECK-LABEL: stack_fold_pmaxud_maskz: 3574; CHECK: # %bb.0: 3575; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3576; CHECK-NEXT: #APP 3577; CHECK-NEXT: nop 3578; CHECK-NEXT: #NO_APP 3579; CHECK-NEXT: kmovd %edi, %k1 3580; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 3581; CHECK-NEXT: retq 3582 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3583 %2 = icmp ugt <16 x i32> %a0, %a1 3584 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1 3585 %4 = bitcast i16 %mask to <16 x i1> 3586 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer 3587 ret <16 x i32> %5 3588} 3589 3590define <16 x i32> @stack_fold_pmaxud_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { 3591; CHECK-LABEL: stack_fold_pmaxud_maskz_commuted: 3592; CHECK: # %bb.0: 3593; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3594; CHECK-NEXT: #APP 3595; CHECK-NEXT: nop 3596; CHECK-NEXT: #NO_APP 3597; CHECK-NEXT: kmovd %edi, %k1 3598; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 3599; CHECK-NEXT: retq 3600 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3601 %2 = icmp ugt <16 x i32> %a1, %a0 3602 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0 3603 %4 = bitcast i16 %mask to <16 x i1> 3604 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer 3605 ret <16 x i32> %5 3606} 3607 3608define <8 x i64> @stack_fold_pmaxuq(<8 x i64> %a0, <8 x i64> %a1) { 3609; CHECK-LABEL: stack_fold_pmaxuq: 3610; CHECK: # %bb.0: 3611; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3612; CHECK-NEXT: #APP 3613; CHECK-NEXT: nop 3614; CHECK-NEXT: #NO_APP 3615; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 3616; CHECK-NEXT: retq 3617 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3618 %2 = icmp ugt <8 x i64> %a0, %a1 3619 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1 3620 ret <8 x i64> %3 3621} 3622 3623define <8 x i64> @stack_fold_pmaxuq_commuted(<8 x i64> %a0, <8 x i64> %a1) { 3624; CHECK-LABEL: stack_fold_pmaxuq_commuted: 3625; CHECK: # %bb.0: 3626; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3627; CHECK-NEXT: #APP 3628; CHECK-NEXT: nop 3629; CHECK-NEXT: #NO_APP 3630; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 3631; CHECK-NEXT: retq 3632 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3633 %2 = icmp ugt <8 x i64> %a1, %a0 3634 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0 3635 ret <8 x i64> %3 3636} 3637 3638define <8 x i64> @stack_fold_pmaxuq_mask(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) { 3639; CHECK-LABEL: stack_fold_pmaxuq_mask: 3640; CHECK: # %bb.0: 3641; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3642; CHECK-NEXT: #APP 3643; CHECK-NEXT: nop 3644; CHECK-NEXT: #NO_APP 3645; CHECK-NEXT: kmovd %edi, %k1 3646; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 3647; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3648; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 3649; CHECK-NEXT: retq 3650 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3651 %2 = icmp ugt <8 x i64> %a0, %a1 3652 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1 3653 %4 = bitcast i8 %mask to <8 x i1> 3654 ; load needed to keep the operation from being scheduled about the asm block 3655 %5 = load <8 x i64>, <8 x i64>* %passthru 3656 %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5 3657 ret <8 x i64> %6 3658} 3659 3660define <8 x i64> @stack_fold_pmaxuq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) { 3661; CHECK-LABEL: stack_fold_pmaxuq_mask_commuted: 3662; CHECK: # %bb.0: 3663; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3664; CHECK-NEXT: #APP 3665; CHECK-NEXT: nop 3666; CHECK-NEXT: #NO_APP 3667; CHECK-NEXT: kmovd %edi, %k1 3668; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 3669; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3670; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 3671; CHECK-NEXT: retq 3672 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3673 %2 = icmp ugt <8 x i64> %a1, %a0 3674 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0 3675 %4 = bitcast i8 %mask to <8 x i1> 3676 ; load needed to keep the operation from being scheduled about the asm block 3677 %5 = load <8 x i64>, <8 x i64>* %passthru 3678 %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5 3679 ret <8 x i64> %6 3680} 3681 3682define <8 x i64> @stack_fold_pmaxuq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 3683; CHECK-LABEL: stack_fold_pmaxuq_maskz: 3684; CHECK: # %bb.0: 3685; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3686; CHECK-NEXT: #APP 3687; CHECK-NEXT: nop 3688; CHECK-NEXT: #NO_APP 3689; CHECK-NEXT: kmovd %edi, %k1 3690; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 3691; CHECK-NEXT: retq 3692 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3693 %2 = icmp ugt <8 x i64> %a0, %a1 3694 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1 3695 %4 = bitcast i8 %mask to <8 x i1> 3696 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer 3697 ret <8 x i64> %5 3698} 3699 3700define <8 x i64> @stack_fold_pmaxuq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 3701; CHECK-LABEL: stack_fold_pmaxuq_maskz_commuted: 3702; CHECK: # %bb.0: 3703; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3704; CHECK-NEXT: #APP 3705; CHECK-NEXT: nop 3706; CHECK-NEXT: #NO_APP 3707; CHECK-NEXT: kmovd %edi, %k1 3708; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 3709; CHECK-NEXT: retq 3710 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3711 %2 = icmp ugt <8 x i64> %a1, %a0 3712 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0 3713 %4 = bitcast i8 %mask to <8 x i1> 3714 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer 3715 ret <8 x i64> %5 3716} 3717 3718define <32 x i16> @stack_fold_pmaxuw(<32 x i16> %a0, <32 x i16> %a1) { 3719; CHECK-LABEL: stack_fold_pmaxuw: 3720; CHECK: # %bb.0: 3721; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3722; CHECK-NEXT: #APP 3723; CHECK-NEXT: nop 3724; CHECK-NEXT: #NO_APP 3725; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 3726; CHECK-NEXT: retq 3727 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3728 %2 = icmp ugt <32 x i16> %a0, %a1 3729 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1 3730 ret <32 x i16> %3 3731} 3732 3733define <32 x i16> @stack_fold_pmaxuw_commuted(<32 x i16> %a0, <32 x i16> %a1) { 3734; CHECK-LABEL: stack_fold_pmaxuw_commuted: 3735; CHECK: # %bb.0: 3736; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3737; CHECK-NEXT: #APP 3738; CHECK-NEXT: nop 3739; CHECK-NEXT: #NO_APP 3740; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 3741; CHECK-NEXT: retq 3742 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3743 %2 = icmp ugt <32 x i16> %a1, %a0 3744 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0 3745 ret <32 x i16> %3 3746} 3747 3748define <32 x i16> @stack_fold_pmaxuw_mask(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) { 3749; CHECK-LABEL: stack_fold_pmaxuw_mask: 3750; CHECK: # %bb.0: 3751; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3752; CHECK-NEXT: #APP 3753; CHECK-NEXT: nop 3754; CHECK-NEXT: #NO_APP 3755; CHECK-NEXT: kmovd %edi, %k1 3756; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 3757; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3758; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 3759; CHECK-NEXT: retq 3760 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3761 %2 = icmp ugt <32 x i16> %a0, %a1 3762 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1 3763 %4 = bitcast i32 %mask to <32 x i1> 3764 ; load needed to keep the operation from being scheduled about the asm block 3765 %5 = load <32 x i16>, <32 x i16>* %passthru 3766 %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5 3767 ret <32 x i16> %6 3768} 3769 3770define <32 x i16> @stack_fold_pmaxuw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) { 3771; CHECK-LABEL: stack_fold_pmaxuw_mask_commuted: 3772; CHECK: # %bb.0: 3773; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3774; CHECK-NEXT: #APP 3775; CHECK-NEXT: nop 3776; CHECK-NEXT: #NO_APP 3777; CHECK-NEXT: kmovd %edi, %k1 3778; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 3779; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3780; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 3781; CHECK-NEXT: retq 3782 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3783 %2 = icmp ugt <32 x i16> %a1, %a0 3784 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0 3785 %4 = bitcast i32 %mask to <32 x i1> 3786 ; load needed to keep the operation from being scheduled about the asm block 3787 %5 = load <32 x i16>, <32 x i16>* %passthru 3788 %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5 3789 ret <32 x i16> %6 3790} 3791 3792define <32 x i16> @stack_fold_pmaxuw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { 3793; CHECK-LABEL: stack_fold_pmaxuw_maskz: 3794; CHECK: # %bb.0: 3795; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3796; CHECK-NEXT: #APP 3797; CHECK-NEXT: nop 3798; CHECK-NEXT: #NO_APP 3799; CHECK-NEXT: kmovd %edi, %k1 3800; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 3801; CHECK-NEXT: retq 3802 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3803 %2 = icmp ugt <32 x i16> %a0, %a1 3804 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1 3805 %4 = bitcast i32 %mask to <32 x i1> 3806 %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer 3807 ret <32 x i16> %5 3808} 3809 3810define <32 x i16> @stack_fold_pmaxuw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { 3811; CHECK-LABEL: stack_fold_pmaxuw_maskz_commuted: 3812; CHECK: # %bb.0: 3813; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3814; CHECK-NEXT: #APP 3815; CHECK-NEXT: nop 3816; CHECK-NEXT: #NO_APP 3817; CHECK-NEXT: kmovd %edi, %k1 3818; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 3819; CHECK-NEXT: retq 3820 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3821 %2 = icmp ugt <32 x i16> %a1, %a0 3822 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0 3823 %4 = bitcast i32 %mask to <32 x i1> 3824 %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer 3825 ret <32 x i16> %5 3826} 3827 3828define <64 x i8> @stack_fold_pminsb(<64 x i8> %a0, <64 x i8> %a1) { 3829; CHECK-LABEL: stack_fold_pminsb: 3830; CHECK: # %bb.0: 3831; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3832; CHECK-NEXT: #APP 3833; CHECK-NEXT: nop 3834; CHECK-NEXT: #NO_APP 3835; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 3836; CHECK-NEXT: retq 3837 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3838 %2 = icmp slt <64 x i8> %a0, %a1 3839 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1 3840 ret <64 x i8> %3 3841} 3842 3843define <64 x i8> @stack_fold_pminsb_commuted(<64 x i8> %a0, <64 x i8> %a1) { 3844; CHECK-LABEL: stack_fold_pminsb_commuted: 3845; CHECK: # %bb.0: 3846; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3847; CHECK-NEXT: #APP 3848; CHECK-NEXT: nop 3849; CHECK-NEXT: #NO_APP 3850; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 3851; CHECK-NEXT: retq 3852 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3853 %2 = icmp slt <64 x i8> %a1, %a0 3854 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0 3855 ret <64 x i8> %3 3856} 3857 3858define <64 x i8> @stack_fold_pminsb_mask(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) { 3859; CHECK-LABEL: stack_fold_pminsb_mask: 3860; CHECK: # %bb.0: 3861; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3862; CHECK-NEXT: #APP 3863; CHECK-NEXT: nop 3864; CHECK-NEXT: #NO_APP 3865; CHECK-NEXT: kmovq %rdi, %k1 3866; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 3867; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3868; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 3869; CHECK-NEXT: retq 3870 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3871 %2 = icmp slt <64 x i8> %a0, %a1 3872 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1 3873 %4 = bitcast i64 %mask to <64 x i1> 3874 ; load needed to keep the operation from being scheduled about the asm block 3875 %5 = load <64 x i8>, <64 x i8>* %passthru 3876 %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5 3877 ret <64 x i8> %6 3878} 3879 3880define <64 x i8> @stack_fold_pminsb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) { 3881; CHECK-LABEL: stack_fold_pminsb_mask_commuted: 3882; CHECK: # %bb.0: 3883; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3884; CHECK-NEXT: #APP 3885; CHECK-NEXT: nop 3886; CHECK-NEXT: #NO_APP 3887; CHECK-NEXT: kmovq %rdi, %k1 3888; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 3889; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3890; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 3891; CHECK-NEXT: retq 3892 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3893 %2 = icmp slt <64 x i8> %a1, %a0 3894 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0 3895 %4 = bitcast i64 %mask to <64 x i1> 3896 ; load needed to keep the operation from being scheduled about the asm block 3897 %5 = load <64 x i8>, <64 x i8>* %passthru 3898 %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5 3899 ret <64 x i8> %6 3900} 3901 3902define <64 x i8> @stack_fold_pminsb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 3903; CHECK-LABEL: stack_fold_pminsb_maskz: 3904; CHECK: # %bb.0: 3905; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3906; CHECK-NEXT: #APP 3907; CHECK-NEXT: nop 3908; CHECK-NEXT: #NO_APP 3909; CHECK-NEXT: kmovq %rdi, %k1 3910; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 3911; CHECK-NEXT: retq 3912 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3913 %2 = icmp slt <64 x i8> %a0, %a1 3914 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1 3915 %4 = bitcast i64 %mask to <64 x i1> 3916 %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer 3917 ret <64 x i8> %5 3918} 3919 3920define <64 x i8> @stack_fold_pminsb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 3921; CHECK-LABEL: stack_fold_pminsb_maskz_commuted: 3922; CHECK: # %bb.0: 3923; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3924; CHECK-NEXT: #APP 3925; CHECK-NEXT: nop 3926; CHECK-NEXT: #NO_APP 3927; CHECK-NEXT: kmovq %rdi, %k1 3928; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 3929; CHECK-NEXT: retq 3930 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3931 %2 = icmp slt <64 x i8> %a1, %a0 3932 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0 3933 %4 = bitcast i64 %mask to <64 x i1> 3934 %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer 3935 ret <64 x i8> %5 3936} 3937 3938define <16 x i32> @stack_fold_pminsd(<16 x i32> %a0, <16 x i32> %a1) { 3939; CHECK-LABEL: stack_fold_pminsd: 3940; CHECK: # %bb.0: 3941; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3942; CHECK-NEXT: #APP 3943; CHECK-NEXT: nop 3944; CHECK-NEXT: #NO_APP 3945; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 3946; CHECK-NEXT: retq 3947 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3948 %2 = icmp slt <16 x i32> %a0, %a1 3949 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1 3950 ret <16 x i32> %3 3951} 3952 3953define <16 x i32> @stack_fold_pminsd_commuted(<16 x i32> %a0, <16 x i32> %a1) { 3954; CHECK-LABEL: stack_fold_pminsd_commuted: 3955; CHECK: # %bb.0: 3956; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3957; CHECK-NEXT: #APP 3958; CHECK-NEXT: nop 3959; CHECK-NEXT: #NO_APP 3960; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 3961; CHECK-NEXT: retq 3962 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3963 %2 = icmp slt <16 x i32> %a1, %a0 3964 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0 3965 ret <16 x i32> %3 3966} 3967 3968define <16 x i32> @stack_fold_pminsd_mask(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) { 3969; CHECK-LABEL: stack_fold_pminsd_mask: 3970; CHECK: # %bb.0: 3971; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3972; CHECK-NEXT: #APP 3973; CHECK-NEXT: nop 3974; CHECK-NEXT: #NO_APP 3975; CHECK-NEXT: kmovd %edi, %k1 3976; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 3977; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3978; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 3979; CHECK-NEXT: retq 3980 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3981 %2 = icmp slt <16 x i32> %a0, %a1 3982 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1 3983 %4 = bitcast i16 %mask to <16 x i1> 3984 ; load needed to keep the operation from being scheduled about the asm block 3985 %5 = load <16 x i32>, <16 x i32>* %passthru 3986 %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5 3987 ret <16 x i32> %6 3988} 3989 3990define <16 x i32> @stack_fold_pminsd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) { 3991; CHECK-LABEL: stack_fold_pminsd_mask_commuted: 3992; CHECK: # %bb.0: 3993; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3994; CHECK-NEXT: #APP 3995; CHECK-NEXT: nop 3996; CHECK-NEXT: #NO_APP 3997; CHECK-NEXT: kmovd %edi, %k1 3998; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 3999; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 4000; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 4001; CHECK-NEXT: retq 4002 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4003 %2 = icmp slt <16 x i32> %a1, %a0 4004 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0 4005 %4 = bitcast i16 %mask to <16 x i1> 4006 ; load needed to keep the operation from being scheduled about the asm block 4007 %5 = load <16 x i32>, <16 x i32>* %passthru 4008 %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5 4009 ret <16 x i32> %6 4010} 4011 4012define <16 x i32> @stack_fold_pminsd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { 4013; CHECK-LABEL: stack_fold_pminsd_maskz: 4014; CHECK: # %bb.0: 4015; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4016; CHECK-NEXT: #APP 4017; CHECK-NEXT: nop 4018; CHECK-NEXT: #NO_APP 4019; CHECK-NEXT: kmovd %edi, %k1 4020; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 4021; CHECK-NEXT: retq 4022 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4023 %2 = icmp slt <16 x i32> %a0, %a1 4024 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1 4025 %4 = bitcast i16 %mask to <16 x i1> 4026 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer 4027 ret <16 x i32> %5 4028} 4029 4030define <16 x i32> @stack_fold_pminsd_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { 4031; CHECK-LABEL: stack_fold_pminsd_maskz_commuted: 4032; CHECK: # %bb.0: 4033; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4034; CHECK-NEXT: #APP 4035; CHECK-NEXT: nop 4036; CHECK-NEXT: #NO_APP 4037; CHECK-NEXT: kmovd %edi, %k1 4038; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 4039; CHECK-NEXT: retq 4040 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4041 %2 = icmp slt <16 x i32> %a1, %a0 4042 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0 4043 %4 = bitcast i16 %mask to <16 x i1> 4044 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer 4045 ret <16 x i32> %5 4046} 4047 4048define <8 x i64> @stack_fold_pminsq(<8 x i64> %a0, <8 x i64> %a1) { 4049; CHECK-LABEL: stack_fold_pminsq: 4050; CHECK: # %bb.0: 4051; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4052; CHECK-NEXT: #APP 4053; CHECK-NEXT: nop 4054; CHECK-NEXT: #NO_APP 4055; CHECK-NEXT: vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 4056; CHECK-NEXT: retq 4057 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4058 %2 = icmp slt <8 x i64> %a0, %a1 4059 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1 4060 ret <8 x i64> %3 4061} 4062 4063define <8 x i64> @stack_fold_pminsq_commuted(<8 x i64> %a0, <8 x i64> %a1) { 4064; CHECK-LABEL: stack_fold_pminsq_commuted: 4065; CHECK: # %bb.0: 4066; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4067; CHECK-NEXT: #APP 4068; CHECK-NEXT: nop 4069; CHECK-NEXT: #NO_APP 4070; CHECK-NEXT: vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 4071; CHECK-NEXT: retq 4072 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4073 %2 = icmp slt <8 x i64> %a1, %a0 4074 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0 4075 ret <8 x i64> %3 4076} 4077 4078define <8 x i64> @stack_fold_pminsq_mask(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) { 4079; CHECK-LABEL: stack_fold_pminsq_mask: 4080; CHECK: # %bb.0: 4081; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4082; CHECK-NEXT: #APP 4083; CHECK-NEXT: nop 4084; CHECK-NEXT: #NO_APP 4085; CHECK-NEXT: kmovd %edi, %k1 4086; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 4087; CHECK-NEXT: vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 4088; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 4089; CHECK-NEXT: retq 4090 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4091 %2 = icmp slt <8 x i64> %a0, %a1 4092 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1 4093 %4 = bitcast i8 %mask to <8 x i1> 4094 ; load needed to keep the operation from being scheduled about the asm block 4095 %5 = load <8 x i64>, <8 x i64>* %passthru 4096 %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5 4097 ret <8 x i64> %6 4098} 4099 4100define <8 x i64> @stack_fold_pminsq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) { 4101; CHECK-LABEL: stack_fold_pminsq_mask_commuted: 4102; CHECK: # %bb.0: 4103; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4104; CHECK-NEXT: #APP 4105; CHECK-NEXT: nop 4106; CHECK-NEXT: #NO_APP 4107; CHECK-NEXT: kmovd %edi, %k1 4108; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 4109; CHECK-NEXT: vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 4110; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 4111; CHECK-NEXT: retq 4112 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4113 %2 = icmp slt <8 x i64> %a1, %a0 4114 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0 4115 %4 = bitcast i8 %mask to <8 x i1> 4116 ; load needed to keep the operation from being scheduled about the asm block 4117 %5 = load <8 x i64>, <8 x i64>* %passthru 4118 %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5 4119 ret <8 x i64> %6 4120} 4121 4122define <8 x i64> @stack_fold_pminsq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 4123; CHECK-LABEL: stack_fold_pminsq_maskz: 4124; CHECK: # %bb.0: 4125; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4126; CHECK-NEXT: #APP 4127; CHECK-NEXT: nop 4128; CHECK-NEXT: #NO_APP 4129; CHECK-NEXT: kmovd %edi, %k1 4130; CHECK-NEXT: vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 4131; CHECK-NEXT: retq 4132 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4133 %2 = icmp slt <8 x i64> %a0, %a1 4134 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1 4135 %4 = bitcast i8 %mask to <8 x i1> 4136 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer 4137 ret <8 x i64> %5 4138} 4139 4140define <8 x i64> @stack_fold_pminsq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 4141; CHECK-LABEL: stack_fold_pminsq_maskz_commuted: 4142; CHECK: # %bb.0: 4143; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4144; CHECK-NEXT: #APP 4145; CHECK-NEXT: nop 4146; CHECK-NEXT: #NO_APP 4147; CHECK-NEXT: kmovd %edi, %k1 4148; CHECK-NEXT: vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 4149; CHECK-NEXT: retq 4150 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4151 %2 = icmp slt <8 x i64> %a1, %a0 4152 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0 4153 %4 = bitcast i8 %mask to <8 x i1> 4154 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer 4155 ret <8 x i64> %5 4156} 4157 4158define <32 x i16> @stack_fold_pminsw(<32 x i16> %a0, <32 x i16> %a1) { 4159; CHECK-LABEL: stack_fold_pminsw: 4160; CHECK: # %bb.0: 4161; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4162; CHECK-NEXT: #APP 4163; CHECK-NEXT: nop 4164; CHECK-NEXT: #NO_APP 4165; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 4166; CHECK-NEXT: retq 4167 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4168 %2 = icmp slt <32 x i16> %a0, %a1 4169 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1 4170 ret <32 x i16> %3 4171} 4172 4173define <32 x i16> @stack_fold_pminsw_commuted(<32 x i16> %a0, <32 x i16> %a1) { 4174; CHECK-LABEL: stack_fold_pminsw_commuted: 4175; CHECK: # %bb.0: 4176; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4177; CHECK-NEXT: #APP 4178; CHECK-NEXT: nop 4179; CHECK-NEXT: #NO_APP 4180; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 4181; CHECK-NEXT: retq 4182 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4183 %2 = icmp slt <32 x i16> %a1, %a0 4184 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0 4185 ret <32 x i16> %3 4186} 4187 4188define <32 x i16> @stack_fold_pminsw_mask(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) { 4189; CHECK-LABEL: stack_fold_pminsw_mask: 4190; CHECK: # %bb.0: 4191; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4192; CHECK-NEXT: #APP 4193; CHECK-NEXT: nop 4194; CHECK-NEXT: #NO_APP 4195; CHECK-NEXT: kmovd %edi, %k1 4196; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 4197; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 4198; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 4199; CHECK-NEXT: retq 4200 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4201 %2 = icmp slt <32 x i16> %a0, %a1 4202 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1 4203 %4 = bitcast i32 %mask to <32 x i1> 4204 ; load needed to keep the operation from being scheduled about the asm block 4205 %5 = load <32 x i16>, <32 x i16>* %passthru 4206 %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5 4207 ret <32 x i16> %6 4208} 4209 4210define <32 x i16> @stack_fold_pminsw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) { 4211; CHECK-LABEL: stack_fold_pminsw_mask_commuted: 4212; CHECK: # %bb.0: 4213; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4214; CHECK-NEXT: #APP 4215; CHECK-NEXT: nop 4216; CHECK-NEXT: #NO_APP 4217; CHECK-NEXT: kmovd %edi, %k1 4218; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 4219; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 4220; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 4221; CHECK-NEXT: retq 4222 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4223 %2 = icmp slt <32 x i16> %a1, %a0 4224 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0 4225 %4 = bitcast i32 %mask to <32 x i1> 4226 ; load needed to keep the operation from being scheduled about the asm block 4227 %5 = load <32 x i16>, <32 x i16>* %passthru 4228 %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5 4229 ret <32 x i16> %6 4230} 4231 4232define <32 x i16> @stack_fold_pminsw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { 4233; CHECK-LABEL: stack_fold_pminsw_maskz: 4234; CHECK: # %bb.0: 4235; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4236; CHECK-NEXT: #APP 4237; CHECK-NEXT: nop 4238; CHECK-NEXT: #NO_APP 4239; CHECK-NEXT: kmovd %edi, %k1 4240; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 4241; CHECK-NEXT: retq 4242 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4243 %2 = icmp slt <32 x i16> %a0, %a1 4244 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1 4245 %4 = bitcast i32 %mask to <32 x i1> 4246 %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer 4247 ret <32 x i16> %5 4248} 4249 4250define <32 x i16> @stack_fold_pminsw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { 4251; CHECK-LABEL: stack_fold_pminsw_maskz_commuted: 4252; CHECK: # %bb.0: 4253; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4254; CHECK-NEXT: #APP 4255; CHECK-NEXT: nop 4256; CHECK-NEXT: #NO_APP 4257; CHECK-NEXT: kmovd %edi, %k1 4258; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 4259; CHECK-NEXT: retq 4260 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4261 %2 = icmp slt <32 x i16> %a1, %a0 4262 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0 4263 %4 = bitcast i32 %mask to <32 x i1> 4264 %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer 4265 ret <32 x i16> %5 4266} 4267 4268define <64 x i8> @stack_fold_pminub(<64 x i8> %a0, <64 x i8> %a1) { 4269; CHECK-LABEL: stack_fold_pminub: 4270; CHECK: # %bb.0: 4271; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4272; CHECK-NEXT: #APP 4273; CHECK-NEXT: nop 4274; CHECK-NEXT: #NO_APP 4275; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 4276; CHECK-NEXT: retq 4277 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4278 %2 = icmp ult <64 x i8> %a0, %a1 4279 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1 4280 ret <64 x i8> %3 4281} 4282 4283define <64 x i8> @stack_fold_pminub_commuted(<64 x i8> %a0, <64 x i8> %a1) { 4284; CHECK-LABEL: stack_fold_pminub_commuted: 4285; CHECK: # %bb.0: 4286; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4287; CHECK-NEXT: #APP 4288; CHECK-NEXT: nop 4289; CHECK-NEXT: #NO_APP 4290; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 4291; CHECK-NEXT: retq 4292 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4293 %2 = icmp ult <64 x i8> %a1, %a0 4294 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0 4295 ret <64 x i8> %3 4296} 4297 4298define <64 x i8> @stack_fold_pminub_mask(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) { 4299; CHECK-LABEL: stack_fold_pminub_mask: 4300; CHECK: # %bb.0: 4301; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4302; CHECK-NEXT: #APP 4303; CHECK-NEXT: nop 4304; CHECK-NEXT: #NO_APP 4305; CHECK-NEXT: kmovq %rdi, %k1 4306; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 4307; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 4308; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 4309; CHECK-NEXT: retq 4310 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4311 %2 = icmp ult <64 x i8> %a0, %a1 4312 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1 4313 %4 = bitcast i64 %mask to <64 x i1> 4314 ; load needed to keep the operation from being scheduled about the asm block 4315 %5 = load <64 x i8>, <64 x i8>* %passthru 4316 %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5 4317 ret <64 x i8> %6 4318} 4319 4320define <64 x i8> @stack_fold_pminub_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) { 4321; CHECK-LABEL: stack_fold_pminub_mask_commuted: 4322; CHECK: # %bb.0: 4323; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4324; CHECK-NEXT: #APP 4325; CHECK-NEXT: nop 4326; CHECK-NEXT: #NO_APP 4327; CHECK-NEXT: kmovq %rdi, %k1 4328; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 4329; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 4330; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 4331; CHECK-NEXT: retq 4332 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4333 %2 = icmp ult <64 x i8> %a1, %a0 4334 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0 4335 %4 = bitcast i64 %mask to <64 x i1> 4336 ; load needed to keep the operation from being scheduled about the asm block 4337 %5 = load <64 x i8>, <64 x i8>* %passthru 4338 %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5 4339 ret <64 x i8> %6 4340} 4341 4342define <64 x i8> @stack_fold_pminub_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 4343; CHECK-LABEL: stack_fold_pminub_maskz: 4344; CHECK: # %bb.0: 4345; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4346; CHECK-NEXT: #APP 4347; CHECK-NEXT: nop 4348; CHECK-NEXT: #NO_APP 4349; CHECK-NEXT: kmovq %rdi, %k1 4350; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 4351; CHECK-NEXT: retq 4352 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4353 %2 = icmp ult <64 x i8> %a0, %a1 4354 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1 4355 %4 = bitcast i64 %mask to <64 x i1> 4356 %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer 4357 ret <64 x i8> %5 4358} 4359 4360define <64 x i8> @stack_fold_pminub_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 4361; CHECK-LABEL: stack_fold_pminub_maskz_commuted: 4362; CHECK: # %bb.0: 4363; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4364; CHECK-NEXT: #APP 4365; CHECK-NEXT: nop 4366; CHECK-NEXT: #NO_APP 4367; CHECK-NEXT: kmovq %rdi, %k1 4368; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 4369; CHECK-NEXT: retq 4370 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4371 %2 = icmp ult <64 x i8> %a1, %a0 4372 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0 4373 %4 = bitcast i64 %mask to <64 x i1> 4374 %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer 4375 ret <64 x i8> %5 4376} 4377 4378define <16 x i32> @stack_fold_pminud(<16 x i32> %a0, <16 x i32> %a1) { 4379; CHECK-LABEL: stack_fold_pminud: 4380; CHECK: # %bb.0: 4381; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4382; CHECK-NEXT: #APP 4383; CHECK-NEXT: nop 4384; CHECK-NEXT: #NO_APP 4385; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 4386; CHECK-NEXT: retq 4387 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4388 %2 = icmp ult <16 x i32> %a0, %a1 4389 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1 4390 ret <16 x i32> %3 4391} 4392 4393define <16 x i32> @stack_fold_pminud_commuted(<16 x i32> %a0, <16 x i32> %a1) { 4394; CHECK-LABEL: stack_fold_pminud_commuted: 4395; CHECK: # %bb.0: 4396; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4397; CHECK-NEXT: #APP 4398; CHECK-NEXT: nop 4399; CHECK-NEXT: #NO_APP 4400; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 4401; CHECK-NEXT: retq 4402 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4403 %2 = icmp ult <16 x i32> %a1, %a0 4404 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0 4405 ret <16 x i32> %3 4406} 4407 4408define <16 x i32> @stack_fold_pminud_mask(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) { 4409; CHECK-LABEL: stack_fold_pminud_mask: 4410; CHECK: # %bb.0: 4411; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4412; CHECK-NEXT: #APP 4413; CHECK-NEXT: nop 4414; CHECK-NEXT: #NO_APP 4415; CHECK-NEXT: kmovd %edi, %k1 4416; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 4417; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 4418; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 4419; CHECK-NEXT: retq 4420 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4421 %2 = icmp ult <16 x i32> %a0, %a1 4422 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1 4423 %4 = bitcast i16 %mask to <16 x i1> 4424 ; load needed to keep the operation from being scheduled about the asm block 4425 %5 = load <16 x i32>, <16 x i32>* %passthru 4426 %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5 4427 ret <16 x i32> %6 4428} 4429 4430define <16 x i32> @stack_fold_pminud_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) { 4431; CHECK-LABEL: stack_fold_pminud_mask_commuted: 4432; CHECK: # %bb.0: 4433; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4434; CHECK-NEXT: #APP 4435; CHECK-NEXT: nop 4436; CHECK-NEXT: #NO_APP 4437; CHECK-NEXT: kmovd %edi, %k1 4438; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 4439; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 4440; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 4441; CHECK-NEXT: retq 4442 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4443 %2 = icmp ult <16 x i32> %a1, %a0 4444 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0 4445 %4 = bitcast i16 %mask to <16 x i1> 4446 ; load needed to keep the operation from being scheduled about the asm block 4447 %5 = load <16 x i32>, <16 x i32>* %passthru 4448 %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5 4449 ret <16 x i32> %6 4450} 4451 4452define <16 x i32> @stack_fold_pminud_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { 4453; CHECK-LABEL: stack_fold_pminud_maskz: 4454; CHECK: # %bb.0: 4455; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4456; CHECK-NEXT: #APP 4457; CHECK-NEXT: nop 4458; CHECK-NEXT: #NO_APP 4459; CHECK-NEXT: kmovd %edi, %k1 4460; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 4461; CHECK-NEXT: retq 4462 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4463 %2 = icmp ult <16 x i32> %a0, %a1 4464 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1 4465 %4 = bitcast i16 %mask to <16 x i1> 4466 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer 4467 ret <16 x i32> %5 4468} 4469 4470define <16 x i32> @stack_fold_pminud_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { 4471; CHECK-LABEL: stack_fold_pminud_maskz_commuted: 4472; CHECK: # %bb.0: 4473; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4474; CHECK-NEXT: #APP 4475; CHECK-NEXT: nop 4476; CHECK-NEXT: #NO_APP 4477; CHECK-NEXT: kmovd %edi, %k1 4478; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 4479; CHECK-NEXT: retq 4480 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4481 %2 = icmp ult <16 x i32> %a1, %a0 4482 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0 4483 %4 = bitcast i16 %mask to <16 x i1> 4484 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer 4485 ret <16 x i32> %5 4486} 4487 4488define <8 x i64> @stack_fold_pminuq(<8 x i64> %a0, <8 x i64> %a1) { 4489; CHECK-LABEL: stack_fold_pminuq: 4490; CHECK: # %bb.0: 4491; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4492; CHECK-NEXT: #APP 4493; CHECK-NEXT: nop 4494; CHECK-NEXT: #NO_APP 4495; CHECK-NEXT: vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 4496; CHECK-NEXT: retq 4497 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4498 %2 = icmp ult <8 x i64> %a0, %a1 4499 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1 4500 ret <8 x i64> %3 4501} 4502 4503define <8 x i64> @stack_fold_pminuq_commuted(<8 x i64> %a0, <8 x i64> %a1) { 4504; CHECK-LABEL: stack_fold_pminuq_commuted: 4505; CHECK: # %bb.0: 4506; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4507; CHECK-NEXT: #APP 4508; CHECK-NEXT: nop 4509; CHECK-NEXT: #NO_APP 4510; CHECK-NEXT: vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 4511; CHECK-NEXT: retq 4512 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4513 %2 = icmp ult <8 x i64> %a1, %a0 4514 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0 4515 ret <8 x i64> %3 4516} 4517 4518define <8 x i64> @stack_fold_pminuq_mask(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) { 4519; CHECK-LABEL: stack_fold_pminuq_mask: 4520; CHECK: # %bb.0: 4521; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4522; CHECK-NEXT: #APP 4523; CHECK-NEXT: nop 4524; CHECK-NEXT: #NO_APP 4525; CHECK-NEXT: kmovd %edi, %k1 4526; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 4527; CHECK-NEXT: vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 4528; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 4529; CHECK-NEXT: retq 4530 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4531 %2 = icmp ult <8 x i64> %a0, %a1 4532 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1 4533 %4 = bitcast i8 %mask to <8 x i1> 4534 ; load needed to keep the operation from being scheduled about the asm block 4535 %5 = load <8 x i64>, <8 x i64>* %passthru 4536 %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5 4537 ret <8 x i64> %6 4538} 4539 4540define <8 x i64> @stack_fold_pminuq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) { 4541; CHECK-LABEL: stack_fold_pminuq_mask_commuted: 4542; CHECK: # %bb.0: 4543; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4544; CHECK-NEXT: #APP 4545; CHECK-NEXT: nop 4546; CHECK-NEXT: #NO_APP 4547; CHECK-NEXT: kmovd %edi, %k1 4548; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 4549; CHECK-NEXT: vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 4550; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 4551; CHECK-NEXT: retq 4552 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4553 %2 = icmp ult <8 x i64> %a1, %a0 4554 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0 4555 %4 = bitcast i8 %mask to <8 x i1> 4556 ; load needed to keep the operation from being scheduled about the asm block 4557 %5 = load <8 x i64>, <8 x i64>* %passthru 4558 %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5 4559 ret <8 x i64> %6 4560} 4561 4562define <8 x i64> @stack_fold_pminuq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 4563; CHECK-LABEL: stack_fold_pminuq_maskz: 4564; CHECK: # %bb.0: 4565; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4566; CHECK-NEXT: #APP 4567; CHECK-NEXT: nop 4568; CHECK-NEXT: #NO_APP 4569; CHECK-NEXT: kmovd %edi, %k1 4570; CHECK-NEXT: vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 4571; CHECK-NEXT: retq 4572 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4573 %2 = icmp ult <8 x i64> %a0, %a1 4574 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1 4575 %4 = bitcast i8 %mask to <8 x i1> 4576 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer 4577 ret <8 x i64> %5 4578} 4579 4580define <8 x i64> @stack_fold_pminuq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 4581; CHECK-LABEL: stack_fold_pminuq_maskz_commuted: 4582; CHECK: # %bb.0: 4583; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4584; CHECK-NEXT: #APP 4585; CHECK-NEXT: nop 4586; CHECK-NEXT: #NO_APP 4587; CHECK-NEXT: kmovd %edi, %k1 4588; CHECK-NEXT: vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 4589; CHECK-NEXT: retq 4590 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4591 %2 = icmp ult <8 x i64> %a1, %a0 4592 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0 4593 %4 = bitcast i8 %mask to <8 x i1> 4594 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer 4595 ret <8 x i64> %5 4596} 4597 4598define <32 x i16> @stack_fold_pminuw(<32 x i16> %a0, <32 x i16> %a1) { 4599; CHECK-LABEL: stack_fold_pminuw: 4600; CHECK: # %bb.0: 4601; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4602; CHECK-NEXT: #APP 4603; CHECK-NEXT: nop 4604; CHECK-NEXT: #NO_APP 4605; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 4606; CHECK-NEXT: retq 4607 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4608 %2 = icmp ult <32 x i16> %a0, %a1 4609 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1 4610 ret <32 x i16> %3 4611} 4612 4613define <32 x i16> @stack_fold_pminuw_commuted(<32 x i16> %a0, <32 x i16> %a1) { 4614; CHECK-LABEL: stack_fold_pminuw_commuted: 4615; CHECK: # %bb.0: 4616; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4617; CHECK-NEXT: #APP 4618; CHECK-NEXT: nop 4619; CHECK-NEXT: #NO_APP 4620; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 4621; CHECK-NEXT: retq 4622 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4623 %2 = icmp ult <32 x i16> %a1, %a0 4624 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0 4625 ret <32 x i16> %3 4626} 4627 4628define <32 x i16> @stack_fold_pminuw_mask(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) { 4629; CHECK-LABEL: stack_fold_pminuw_mask: 4630; CHECK: # %bb.0: 4631; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4632; CHECK-NEXT: #APP 4633; CHECK-NEXT: nop 4634; CHECK-NEXT: #NO_APP 4635; CHECK-NEXT: kmovd %edi, %k1 4636; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 4637; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 4638; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 4639; CHECK-NEXT: retq 4640 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4641 %2 = icmp ult <32 x i16> %a0, %a1 4642 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1 4643 %4 = bitcast i32 %mask to <32 x i1> 4644 ; load needed to keep the operation from being scheduled about the asm block 4645 %5 = load <32 x i16>, <32 x i16>* %passthru 4646 %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5 4647 ret <32 x i16> %6 4648} 4649 4650define <32 x i16> @stack_fold_pminuw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) { 4651; CHECK-LABEL: stack_fold_pminuw_mask_commuted: 4652; CHECK: # %bb.0: 4653; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4654; CHECK-NEXT: #APP 4655; CHECK-NEXT: nop 4656; CHECK-NEXT: #NO_APP 4657; CHECK-NEXT: kmovd %edi, %k1 4658; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 4659; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 4660; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 4661; CHECK-NEXT: retq 4662 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4663 %2 = icmp ult <32 x i16> %a1, %a0 4664 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0 4665 %4 = bitcast i32 %mask to <32 x i1> 4666 ; load needed to keep the operation from being scheduled about the asm block 4667 %5 = load <32 x i16>, <32 x i16>* %passthru 4668 %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5 4669 ret <32 x i16> %6 4670} 4671 4672define <32 x i16> @stack_fold_pminuw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { 4673; CHECK-LABEL: stack_fold_pminuw_maskz: 4674; CHECK: # %bb.0: 4675; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4676; CHECK-NEXT: #APP 4677; CHECK-NEXT: nop 4678; CHECK-NEXT: #NO_APP 4679; CHECK-NEXT: kmovd %edi, %k1 4680; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 4681; CHECK-NEXT: retq 4682 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4683 %2 = icmp ult <32 x i16> %a0, %a1 4684 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1 4685 %4 = bitcast i32 %mask to <32 x i1> 4686 %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer 4687 ret <32 x i16> %5 4688} 4689 4690define <32 x i16> @stack_fold_pminuw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { 4691; CHECK-LABEL: stack_fold_pminuw_maskz_commuted: 4692; CHECK: # %bb.0: 4693; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4694; CHECK-NEXT: #APP 4695; CHECK-NEXT: nop 4696; CHECK-NEXT: #NO_APP 4697; CHECK-NEXT: kmovd %edi, %k1 4698; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 4699; CHECK-NEXT: retq 4700 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4701 %2 = icmp ult <32 x i16> %a1, %a0 4702 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0 4703 %4 = bitcast i32 %mask to <32 x i1> 4704 %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer 4705 ret <32 x i16> %5 4706} 4707 4708define <16 x i8> @stack_fold_vpmovdb(<16 x i32> %a0) { 4709; CHECK-LABEL: stack_fold_vpmovdb: 4710; CHECK: # %bb.0: 4711; CHECK-NEXT: vpmovdb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill 4712; CHECK-NEXT: #APP 4713; CHECK-NEXT: nop 4714; CHECK-NEXT: #NO_APP 4715; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4716; CHECK-NEXT: vzeroupper 4717; CHECK-NEXT: retq 4718 %1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %a0, <16 x i8> undef, i16 -1) 4719 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4720 ret <16 x i8> %1 4721} 4722declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32>, <16 x i8>, i16) 4723 4724define <16 x i16> @stack_fold_vpmovdw(<16 x i32> %a0) { 4725; CHECK-LABEL: stack_fold_vpmovdw: 4726; CHECK: # %bb.0: 4727; CHECK-NEXT: vpmovdw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill 4728; CHECK-NEXT: #APP 4729; CHECK-NEXT: nop 4730; CHECK-NEXT: #NO_APP 4731; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4732; CHECK-NEXT: retq 4733 %1 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %a0, <16 x i16> undef, i16 -1) 4734 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4735 ret <16 x i16> %1 4736} 4737declare <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32>, <16 x i16>, i16) 4738 4739define <2 x i64> @stack_fold_movq_load(<2 x i64> %a0) { 4740; CHECK-LABEL: stack_fold_movq_load: 4741; CHECK: # %bb.0: 4742; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4743; CHECK-NEXT: #APP 4744; CHECK-NEXT: nop 4745; CHECK-NEXT: #NO_APP 4746; CHECK-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4747; CHECK-NEXT: # xmm0 = mem[0],zero 4748; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 4749; CHECK-NEXT: vpsubq %xmm1, %xmm0, %xmm0 4750; CHECK-NEXT: retq 4751 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4752 %2 = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 2> 4753 ; add forces execution domain 4754 %3 = add <2 x i64> %2, <i64 1, i64 1> 4755 ret <2 x i64> %3 4756} 4757 4758define <8 x i32> @stack_fold_vpmovqd(<8 x i64> %a0) { 4759; CHECK-LABEL: stack_fold_vpmovqd: 4760; CHECK: # %bb.0: 4761; CHECK-NEXT: vpmovqd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill 4762; CHECK-NEXT: #APP 4763; CHECK-NEXT: nop 4764; CHECK-NEXT: #NO_APP 4765; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4766; CHECK-NEXT: retq 4767 %1 = trunc <8 x i64> %a0 to <8 x i32> 4768 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4769 ret <8 x i32> %1 4770} 4771declare <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64>, <8 x i32>, i8) 4772 4773define <8 x i16> @stack_fold_vpmovqw(<8 x i64> %a0) { 4774; CHECK-LABEL: stack_fold_vpmovqw: 4775; CHECK: # %bb.0: 4776; CHECK-NEXT: vpmovqw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill 4777; CHECK-NEXT: #APP 4778; CHECK-NEXT: nop 4779; CHECK-NEXT: #NO_APP 4780; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4781; CHECK-NEXT: vzeroupper 4782; CHECK-NEXT: retq 4783 %1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %a0, <8 x i16> undef, i8 -1) 4784 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4785 ret <8 x i16> %1 4786} 4787declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8) 4788 4789define <32 x i8> @stack_fold_vpmovwb(<32 x i16> %a0) { 4790; CHECK-LABEL: stack_fold_vpmovwb: 4791; CHECK: # %bb.0: 4792; CHECK-NEXT: vpmovwb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill 4793; CHECK-NEXT: #APP 4794; CHECK-NEXT: nop 4795; CHECK-NEXT: #NO_APP 4796; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4797; CHECK-NEXT: retq 4798 %1 = trunc <32 x i16> %a0 to <32 x i8> 4799 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4800 ret <32 x i8> %1 4801} 4802declare <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16>, <32 x i8>, i32) 4803 4804define <16 x i8> @stack_fold_vpmovsdb(<16 x i32> %a0) { 4805; CHECK-LABEL: stack_fold_vpmovsdb: 4806; CHECK: # %bb.0: 4807; CHECK-NEXT: vpmovsdb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill 4808; CHECK-NEXT: #APP 4809; CHECK-NEXT: nop 4810; CHECK-NEXT: #NO_APP 4811; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4812; CHECK-NEXT: vzeroupper 4813; CHECK-NEXT: retq 4814 %1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %a0, <16 x i8> undef, i16 -1) 4815 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4816 ret <16 x i8> %1 4817} 4818declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32>, <16 x i8>, i16) 4819 4820define <16 x i16> @stack_fold_vpmovsdw(<16 x i32> %a0) { 4821; CHECK-LABEL: stack_fold_vpmovsdw: 4822; CHECK: # %bb.0: 4823; CHECK-NEXT: vpmovsdw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill 4824; CHECK-NEXT: #APP 4825; CHECK-NEXT: nop 4826; CHECK-NEXT: #NO_APP 4827; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4828; CHECK-NEXT: retq 4829 %1 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %a0, <16 x i16> undef, i16 -1) 4830 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4831 ret <16 x i16> %1 4832} 4833declare <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32>, <16 x i16>, i16) 4834 4835define <8 x i32> @stack_fold_vpmovsqd(<8 x i64> %a0) { 4836; CHECK-LABEL: stack_fold_vpmovsqd: 4837; CHECK: # %bb.0: 4838; CHECK-NEXT: vpmovsqd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill 4839; CHECK-NEXT: #APP 4840; CHECK-NEXT: nop 4841; CHECK-NEXT: #NO_APP 4842; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4843; CHECK-NEXT: retq 4844 %1 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %a0, <8 x i32> undef, i8 -1) 4845 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4846 ret <8 x i32> %1 4847} 4848declare <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64>, <8 x i32>, i8) 4849 4850define <8 x i16> @stack_fold_vpmovsqw(<8 x i64> %a0) { 4851; CHECK-LABEL: stack_fold_vpmovsqw: 4852; CHECK: # %bb.0: 4853; CHECK-NEXT: vpmovsqw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill 4854; CHECK-NEXT: #APP 4855; CHECK-NEXT: nop 4856; CHECK-NEXT: #NO_APP 4857; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4858; CHECK-NEXT: vzeroupper 4859; CHECK-NEXT: retq 4860 %1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %a0, <8 x i16> undef, i8 -1) 4861 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4862 ret <8 x i16> %1 4863} 4864declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64>, <8 x i16>, i8) 4865 4866define <32 x i8> @stack_fold_vpmovswb(<32 x i16> %a0) { 4867; CHECK-LABEL: stack_fold_vpmovswb: 4868; CHECK: # %bb.0: 4869; CHECK-NEXT: vpmovswb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill 4870; CHECK-NEXT: #APP 4871; CHECK-NEXT: nop 4872; CHECK-NEXT: #NO_APP 4873; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4874; CHECK-NEXT: retq 4875 %1 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %a0, <32 x i8> undef, i32 -1) 4876 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4877 ret <32 x i8> %1 4878} 4879declare <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16>, <32 x i8>, i32) 4880 4881define <16 x i32> @stack_fold_pmovsxbd_zmm(<16 x i8> %a0) { 4882; CHECK-LABEL: stack_fold_pmovsxbd_zmm: 4883; CHECK: # %bb.0: 4884; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4885; CHECK-NEXT: #APP 4886; CHECK-NEXT: nop 4887; CHECK-NEXT: #NO_APP 4888; CHECK-NEXT: vpmovsxbd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload 4889; CHECK-NEXT: retq 4890 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4891 %2 = sext <16 x i8> %a0 to <16 x i32> 4892 ret <16 x i32> %2 4893} 4894 4895define <8 x i64> @stack_fold_pmovsxbq_zmm(<16 x i8> %a0) { 4896; CHECK-LABEL: stack_fold_pmovsxbq_zmm: 4897; CHECK: # %bb.0: 4898; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4899; CHECK-NEXT: #APP 4900; CHECK-NEXT: nop 4901; CHECK-NEXT: #NO_APP 4902; CHECK-NEXT: vpmovsxbq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload 4903; CHECK-NEXT: retq 4904 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4905 %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 4906 %3 = sext <8 x i8> %2 to <8 x i64> 4907 ret <8 x i64> %3 4908} 4909 4910define <32 x i16> @stack_fold_pmovsxbw_zmm(<32 x i8> %a0) { 4911; CHECK-LABEL: stack_fold_pmovsxbw_zmm: 4912; CHECK: # %bb.0: 4913; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4914; CHECK-NEXT: #APP 4915; CHECK-NEXT: nop 4916; CHECK-NEXT: #NO_APP 4917; CHECK-NEXT: vpmovsxbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload 4918; CHECK-NEXT: retq 4919 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4920 %2 = sext <32 x i8> %a0 to <32 x i16> 4921 ret <32 x i16> %2 4922} 4923 4924define <8 x i64> @stack_fold_pmovsxdq_zmm(<8 x i32> %a0) { 4925; CHECK-LABEL: stack_fold_pmovsxdq_zmm: 4926; CHECK: # %bb.0: 4927; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4928; CHECK-NEXT: #APP 4929; CHECK-NEXT: nop 4930; CHECK-NEXT: #NO_APP 4931; CHECK-NEXT: vpmovsxdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload 4932; CHECK-NEXT: retq 4933 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4934 %2 = sext <8 x i32> %a0 to <8 x i64> 4935 ret <8 x i64> %2 4936} 4937 4938define <16 x i32> @stack_fold_pmovsxwd_zmm(<16 x i16> %a0) { 4939; CHECK-LABEL: stack_fold_pmovsxwd_zmm: 4940; CHECK: # %bb.0: 4941; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4942; CHECK-NEXT: #APP 4943; CHECK-NEXT: nop 4944; CHECK-NEXT: #NO_APP 4945; CHECK-NEXT: vpmovsxwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload 4946; CHECK-NEXT: retq 4947 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4948 %2 = sext <16 x i16> %a0 to <16 x i32> 4949 ret <16 x i32> %2 4950} 4951 4952define <8 x i64> @stack_fold_pmovsxwq_zmm(<8 x i16> %a0) { 4953; CHECK-LABEL: stack_fold_pmovsxwq_zmm: 4954; CHECK: # %bb.0: 4955; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4956; CHECK-NEXT: #APP 4957; CHECK-NEXT: nop 4958; CHECK-NEXT: #NO_APP 4959; CHECK-NEXT: vpmovsxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload 4960; CHECK-NEXT: retq 4961 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4962 %2 = sext <8 x i16> %a0 to <8 x i64> 4963 ret <8 x i64> %2 4964} 4965 4966define <8 x i64> @stack_fold_pmovsxwq_mask_zmm(<8 x i64> %passthru, <8 x i16> %a0, i8 %mask) { 4967; CHECK-LABEL: stack_fold_pmovsxwq_mask_zmm: 4968; CHECK: # %bb.0: 4969; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4970; CHECK-NEXT: #APP 4971; CHECK-NEXT: nop 4972; CHECK-NEXT: #NO_APP 4973; CHECK-NEXT: kmovd %edi, %k1 4974; CHECK-NEXT: vpmovsxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 16-byte Folded Reload 4975; CHECK-NEXT: retq 4976 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4977 %2 = sext <8 x i16> %a0 to <8 x i64> 4978 %3 = bitcast i8 %mask to <8 x i1> 4979 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %passthru 4980 ret <8 x i64> %4 4981} 4982 4983define <8 x i64> @stack_fold_pmovsxwq_maskz_zmm(<8 x i16> %a0, i8 %mask) { 4984; CHECK-LABEL: stack_fold_pmovsxwq_maskz_zmm: 4985; CHECK: # %bb.0: 4986; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4987; CHECK-NEXT: #APP 4988; CHECK-NEXT: nop 4989; CHECK-NEXT: #NO_APP 4990; CHECK-NEXT: kmovd %edi, %k1 4991; CHECK-NEXT: vpmovsxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 16-byte Folded Reload 4992; CHECK-NEXT: retq 4993 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4994 %2 = sext <8 x i16> %a0 to <8 x i64> 4995 %3 = bitcast i8 %mask to <8 x i1> 4996 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer 4997 ret <8 x i64> %4 4998} 4999 5000define <16 x i8> @stack_fold_vpmovusdb(<16 x i32> %a0) { 5001; CHECK-LABEL: stack_fold_vpmovusdb: 5002; CHECK: # %bb.0: 5003; CHECK-NEXT: vpmovusdb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill 5004; CHECK-NEXT: #APP 5005; CHECK-NEXT: nop 5006; CHECK-NEXT: #NO_APP 5007; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5008; CHECK-NEXT: vzeroupper 5009; CHECK-NEXT: retq 5010 %1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %a0, <16 x i8> undef, i16 -1) 5011 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5012 ret <16 x i8> %1 5013} 5014declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32>, <16 x i8>, i16) 5015 5016define <16 x i16> @stack_fold_vpmovusdw(<16 x i32> %a0) { 5017; CHECK-LABEL: stack_fold_vpmovusdw: 5018; CHECK: # %bb.0: 5019; CHECK-NEXT: vpmovusdw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill 5020; CHECK-NEXT: #APP 5021; CHECK-NEXT: nop 5022; CHECK-NEXT: #NO_APP 5023; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5024; CHECK-NEXT: retq 5025 %1 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %a0, <16 x i16> undef, i16 -1) 5026 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5027 ret <16 x i16> %1 5028} 5029declare <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32>, <16 x i16>, i16) 5030 5031define <8 x i32> @stack_fold_vpmovusqd(<8 x i64> %a0) { 5032; CHECK-LABEL: stack_fold_vpmovusqd: 5033; CHECK: # %bb.0: 5034; CHECK-NEXT: vpmovusqd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill 5035; CHECK-NEXT: #APP 5036; CHECK-NEXT: nop 5037; CHECK-NEXT: #NO_APP 5038; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5039; CHECK-NEXT: retq 5040 %1 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %a0, <8 x i32> undef, i8 -1) 5041 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5042 ret <8 x i32> %1 5043} 5044declare <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64>, <8 x i32>, i8) 5045 5046define <8 x i16> @stack_fold_vpmovusqw(<8 x i64> %a0) { 5047; CHECK-LABEL: stack_fold_vpmovusqw: 5048; CHECK: # %bb.0: 5049; CHECK-NEXT: vpmovusqw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill 5050; CHECK-NEXT: #APP 5051; CHECK-NEXT: nop 5052; CHECK-NEXT: #NO_APP 5053; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5054; CHECK-NEXT: vzeroupper 5055; CHECK-NEXT: retq 5056 %1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %a0, <8 x i16> undef, i8 -1) 5057 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5058 ret <8 x i16> %1 5059} 5060declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64>, <8 x i16>, i8) 5061 5062define <32 x i8> @stack_fold_vpmovuswb(<32 x i16> %a0) { 5063; CHECK-LABEL: stack_fold_vpmovuswb: 5064; CHECK: # %bb.0: 5065; CHECK-NEXT: vpmovuswb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill 5066; CHECK-NEXT: #APP 5067; CHECK-NEXT: nop 5068; CHECK-NEXT: #NO_APP 5069; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5070; CHECK-NEXT: retq 5071 %1 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %a0, <32 x i8> undef, i32 -1) 5072 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5073 ret <32 x i8> %1 5074} 5075declare <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16>, <32 x i8>, i32) 5076 5077define <16 x i32> @stack_fold_pmovzxbd_zmm(<16 x i8> %a0) { 5078; CHECK-LABEL: stack_fold_pmovzxbd_zmm: 5079; CHECK: # %bb.0: 5080; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5081; CHECK-NEXT: #APP 5082; CHECK-NEXT: nop 5083; CHECK-NEXT: #NO_APP 5084; CHECK-NEXT: vpmovzxbd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload 5085; CHECK-NEXT: # zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero 5086; CHECK-NEXT: retq 5087 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5088 %2 = zext <16 x i8> %a0 to <16 x i32> 5089 ret <16 x i32> %2 5090} 5091 5092define <8 x i64> @stack_fold_pmovzxbq_zmm(<16 x i8> %a0) { 5093; CHECK-LABEL: stack_fold_pmovzxbq_zmm: 5094; CHECK: # %bb.0: 5095; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5096; CHECK-NEXT: #APP 5097; CHECK-NEXT: nop 5098; CHECK-NEXT: #NO_APP 5099; CHECK-NEXT: vpmovzxbq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload 5100; CHECK-NEXT: # zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero 5101; CHECK-NEXT: retq 5102 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5103 %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 5104 %3 = zext <8 x i8> %2 to <8 x i64> 5105 ret <8 x i64> %3 5106} 5107 5108define <32 x i16> @stack_fold_pmovzxbw_zmm(<32 x i8> %a0) { 5109; CHECK-LABEL: stack_fold_pmovzxbw_zmm: 5110; CHECK: # %bb.0: 5111; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5112; CHECK-NEXT: #APP 5113; CHECK-NEXT: nop 5114; CHECK-NEXT: #NO_APP 5115; CHECK-NEXT: vpmovzxbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload 5116; CHECK-NEXT: # zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero,mem[16],zero,mem[17],zero,mem[18],zero,mem[19],zero,mem[20],zero,mem[21],zero,mem[22],zero,mem[23],zero,mem[24],zero,mem[25],zero,mem[26],zero,mem[27],zero,mem[28],zero,mem[29],zero,mem[30],zero,mem[31],zero 5117; CHECK-NEXT: retq 5118 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5119 %2 = zext <32 x i8> %a0 to <32 x i16> 5120 ret <32 x i16> %2 5121} 5122 5123define <8 x i64> @stack_fold_pmovzxdq_zmm(<8 x i32> %a0) { 5124; CHECK-LABEL: stack_fold_pmovzxdq_zmm: 5125; CHECK: # %bb.0: 5126; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5127; CHECK-NEXT: #APP 5128; CHECK-NEXT: nop 5129; CHECK-NEXT: #NO_APP 5130; CHECK-NEXT: vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload 5131; CHECK-NEXT: # zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 5132; CHECK-NEXT: retq 5133 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5134 %2 = zext <8 x i32> %a0 to <8 x i64> 5135 ret <8 x i64> %2 5136} 5137 5138define <16 x i32> @stack_fold_pmovzxwd_zmm(<16 x i16> %a0) { 5139; CHECK-LABEL: stack_fold_pmovzxwd_zmm: 5140; CHECK: # %bb.0: 5141; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5142; CHECK-NEXT: #APP 5143; CHECK-NEXT: nop 5144; CHECK-NEXT: #NO_APP 5145; CHECK-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload 5146; CHECK-NEXT: # zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 5147; CHECK-NEXT: retq 5148 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5149 %2 = zext <16 x i16> %a0 to <16 x i32> 5150 ret <16 x i32> %2 5151} 5152 5153define <8 x i64> @stack_fold_pmovzxwq_zmm(<8 x i16> %a0) { 5154; CHECK-LABEL: stack_fold_pmovzxwq_zmm: 5155; CHECK: # %bb.0: 5156; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5157; CHECK-NEXT: #APP 5158; CHECK-NEXT: nop 5159; CHECK-NEXT: #NO_APP 5160; CHECK-NEXT: vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload 5161; CHECK-NEXT: # zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 5162; CHECK-NEXT: retq 5163 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5164 %2 = zext <8 x i16> %a0 to <8 x i64> 5165 ret <8 x i64> %2 5166} 5167 5168define <8 x i64> @stack_fold_pmovzxwq_mask_zmm(<8 x i64> %passthru, <8 x i16> %a0, i8 %mask) { 5169; CHECK-LABEL: stack_fold_pmovzxwq_mask_zmm: 5170; CHECK: # %bb.0: 5171; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5172; CHECK-NEXT: #APP 5173; CHECK-NEXT: nop 5174; CHECK-NEXT: #NO_APP 5175; CHECK-NEXT: kmovd %edi, %k1 5176; CHECK-NEXT: vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 16-byte Folded Reload 5177; CHECK-NEXT: # zmm0 {%k1} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 5178; CHECK-NEXT: retq 5179 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5180 %2 = zext <8 x i16> %a0 to <8 x i64> 5181 %3 = bitcast i8 %mask to <8 x i1> 5182 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %passthru 5183 ret <8 x i64> %4 5184} 5185 5186define <8 x i64> @stack_fold_pmovzxwq_maskz_zmm(<8 x i16> %a0, i8 %mask) { 5187; CHECK-LABEL: stack_fold_pmovzxwq_maskz_zmm: 5188; CHECK: # %bb.0: 5189; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5190; CHECK-NEXT: #APP 5191; CHECK-NEXT: nop 5192; CHECK-NEXT: #NO_APP 5193; CHECK-NEXT: kmovd %edi, %k1 5194; CHECK-NEXT: vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 16-byte Folded Reload 5195; CHECK-NEXT: # zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 5196; CHECK-NEXT: retq 5197 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5198 %2 = zext <8 x i16> %a0 to <8 x i64> 5199 %3 = bitcast i8 %mask to <8 x i1> 5200 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer 5201 ret <8 x i64> %4 5202} 5203 5204define <16 x i32> @stack_fold_pmulld(<16 x i32> %a0, <16 x i32> %a1) { 5205; CHECK-LABEL: stack_fold_pmulld: 5206; CHECK: # %bb.0: 5207; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5208; CHECK-NEXT: #APP 5209; CHECK-NEXT: nop 5210; CHECK-NEXT: #NO_APP 5211; CHECK-NEXT: vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 5212; CHECK-NEXT: retq 5213 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5214 %2 = mul <16 x i32> %a0, %a1 5215 ret <16 x i32> %2 5216} 5217 5218define <16 x i32> @stack_fold_pmulld_commuted(<16 x i32> %a0, <16 x i32> %a1) { 5219; CHECK-LABEL: stack_fold_pmulld_commuted: 5220; CHECK: # %bb.0: 5221; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5222; CHECK-NEXT: #APP 5223; CHECK-NEXT: nop 5224; CHECK-NEXT: #NO_APP 5225; CHECK-NEXT: vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 5226; CHECK-NEXT: retq 5227 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5228 %2 = mul <16 x i32> %a1, %a0 5229 ret <16 x i32> %2 5230} 5231 5232define <16 x i32> @stack_fold_pmulld_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) { 5233; CHECK-LABEL: stack_fold_pmulld_mask: 5234; CHECK: # %bb.0: 5235; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5236; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 5237; CHECK-NEXT: #APP 5238; CHECK-NEXT: nop 5239; CHECK-NEXT: #NO_APP 5240; CHECK-NEXT: kmovd %esi, %k1 5241; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 5242; CHECK-NEXT: vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 5243; CHECK-NEXT: retq 5244 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5245 %2 = mul <16 x i32> %a0, %a1 5246 %3 = bitcast i16 %mask to <16 x i1> 5247 ; load needed to keep the operation from being scheduled about the asm block 5248 %4 = load <16 x i32>, <16 x i32>* %a2 5249 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4 5250 ret <16 x i32> %5 5251} 5252 5253define <16 x i32> @stack_fold_pmulld_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) { 5254; CHECK-LABEL: stack_fold_pmulld_mask_commuted: 5255; CHECK: # %bb.0: 5256; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5257; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 5258; CHECK-NEXT: #APP 5259; CHECK-NEXT: nop 5260; CHECK-NEXT: #NO_APP 5261; CHECK-NEXT: kmovd %esi, %k1 5262; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 5263; CHECK-NEXT: vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 5264; CHECK-NEXT: retq 5265 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5266 %2 = mul <16 x i32> %a1, %a0 5267 %3 = bitcast i16 %mask to <16 x i1> 5268 ; load needed to keep the operation from being scheduled about the asm block 5269 %4 = load <16 x i32>, <16 x i32>* %a2 5270 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4 5271 ret <16 x i32> %5 5272} 5273 5274define <16 x i32> @stack_fold_pmulld_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { 5275; CHECK-LABEL: stack_fold_pmulld_maskz: 5276; CHECK: # %bb.0: 5277; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5278; CHECK-NEXT: #APP 5279; CHECK-NEXT: nop 5280; CHECK-NEXT: #NO_APP 5281; CHECK-NEXT: kmovd %edi, %k1 5282; CHECK-NEXT: vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 5283; CHECK-NEXT: retq 5284 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5285 %2 = mul <16 x i32> %a0, %a1 5286 %3 = bitcast i16 %mask to <16 x i1> 5287 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 5288 ret <16 x i32> %4 5289} 5290 5291define <16 x i32> @stack_fold_pmulld_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { 5292; CHECK-LABEL: stack_fold_pmulld_maskz_commuted: 5293; CHECK: # %bb.0: 5294; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5295; CHECK-NEXT: #APP 5296; CHECK-NEXT: nop 5297; CHECK-NEXT: #NO_APP 5298; CHECK-NEXT: kmovd %edi, %k1 5299; CHECK-NEXT: vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 5300; CHECK-NEXT: retq 5301 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5302 %2 = mul <16 x i32> %a1, %a0 5303 %3 = bitcast i16 %mask to <16 x i1> 5304 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 5305 ret <16 x i32> %4 5306} 5307 5308define <8 x i64> @stack_fold_pmullq(<8 x i64> %a0, <8 x i64> %a1) { 5309; CHECK-LABEL: stack_fold_pmullq: 5310; CHECK: # %bb.0: 5311; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5312; CHECK-NEXT: #APP 5313; CHECK-NEXT: nop 5314; CHECK-NEXT: #NO_APP 5315; CHECK-NEXT: vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 5316; CHECK-NEXT: retq 5317 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5318 %2 = mul <8 x i64> %a0, %a1 5319 ret <8 x i64> %2 5320} 5321 5322define <8 x i64> @stack_fold_pmullq_commuted(<8 x i64> %a0, <8 x i64> %a1) { 5323; CHECK-LABEL: stack_fold_pmullq_commuted: 5324; CHECK: # %bb.0: 5325; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5326; CHECK-NEXT: #APP 5327; CHECK-NEXT: nop 5328; CHECK-NEXT: #NO_APP 5329; CHECK-NEXT: vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 5330; CHECK-NEXT: retq 5331 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5332 %2 = mul <8 x i64> %a1, %a0 5333 ret <8 x i64> %2 5334} 5335 5336define <8 x i64> @stack_fold_pmullq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) { 5337; CHECK-LABEL: stack_fold_pmullq_mask: 5338; CHECK: # %bb.0: 5339; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5340; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 5341; CHECK-NEXT: #APP 5342; CHECK-NEXT: nop 5343; CHECK-NEXT: #NO_APP 5344; CHECK-NEXT: kmovd %esi, %k1 5345; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 5346; CHECK-NEXT: vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 5347; CHECK-NEXT: retq 5348 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5349 %2 = mul <8 x i64> %a0, %a1 5350 %3 = bitcast i8 %mask to <8 x i1> 5351 ; load needed to keep the operation from being scheduled about the asm block 5352 %4 = load <8 x i64>, <8 x i64>* %a2 5353 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4 5354 ret <8 x i64> %5 5355} 5356 5357define <8 x i64> @stack_fold_pmullq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) { 5358; CHECK-LABEL: stack_fold_pmullq_mask_commuted: 5359; CHECK: # %bb.0: 5360; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5361; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 5362; CHECK-NEXT: #APP 5363; CHECK-NEXT: nop 5364; CHECK-NEXT: #NO_APP 5365; CHECK-NEXT: kmovd %esi, %k1 5366; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 5367; CHECK-NEXT: vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 5368; CHECK-NEXT: retq 5369 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5370 %2 = mul <8 x i64> %a1, %a0 5371 %3 = bitcast i8 %mask to <8 x i1> 5372 ; load needed to keep the operation from being scheduled about the asm block 5373 %4 = load <8 x i64>, <8 x i64>* %a2 5374 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4 5375 ret <8 x i64> %5 5376} 5377 5378define <8 x i64> @stack_fold_pmullq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 5379; CHECK-LABEL: stack_fold_pmullq_maskz: 5380; CHECK: # %bb.0: 5381; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5382; CHECK-NEXT: #APP 5383; CHECK-NEXT: nop 5384; CHECK-NEXT: #NO_APP 5385; CHECK-NEXT: kmovd %edi, %k1 5386; CHECK-NEXT: vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 5387; CHECK-NEXT: retq 5388 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5389 %2 = mul <8 x i64> %a0, %a1 5390 %3 = bitcast i8 %mask to <8 x i1> 5391 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer 5392 ret <8 x i64> %4 5393} 5394 5395define <8 x i64> @stack_fold_pmullq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 5396; CHECK-LABEL: stack_fold_pmullq_maskz_commuted: 5397; CHECK: # %bb.0: 5398; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5399; CHECK-NEXT: #APP 5400; CHECK-NEXT: nop 5401; CHECK-NEXT: #NO_APP 5402; CHECK-NEXT: kmovd %edi, %k1 5403; CHECK-NEXT: vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 5404; CHECK-NEXT: retq 5405 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5406 %2 = mul <8 x i64> %a1, %a0 5407 %3 = bitcast i8 %mask to <8 x i1> 5408 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer 5409 ret <8 x i64> %4 5410} 5411 5412define <32 x i16> @stack_fold_pmullw(<32 x i16> %a0, <32 x i16> %a1) { 5413; CHECK-LABEL: stack_fold_pmullw: 5414; CHECK: # %bb.0: 5415; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5416; CHECK-NEXT: #APP 5417; CHECK-NEXT: nop 5418; CHECK-NEXT: #NO_APP 5419; CHECK-NEXT: vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 5420; CHECK-NEXT: retq 5421 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5422 %2 = mul <32 x i16> %a0, %a1 5423 ret <32 x i16> %2 5424} 5425 5426define <32 x i16> @stack_fold_pmullw_commuted(<32 x i16> %a0, <32 x i16> %a1) { 5427; CHECK-LABEL: stack_fold_pmullw_commuted: 5428; CHECK: # %bb.0: 5429; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5430; CHECK-NEXT: #APP 5431; CHECK-NEXT: nop 5432; CHECK-NEXT: #NO_APP 5433; CHECK-NEXT: vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 5434; CHECK-NEXT: retq 5435 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5436 %2 = mul <32 x i16> %a1, %a0 5437 ret <32 x i16> %2 5438} 5439 5440define <32 x i16> @stack_fold_pmullw_mask(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) { 5441; CHECK-LABEL: stack_fold_pmullw_mask: 5442; CHECK: # %bb.0: 5443; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5444; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 5445; CHECK-NEXT: #APP 5446; CHECK-NEXT: nop 5447; CHECK-NEXT: #NO_APP 5448; CHECK-NEXT: kmovd %esi, %k1 5449; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 5450; CHECK-NEXT: vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 5451; CHECK-NEXT: retq 5452 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5453 %2 = mul <32 x i16> %a0, %a1 5454 %3 = bitcast i32 %mask to <32 x i1> 5455 ; load needed to keep the operation from being scheduled about the asm block 5456 %4 = load <32 x i16>, <32 x i16>* %a2 5457 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4 5458 ret <32 x i16> %5 5459} 5460 5461define <32 x i16> @stack_fold_pmullw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) { 5462; CHECK-LABEL: stack_fold_pmullw_mask_commuted: 5463; CHECK: # %bb.0: 5464; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5465; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 5466; CHECK-NEXT: #APP 5467; CHECK-NEXT: nop 5468; CHECK-NEXT: #NO_APP 5469; CHECK-NEXT: kmovd %esi, %k1 5470; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 5471; CHECK-NEXT: vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 5472; CHECK-NEXT: retq 5473 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5474 %2 = mul <32 x i16> %a1, %a0 5475 %3 = bitcast i32 %mask to <32 x i1> 5476 ; load needed to keep the operation from being scheduled about the asm block 5477 %4 = load <32 x i16>, <32 x i16>* %a2 5478 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4 5479 ret <32 x i16> %5 5480} 5481 5482define <32 x i16> @stack_fold_pmullw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { 5483; CHECK-LABEL: stack_fold_pmullw_maskz: 5484; CHECK: # %bb.0: 5485; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5486; CHECK-NEXT: #APP 5487; CHECK-NEXT: nop 5488; CHECK-NEXT: #NO_APP 5489; CHECK-NEXT: kmovd %edi, %k1 5490; CHECK-NEXT: vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 5491; CHECK-NEXT: retq 5492 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5493 %2 = mul <32 x i16> %a0, %a1 5494 %3 = bitcast i32 %mask to <32 x i1> 5495 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer 5496 ret <32 x i16> %4 5497} 5498 5499define <32 x i16> @stack_fold_pmullw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { 5500; CHECK-LABEL: stack_fold_pmullw_maskz_commuted: 5501; CHECK: # %bb.0: 5502; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5503; CHECK-NEXT: #APP 5504; CHECK-NEXT: nop 5505; CHECK-NEXT: #NO_APP 5506; CHECK-NEXT: kmovd %edi, %k1 5507; CHECK-NEXT: vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 5508; CHECK-NEXT: retq 5509 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5510 %2 = mul <32 x i16> %a1, %a0 5511 %3 = bitcast i32 %mask to <32 x i1> 5512 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer 5513 ret <32 x i16> %4 5514} 5515 5516define <8 x i64> @stack_fold_pmuldq(<8 x i64> %a0, <8 x i64> %a1) { 5517; CHECK-LABEL: stack_fold_pmuldq: 5518; CHECK: # %bb.0: 5519; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5520; CHECK-NEXT: #APP 5521; CHECK-NEXT: nop 5522; CHECK-NEXT: #NO_APP 5523; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 5524; CHECK-NEXT: retq 5525 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5526 %2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5527 %3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5528 %4 = shl <8 x i64> %a1, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5529 %5 = ashr <8 x i64> %4, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5530 %6 = mul <8 x i64> %3, %5 5531 ret <8 x i64> %6 5532} 5533 5534define <8 x i64> @stack_fold_pmuldq_commuted(<8 x i64> %a0, <8 x i64> %a1) { 5535; CHECK-LABEL: stack_fold_pmuldq_commuted: 5536; CHECK: # %bb.0: 5537; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5538; CHECK-NEXT: #APP 5539; CHECK-NEXT: nop 5540; CHECK-NEXT: #NO_APP 5541; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 5542; CHECK-NEXT: retq 5543 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5544 %2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5545 %3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5546 %4 = shl <8 x i64> %a1, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5547 %5 = ashr <8 x i64> %4, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5548 %6 = mul <8 x i64> %5, %3 5549 ret <8 x i64> %6 5550} 5551 5552define <8 x i64> @stack_fold_pmuldq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) { 5553; CHECK-LABEL: stack_fold_pmuldq_mask: 5554; CHECK: # %bb.0: 5555; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5556; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 5557; CHECK-NEXT: #APP 5558; CHECK-NEXT: nop 5559; CHECK-NEXT: #NO_APP 5560; CHECK-NEXT: kmovd %esi, %k1 5561; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 5562; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 5563; CHECK-NEXT: retq 5564 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5565 %2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5566 %3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5567 %4 = shl <8 x i64> %a1, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5568 %5 = ashr <8 x i64> %4, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5569 %6 = mul <8 x i64> %3, %5 5570 %7 = bitcast i8 %mask to <8 x i1> 5571 ; load needed to keep the operation from being scheduled about the asm block 5572 %8 = load <8 x i64>, <8 x i64>* %a2 5573 %9 = select <8 x i1> %7, <8 x i64> %6, <8 x i64> %8 5574 ret <8 x i64> %9 5575} 5576 5577define <8 x i64> @stack_fold_pmuldq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) { 5578; CHECK-LABEL: stack_fold_pmuldq_mask_commuted: 5579; CHECK: # %bb.0: 5580; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5581; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 5582; CHECK-NEXT: #APP 5583; CHECK-NEXT: nop 5584; CHECK-NEXT: #NO_APP 5585; CHECK-NEXT: kmovd %esi, %k1 5586; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 5587; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 5588; CHECK-NEXT: retq 5589 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5590 %2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5591 %3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5592 %4 = shl <8 x i64> %a1, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5593 %5 = ashr <8 x i64> %4, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5594 %6 = mul <8 x i64> %5, %3 5595 %7 = bitcast i8 %mask to <8 x i1> 5596 ; load needed to keep the operation from being scheduled about the asm block 5597 %8 = load <8 x i64>, <8 x i64>* %a2 5598 %9 = select <8 x i1> %7, <8 x i64> %6, <8 x i64> %8 5599 ret <8 x i64> %9 5600} 5601 5602define <8 x i64> @stack_fold_pmuldq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 5603; CHECK-LABEL: stack_fold_pmuldq_maskz: 5604; CHECK: # %bb.0: 5605; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5606; CHECK-NEXT: #APP 5607; CHECK-NEXT: nop 5608; CHECK-NEXT: #NO_APP 5609; CHECK-NEXT: kmovd %edi, %k1 5610; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 5611; CHECK-NEXT: retq 5612 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5613 %2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5614 %3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5615 %4 = shl <8 x i64> %a1, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5616 %5 = ashr <8 x i64> %4, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5617 %6 = mul <8 x i64> %3, %5 5618 %7 = bitcast i8 %mask to <8 x i1> 5619 %8 = select <8 x i1> %7, <8 x i64> %6, <8 x i64> zeroinitializer 5620 ret <8 x i64> %8 5621} 5622 5623define <8 x i64> @stack_fold_pmuldq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 5624; CHECK-LABEL: stack_fold_pmuldq_maskz_commuted: 5625; CHECK: # %bb.0: 5626; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5627; CHECK-NEXT: #APP 5628; CHECK-NEXT: nop 5629; CHECK-NEXT: #NO_APP 5630; CHECK-NEXT: kmovd %edi, %k1 5631; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 5632; CHECK-NEXT: retq 5633 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5634 %2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5635 %3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5636 %4 = shl <8 x i64> %a1, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5637 %5 = ashr <8 x i64> %4, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5638 %6 = mul <8 x i64> %5, %3 5639 %7 = bitcast i8 %mask to <8 x i1> 5640 %8 = select <8 x i1> %7, <8 x i64> %6, <8 x i64> zeroinitializer 5641 ret <8 x i64> %8 5642} 5643 5644 5645 5646 5647define <8 x i64> @stack_fold_pmuludq(<8 x i64> %a0, <8 x i64> %a1) { 5648; CHECK-LABEL: stack_fold_pmuludq: 5649; CHECK: # %bb.0: 5650; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5651; CHECK-NEXT: #APP 5652; CHECK-NEXT: nop 5653; CHECK-NEXT: #NO_APP 5654; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 5655; CHECK-NEXT: retq 5656 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5657 %2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 5658 %3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 5659 %4 = mul <8 x i64> %2, %3 5660 ret <8 x i64> %4 5661} 5662 5663define <8 x i64> @stack_fold_pmuludq_commuted(<8 x i64> %a0, <8 x i64> %a1) { 5664; CHECK-LABEL: stack_fold_pmuludq_commuted: 5665; CHECK: # %bb.0: 5666; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5667; CHECK-NEXT: #APP 5668; CHECK-NEXT: nop 5669; CHECK-NEXT: #NO_APP 5670; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 5671; CHECK-NEXT: retq 5672 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5673 %2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 5674 %3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 5675 %4 = mul <8 x i64> %3, %2 5676 ret <8 x i64> %4 5677} 5678 5679define <8 x i64> @stack_fold_pmuludq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) { 5680; CHECK-LABEL: stack_fold_pmuludq_mask: 5681; CHECK: # %bb.0: 5682; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5683; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 5684; CHECK-NEXT: #APP 5685; CHECK-NEXT: nop 5686; CHECK-NEXT: #NO_APP 5687; CHECK-NEXT: kmovd %esi, %k1 5688; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 5689; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 5690; CHECK-NEXT: retq 5691 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5692 %2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 5693 %3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 5694 %4 = mul <8 x i64> %2, %3 5695 %5 = bitcast i8 %mask to <8 x i1> 5696 ; load needed to keep the operation from being scheduled about the asm block 5697 %6 = load <8 x i64>, <8 x i64>* %a2 5698 %7 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> %6 5699 ret <8 x i64> %7 5700} 5701 5702define <8 x i64> @stack_fold_pmuludq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) { 5703; CHECK-LABEL: stack_fold_pmuludq_mask_commuted: 5704; CHECK: # %bb.0: 5705; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5706; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 5707; CHECK-NEXT: #APP 5708; CHECK-NEXT: nop 5709; CHECK-NEXT: #NO_APP 5710; CHECK-NEXT: kmovd %esi, %k1 5711; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 5712; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 5713; CHECK-NEXT: retq 5714 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5715 %2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 5716 %3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 5717 %4 = mul <8 x i64> %3, %2 5718 %5 = bitcast i8 %mask to <8 x i1> 5719 ; load needed to keep the operation from being scheduled about the asm block 5720 %6 = load <8 x i64>, <8 x i64>* %a2 5721 %7 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> %6 5722 ret <8 x i64> %7 5723} 5724 5725define <8 x i64> @stack_fold_pmuludq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 5726; CHECK-LABEL: stack_fold_pmuludq_maskz: 5727; CHECK: # %bb.0: 5728; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5729; CHECK-NEXT: #APP 5730; CHECK-NEXT: nop 5731; CHECK-NEXT: #NO_APP 5732; CHECK-NEXT: kmovd %edi, %k1 5733; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 5734; CHECK-NEXT: retq 5735 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5736 %2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 5737 %3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 5738 %4 = mul <8 x i64> %2, %3 5739 %5 = bitcast i8 %mask to <8 x i1> 5740 %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer 5741 ret <8 x i64> %6 5742} 5743 5744define <8 x i64> @stack_fold_pmuludq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 5745; CHECK-LABEL: stack_fold_pmuludq_maskz_commuted: 5746; CHECK: # %bb.0: 5747; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5748; CHECK-NEXT: #APP 5749; CHECK-NEXT: nop 5750; CHECK-NEXT: #NO_APP 5751; CHECK-NEXT: kmovd %edi, %k1 5752; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 5753; CHECK-NEXT: retq 5754 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5755 %2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 5756 %3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 5757 %4 = mul <8 x i64> %3, %2 5758 %5 = bitcast i8 %mask to <8 x i1> 5759 %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer 5760 ret <8 x i64> %6 5761} 5762 5763define <16 x i32> @stack_fold_vpopcntd(<16 x i32> %a0) { 5764; CHECK-LABEL: stack_fold_vpopcntd: 5765; CHECK: # %bb.0: 5766; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5767; CHECK-NEXT: #APP 5768; CHECK-NEXT: nop 5769; CHECK-NEXT: #NO_APP 5770; CHECK-NEXT: vpopcntd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 5771; CHECK-NEXT: retq 5772 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5773 %2 = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %a0) 5774 ret <16 x i32> %2 5775} 5776declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) nounwind readonly 5777 5778define <8 x i64> @stack_fold_vpopcntq(<8 x i64> %a0) { 5779; CHECK-LABEL: stack_fold_vpopcntq: 5780; CHECK: # %bb.0: 5781; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5782; CHECK-NEXT: #APP 5783; CHECK-NEXT: nop 5784; CHECK-NEXT: #NO_APP 5785; CHECK-NEXT: vpopcntq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 5786; CHECK-NEXT: retq 5787 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5788 %2 = call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %a0) 5789 ret <8 x i64> %2 5790} 5791declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>) nounwind readnone 5792 5793define <16 x i32> @stack_fold_pord(<16 x i32> %a0, <16 x i32> %a1) { 5794; CHECK-LABEL: stack_fold_pord: 5795; CHECK: # %bb.0: 5796; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5797; CHECK-NEXT: #APP 5798; CHECK-NEXT: nop 5799; CHECK-NEXT: #NO_APP 5800; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 5801; CHECK-NEXT: retq 5802 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5803 %2 = or <16 x i32> %a0, %a1 5804 ret <16 x i32> %2 5805} 5806 5807define <16 x i32> @stack_fold_pord_commuted(<16 x i32> %a0, <16 x i32> %a1) { 5808; CHECK-LABEL: stack_fold_pord_commuted: 5809; CHECK: # %bb.0: 5810; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5811; CHECK-NEXT: #APP 5812; CHECK-NEXT: nop 5813; CHECK-NEXT: #NO_APP 5814; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 5815; CHECK-NEXT: retq 5816 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5817 %2 = or <16 x i32> %a1, %a0 5818 ret <16 x i32> %2 5819} 5820 5821define <16 x i32> @stack_fold_pord_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) { 5822; CHECK-LABEL: stack_fold_pord_mask: 5823; CHECK: # %bb.0: 5824; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5825; CHECK-NEXT: vmovaps %zmm0, %zmm1 5826; CHECK-NEXT: #APP 5827; CHECK-NEXT: nop 5828; CHECK-NEXT: #NO_APP 5829; CHECK-NEXT: kmovd %esi, %k1 5830; CHECK-NEXT: vmovaps (%rdi), %zmm0 5831; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 5832; CHECK-NEXT: retq 5833 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5834 %2 = or <16 x i32> %a0, %a1 5835 %3 = bitcast i16 %mask to <16 x i1> 5836 ; load needed to keep the operation from being scheduled about the asm block 5837 %4 = load <16 x i32>, <16 x i32>* %a2 5838 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4 5839 ret <16 x i32> %5 5840} 5841 5842define <16 x i32> @stack_fold_pord_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) { 5843; CHECK-LABEL: stack_fold_pord_mask_commuted: 5844; CHECK: # %bb.0: 5845; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5846; CHECK-NEXT: vmovaps %zmm0, %zmm1 5847; CHECK-NEXT: #APP 5848; CHECK-NEXT: nop 5849; CHECK-NEXT: #NO_APP 5850; CHECK-NEXT: kmovd %esi, %k1 5851; CHECK-NEXT: vmovaps (%rdi), %zmm0 5852; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 5853; CHECK-NEXT: retq 5854 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5855 %2 = or <16 x i32> %a1, %a0 5856 %3 = bitcast i16 %mask to <16 x i1> 5857 ; load needed to keep the operation from being scheduled about the asm block 5858 %4 = load <16 x i32>, <16 x i32>* %a2 5859 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4 5860 ret <16 x i32> %5 5861} 5862 5863define <16 x i32> @stack_fold_pord_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { 5864; CHECK-LABEL: stack_fold_pord_maskz: 5865; CHECK: # %bb.0: 5866; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5867; CHECK-NEXT: #APP 5868; CHECK-NEXT: nop 5869; CHECK-NEXT: #NO_APP 5870; CHECK-NEXT: kmovd %edi, %k1 5871; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 5872; CHECK-NEXT: retq 5873 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5874 %2 = or <16 x i32> %a0, %a1 5875 %3 = bitcast i16 %mask to <16 x i1> 5876 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 5877 ret <16 x i32> %4 5878} 5879 5880define <16 x i32> @stack_fold_pord_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { 5881; CHECK-LABEL: stack_fold_pord_maskz_commuted: 5882; CHECK: # %bb.0: 5883; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5884; CHECK-NEXT: #APP 5885; CHECK-NEXT: nop 5886; CHECK-NEXT: #NO_APP 5887; CHECK-NEXT: kmovd %edi, %k1 5888; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 5889; CHECK-NEXT: retq 5890 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5891 %2 = or <16 x i32> %a1, %a0 5892 %3 = bitcast i16 %mask to <16 x i1> 5893 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 5894 ret <16 x i32> %4 5895} 5896 5897define <8 x i64> @stack_fold_porq(<8 x i64> %a0, <8 x i64> %a1) { 5898; CHECK-LABEL: stack_fold_porq: 5899; CHECK: # %bb.0: 5900; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5901; CHECK-NEXT: #APP 5902; CHECK-NEXT: nop 5903; CHECK-NEXT: #NO_APP 5904; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 5905; CHECK-NEXT: retq 5906 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5907 %2 = or <8 x i64> %a0, %a1 5908 ret <8 x i64> %2 5909} 5910 5911define <8 x i64> @stack_fold_porq_commuted(<8 x i64> %a0, <8 x i64> %a1) { 5912; CHECK-LABEL: stack_fold_porq_commuted: 5913; CHECK: # %bb.0: 5914; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5915; CHECK-NEXT: #APP 5916; CHECK-NEXT: nop 5917; CHECK-NEXT: #NO_APP 5918; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 5919; CHECK-NEXT: retq 5920 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5921 %2 = or <8 x i64> %a1, %a0 5922 ret <8 x i64> %2 5923} 5924 5925define <8 x i64> @stack_fold_porq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) { 5926; CHECK-LABEL: stack_fold_porq_mask: 5927; CHECK: # %bb.0: 5928; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5929; CHECK-NEXT: vmovapd %zmm0, %zmm1 5930; CHECK-NEXT: #APP 5931; CHECK-NEXT: nop 5932; CHECK-NEXT: #NO_APP 5933; CHECK-NEXT: kmovd %esi, %k1 5934; CHECK-NEXT: vmovapd (%rdi), %zmm0 5935; CHECK-NEXT: vorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 5936; CHECK-NEXT: retq 5937 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5938 %2 = or <8 x i64> %a0, %a1 5939 %3 = bitcast i8 %mask to <8 x i1> 5940 ; load needed to keep the operation from being scheduled about the asm block 5941 %4 = load <8 x i64>, <8 x i64>* %a2 5942 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4 5943 ret <8 x i64> %5 5944} 5945 5946define <8 x i64> @stack_fold_porq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) { 5947; CHECK-LABEL: stack_fold_porq_mask_commuted: 5948; CHECK: # %bb.0: 5949; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5950; CHECK-NEXT: vmovapd %zmm0, %zmm1 5951; CHECK-NEXT: #APP 5952; CHECK-NEXT: nop 5953; CHECK-NEXT: #NO_APP 5954; CHECK-NEXT: kmovd %esi, %k1 5955; CHECK-NEXT: vmovapd (%rdi), %zmm0 5956; CHECK-NEXT: vorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 5957; CHECK-NEXT: retq 5958 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5959 %2 = or <8 x i64> %a1, %a0 5960 %3 = bitcast i8 %mask to <8 x i1> 5961 ; load needed to keep the operation from being scheduled about the asm block 5962 %4 = load <8 x i64>, <8 x i64>* %a2 5963 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4 5964 ret <8 x i64> %5 5965} 5966 5967define <8 x i64> @stack_fold_porq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 5968; CHECK-LABEL: stack_fold_porq_maskz: 5969; CHECK: # %bb.0: 5970; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5971; CHECK-NEXT: #APP 5972; CHECK-NEXT: nop 5973; CHECK-NEXT: #NO_APP 5974; CHECK-NEXT: kmovd %edi, %k1 5975; CHECK-NEXT: vorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 5976; CHECK-NEXT: retq 5977 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5978 %2 = or <8 x i64> %a0, %a1 5979 %3 = bitcast i8 %mask to <8 x i1> 5980 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer 5981 ret <8 x i64> %4 5982} 5983 5984define <8 x i64> @stack_fold_porq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 5985; CHECK-LABEL: stack_fold_porq_maskz_commuted: 5986; CHECK: # %bb.0: 5987; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5988; CHECK-NEXT: #APP 5989; CHECK-NEXT: nop 5990; CHECK-NEXT: #NO_APP 5991; CHECK-NEXT: kmovd %edi, %k1 5992; CHECK-NEXT: vorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 5993; CHECK-NEXT: retq 5994 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5995 %2 = or <8 x i64> %a1, %a0 5996 %3 = bitcast i8 %mask to <8 x i1> 5997 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer 5998 ret <8 x i64> %4 5999} 6000 6001define <8 x i64> @stack_fold_psadbw(<64 x i8> %a0, <64 x i8> %a1) { 6002; CHECK-LABEL: stack_fold_psadbw: 6003; CHECK: # %bb.0: 6004; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6005; CHECK-NEXT: #APP 6006; CHECK-NEXT: nop 6007; CHECK-NEXT: #NO_APP 6008; CHECK-NEXT: vpsadbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 6009; CHECK-NEXT: retq 6010 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6011 %2 = call <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %a0, <64 x i8> %a1) 6012 ret <8 x i64> %2 6013} 6014declare <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8>, <64 x i8>) nounwind readnone 6015 6016define <8 x i64> @stack_fold_psadbw_commute(<64 x i8> %a0, <64 x i8> %a1) { 6017; CHECK-LABEL: stack_fold_psadbw_commute: 6018; CHECK: # %bb.0: 6019; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6020; CHECK-NEXT: #APP 6021; CHECK-NEXT: nop 6022; CHECK-NEXT: #NO_APP 6023; CHECK-NEXT: vpsadbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 6024; CHECK-NEXT: retq 6025 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6026 %2 = call <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %a1, <64 x i8> %a0) 6027 ret <8 x i64> %2 6028} 6029 6030define <64 x i8> @stack_fold_pshufb_zmm(<64 x i8> %a0, <64 x i8> %a1) { 6031; CHECK-LABEL: stack_fold_pshufb_zmm: 6032; CHECK: # %bb.0: 6033; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6034; CHECK-NEXT: #APP 6035; CHECK-NEXT: nop 6036; CHECK-NEXT: #NO_APP 6037; CHECK-NEXT: vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 6038; CHECK-NEXT: retq 6039 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6040 %2 = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %a0, <64 x i8> %a1) 6041 ret <64 x i8> %2 6042} 6043declare <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8>, <64 x i8>) 6044 6045define <64 x i8> @stack_fold_pshufb_zmm_mask(<64 x i8>* %passthru, <64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 6046; CHECK-LABEL: stack_fold_pshufb_zmm_mask: 6047; CHECK: # %bb.0: 6048; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6049; CHECK-NEXT: #APP 6050; CHECK-NEXT: nop 6051; CHECK-NEXT: #NO_APP 6052; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 6053; CHECK-NEXT: kmovq %rsi, %k1 6054; CHECK-NEXT: vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 6055; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 6056; CHECK-NEXT: retq 6057 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6058 %2 = load <64 x i8>, <64 x i8>* %passthru 6059 %3 = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %a0, <64 x i8> %a1) 6060 %4 = bitcast i64 %mask to <64 x i1> 6061 %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %2 6062 ret <64 x i8> %5 6063} 6064 6065define <64 x i8> @stack_fold_pshufb_zmm_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 6066; CHECK-LABEL: stack_fold_pshufb_zmm_maskz: 6067; CHECK: # %bb.0: 6068; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6069; CHECK-NEXT: #APP 6070; CHECK-NEXT: nop 6071; CHECK-NEXT: #NO_APP 6072; CHECK-NEXT: kmovq %rdi, %k1 6073; CHECK-NEXT: vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 6074; CHECK-NEXT: retq 6075 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6076 %2 = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %a0, <64 x i8> %a1) 6077 %3 = bitcast i64 %mask to <64 x i1> 6078 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer 6079 ret <64 x i8> %4 6080} 6081 6082define <16 x i32> @stack_fold_pshufd_zmm(<16 x i32> %a0) { 6083; CHECK-LABEL: stack_fold_pshufd_zmm: 6084; CHECK: # %bb.0: 6085; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6086; CHECK-NEXT: #APP 6087; CHECK-NEXT: nop 6088; CHECK-NEXT: #NO_APP 6089; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 6090; CHECK-NEXT: # zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 6091; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 6092; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0 6093; CHECK-NEXT: retq 6094 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6095 %2 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12> 6096 %3 = add <16 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 6097 ret <16 x i32> %3 6098} 6099 6100define <16 x i32> @stack_fold_pshufd_zmm_mask(<16 x i32> %passthru, <16 x i32> %a0, i16 %mask) { 6101; CHECK-LABEL: stack_fold_pshufd_zmm_mask: 6102; CHECK: # %bb.0: 6103; CHECK-NEXT: subq $56, %rsp 6104; CHECK-NEXT: .cfi_def_cfa_offset 64 6105; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6106; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6107; CHECK-NEXT: #APP 6108; CHECK-NEXT: nop 6109; CHECK-NEXT: #NO_APP 6110; CHECK-NEXT: kmovd %edi, %k1 6111; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 6112; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload 6113; CHECK-NEXT: # zmm0 {%k1} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 6114; CHECK-NEXT: addq $56, %rsp 6115; CHECK-NEXT: .cfi_def_cfa_offset 8 6116; CHECK-NEXT: retq 6117 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6118 %2 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12> 6119 %3 = bitcast i16 %mask to <16 x i1> 6120 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %passthru 6121 ret <16 x i32> %4 6122} 6123 6124define <16 x i32> @stack_fold_pshufd_zmm_maskz(<16 x i32> %a0, i16 %mask) { 6125; CHECK-LABEL: stack_fold_pshufd_zmm_maskz: 6126; CHECK: # %bb.0: 6127; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6128; CHECK-NEXT: #APP 6129; CHECK-NEXT: nop 6130; CHECK-NEXT: #NO_APP 6131; CHECK-NEXT: kmovd %edi, %k1 6132; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload 6133; CHECK-NEXT: # zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 6134; CHECK-NEXT: retq 6135 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6136 %2 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12> 6137 %3 = bitcast i16 %mask to <16 x i1> 6138 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 6139 ret <16 x i32> %4 6140} 6141 6142define <32 x i16> @stack_fold_pshufhw_zmm(<32 x i16> %a0) { 6143; CHECK-LABEL: stack_fold_pshufhw_zmm: 6144; CHECK: # %bb.0: 6145; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6146; CHECK-NEXT: #APP 6147; CHECK-NEXT: nop 6148; CHECK-NEXT: #NO_APP 6149; CHECK-NEXT: vpshufhw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 6150; CHECK-NEXT: # zmm0 = mem[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12,16,17,18,19,23,22,21,20,24,25,26,27,31,30,29,28] 6151; CHECK-NEXT: retq 6152 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6153 %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12, i32 16, i32 17, i32 18, i32 19, i32 23, i32 22, i32 21, i32 20, i32 24, i32 25, i32 26, i32 27, i32 31, i32 30, i32 29, i32 28> 6154 ret <32 x i16> %2 6155} 6156 6157define <32 x i16> @stack_fold_pshufhw_zmm_mask(<32 x i16> %passthru, <32 x i16> %a0, i32 %mask) { 6158; CHECK-LABEL: stack_fold_pshufhw_zmm_mask: 6159; CHECK: # %bb.0: 6160; CHECK-NEXT: subq $56, %rsp 6161; CHECK-NEXT: .cfi_def_cfa_offset 64 6162; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6163; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6164; CHECK-NEXT: #APP 6165; CHECK-NEXT: nop 6166; CHECK-NEXT: #NO_APP 6167; CHECK-NEXT: kmovd %edi, %k1 6168; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 6169; CHECK-NEXT: vpshufhw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload 6170; CHECK-NEXT: # zmm0 {%k1} = mem[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12,16,17,18,19,23,22,21,20,24,25,26,27,31,30,29,28] 6171; CHECK-NEXT: addq $56, %rsp 6172; CHECK-NEXT: .cfi_def_cfa_offset 8 6173; CHECK-NEXT: retq 6174 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6175 %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12, i32 16, i32 17, i32 18, i32 19, i32 23, i32 22, i32 21, i32 20, i32 24, i32 25, i32 26, i32 27, i32 31, i32 30, i32 29, i32 28> 6176 %3 = bitcast i32 %mask to <32 x i1> 6177 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %passthru 6178 ret <32 x i16> %4 6179} 6180 6181define <32 x i16> @stack_fold_pshufhw_zmm_maskz(<32 x i16> %a0, i32 %mask) { 6182; CHECK-LABEL: stack_fold_pshufhw_zmm_maskz: 6183; CHECK: # %bb.0: 6184; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6185; CHECK-NEXT: #APP 6186; CHECK-NEXT: nop 6187; CHECK-NEXT: #NO_APP 6188; CHECK-NEXT: kmovd %edi, %k1 6189; CHECK-NEXT: vpshufhw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload 6190; CHECK-NEXT: # zmm0 {%k1} {z} = mem[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12,16,17,18,19,23,22,21,20,24,25,26,27,31,30,29,28] 6191; CHECK-NEXT: retq 6192 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6193 %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12, i32 16, i32 17, i32 18, i32 19, i32 23, i32 22, i32 21, i32 20, i32 24, i32 25, i32 26, i32 27, i32 31, i32 30, i32 29, i32 28> 6194 %3 = bitcast i32 %mask to <32 x i1> 6195 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer 6196 ret <32 x i16> %4 6197} 6198 6199define <32 x i16> @stack_fold_pshuflw_zmm(<32 x i16> %a0) { 6200; CHECK-LABEL: stack_fold_pshuflw_zmm: 6201; CHECK: # %bb.0: 6202; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6203; CHECK-NEXT: #APP 6204; CHECK-NEXT: nop 6205; CHECK-NEXT: #NO_APP 6206; CHECK-NEXT: vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 6207; CHECK-NEXT: # zmm0 = mem[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15,19,18,17,16,20,21,22,23,27,26,25,24,28,29,30,31] 6208; CHECK-NEXT: retq 6209 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6210 %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 18, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 26, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31> 6211 ret <32 x i16> %2 6212} 6213 6214define <32 x i16> @stack_fold_pshuflw_zmm_mask(<32 x i16> %passthru, <32 x i16> %a0, i32 %mask) { 6215; CHECK-LABEL: stack_fold_pshuflw_zmm_mask: 6216; CHECK: # %bb.0: 6217; CHECK-NEXT: subq $56, %rsp 6218; CHECK-NEXT: .cfi_def_cfa_offset 64 6219; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6220; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6221; CHECK-NEXT: #APP 6222; CHECK-NEXT: nop 6223; CHECK-NEXT: #NO_APP 6224; CHECK-NEXT: kmovd %edi, %k1 6225; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 6226; CHECK-NEXT: vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload 6227; CHECK-NEXT: # zmm0 {%k1} = mem[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15,19,18,17,16,20,21,22,23,27,26,25,24,28,29,30,31] 6228; CHECK-NEXT: addq $56, %rsp 6229; CHECK-NEXT: .cfi_def_cfa_offset 8 6230; CHECK-NEXT: retq 6231 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6232 %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 18, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 26, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31> 6233 %3 = bitcast i32 %mask to <32 x i1> 6234 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %passthru 6235 ret <32 x i16> %4 6236} 6237 6238define <32 x i16> @stack_fold_pshuflw_zmm_maskz(<32 x i16> %a0, i32 %mask) { 6239; CHECK-LABEL: stack_fold_pshuflw_zmm_maskz: 6240; CHECK: # %bb.0: 6241; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6242; CHECK-NEXT: #APP 6243; CHECK-NEXT: nop 6244; CHECK-NEXT: #NO_APP 6245; CHECK-NEXT: kmovd %edi, %k1 6246; CHECK-NEXT: vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload 6247; CHECK-NEXT: # zmm0 {%k1} {z} = mem[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15,19,18,17,16,20,21,22,23,27,26,25,24,28,29,30,31] 6248; CHECK-NEXT: retq 6249 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6250 %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 18, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 26, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31> 6251 %3 = bitcast i32 %mask to <32 x i1> 6252 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer 6253 ret <32 x i16> %4 6254} 6255 6256define <16 x i32> @stack_fold_pslld(<16 x i32> %a0, <4 x i32> %a1) { 6257; CHECK-LABEL: stack_fold_pslld: 6258; CHECK: # %bb.0: 6259; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6260; CHECK-NEXT: #APP 6261; CHECK-NEXT: nop 6262; CHECK-NEXT: #NO_APP 6263; CHECK-NEXT: vpslld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload 6264; CHECK-NEXT: retq 6265 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6266 %2 = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1) 6267 ret <16 x i32> %2 6268} 6269declare <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32>, <4 x i32>) nounwind readnone 6270 6271define <16 x i32> @stack_fold_pslld_mask(<16 x i32>* %passthru, <16 x i32> %a0, <4 x i32> %a1, i16 %mask) { 6272; CHECK-LABEL: stack_fold_pslld_mask: 6273; CHECK: # %bb.0: 6274; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6275; CHECK-NEXT: #APP 6276; CHECK-NEXT: nop 6277; CHECK-NEXT: #NO_APP 6278; CHECK-NEXT: kmovd %esi, %k1 6279; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 6280; CHECK-NEXT: vpslld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 16-byte Folded Reload 6281; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 6282; CHECK-NEXT: retq 6283 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6284 %2 = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1) 6285 %3 = bitcast i16 %mask to <16 x i1> 6286 %4 = load <16 x i32>, <16 x i32>* %passthru 6287 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4 6288 ret <16 x i32> %5 6289} 6290 6291define <16 x i32> @stack_fold_pslld_maskz(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) { 6292; CHECK-LABEL: stack_fold_pslld_maskz: 6293; CHECK: # %bb.0: 6294; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6295; CHECK-NEXT: #APP 6296; CHECK-NEXT: nop 6297; CHECK-NEXT: #NO_APP 6298; CHECK-NEXT: kmovd %edi, %k1 6299; CHECK-NEXT: vpslld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 16-byte Folded Reload 6300; CHECK-NEXT: retq 6301 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6302 %2 = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1) 6303 %3 = bitcast i16 %mask to <16 x i1> 6304 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 6305 ret <16 x i32> %4 6306} 6307 6308define <16 x i32> @stack_fold_pslldi(<16 x i32> %a0) { 6309; CHECK-LABEL: stack_fold_pslldi: 6310; CHECK: # %bb.0: 6311; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6312; CHECK-NEXT: #APP 6313; CHECK-NEXT: nop 6314; CHECK-NEXT: #NO_APP 6315; CHECK-NEXT: vpslld $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 6316; CHECK-NEXT: retq 6317 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6318 %2 = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 1) 6319 ret <16 x i32> %2 6320} 6321declare <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32>, i32) nounwind readnone 6322 6323define <16 x i32> @stack_fold_pslldi_mask(<16 x i32>* %passthru, <16 x i32> %a0, i16 %mask) { 6324; CHECK-LABEL: stack_fold_pslldi_mask: 6325; CHECK: # %bb.0: 6326; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6327; CHECK-NEXT: #APP 6328; CHECK-NEXT: nop 6329; CHECK-NEXT: #NO_APP 6330; CHECK-NEXT: kmovd %esi, %k1 6331; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 6332; CHECK-NEXT: vpslld $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 {%k1} # 64-byte Folded Reload 6333; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 6334; CHECK-NEXT: retq 6335 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6336 %2 = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 1) 6337 %3 = bitcast i16 %mask to <16 x i1> 6338 %4 = load <16 x i32>, <16 x i32>* %passthru 6339 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4 6340 ret <16 x i32> %5 6341} 6342 6343define <16 x i32> @stack_fold_pslldi_maskz(<16 x i32> %a0, i16 %mask) { 6344; CHECK-LABEL: stack_fold_pslldi_maskz: 6345; CHECK: # %bb.0: 6346; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6347; CHECK-NEXT: #APP 6348; CHECK-NEXT: nop 6349; CHECK-NEXT: #NO_APP 6350; CHECK-NEXT: kmovd %edi, %k1 6351; CHECK-NEXT: vpslld $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload 6352; CHECK-NEXT: retq 6353 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6354 %2 = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 1) 6355 %3 = bitcast i16 %mask to <16 x i1> 6356 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 6357 ret <16 x i32> %4 6358} 6359 6360define <64 x i8> @stack_fold_pslldq(<64 x i8> %a, <64 x i8> %b) { 6361; CHECK-LABEL: stack_fold_pslldq: 6362; CHECK: # %bb.0: 6363; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6364; CHECK-NEXT: #APP 6365; CHECK-NEXT: nop 6366; CHECK-NEXT: #NO_APP 6367; CHECK-NEXT: vpslldq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 6368; CHECK-NEXT: # zmm0 = zero,mem[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,mem[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30],zero,mem[32,33,34,35,36,37,38,39,40,41,42,43,44,45,46],zero,mem[48,49,50,51,52,53,54,55,56,57,58,59,60,61,62] 6369; CHECK-NEXT: retq 6370 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6371 %2 = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> <i32 79, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 95, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 111, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 127, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62> 6372 ret <64 x i8> %2 6373} 6374 6375define <8 x i64> @stack_fold_psllq(<8 x i64> %a0, <2 x i64> %a1) { 6376; CHECK-LABEL: stack_fold_psllq: 6377; CHECK: # %bb.0: 6378; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6379; CHECK-NEXT: #APP 6380; CHECK-NEXT: nop 6381; CHECK-NEXT: #NO_APP 6382; CHECK-NEXT: vpsllq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload 6383; CHECK-NEXT: retq 6384 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6385 %2 = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %a0, <2 x i64> %a1) 6386 ret <8 x i64> %2 6387} 6388declare <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64>, <2 x i64>) nounwind readnone 6389 6390define <8 x i64> @stack_fold_psllqi(<8 x i64> %a0) { 6391; CHECK-LABEL: stack_fold_psllqi: 6392; CHECK: # %bb.0: 6393; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6394; CHECK-NEXT: #APP 6395; CHECK-NEXT: nop 6396; CHECK-NEXT: #NO_APP 6397; CHECK-NEXT: vpsllq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 6398; CHECK-NEXT: retq 6399 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6400 %2 = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %a0, i32 1) 6401 ret <8 x i64> %2 6402} 6403declare <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64>, i32) nounwind readnone 6404 6405define <16 x i32> @stack_fold_psllvd(<16 x i32> %a0, <16 x i32> %a1) { 6406; CHECK-LABEL: stack_fold_psllvd: 6407; CHECK: # %bb.0: 6408; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6409; CHECK-NEXT: #APP 6410; CHECK-NEXT: nop 6411; CHECK-NEXT: #NO_APP 6412; CHECK-NEXT: vpsllvd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 6413; CHECK-NEXT: retq 6414 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6415 %2 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1) 6416 ret <16 x i32> %2 6417} 6418declare <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32>, <16 x i32>) nounwind readnone 6419 6420define <16 x i32> @stack_fold_psllvd_mask(<16 x i32>* %passthru, <16 x i32> %a0, <16 x i32> %a1, i16 %mask) { 6421; CHECK-LABEL: stack_fold_psllvd_mask: 6422; CHECK: # %bb.0: 6423; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6424; CHECK-NEXT: #APP 6425; CHECK-NEXT: nop 6426; CHECK-NEXT: #NO_APP 6427; CHECK-NEXT: kmovd %esi, %k1 6428; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 6429; CHECK-NEXT: vpsllvd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 6430; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 6431; CHECK-NEXT: retq 6432 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6433 %2 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1) 6434 %3 = bitcast i16 %mask to <16 x i1> 6435 %4 = load <16 x i32>, <16 x i32>* %passthru 6436 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4 6437 ret <16 x i32> %5 6438} 6439 6440define <16 x i32> @stack_fold_psllvd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { 6441; CHECK-LABEL: stack_fold_psllvd_maskz: 6442; CHECK: # %bb.0: 6443; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6444; CHECK-NEXT: #APP 6445; CHECK-NEXT: nop 6446; CHECK-NEXT: #NO_APP 6447; CHECK-NEXT: kmovd %edi, %k1 6448; CHECK-NEXT: vpsllvd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 6449; CHECK-NEXT: retq 6450 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6451 %2 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1) 6452 %3 = bitcast i16 %mask to <16 x i1> 6453 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 6454 ret <16 x i32> %4 6455} 6456 6457define <8 x i64> @stack_fold_psllvq(<8 x i64> %a0, <8 x i64> %a1) { 6458; CHECK-LABEL: stack_fold_psllvq: 6459; CHECK: # %bb.0: 6460; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6461; CHECK-NEXT: #APP 6462; CHECK-NEXT: nop 6463; CHECK-NEXT: #NO_APP 6464; CHECK-NEXT: vpsllvq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 6465; CHECK-NEXT: retq 6466 %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6467 %2 = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %a0, <8 x i64> %a1) 6468 ret <8 x i64> %2 6469} 6470declare <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64>, <8 x i64>) nounwind readnone 6471 6472define <32 x i16> @stack_fold_psllvw(<32 x i16> %a0, <32 x i16> %a1) { 6473; CHECK-LABEL: stack_fold_psllvw: 6474; CHECK: # %bb.0: 6475; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6476; CHECK-NEXT: #APP 6477; CHECK-NEXT: nop 6478; CHECK-NEXT: #NO_APP 6479; CHECK-NEXT: vpsllvw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 6480; CHECK-NEXT: retq 6481 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6482 %2 = call <32 x i16> @llvm.x86.avx512.psllv.w.512(<32 x i16> %a0, <32 x i16> %a1) 6483 ret <32 x i16> %2 6484} 6485declare <32 x i16> @llvm.x86.avx512.psllv.w.512(<32 x i16>, <32 x i16>) nounwind readnone 6486 6487define <32 x i16> @stack_fold_psllw(<32 x i16> %a0, <8 x i16> %a1) { 6488; CHECK-LABEL: stack_fold_psllw: 6489; CHECK: # %bb.0: 6490; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6491; CHECK-NEXT: #APP 6492; CHECK-NEXT: nop 6493; CHECK-NEXT: #NO_APP 6494; CHECK-NEXT: vpsllw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload 6495; CHECK-NEXT: retq 6496 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6497 %2 = call <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16> %a0, <8 x i16> %a1) 6498 ret <32 x i16> %2 6499} 6500declare <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16>, <8 x i16>) nounwind readnone 6501 6502define <32 x i16> @stack_fold_psllwi(<32 x i16> %a0) { 6503; CHECK-LABEL: stack_fold_psllwi: 6504; CHECK: # %bb.0: 6505; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6506; CHECK-NEXT: #APP 6507; CHECK-NEXT: nop 6508; CHECK-NEXT: #NO_APP 6509; CHECK-NEXT: vpsllw $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 6510; CHECK-NEXT: retq 6511 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6512 %2 = call <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16> %a0, i32 1) 6513 ret <32 x i16> %2 6514} 6515declare <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16>, i32) nounwind readnone 6516 6517define <16 x i32> @stack_fold_psrad(<16 x i32> %a0, <4 x i32> %a1) { 6518; CHECK-LABEL: stack_fold_psrad: 6519; CHECK: # %bb.0: 6520; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6521; CHECK-NEXT: #APP 6522; CHECK-NEXT: nop 6523; CHECK-NEXT: #NO_APP 6524; CHECK-NEXT: vpsrad {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload 6525; CHECK-NEXT: retq 6526 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6527 %2 = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %a0, <4 x i32> %a1) 6528 ret <16 x i32> %2 6529} 6530declare <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32>, <4 x i32>) nounwind readnone 6531 6532define <16 x i32> @stack_fold_psradi(<16 x i32> %a0) { 6533; CHECK-LABEL: stack_fold_psradi: 6534; CHECK: # %bb.0: 6535; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6536; CHECK-NEXT: #APP 6537; CHECK-NEXT: nop 6538; CHECK-NEXT: #NO_APP 6539; CHECK-NEXT: vpsrad $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 6540; CHECK-NEXT: retq 6541 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6542 %2 = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %a0, i32 1) 6543 ret <16 x i32> %2 6544} 6545declare <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32>, i32) nounwind readnone 6546 6547define <8 x i64> @stack_fold_psraq(<8 x i64> %a0, <2 x i64> %a1) { 6548; CHECK-LABEL: stack_fold_psraq: 6549; CHECK: # %bb.0: 6550; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6551; CHECK-NEXT: #APP 6552; CHECK-NEXT: nop 6553; CHECK-NEXT: #NO_APP 6554; CHECK-NEXT: vpsraq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload 6555; CHECK-NEXT: retq 6556 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6557 %2 = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %a0, <2 x i64> %a1) 6558 ret <8 x i64> %2 6559} 6560declare <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64>, <2 x i64>) nounwind readnone 6561 6562define <8 x i64> @stack_fold_psraqi(<8 x i64> %a0) { 6563; CHECK-LABEL: stack_fold_psraqi: 6564; CHECK: # %bb.0: 6565; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6566; CHECK-NEXT: #APP 6567; CHECK-NEXT: nop 6568; CHECK-NEXT: #NO_APP 6569; CHECK-NEXT: vpsraq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 6570; CHECK-NEXT: retq 6571 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6572 %2 = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %a0, i32 1) 6573 ret <8 x i64> %2 6574} 6575declare <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64>, i32) nounwind readnone 6576 6577define <16 x i32> @stack_fold_psravd(<16 x i32> %a0, <16 x i32> %a1) { 6578; CHECK-LABEL: stack_fold_psravd: 6579; CHECK: # %bb.0: 6580; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6581; CHECK-NEXT: #APP 6582; CHECK-NEXT: nop 6583; CHECK-NEXT: #NO_APP 6584; CHECK-NEXT: vpsravd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 6585; CHECK-NEXT: retq 6586 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6587 %2 = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %a0, <16 x i32> %a1) 6588 ret <16 x i32> %2 6589} 6590declare <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32>, <16 x i32>) nounwind readnone 6591 6592define <8 x i64> @stack_fold_psravq(<8 x i64> %a0, <8 x i64> %a1) { 6593; CHECK-LABEL: stack_fold_psravq: 6594; CHECK: # %bb.0: 6595; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6596; CHECK-NEXT: #APP 6597; CHECK-NEXT: nop 6598; CHECK-NEXT: #NO_APP 6599; CHECK-NEXT: vpsravq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 6600; CHECK-NEXT: retq 6601 %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6602 %2 = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %a0, <8 x i64> %a1) 6603 ret <8 x i64> %2 6604} 6605declare <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64>, <8 x i64>) nounwind readnone 6606 6607define <32 x i16> @stack_fold_psravw(<32 x i16> %a0, <32 x i16> %a1) { 6608; CHECK-LABEL: stack_fold_psravw: 6609; CHECK: # %bb.0: 6610; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6611; CHECK-NEXT: #APP 6612; CHECK-NEXT: nop 6613; CHECK-NEXT: #NO_APP 6614; CHECK-NEXT: vpsravw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 6615; CHECK-NEXT: retq 6616 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6617 %2 = call <32 x i16> @llvm.x86.avx512.psrav.w.512(<32 x i16> %a0, <32 x i16> %a1) 6618 ret <32 x i16> %2 6619} 6620declare <32 x i16> @llvm.x86.avx512.psrav.w.512(<32 x i16>, <32 x i16>) nounwind readnone 6621 6622define <32 x i16> @stack_fold_psraw(<32 x i16> %a0, <8 x i16> %a1) { 6623; CHECK-LABEL: stack_fold_psraw: 6624; CHECK: # %bb.0: 6625; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6626; CHECK-NEXT: #APP 6627; CHECK-NEXT: nop 6628; CHECK-NEXT: #NO_APP 6629; CHECK-NEXT: vpsraw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload 6630; CHECK-NEXT: retq 6631 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6632 %2 = call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %a0, <8 x i16> %a1) 6633 ret <32 x i16> %2 6634} 6635declare <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16>, <8 x i16>) nounwind readnone 6636 6637define <32 x i16> @stack_fold_psrawi(<32 x i16> %a0) { 6638; CHECK-LABEL: stack_fold_psrawi: 6639; CHECK: # %bb.0: 6640; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6641; CHECK-NEXT: #APP 6642; CHECK-NEXT: nop 6643; CHECK-NEXT: #NO_APP 6644; CHECK-NEXT: vpsraw $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 6645; CHECK-NEXT: retq 6646 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6647 %2 = call <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16> %a0, i32 1) 6648 ret <32 x i16> %2 6649} 6650declare <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16>, i32) nounwind readnone 6651 6652define <16 x i32> @stack_fold_psrld(<16 x i32> %a0, <4 x i32> %a1) { 6653; CHECK-LABEL: stack_fold_psrld: 6654; CHECK: # %bb.0: 6655; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6656; CHECK-NEXT: #APP 6657; CHECK-NEXT: nop 6658; CHECK-NEXT: #NO_APP 6659; CHECK-NEXT: vpsrld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload 6660; CHECK-NEXT: retq 6661 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6662 %2 = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %a0, <4 x i32> %a1) 6663 ret <16 x i32> %2 6664} 6665declare <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32>, <4 x i32>) nounwind readnone 6666 6667define <16 x i32> @stack_fold_psrldi(<16 x i32> %a0) { 6668; CHECK-LABEL: stack_fold_psrldi: 6669; CHECK: # %bb.0: 6670; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6671; CHECK-NEXT: #APP 6672; CHECK-NEXT: nop 6673; CHECK-NEXT: #NO_APP 6674; CHECK-NEXT: vpsrld $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 6675; CHECK-NEXT: retq 6676 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6677 %2 = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %a0, i32 1) 6678 ret <16 x i32> %2 6679} 6680declare <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32>, i32) nounwind readnone 6681 6682define <64 x i8> @stack_fold_psrldq(<64 x i8> %a, <64 x i8> %b) { 6683; CHECK-LABEL: stack_fold_psrldq: 6684; CHECK: # %bb.0: 6685; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6686; CHECK-NEXT: #APP 6687; CHECK-NEXT: nop 6688; CHECK-NEXT: #NO_APP 6689; CHECK-NEXT: vpsrldq $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 6690; CHECK-NEXT: # zmm0 = mem[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,mem[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,mem[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,mem[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zero,zero 6691; CHECK-NEXT: retq 6692 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6693 %2 = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 64, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 64, i32 64, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 64, i32 64, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 64> 6694 ret <64 x i8> %2 6695} 6696 6697define <8 x i64> @stack_fold_psrlq(<8 x i64> %a0, <2 x i64> %a1) { 6698; CHECK-LABEL: stack_fold_psrlq: 6699; CHECK: # %bb.0: 6700; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6701; CHECK-NEXT: #APP 6702; CHECK-NEXT: nop 6703; CHECK-NEXT: #NO_APP 6704; CHECK-NEXT: vpsrlq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload 6705; CHECK-NEXT: retq 6706 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6707 %2 = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %a0, <2 x i64> %a1) 6708 ret <8 x i64> %2 6709} 6710declare <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64>, <2 x i64>) nounwind readnone 6711 6712define <8 x i64> @stack_fold_psrlqi(<8 x i64> %a0) { 6713; CHECK-LABEL: stack_fold_psrlqi: 6714; CHECK: # %bb.0: 6715; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6716; CHECK-NEXT: #APP 6717; CHECK-NEXT: nop 6718; CHECK-NEXT: #NO_APP 6719; CHECK-NEXT: vpsrlq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 6720; CHECK-NEXT: retq 6721 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6722 %2 = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %a0, i32 1) 6723 ret <8 x i64> %2 6724} 6725declare <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64>, i32) nounwind readnone 6726 6727define <16 x i32> @stack_fold_psrlvd(<16 x i32> %a0, <16 x i32> %a1) { 6728; CHECK-LABEL: stack_fold_psrlvd: 6729; CHECK: # %bb.0: 6730; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6731; CHECK-NEXT: #APP 6732; CHECK-NEXT: nop 6733; CHECK-NEXT: #NO_APP 6734; CHECK-NEXT: vpsrlvd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 6735; CHECK-NEXT: retq 6736 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6737 %2 = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %a0, <16 x i32> %a1) 6738 ret <16 x i32> %2 6739} 6740declare <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32>, <16 x i32>) nounwind readnone 6741 6742define <8 x i64> @stack_fold_psrlvq(<8 x i64> %a0, <8 x i64> %a1) { 6743; CHECK-LABEL: stack_fold_psrlvq: 6744; CHECK: # %bb.0: 6745; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6746; CHECK-NEXT: #APP 6747; CHECK-NEXT: nop 6748; CHECK-NEXT: #NO_APP 6749; CHECK-NEXT: vpsrlvq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 6750; CHECK-NEXT: retq 6751 %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6752 %2 = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %a0, <8 x i64> %a1) 6753 ret <8 x i64> %2 6754} 6755declare <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64>, <8 x i64>) nounwind readnone 6756 6757define <32 x i16> @stack_fold_psrlvw(<32 x i16> %a0, <32 x i16> %a1) { 6758; CHECK-LABEL: stack_fold_psrlvw: 6759; CHECK: # %bb.0: 6760; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6761; CHECK-NEXT: #APP 6762; CHECK-NEXT: nop 6763; CHECK-NEXT: #NO_APP 6764; CHECK-NEXT: vpsrlvw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 6765; CHECK-NEXT: retq 6766 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6767 %2 = call <32 x i16> @llvm.x86.avx512.psrlv.w.512(<32 x i16> %a0, <32 x i16> %a1) 6768 ret <32 x i16> %2 6769} 6770declare <32 x i16> @llvm.x86.avx512.psrlv.w.512(<32 x i16>, <32 x i16>) nounwind readnone 6771 6772define <32 x i16> @stack_fold_psrlw(<32 x i16> %a0, <8 x i16> %a1) { 6773; CHECK-LABEL: stack_fold_psrlw: 6774; CHECK: # %bb.0: 6775; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6776; CHECK-NEXT: #APP 6777; CHECK-NEXT: nop 6778; CHECK-NEXT: #NO_APP 6779; CHECK-NEXT: vpsrlw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload 6780; CHECK-NEXT: retq 6781 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6782 %2 = call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %a0, <8 x i16> %a1) 6783 ret <32 x i16> %2 6784} 6785declare <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16>, <8 x i16>) nounwind readnone 6786 6787define <32 x i16> @stack_fold_psrlwi(<32 x i16> %a0) { 6788; CHECK-LABEL: stack_fold_psrlwi: 6789; CHECK: # %bb.0: 6790; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6791; CHECK-NEXT: #APP 6792; CHECK-NEXT: nop 6793; CHECK-NEXT: #NO_APP 6794; CHECK-NEXT: vpsrlw $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 6795; CHECK-NEXT: retq 6796 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6797 %2 = call <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16> %a0, i32 1) 6798 ret <32 x i16> %2 6799} 6800declare <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16>, i32) nounwind readnone 6801 6802define <64 x i8> @stack_fold_psubb(<64 x i8> %a0, <64 x i8> %a1) { 6803; CHECK-LABEL: stack_fold_psubb: 6804; CHECK: # %bb.0: 6805; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6806; CHECK-NEXT: #APP 6807; CHECK-NEXT: nop 6808; CHECK-NEXT: #NO_APP 6809; CHECK-NEXT: vpsubb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 6810; CHECK-NEXT: retq 6811 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6812 %2 = sub <64 x i8> %a0, %a1 6813 ret <64 x i8> %2 6814} 6815 6816define <16 x i32> @stack_fold_psubd(<16 x i32> %a0, <16 x i32> %a1) { 6817; CHECK-LABEL: stack_fold_psubd: 6818; CHECK: # %bb.0: 6819; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6820; CHECK-NEXT: #APP 6821; CHECK-NEXT: nop 6822; CHECK-NEXT: #NO_APP 6823; CHECK-NEXT: vpsubd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 6824; CHECK-NEXT: retq 6825 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6826 %2 = sub <16 x i32> %a0, %a1 6827 ret <16 x i32> %2 6828} 6829 6830define <8 x i64> @stack_fold_psubq(<8 x i64> %a0, <8 x i64> %a1) { 6831; CHECK-LABEL: stack_fold_psubq: 6832; CHECK: # %bb.0: 6833; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6834; CHECK-NEXT: #APP 6835; CHECK-NEXT: nop 6836; CHECK-NEXT: #NO_APP 6837; CHECK-NEXT: vpsubq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 6838; CHECK-NEXT: retq 6839 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6840 %2 = sub <8 x i64> %a0, %a1 6841 ret <8 x i64> %2 6842} 6843 6844define <64 x i8> @stack_fold_psubsb(<64 x i8> %a0, <64 x i8> %a1) { 6845; CHECK-LABEL: stack_fold_psubsb: 6846; CHECK: # %bb.0: 6847; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6848; CHECK-NEXT: #APP 6849; CHECK-NEXT: nop 6850; CHECK-NEXT: #NO_APP 6851; CHECK-NEXT: vpsubsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 6852; CHECK-NEXT: retq 6853 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6854 %2 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1) 6855 ret <64 x i8> %2 6856} 6857 6858define <32 x i16> @stack_fold_psubsw(<32 x i16> %a0, <32 x i16> %a1) { 6859; CHECK-LABEL: stack_fold_psubsw: 6860; CHECK: # %bb.0: 6861; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6862; CHECK-NEXT: #APP 6863; CHECK-NEXT: nop 6864; CHECK-NEXT: #NO_APP 6865; CHECK-NEXT: vpsubsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 6866; CHECK-NEXT: retq 6867 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6868 %2 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1) 6869 ret <32 x i16> %2 6870} 6871 6872define <64 x i8> @stack_fold_psubusb(<64 x i8> %a0, <64 x i8> %a1) { 6873; CHECK-LABEL: stack_fold_psubusb: 6874; CHECK: # %bb.0: 6875; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6876; CHECK-NEXT: #APP 6877; CHECK-NEXT: nop 6878; CHECK-NEXT: #NO_APP 6879; CHECK-NEXT: vpsubusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 6880; CHECK-NEXT: retq 6881 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6882 %2 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1) 6883 ret <64 x i8> %2 6884} 6885 6886define <32 x i16> @stack_fold_psubusw(<32 x i16> %a0, <32 x i16> %a1) { 6887; CHECK-LABEL: stack_fold_psubusw: 6888; CHECK: # %bb.0: 6889; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6890; CHECK-NEXT: #APP 6891; CHECK-NEXT: nop 6892; CHECK-NEXT: #NO_APP 6893; CHECK-NEXT: vpsubusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 6894; CHECK-NEXT: retq 6895 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6896 %2 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1) 6897 ret <32 x i16> %2 6898} 6899 6900define <32 x i16> @stack_fold_psubw(<32 x i16> %a0, <32 x i16> %a1) { 6901; CHECK-LABEL: stack_fold_psubw: 6902; CHECK: # %bb.0: 6903; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6904; CHECK-NEXT: #APP 6905; CHECK-NEXT: nop 6906; CHECK-NEXT: #NO_APP 6907; CHECK-NEXT: vpsubw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 6908; CHECK-NEXT: retq 6909 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6910 %2 = sub <32 x i16> %a0, %a1 6911 ret <32 x i16> %2 6912} 6913 6914define <8 x i64> @stack_fold_shufi64x2(<8 x i64> %a, <8 x i64> %b) { 6915; CHECK-LABEL: stack_fold_shufi64x2: 6916; CHECK: # %bb.0: 6917; CHECK-NEXT: subq $56, %rsp 6918; CHECK-NEXT: .cfi_def_cfa_offset 64 6919; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6920; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6921; CHECK-NEXT: #APP 6922; CHECK-NEXT: nop 6923; CHECK-NEXT: #NO_APP 6924; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 6925; CHECK-NEXT: vshufi64x2 $24, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 6926; CHECK-NEXT: # zmm0 = zmm0[0,1,4,5],mem[2,3,0,1] 6927; CHECK-NEXT: addq $56, %rsp 6928; CHECK-NEXT: .cfi_def_cfa_offset 8 6929; CHECK-NEXT: retq 6930 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6931 %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9> 6932 ret <8 x i64> %2 6933} 6934 6935define <8 x i64> @stack_fold_shufi64x2_mask(<8 x i64> %a, <8 x i64> %b, i8 %mask, <8 x i64>* %passthru) { 6936; CHECK-LABEL: stack_fold_shufi64x2_mask: 6937; CHECK: # %bb.0: 6938; CHECK-NEXT: subq $56, %rsp 6939; CHECK-NEXT: .cfi_def_cfa_offset 64 6940; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6941; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6942; CHECK-NEXT: #APP 6943; CHECK-NEXT: nop 6944; CHECK-NEXT: #NO_APP 6945; CHECK-NEXT: kmovd %edi, %k1 6946; CHECK-NEXT: vmovdqa64 (%rsi), %zmm1 6947; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 6948; CHECK-NEXT: vshufi64x2 $24, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload 6949; CHECK-NEXT: # zmm1 {%k1} = zmm0[0,1,4,5],mem[2,3,0,1] 6950; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 6951; CHECK-NEXT: addq $56, %rsp 6952; CHECK-NEXT: .cfi_def_cfa_offset 8 6953; CHECK-NEXT: retq 6954 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6955 %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9> 6956 %3 = bitcast i8 %mask to <8 x i1> 6957 ; load needed to keep the operation from being scheduled above the asm block 6958 %4 = load <8 x i64>, <8 x i64>* %passthru 6959 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4 6960 ret <8 x i64> %5 6961} 6962 6963define <8 x i64> @stack_fold_shufi64x2_maskz(<8 x i64> %a, <8 x i64> %b, i8 %mask, <8 x i64>* %passthru) { 6964; CHECK-LABEL: stack_fold_shufi64x2_maskz: 6965; CHECK: # %bb.0: 6966; CHECK-NEXT: subq $56, %rsp 6967; CHECK-NEXT: .cfi_def_cfa_offset 64 6968; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6969; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6970; CHECK-NEXT: #APP 6971; CHECK-NEXT: nop 6972; CHECK-NEXT: #NO_APP 6973; CHECK-NEXT: kmovd %edi, %k1 6974; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 6975; CHECK-NEXT: vshufi64x2 $24, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 6976; CHECK-NEXT: # zmm0 {%k1} {z} = zmm0[0,1,4,5],mem[2,3,0,1] 6977; CHECK-NEXT: addq $56, %rsp 6978; CHECK-NEXT: .cfi_def_cfa_offset 8 6979; CHECK-NEXT: retq 6980 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6981 %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9> 6982 %3 = bitcast i8 %mask to <8 x i1> 6983 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer 6984 ret <8 x i64> %4 6985} 6986 6987define <16 x i32> @stack_fold_shufi32x4_mask(<16 x i32> %a, <16 x i32> %b, i16 %mask, <16 x i32>* %passthru) { 6988; CHECK-LABEL: stack_fold_shufi32x4_mask: 6989; CHECK: # %bb.0: 6990; CHECK-NEXT: subq $56, %rsp 6991; CHECK-NEXT: .cfi_def_cfa_offset 64 6992; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6993; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6994; CHECK-NEXT: #APP 6995; CHECK-NEXT: nop 6996; CHECK-NEXT: #NO_APP 6997; CHECK-NEXT: kmovd %edi, %k1 6998; CHECK-NEXT: vmovdqa64 (%rsi), %zmm1 6999; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 7000; CHECK-NEXT: vshufi32x4 $20, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload 7001; CHECK-NEXT: # zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,7],mem[4,5,6,7,0,1,2,3] 7002; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 7003; CHECK-NEXT: addq $56, %rsp 7004; CHECK-NEXT: .cfi_def_cfa_offset 8 7005; CHECK-NEXT: retq 7006 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 7007 %2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19> 7008 %3 = bitcast i16 %mask to <16 x i1> 7009 ; load needed to keep the operation from being scheduled above the asm block 7010 %4 = load <16 x i32>, <16 x i32>* %passthru 7011 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4 7012 ret <16 x i32> %5 7013} 7014 7015define <16 x i32> @stack_fold_shufi32x4_maskz(<16 x i32> %a, <16 x i32> %b, i16 %mask) { 7016; CHECK-LABEL: stack_fold_shufi32x4_maskz: 7017; CHECK: # %bb.0: 7018; CHECK-NEXT: subq $56, %rsp 7019; CHECK-NEXT: .cfi_def_cfa_offset 64 7020; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7021; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7022; CHECK-NEXT: #APP 7023; CHECK-NEXT: nop 7024; CHECK-NEXT: #NO_APP 7025; CHECK-NEXT: kmovd %edi, %k1 7026; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 7027; CHECK-NEXT: vshufi32x4 $20, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 7028; CHECK-NEXT: # zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],mem[4,5,6,7,0,1,2,3] 7029; CHECK-NEXT: addq $56, %rsp 7030; CHECK-NEXT: .cfi_def_cfa_offset 8 7031; CHECK-NEXT: retq 7032 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 7033 %2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19> 7034 %3 = bitcast i16 %mask to <16 x i1> 7035 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 7036 ret <16 x i32> %4 7037} 7038 7039define <16 x i32> @stack_fold_ternlogd(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { 7040; CHECK-LABEL: stack_fold_ternlogd: 7041; CHECK: # %bb.0: 7042; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7043; CHECK-NEXT: #APP 7044; CHECK-NEXT: nop 7045; CHECK-NEXT: #NO_APP 7046; CHECK-NEXT: vpternlogd $33, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 7047; CHECK-NEXT: retq 7048 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 7049 %2 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33) 7050 ret <16 x i32> %2 7051} 7052declare <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32) 7053 7054define <8 x i64> @stack_fold_ternlogq(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) { 7055; CHECK-LABEL: stack_fold_ternlogq: 7056; CHECK: # %bb.0: 7057; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7058; CHECK-NEXT: #APP 7059; CHECK-NEXT: nop 7060; CHECK-NEXT: #NO_APP 7061; CHECK-NEXT: vpternlogq $33, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 7062; CHECK-NEXT: retq 7063 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 7064 %2 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33) 7065 ret <8 x i64> %2 7066} 7067 7068declare <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32) 7069 7070define <64 x i8> @stack_fold_punpckhbw_zmm(<64 x i8> %a0, <64 x i8> %a1) { 7071; CHECK-LABEL: stack_fold_punpckhbw_zmm: 7072; CHECK: # %bb.0: 7073; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7074; CHECK-NEXT: #APP 7075; CHECK-NEXT: nop 7076; CHECK-NEXT: #NO_APP 7077; CHECK-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 7078; CHECK-NEXT: # zmm0 = zmm0[8],mem[8],zmm0[9],mem[9],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[12],mem[12],zmm0[13],mem[13],zmm0[14],mem[14],zmm0[15],mem[15],zmm0[24],mem[24],zmm0[25],mem[25],zmm0[26],mem[26],zmm0[27],mem[27],zmm0[28],mem[28],zmm0[29],mem[29],zmm0[30],mem[30],zmm0[31],mem[31],zmm0[40],mem[40],zmm0[41],mem[41],zmm0[42],mem[42],zmm0[43],mem[43],zmm0[44],mem[44],zmm0[45],mem[45],zmm0[46],mem[46],zmm0[47],mem[47],zmm0[56],mem[56],zmm0[57],mem[57],zmm0[58],mem[58],zmm0[59],mem[59],zmm0[60],mem[60],zmm0[61],mem[61],zmm0[62],mem[62],zmm0[63],mem[63] 7079; CHECK-NEXT: retq 7080 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 7081 %2 = shufflevector <64 x i8> %a0, <64 x i8> %a1, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127> 7082 ret <64 x i8> %2 7083} 7084 7085define <64 x i8> @stack_fold_punpckhbw_mask_zmm(<64 x i8>* %passthru, <64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 7086; CHECK-LABEL: stack_fold_punpckhbw_mask_zmm: 7087; CHECK: # %bb.0: 7088; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7089; CHECK-NEXT: #APP 7090; CHECK-NEXT: nop 7091; CHECK-NEXT: #NO_APP 7092; CHECK-NEXT: kmovq %rsi, %k1 7093; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 7094; CHECK-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 7095; CHECK-NEXT: # zmm2 {%k1} = zmm0[8],mem[8],zmm0[9],mem[9],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[12],mem[12],zmm0[13],mem[13],zmm0[14],mem[14],zmm0[15],mem[15],zmm0[24],mem[24],zmm0[25],mem[25],zmm0[26],mem[26],zmm0[27],mem[27],zmm0[28],mem[28],zmm0[29],mem[29],zmm0[30],mem[30],zmm0[31],mem[31],zmm0[40],mem[40],zmm0[41],mem[41],zmm0[42],mem[42],zmm0[43],mem[43],zmm0[44],mem[44],zmm0[45],mem[45],zmm0[46],mem[46],zmm0[47],mem[47],zmm0[56],mem[56],zmm0[57],mem[57],zmm0[58],mem[58],zmm0[59],mem[59],zmm0[60],mem[60],zmm0[61],mem[61],zmm0[62],mem[62],zmm0[63],mem[63] 7096; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 7097; CHECK-NEXT: retq 7098 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 7099 %2 = shufflevector <64 x i8> %a0, <64 x i8> %a1, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127> 7100 %3 = bitcast i64 %mask to <64 x i1> 7101 ; load needed to keep the operation from being scheduled about the asm block 7102 %4 = load <64 x i8>, <64 x i8>* %passthru 7103 %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4 7104 ret <64 x i8> %5 7105} 7106 7107define <64 x i8> @stack_fold_punpckhbw_maskz_zmm(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 7108; CHECK-LABEL: stack_fold_punpckhbw_maskz_zmm: 7109; CHECK: # %bb.0: 7110; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7111; CHECK-NEXT: #APP 7112; CHECK-NEXT: nop 7113; CHECK-NEXT: #NO_APP 7114; CHECK-NEXT: kmovq %rdi, %k1 7115; CHECK-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 7116; CHECK-NEXT: # zmm0 {%k1} {z} = zmm0[8],mem[8],zmm0[9],mem[9],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[12],mem[12],zmm0[13],mem[13],zmm0[14],mem[14],zmm0[15],mem[15],zmm0[24],mem[24],zmm0[25],mem[25],zmm0[26],mem[26],zmm0[27],mem[27],zmm0[28],mem[28],zmm0[29],mem[29],zmm0[30],mem[30],zmm0[31],mem[31],zmm0[40],mem[40],zmm0[41],mem[41],zmm0[42],mem[42],zmm0[43],mem[43],zmm0[44],mem[44],zmm0[45],mem[45],zmm0[46],mem[46],zmm0[47],mem[47],zmm0[56],mem[56],zmm0[57],mem[57],zmm0[58],mem[58],zmm0[59],mem[59],zmm0[60],mem[60],zmm0[61],mem[61],zmm0[62],mem[62],zmm0[63],mem[63] 7117; CHECK-NEXT: retq 7118 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 7119 %2 = shufflevector <64 x i8> %a0, <64 x i8> %a1, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127> 7120 %3 = bitcast i64 %mask to <64 x i1> 7121 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer 7122 ret <64 x i8> %4 7123} 7124 7125define <16 x i32> @stack_fold_pxord(<16 x i32> %a0, <16 x i32> %a1) { 7126; CHECK-LABEL: stack_fold_pxord: 7127; CHECK: # %bb.0: 7128; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7129; CHECK-NEXT: #APP 7130; CHECK-NEXT: nop 7131; CHECK-NEXT: #NO_APP 7132; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 7133; CHECK-NEXT: retq 7134 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 7135 %2 = xor <16 x i32> %a0, %a1 7136 ret <16 x i32> %2 7137} 7138 7139define <16 x i32> @stack_fold_pxord_commuted(<16 x i32> %a0, <16 x i32> %a1) { 7140; CHECK-LABEL: stack_fold_pxord_commuted: 7141; CHECK: # %bb.0: 7142; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7143; CHECK-NEXT: #APP 7144; CHECK-NEXT: nop 7145; CHECK-NEXT: #NO_APP 7146; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 7147; CHECK-NEXT: retq 7148 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 7149 %2 = xor <16 x i32> %a1, %a0 7150 ret <16 x i32> %2 7151} 7152 7153define <16 x i32> @stack_fold_pxord_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) { 7154; CHECK-LABEL: stack_fold_pxord_mask: 7155; CHECK: # %bb.0: 7156; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7157; CHECK-NEXT: vmovaps %zmm0, %zmm1 7158; CHECK-NEXT: #APP 7159; CHECK-NEXT: nop 7160; CHECK-NEXT: #NO_APP 7161; CHECK-NEXT: kmovd %esi, %k1 7162; CHECK-NEXT: vmovaps (%rdi), %zmm0 7163; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 7164; CHECK-NEXT: retq 7165 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 7166 %2 = xor <16 x i32> %a0, %a1 7167 %3 = bitcast i16 %mask to <16 x i1> 7168 ; load needed to keep the operation from being scheduled about the asm block 7169 %4 = load <16 x i32>, <16 x i32>* %a2 7170 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4 7171 ret <16 x i32> %5 7172} 7173 7174define <16 x i32> @stack_fold_pxord_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) { 7175; CHECK-LABEL: stack_fold_pxord_mask_commuted: 7176; CHECK: # %bb.0: 7177; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7178; CHECK-NEXT: vmovaps %zmm0, %zmm1 7179; CHECK-NEXT: #APP 7180; CHECK-NEXT: nop 7181; CHECK-NEXT: #NO_APP 7182; CHECK-NEXT: kmovd %esi, %k1 7183; CHECK-NEXT: vmovaps (%rdi), %zmm0 7184; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 7185; CHECK-NEXT: retq 7186 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 7187 %2 = xor <16 x i32> %a1, %a0 7188 %3 = bitcast i16 %mask to <16 x i1> 7189 ; load needed to keep the operation from being scheduled about the asm block 7190 %4 = load <16 x i32>, <16 x i32>* %a2 7191 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4 7192 ret <16 x i32> %5 7193} 7194 7195define <16 x i32> @stack_fold_pxord_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { 7196; CHECK-LABEL: stack_fold_pxord_maskz: 7197; CHECK: # %bb.0: 7198; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7199; CHECK-NEXT: #APP 7200; CHECK-NEXT: nop 7201; CHECK-NEXT: #NO_APP 7202; CHECK-NEXT: kmovd %edi, %k1 7203; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 7204; CHECK-NEXT: retq 7205 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 7206 %2 = xor <16 x i32> %a0, %a1 7207 %3 = bitcast i16 %mask to <16 x i1> 7208 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 7209 ret <16 x i32> %4 7210} 7211 7212define <16 x i32> @stack_fold_pxord_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { 7213; CHECK-LABEL: stack_fold_pxord_maskz_commuted: 7214; CHECK: # %bb.0: 7215; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7216; CHECK-NEXT: #APP 7217; CHECK-NEXT: nop 7218; CHECK-NEXT: #NO_APP 7219; CHECK-NEXT: kmovd %edi, %k1 7220; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 7221; CHECK-NEXT: retq 7222 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 7223 %2 = xor <16 x i32> %a1, %a0 7224 %3 = bitcast i16 %mask to <16 x i1> 7225 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 7226 ret <16 x i32> %4 7227} 7228 7229define <8 x i64> @stack_fold_pxorq(<8 x i64> %a0, <8 x i64> %a1) { 7230; CHECK-LABEL: stack_fold_pxorq: 7231; CHECK: # %bb.0: 7232; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7233; CHECK-NEXT: #APP 7234; CHECK-NEXT: nop 7235; CHECK-NEXT: #NO_APP 7236; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 7237; CHECK-NEXT: retq 7238 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 7239 %2 = xor <8 x i64> %a0, %a1 7240 ret <8 x i64> %2 7241} 7242 7243define <8 x i64> @stack_fold_pxorq_commuted(<8 x i64> %a0, <8 x i64> %a1) { 7244; CHECK-LABEL: stack_fold_pxorq_commuted: 7245; CHECK: # %bb.0: 7246; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7247; CHECK-NEXT: #APP 7248; CHECK-NEXT: nop 7249; CHECK-NEXT: #NO_APP 7250; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 7251; CHECK-NEXT: retq 7252 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 7253 %2 = xor <8 x i64> %a1, %a0 7254 ret <8 x i64> %2 7255} 7256 7257define <8 x i64> @stack_fold_pxorq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) { 7258; CHECK-LABEL: stack_fold_pxorq_mask: 7259; CHECK: # %bb.0: 7260; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7261; CHECK-NEXT: vmovapd %zmm0, %zmm1 7262; CHECK-NEXT: #APP 7263; CHECK-NEXT: nop 7264; CHECK-NEXT: #NO_APP 7265; CHECK-NEXT: kmovd %esi, %k1 7266; CHECK-NEXT: vmovapd (%rdi), %zmm0 7267; CHECK-NEXT: vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 7268; CHECK-NEXT: retq 7269 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 7270 %2 = xor <8 x i64> %a0, %a1 7271 %3 = bitcast i8 %mask to <8 x i1> 7272 ; load needed to keep the operation from being scheduled about the asm block 7273 %4 = load <8 x i64>, <8 x i64>* %a2 7274 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4 7275 ret <8 x i64> %5 7276} 7277 7278define <8 x i64> @stack_fold_pxorq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) { 7279; CHECK-LABEL: stack_fold_pxorq_mask_commuted: 7280; CHECK: # %bb.0: 7281; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7282; CHECK-NEXT: vmovapd %zmm0, %zmm1 7283; CHECK-NEXT: #APP 7284; CHECK-NEXT: nop 7285; CHECK-NEXT: #NO_APP 7286; CHECK-NEXT: kmovd %esi, %k1 7287; CHECK-NEXT: vmovapd (%rdi), %zmm0 7288; CHECK-NEXT: vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 7289; CHECK-NEXT: retq 7290 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 7291 %2 = xor <8 x i64> %a1, %a0 7292 %3 = bitcast i8 %mask to <8 x i1> 7293 ; load needed to keep the operation from being scheduled about the asm block 7294 %4 = load <8 x i64>, <8 x i64>* %a2 7295 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4 7296 ret <8 x i64> %5 7297} 7298 7299define <8 x i64> @stack_fold_pxorq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 7300; CHECK-LABEL: stack_fold_pxorq_maskz: 7301; CHECK: # %bb.0: 7302; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7303; CHECK-NEXT: #APP 7304; CHECK-NEXT: nop 7305; CHECK-NEXT: #NO_APP 7306; CHECK-NEXT: kmovd %edi, %k1 7307; CHECK-NEXT: vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 7308; CHECK-NEXT: retq 7309 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 7310 %2 = xor <8 x i64> %a0, %a1 7311 %3 = bitcast i8 %mask to <8 x i1> 7312 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer 7313 ret <8 x i64> %4 7314} 7315 7316define <8 x i64> @stack_fold_pxorq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 7317; CHECK-LABEL: stack_fold_pxorq_maskz_commuted: 7318; CHECK: # %bb.0: 7319; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7320; CHECK-NEXT: #APP 7321; CHECK-NEXT: nop 7322; CHECK-NEXT: #NO_APP 7323; CHECK-NEXT: kmovd %edi, %k1 7324; CHECK-NEXT: vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 7325; CHECK-NEXT: retq 7326 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 7327 %2 = xor <8 x i64> %a1, %a0 7328 %3 = bitcast i8 %mask to <8 x i1> 7329 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer 7330 ret <8 x i64> %4 7331} 7332 7333declare <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8>, <64 x i8>) 7334declare <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16>, <32 x i16>) 7335declare <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8>, <64 x i8>) 7336declare <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16>, <32 x i16>) 7337declare <16 x i32> @llvm.x86.avx512.conflict.d.512(<16 x i32>) 7338declare <8 x i64> @llvm.x86.avx512.conflict.q.512(<8 x i64>) 7339declare <16 x i32> @llvm.ctlz.v16i32(<16 x i32>, i1) 7340declare <8 x i64> @llvm.ctlz.v8i64(<8 x i64>, i1) 7341declare <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8>, <64 x i8>) 7342declare <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16>, <32 x i16>) 7343declare <64 x i8> @llvm.usub.sat.v64i8(<64 x i8>, <64 x i8>) 7344declare <32 x i16> @llvm.usub.sat.v32i16(<32 x i16>, <32 x i16>) 7345