1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512BW 3; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512F-32 4 5; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512bw-builtins.c 6 7; 8; Signed Saturation 9; 10 11define <32 x i16> @test_mask_adds_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { 12; AVX512BW-LABEL: test_mask_adds_epi16_rr_512: 13; AVX512BW: ## %bb.0: 14; AVX512BW-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 15; AVX512BW-NEXT: retq 16; 17; AVX512F-32-LABEL: test_mask_adds_epi16_rr_512: 18; AVX512F-32: # %bb.0: 19; AVX512F-32-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 20; AVX512F-32-NEXT: retl 21 %res = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a, <32 x i16> %b) 22 ret <32 x i16> %res 23} 24declare <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16>, <32 x i16>) 25 26define <32 x i16> @test_mask_adds_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { 27; AVX512BW-LABEL: test_mask_adds_epi16_rrk_512: 28; AVX512BW: ## %bb.0: 29; AVX512BW-NEXT: kmovd %edi, %k1 30; AVX512BW-NEXT: vpaddsw %zmm1, %zmm0, %zmm2 {%k1} 31; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 32; AVX512BW-NEXT: retq 33; 34; AVX512F-32-LABEL: test_mask_adds_epi16_rrk_512: 35; AVX512F-32: # %bb.0: 36; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 37; AVX512F-32-NEXT: vpaddsw %zmm1, %zmm0, %zmm2 {%k1} 38; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 39; AVX512F-32-NEXT: retl 40 %1 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a, <32 x i16> %b) 41 %2 = bitcast i32 %mask to <32 x i1> 42 %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru 43 ret <32 x i16> %3 44} 45 46define <32 x i16> @test_mask_adds_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { 47; AVX512BW-LABEL: test_mask_adds_epi16_rrkz_512: 48; AVX512BW: ## %bb.0: 49; AVX512BW-NEXT: kmovd %edi, %k1 50; AVX512BW-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z} 51; AVX512BW-NEXT: retq 52; 53; AVX512F-32-LABEL: test_mask_adds_epi16_rrkz_512: 54; AVX512F-32: # %bb.0: 55; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 56; AVX512F-32-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z} 57; AVX512F-32-NEXT: retl 58 %1 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a, <32 x i16> %b) 59 %2 = bitcast i32 %mask to <32 x i1> 60 %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer 61 ret <32 x i16> %3 62} 63 64define <32 x i16> @test_mask_adds_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { 65; AVX512BW-LABEL: test_mask_adds_epi16_rm_512: 66; AVX512BW: ## %bb.0: 67; AVX512BW-NEXT: vpaddsw (%rdi), %zmm0, %zmm0 68; AVX512BW-NEXT: retq 69; 70; AVX512F-32-LABEL: test_mask_adds_epi16_rm_512: 71; AVX512F-32: # %bb.0: 72; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax 73; AVX512F-32-NEXT: vpaddsw (%eax), %zmm0, %zmm0 74; AVX512F-32-NEXT: retl 75 %b = load <32 x i16>, <32 x i16>* %ptr_b 76 %1 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a, <32 x i16> %b) 77 ret <32 x i16> %1 78} 79 80define <32 x i16> @test_mask_adds_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { 81; AVX512BW-LABEL: test_mask_adds_epi16_rmk_512: 82; AVX512BW: ## %bb.0: 83; AVX512BW-NEXT: kmovd %esi, %k1 84; AVX512BW-NEXT: vpaddsw (%rdi), %zmm0, %zmm1 {%k1} 85; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 86; AVX512BW-NEXT: retq 87; 88; AVX512F-32-LABEL: test_mask_adds_epi16_rmk_512: 89; AVX512F-32: # %bb.0: 90; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax 91; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 92; AVX512F-32-NEXT: vpaddsw (%eax), %zmm0, %zmm1 {%k1} 93; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 94; AVX512F-32-NEXT: retl 95 %b = load <32 x i16>, <32 x i16>* %ptr_b 96 %1 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a, <32 x i16> %b) 97 %2 = bitcast i32 %mask to <32 x i1> 98 %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru 99 ret <32 x i16> %3 100} 101 102define <32 x i16> @test_mask_adds_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { 103; AVX512BW-LABEL: test_mask_adds_epi16_rmkz_512: 104; AVX512BW: ## %bb.0: 105; AVX512BW-NEXT: kmovd %esi, %k1 106; AVX512BW-NEXT: vpaddsw (%rdi), %zmm0, %zmm0 {%k1} {z} 107; AVX512BW-NEXT: retq 108; 109; AVX512F-32-LABEL: test_mask_adds_epi16_rmkz_512: 110; AVX512F-32: # %bb.0: 111; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax 112; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 113; AVX512F-32-NEXT: vpaddsw (%eax), %zmm0, %zmm0 {%k1} {z} 114; AVX512F-32-NEXT: retl 115 %b = load <32 x i16>, <32 x i16>* %ptr_b 116 %1 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a, <32 x i16> %b) 117 %2 = bitcast i32 %mask to <32 x i1> 118 %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer 119 ret <32 x i16> %3 120} 121 122define <32 x i16> @test_mask_subs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { 123; AVX512BW-LABEL: test_mask_subs_epi16_rr_512: 124; AVX512BW: ## %bb.0: 125; AVX512BW-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 126; AVX512BW-NEXT: retq 127; 128; AVX512F-32-LABEL: test_mask_subs_epi16_rr_512: 129; AVX512F-32: # %bb.0: 130; AVX512F-32-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 131; AVX512F-32-NEXT: retl 132 %sub = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> %a, <32 x i16> %b) 133 ret <32 x i16> %sub 134} 135declare <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16>, <32 x i16>) 136 137define <32 x i16> @test_mask_subs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { 138; AVX512BW-LABEL: test_mask_subs_epi16_rrk_512: 139; AVX512BW: ## %bb.0: 140; AVX512BW-NEXT: kmovd %edi, %k1 141; AVX512BW-NEXT: vpsubsw %zmm1, %zmm0, %zmm2 {%k1} 142; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 143; AVX512BW-NEXT: retq 144; 145; AVX512F-32-LABEL: test_mask_subs_epi16_rrk_512: 146; AVX512F-32: # %bb.0: 147; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 148; AVX512F-32-NEXT: vpsubsw %zmm1, %zmm0, %zmm2 {%k1} 149; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 150; AVX512F-32-NEXT: retl 151 %sub = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> %a, <32 x i16> %b) 152 %bc = bitcast i32 %mask to <32 x i1> 153 %res = select <32 x i1> %bc, <32 x i16> %sub, <32 x i16> %passThru 154 ret <32 x i16> %res 155} 156 157define <32 x i16> @test_mask_subs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { 158; AVX512BW-LABEL: test_mask_subs_epi16_rrkz_512: 159; AVX512BW: ## %bb.0: 160; AVX512BW-NEXT: kmovd %edi, %k1 161; AVX512BW-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z} 162; AVX512BW-NEXT: retq 163; 164; AVX512F-32-LABEL: test_mask_subs_epi16_rrkz_512: 165; AVX512F-32: # %bb.0: 166; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 167; AVX512F-32-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z} 168; AVX512F-32-NEXT: retl 169 %sub = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> %a, <32 x i16> %b) 170 %bc = bitcast i32 %mask to <32 x i1> 171 %res = select <32 x i1> %bc, <32 x i16> %sub, <32 x i16> zeroinitializer 172 ret <32 x i16> %res 173} 174 175define <32 x i16> @test_mask_subs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { 176; AVX512BW-LABEL: test_mask_subs_epi16_rm_512: 177; AVX512BW: ## %bb.0: 178; AVX512BW-NEXT: vpsubsw (%rdi), %zmm0, %zmm0 179; AVX512BW-NEXT: retq 180; 181; AVX512F-32-LABEL: test_mask_subs_epi16_rm_512: 182; AVX512F-32: # %bb.0: 183; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax 184; AVX512F-32-NEXT: vpsubsw (%eax), %zmm0, %zmm0 185; AVX512F-32-NEXT: retl 186 %b = load <32 x i16>, <32 x i16>* %ptr_b 187 %sub = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> %a, <32 x i16> %b) 188 ret <32 x i16> %sub 189} 190 191define <32 x i16> @test_mask_subs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { 192; AVX512BW-LABEL: test_mask_subs_epi16_rmk_512: 193; AVX512BW: ## %bb.0: 194; AVX512BW-NEXT: kmovd %esi, %k1 195; AVX512BW-NEXT: vpsubsw (%rdi), %zmm0, %zmm1 {%k1} 196; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 197; AVX512BW-NEXT: retq 198; 199; AVX512F-32-LABEL: test_mask_subs_epi16_rmk_512: 200; AVX512F-32: # %bb.0: 201; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax 202; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 203; AVX512F-32-NEXT: vpsubsw (%eax), %zmm0, %zmm1 {%k1} 204; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 205; AVX512F-32-NEXT: retl 206 %b = load <32 x i16>, <32 x i16>* %ptr_b 207 %sub = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> %a, <32 x i16> %b) 208 %bc = bitcast i32 %mask to <32 x i1> 209 %res = select <32 x i1> %bc, <32 x i16> %sub, <32 x i16> %passThru 210 ret <32 x i16> %res 211} 212 213define <32 x i16> @test_mask_subs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { 214; AVX512BW-LABEL: test_mask_subs_epi16_rmkz_512: 215; AVX512BW: ## %bb.0: 216; AVX512BW-NEXT: kmovd %esi, %k1 217; AVX512BW-NEXT: vpsubsw (%rdi), %zmm0, %zmm0 {%k1} {z} 218; AVX512BW-NEXT: retq 219; 220; AVX512F-32-LABEL: test_mask_subs_epi16_rmkz_512: 221; AVX512F-32: # %bb.0: 222; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax 223; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 224; AVX512F-32-NEXT: vpsubsw (%eax), %zmm0, %zmm0 {%k1} {z} 225; AVX512F-32-NEXT: retl 226 %b = load <32 x i16>, <32 x i16>* %ptr_b 227 %sub = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> %a, <32 x i16> %b) 228 %bc = bitcast i32 %mask to <32 x i1> 229 %res = select <32 x i1> %bc, <32 x i16> %sub, <32 x i16> zeroinitializer 230 ret <32 x i16> %res 231} 232 233 234define <64 x i16> @test_mask_adds_epi16_rr_1024(<64 x i16> %a, <64 x i16> %b) { 235; AVX512BW-LABEL: test_mask_adds_epi16_rr_1024: 236; AVX512BW: ## %bb.0: 237; AVX512BW-NEXT: vpaddsw %zmm2, %zmm0, %zmm0 238; AVX512BW-NEXT: vpaddsw %zmm3, %zmm1, %zmm1 239; AVX512BW-NEXT: retq 240; 241; AVX512F-32-LABEL: test_mask_adds_epi16_rr_1024: 242; AVX512F-32: # %bb.0: 243; AVX512F-32-NEXT: pushl %ebp 244; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 245; AVX512F-32-NEXT: .cfi_offset %ebp, -8 246; AVX512F-32-NEXT: movl %esp, %ebp 247; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp 248; AVX512F-32-NEXT: andl $-64, %esp 249; AVX512F-32-NEXT: subl $64, %esp 250; AVX512F-32-NEXT: vpaddsw %zmm2, %zmm0, %zmm0 251; AVX512F-32-NEXT: vpaddsw 8(%ebp), %zmm1, %zmm1 252; AVX512F-32-NEXT: movl %ebp, %esp 253; AVX512F-32-NEXT: popl %ebp 254; AVX512F-32-NEXT: .cfi_def_cfa %esp, 4 255; AVX512F-32-NEXT: retl 256 %1 = call <64 x i16> @llvm.sadd.sat.v64i16(<64 x i16> %a, <64 x i16> %b) 257 ret <64 x i16> %1 258} 259declare <64 x i16> @llvm.sadd.sat.v64i16(<64 x i16>, <64 x i16>) 260 261define <64 x i16> @test_mask_subs_epi16_rr_1024(<64 x i16> %a, <64 x i16> %b) { 262; AVX512BW-LABEL: test_mask_subs_epi16_rr_1024: 263; AVX512BW: ## %bb.0: 264; AVX512BW-NEXT: vpsubsw %zmm2, %zmm0, %zmm0 265; AVX512BW-NEXT: vpsubsw %zmm3, %zmm1, %zmm1 266; AVX512BW-NEXT: retq 267; 268; AVX512F-32-LABEL: test_mask_subs_epi16_rr_1024: 269; AVX512F-32: # %bb.0: 270; AVX512F-32-NEXT: pushl %ebp 271; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 272; AVX512F-32-NEXT: .cfi_offset %ebp, -8 273; AVX512F-32-NEXT: movl %esp, %ebp 274; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp 275; AVX512F-32-NEXT: andl $-64, %esp 276; AVX512F-32-NEXT: subl $64, %esp 277; AVX512F-32-NEXT: vpsubsw %zmm2, %zmm0, %zmm0 278; AVX512F-32-NEXT: vpsubsw 8(%ebp), %zmm1, %zmm1 279; AVX512F-32-NEXT: movl %ebp, %esp 280; AVX512F-32-NEXT: popl %ebp 281; AVX512F-32-NEXT: .cfi_def_cfa %esp, 4 282; AVX512F-32-NEXT: retl 283 %sub = call <64 x i16> @llvm.ssub.sat.v64i16(<64 x i16> %a, <64 x i16> %b) 284 ret <64 x i16> %sub 285} 286declare <64 x i16> @llvm.ssub.sat.v64i16(<64 x i16>, <64 x i16>); 287 288; 289; Unsigned Saturation 290; 291 292define <32 x i16> @test_mask_adds_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) { 293; AVX512BW-LABEL: test_mask_adds_epu16_rr_512: 294; AVX512BW: ## %bb.0: 295; AVX512BW-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 296; AVX512BW-NEXT: retq 297; 298; AVX512F-32-LABEL: test_mask_adds_epu16_rr_512: 299; AVX512F-32: # %bb.0: 300; AVX512F-32-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 301; AVX512F-32-NEXT: retl 302 %res = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a, <32 x i16> %b) 303 ret <32 x i16> %res 304} 305declare <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16>, <32 x i16>) 306 307define <32 x i16> @test_mask_adds_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { 308; AVX512BW-LABEL: test_mask_adds_epu16_rrk_512: 309; AVX512BW: ## %bb.0: 310; AVX512BW-NEXT: kmovd %edi, %k1 311; AVX512BW-NEXT: vpaddusw %zmm1, %zmm0, %zmm2 {%k1} 312; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 313; AVX512BW-NEXT: retq 314; 315; AVX512F-32-LABEL: test_mask_adds_epu16_rrk_512: 316; AVX512F-32: # %bb.0: 317; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 318; AVX512F-32-NEXT: vpaddusw %zmm1, %zmm0, %zmm2 {%k1} 319; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 320; AVX512F-32-NEXT: retl 321 %1 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a, <32 x i16> %b) 322 %2 = bitcast i32 %mask to <32 x i1> 323 %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru 324 ret <32 x i16> %3 325} 326 327define <32 x i16> @test_mask_adds_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { 328; AVX512BW-LABEL: test_mask_adds_epu16_rrkz_512: 329; AVX512BW: ## %bb.0: 330; AVX512BW-NEXT: kmovd %edi, %k1 331; AVX512BW-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z} 332; AVX512BW-NEXT: retq 333; 334; AVX512F-32-LABEL: test_mask_adds_epu16_rrkz_512: 335; AVX512F-32: # %bb.0: 336; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 337; AVX512F-32-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z} 338; AVX512F-32-NEXT: retl 339 %1 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a, <32 x i16> %b) 340 %2 = bitcast i32 %mask to <32 x i1> 341 %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer 342 ret <32 x i16> %3 343} 344 345define <32 x i16> @test_mask_adds_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { 346; AVX512BW-LABEL: test_mask_adds_epu16_rm_512: 347; AVX512BW: ## %bb.0: 348; AVX512BW-NEXT: vpaddusw (%rdi), %zmm0, %zmm0 349; AVX512BW-NEXT: retq 350; 351; AVX512F-32-LABEL: test_mask_adds_epu16_rm_512: 352; AVX512F-32: # %bb.0: 353; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax 354; AVX512F-32-NEXT: vpaddusw (%eax), %zmm0, %zmm0 355; AVX512F-32-NEXT: retl 356 %b = load <32 x i16>, <32 x i16>* %ptr_b 357 %1 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a, <32 x i16> %b) 358 ret <32 x i16> %1 359} 360 361define <32 x i16> @test_mask_adds_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { 362; AVX512BW-LABEL: test_mask_adds_epu16_rmk_512: 363; AVX512BW: ## %bb.0: 364; AVX512BW-NEXT: kmovd %esi, %k1 365; AVX512BW-NEXT: vpaddusw (%rdi), %zmm0, %zmm1 {%k1} 366; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 367; AVX512BW-NEXT: retq 368; 369; AVX512F-32-LABEL: test_mask_adds_epu16_rmk_512: 370; AVX512F-32: # %bb.0: 371; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax 372; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 373; AVX512F-32-NEXT: vpaddusw (%eax), %zmm0, %zmm1 {%k1} 374; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 375; AVX512F-32-NEXT: retl 376 %b = load <32 x i16>, <32 x i16>* %ptr_b 377 %1 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a, <32 x i16> %b) 378 %2 = bitcast i32 %mask to <32 x i1> 379 %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru 380 ret <32 x i16> %3 381} 382 383define <32 x i16> @test_mask_adds_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { 384; AVX512BW-LABEL: test_mask_adds_epu16_rmkz_512: 385; AVX512BW: ## %bb.0: 386; AVX512BW-NEXT: kmovd %esi, %k1 387; AVX512BW-NEXT: vpaddusw (%rdi), %zmm0, %zmm0 {%k1} {z} 388; AVX512BW-NEXT: retq 389; 390; AVX512F-32-LABEL: test_mask_adds_epu16_rmkz_512: 391; AVX512F-32: # %bb.0: 392; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax 393; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 394; AVX512F-32-NEXT: vpaddusw (%eax), %zmm0, %zmm0 {%k1} {z} 395; AVX512F-32-NEXT: retl 396 %b = load <32 x i16>, <32 x i16>* %ptr_b 397 %1 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a, <32 x i16> %b) 398 %2 = bitcast i32 %mask to <32 x i1> 399 %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer 400 ret <32 x i16> %3 401} 402 403define <32 x i16> @test_mask_subs_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) { 404; AVX512BW-LABEL: test_mask_subs_epu16_rr_512: 405; AVX512BW: ## %bb.0: 406; AVX512BW-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 407; AVX512BW-NEXT: retq 408; 409; AVX512F-32-LABEL: test_mask_subs_epu16_rr_512: 410; AVX512F-32: # %bb.0: 411; AVX512F-32-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 412; AVX512F-32-NEXT: retl 413 %sub = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> %a, <32 x i16> %b) 414 ret <32 x i16> %sub 415} 416declare <32 x i16> @llvm.usub.sat.v32i16(<32 x i16>, <32 x i16>) 417 418define <32 x i16> @test_mask_subs_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { 419; AVX512BW-LABEL: test_mask_subs_epu16_rrk_512: 420; AVX512BW: ## %bb.0: 421; AVX512BW-NEXT: kmovd %edi, %k1 422; AVX512BW-NEXT: vpsubusw %zmm1, %zmm0, %zmm2 {%k1} 423; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 424; AVX512BW-NEXT: retq 425; 426; AVX512F-32-LABEL: test_mask_subs_epu16_rrk_512: 427; AVX512F-32: # %bb.0: 428; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 429; AVX512F-32-NEXT: vpsubusw %zmm1, %zmm0, %zmm2 {%k1} 430; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 431; AVX512F-32-NEXT: retl 432 %sub = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> %a, <32 x i16> %b) 433 %bc = bitcast i32 %mask to <32 x i1> 434 %res = select <32 x i1> %bc, <32 x i16> %sub, <32 x i16> %passThru 435 ret <32 x i16> %res 436} 437 438define <32 x i16> @test_mask_subs_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { 439; AVX512BW-LABEL: test_mask_subs_epu16_rrkz_512: 440; AVX512BW: ## %bb.0: 441; AVX512BW-NEXT: kmovd %edi, %k1 442; AVX512BW-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z} 443; AVX512BW-NEXT: retq 444; 445; AVX512F-32-LABEL: test_mask_subs_epu16_rrkz_512: 446; AVX512F-32: # %bb.0: 447; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 448; AVX512F-32-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z} 449; AVX512F-32-NEXT: retl 450 %sub = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> %a, <32 x i16> %b) 451 %bc = bitcast i32 %mask to <32 x i1> 452 %res = select <32 x i1> %bc, <32 x i16> %sub, <32 x i16> zeroinitializer 453 ret <32 x i16> %res 454} 455 456define <32 x i16> @test_mask_subs_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { 457; AVX512BW-LABEL: test_mask_subs_epu16_rm_512: 458; AVX512BW: ## %bb.0: 459; AVX512BW-NEXT: vpsubusw (%rdi), %zmm0, %zmm0 460; AVX512BW-NEXT: retq 461; 462; AVX512F-32-LABEL: test_mask_subs_epu16_rm_512: 463; AVX512F-32: # %bb.0: 464; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax 465; AVX512F-32-NEXT: vpsubusw (%eax), %zmm0, %zmm0 466; AVX512F-32-NEXT: retl 467 %b = load <32 x i16>, <32 x i16>* %ptr_b 468 %sub = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> %a, <32 x i16> %b) 469 ret <32 x i16> %sub 470} 471 472define <32 x i16> @test_mask_subs_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { 473; AVX512BW-LABEL: test_mask_subs_epu16_rmk_512: 474; AVX512BW: ## %bb.0: 475; AVX512BW-NEXT: kmovd %esi, %k1 476; AVX512BW-NEXT: vpsubusw (%rdi), %zmm0, %zmm1 {%k1} 477; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 478; AVX512BW-NEXT: retq 479; 480; AVX512F-32-LABEL: test_mask_subs_epu16_rmk_512: 481; AVX512F-32: # %bb.0: 482; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax 483; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 484; AVX512F-32-NEXT: vpsubusw (%eax), %zmm0, %zmm1 {%k1} 485; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 486; AVX512F-32-NEXT: retl 487 %b = load <32 x i16>, <32 x i16>* %ptr_b 488 %sub = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> %a, <32 x i16> %b) 489 %bc = bitcast i32 %mask to <32 x i1> 490 %res = select <32 x i1> %bc, <32 x i16> %sub, <32 x i16> %passThru 491 ret <32 x i16> %res 492} 493 494define <32 x i16> @test_mask_subs_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { 495; AVX512BW-LABEL: test_mask_subs_epu16_rmkz_512: 496; AVX512BW: ## %bb.0: 497; AVX512BW-NEXT: kmovd %esi, %k1 498; AVX512BW-NEXT: vpsubusw (%rdi), %zmm0, %zmm0 {%k1} {z} 499; AVX512BW-NEXT: retq 500; 501; AVX512F-32-LABEL: test_mask_subs_epu16_rmkz_512: 502; AVX512F-32: # %bb.0: 503; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax 504; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 505; AVX512F-32-NEXT: vpsubusw (%eax), %zmm0, %zmm0 {%k1} {z} 506; AVX512F-32-NEXT: retl 507 %b = load <32 x i16>, <32 x i16>* %ptr_b 508 %sub = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> %a, <32 x i16> %b) 509 %bc = bitcast i32 %mask to <32 x i1> 510 %res = select <32 x i1> %bc, <32 x i16> %sub, <32 x i16> zeroinitializer 511 ret <32 x i16> %res 512} 513 514 515define <64 x i16> @test_mask_adds_epu16_rr_1024(<64 x i16> %a, <64 x i16> %b) { 516; AVX512BW-LABEL: test_mask_adds_epu16_rr_1024: 517; AVX512BW: ## %bb.0: 518; AVX512BW-NEXT: vpaddusw %zmm2, %zmm0, %zmm0 519; AVX512BW-NEXT: vpaddusw %zmm3, %zmm1, %zmm1 520; AVX512BW-NEXT: retq 521; 522; AVX512F-32-LABEL: test_mask_adds_epu16_rr_1024: 523; AVX512F-32: # %bb.0: 524; AVX512F-32-NEXT: pushl %ebp 525; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 526; AVX512F-32-NEXT: .cfi_offset %ebp, -8 527; AVX512F-32-NEXT: movl %esp, %ebp 528; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp 529; AVX512F-32-NEXT: andl $-64, %esp 530; AVX512F-32-NEXT: subl $64, %esp 531; AVX512F-32-NEXT: vpaddusw %zmm2, %zmm0, %zmm0 532; AVX512F-32-NEXT: vpaddusw 8(%ebp), %zmm1, %zmm1 533; AVX512F-32-NEXT: movl %ebp, %esp 534; AVX512F-32-NEXT: popl %ebp 535; AVX512F-32-NEXT: .cfi_def_cfa %esp, 4 536; AVX512F-32-NEXT: retl 537 %1 = call <64 x i16> @llvm.uadd.sat.v64i16(<64 x i16> %a, <64 x i16> %b) 538 ret <64 x i16> %1 539} 540declare <64 x i16> @llvm.uadd.sat.v64i16(<64 x i16>, <64 x i16>) 541 542define <64 x i16> @test_mask_subs_epu16_rr_1024(<64 x i16> %a, <64 x i16> %b) { 543; AVX512BW-LABEL: test_mask_subs_epu16_rr_1024: 544; AVX512BW: ## %bb.0: 545; AVX512BW-NEXT: vpsubusw %zmm2, %zmm0, %zmm0 546; AVX512BW-NEXT: vpsubusw %zmm3, %zmm1, %zmm1 547; AVX512BW-NEXT: retq 548; 549; AVX512F-32-LABEL: test_mask_subs_epu16_rr_1024: 550; AVX512F-32: # %bb.0: 551; AVX512F-32-NEXT: pushl %ebp 552; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 553; AVX512F-32-NEXT: .cfi_offset %ebp, -8 554; AVX512F-32-NEXT: movl %esp, %ebp 555; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp 556; AVX512F-32-NEXT: andl $-64, %esp 557; AVX512F-32-NEXT: subl $64, %esp 558; AVX512F-32-NEXT: vpsubusw %zmm2, %zmm0, %zmm0 559; AVX512F-32-NEXT: vpsubusw 8(%ebp), %zmm1, %zmm1 560; AVX512F-32-NEXT: movl %ebp, %esp 561; AVX512F-32-NEXT: popl %ebp 562; AVX512F-32-NEXT: .cfi_def_cfa %esp, 4 563; AVX512F-32-NEXT: retl 564 %sub = call <64 x i16> @llvm.usub.sat.v64i16(<64 x i16> %a, <64 x i16> %b) 565 ret <64 x i16> %sub 566} 567declare <64 x i16> @llvm.usub.sat.v64i16(<64 x i16>, <64 x i16>) 568