1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41 3; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=AVX 4; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F 5; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL 6 7define <2 x double> @floor_v2f64(<2 x double> %p) { 8; SSE41-LABEL: floor_v2f64: 9; SSE41: ## %bb.0: 10; SSE41-NEXT: roundpd $9, %xmm0, %xmm0 11; SSE41-NEXT: retq 12; 13; AVX-LABEL: floor_v2f64: 14; AVX: ## %bb.0: 15; AVX-NEXT: vroundpd $9, %xmm0, %xmm0 16; AVX-NEXT: retq 17; 18; AVX512-LABEL: floor_v2f64: 19; AVX512: ## %bb.0: 20; AVX512-NEXT: vroundpd $9, %xmm0, %xmm0 21; AVX512-NEXT: retq 22 %t = call <2 x double> @llvm.floor.v2f64(<2 x double> %p) 23 ret <2 x double> %t 24} 25declare <2 x double> @llvm.floor.v2f64(<2 x double> %p) 26 27define <4 x float> @floor_v4f32(<4 x float> %p) { 28; SSE41-LABEL: floor_v4f32: 29; SSE41: ## %bb.0: 30; SSE41-NEXT: roundps $9, %xmm0, %xmm0 31; SSE41-NEXT: retq 32; 33; AVX-LABEL: floor_v4f32: 34; AVX: ## %bb.0: 35; AVX-NEXT: vroundps $9, %xmm0, %xmm0 36; AVX-NEXT: retq 37; 38; AVX512-LABEL: floor_v4f32: 39; AVX512: ## %bb.0: 40; AVX512-NEXT: vroundps $9, %xmm0, %xmm0 41; AVX512-NEXT: retq 42 %t = call <4 x float> @llvm.floor.v4f32(<4 x float> %p) 43 ret <4 x float> %t 44} 45declare <4 x float> @llvm.floor.v4f32(<4 x float> %p) 46 47define <4 x double> @floor_v4f64(<4 x double> %p){ 48; SSE41-LABEL: floor_v4f64: 49; SSE41: ## %bb.0: 50; SSE41-NEXT: roundpd $9, %xmm0, %xmm0 51; SSE41-NEXT: roundpd $9, %xmm1, %xmm1 52; SSE41-NEXT: retq 53; 54; AVX-LABEL: floor_v4f64: 55; AVX: ## %bb.0: 56; AVX-NEXT: vroundpd $9, %ymm0, %ymm0 57; AVX-NEXT: retq 58; 59; AVX512-LABEL: floor_v4f64: 60; AVX512: ## %bb.0: 61; AVX512-NEXT: vroundpd $9, %ymm0, %ymm0 62; AVX512-NEXT: retq 63 %t = call <4 x double> @llvm.floor.v4f64(<4 x double> %p) 64 ret <4 x double> %t 65} 66declare <4 x double> @llvm.floor.v4f64(<4 x double> %p) 67 68define <8 x float> @floor_v8f32(<8 x float> %p) { 69; SSE41-LABEL: floor_v8f32: 70; SSE41: ## %bb.0: 71; SSE41-NEXT: roundps $9, %xmm0, %xmm0 72; SSE41-NEXT: roundps $9, %xmm1, %xmm1 73; SSE41-NEXT: retq 74; 75; AVX-LABEL: floor_v8f32: 76; AVX: ## %bb.0: 77; AVX-NEXT: vroundps $9, %ymm0, %ymm0 78; AVX-NEXT: retq 79; 80; AVX512-LABEL: floor_v8f32: 81; AVX512: ## %bb.0: 82; AVX512-NEXT: vroundps $9, %ymm0, %ymm0 83; AVX512-NEXT: retq 84 %t = call <8 x float> @llvm.floor.v8f32(<8 x float> %p) 85 ret <8 x float> %t 86} 87declare <8 x float> @llvm.floor.v8f32(<8 x float> %p) 88 89define <8 x double> @floor_v8f64(<8 x double> %p){ 90; SSE41-LABEL: floor_v8f64: 91; SSE41: ## %bb.0: 92; SSE41-NEXT: roundpd $9, %xmm0, %xmm0 93; SSE41-NEXT: roundpd $9, %xmm1, %xmm1 94; SSE41-NEXT: roundpd $9, %xmm2, %xmm2 95; SSE41-NEXT: roundpd $9, %xmm3, %xmm3 96; SSE41-NEXT: retq 97; 98; AVX-LABEL: floor_v8f64: 99; AVX: ## %bb.0: 100; AVX-NEXT: vroundpd $9, %ymm0, %ymm0 101; AVX-NEXT: vroundpd $9, %ymm1, %ymm1 102; AVX-NEXT: retq 103; 104; AVX512-LABEL: floor_v8f64: 105; AVX512: ## %bb.0: 106; AVX512-NEXT: vrndscalepd $9, %zmm0, %zmm0 107; AVX512-NEXT: retq 108 %t = call <8 x double> @llvm.floor.v8f64(<8 x double> %p) 109 ret <8 x double> %t 110} 111declare <8 x double> @llvm.floor.v8f64(<8 x double> %p) 112 113define <16 x float> @floor_v16f32(<16 x float> %p) { 114; SSE41-LABEL: floor_v16f32: 115; SSE41: ## %bb.0: 116; SSE41-NEXT: roundps $9, %xmm0, %xmm0 117; SSE41-NEXT: roundps $9, %xmm1, %xmm1 118; SSE41-NEXT: roundps $9, %xmm2, %xmm2 119; SSE41-NEXT: roundps $9, %xmm3, %xmm3 120; SSE41-NEXT: retq 121; 122; AVX-LABEL: floor_v16f32: 123; AVX: ## %bb.0: 124; AVX-NEXT: vroundps $9, %ymm0, %ymm0 125; AVX-NEXT: vroundps $9, %ymm1, %ymm1 126; AVX-NEXT: retq 127; 128; AVX512-LABEL: floor_v16f32: 129; AVX512: ## %bb.0: 130; AVX512-NEXT: vrndscaleps $9, %zmm0, %zmm0 131; AVX512-NEXT: retq 132 %t = call <16 x float> @llvm.floor.v16f32(<16 x float> %p) 133 ret <16 x float> %t 134} 135declare <16 x float> @llvm.floor.v16f32(<16 x float> %p) 136 137define <2 x double> @ceil_v2f64(<2 x double> %p) { 138; SSE41-LABEL: ceil_v2f64: 139; SSE41: ## %bb.0: 140; SSE41-NEXT: roundpd $10, %xmm0, %xmm0 141; SSE41-NEXT: retq 142; 143; AVX-LABEL: ceil_v2f64: 144; AVX: ## %bb.0: 145; AVX-NEXT: vroundpd $10, %xmm0, %xmm0 146; AVX-NEXT: retq 147; 148; AVX512-LABEL: ceil_v2f64: 149; AVX512: ## %bb.0: 150; AVX512-NEXT: vroundpd $10, %xmm0, %xmm0 151; AVX512-NEXT: retq 152 %t = call <2 x double> @llvm.ceil.v2f64(<2 x double> %p) 153 ret <2 x double> %t 154} 155declare <2 x double> @llvm.ceil.v2f64(<2 x double> %p) 156 157define <2 x double> @ceil_v2f64_load(<2 x double>* %ptr) { 158; SSE41-LABEL: ceil_v2f64_load: 159; SSE41: ## %bb.0: 160; SSE41-NEXT: movupd (%rdi), %xmm0 161; SSE41-NEXT: roundpd $10, %xmm0, %xmm0 162; SSE41-NEXT: retq 163; 164; AVX-LABEL: ceil_v2f64_load: 165; AVX: ## %bb.0: 166; AVX-NEXT: vroundpd $10, (%rdi), %xmm0 167; AVX-NEXT: retq 168; 169; AVX512-LABEL: ceil_v2f64_load: 170; AVX512: ## %bb.0: 171; AVX512-NEXT: vroundpd $10, (%rdi), %xmm0 172; AVX512-NEXT: retq 173 %p = load <2 x double>, <2 x double>* %ptr, align 1 174 %t = call <2 x double> @llvm.ceil.v2f64(<2 x double> %p) 175 ret <2 x double> %t 176} 177 178define <4 x float> @ceil_v4f32(<4 x float> %p) { 179; SSE41-LABEL: ceil_v4f32: 180; SSE41: ## %bb.0: 181; SSE41-NEXT: roundps $10, %xmm0, %xmm0 182; SSE41-NEXT: retq 183; 184; AVX-LABEL: ceil_v4f32: 185; AVX: ## %bb.0: 186; AVX-NEXT: vroundps $10, %xmm0, %xmm0 187; AVX-NEXT: retq 188; 189; AVX512-LABEL: ceil_v4f32: 190; AVX512: ## %bb.0: 191; AVX512-NEXT: vroundps $10, %xmm0, %xmm0 192; AVX512-NEXT: retq 193 %t = call <4 x float> @llvm.ceil.v4f32(<4 x float> %p) 194 ret <4 x float> %t 195} 196declare <4 x float> @llvm.ceil.v4f32(<4 x float> %p) 197 198define <4 x float> @ceil_v4f32_load(<4 x float>* %ptr) { 199; SSE41-LABEL: ceil_v4f32_load: 200; SSE41: ## %bb.0: 201; SSE41-NEXT: movups (%rdi), %xmm0 202; SSE41-NEXT: roundps $10, %xmm0, %xmm0 203; SSE41-NEXT: retq 204; 205; AVX-LABEL: ceil_v4f32_load: 206; AVX: ## %bb.0: 207; AVX-NEXT: vroundps $10, (%rdi), %xmm0 208; AVX-NEXT: retq 209; 210; AVX512-LABEL: ceil_v4f32_load: 211; AVX512: ## %bb.0: 212; AVX512-NEXT: vroundps $10, (%rdi), %xmm0 213; AVX512-NEXT: retq 214 %p = load <4 x float>, <4 x float>* %ptr, align 1 215 %t = call <4 x float> @llvm.ceil.v4f32(<4 x float> %p) 216 ret <4 x float> %t 217} 218 219define <4 x double> @ceil_v4f64(<4 x double> %p) { 220; SSE41-LABEL: ceil_v4f64: 221; SSE41: ## %bb.0: 222; SSE41-NEXT: roundpd $10, %xmm0, %xmm0 223; SSE41-NEXT: roundpd $10, %xmm1, %xmm1 224; SSE41-NEXT: retq 225; 226; AVX-LABEL: ceil_v4f64: 227; AVX: ## %bb.0: 228; AVX-NEXT: vroundpd $10, %ymm0, %ymm0 229; AVX-NEXT: retq 230; 231; AVX512-LABEL: ceil_v4f64: 232; AVX512: ## %bb.0: 233; AVX512-NEXT: vroundpd $10, %ymm0, %ymm0 234; AVX512-NEXT: retq 235 %t = call <4 x double> @llvm.ceil.v4f64(<4 x double> %p) 236 ret <4 x double> %t 237} 238declare <4 x double> @llvm.ceil.v4f64(<4 x double> %p) 239 240define <8 x float> @ceil_v8f32(<8 x float> %p) { 241; SSE41-LABEL: ceil_v8f32: 242; SSE41: ## %bb.0: 243; SSE41-NEXT: roundps $10, %xmm0, %xmm0 244; SSE41-NEXT: roundps $10, %xmm1, %xmm1 245; SSE41-NEXT: retq 246; 247; AVX-LABEL: ceil_v8f32: 248; AVX: ## %bb.0: 249; AVX-NEXT: vroundps $10, %ymm0, %ymm0 250; AVX-NEXT: retq 251; 252; AVX512-LABEL: ceil_v8f32: 253; AVX512: ## %bb.0: 254; AVX512-NEXT: vroundps $10, %ymm0, %ymm0 255; AVX512-NEXT: retq 256 %t = call <8 x float> @llvm.ceil.v8f32(<8 x float> %p) 257 ret <8 x float> %t 258} 259declare <8 x float> @llvm.ceil.v8f32(<8 x float> %p) 260 261define <8 x double> @ceil_v8f64(<8 x double> %p){ 262; SSE41-LABEL: ceil_v8f64: 263; SSE41: ## %bb.0: 264; SSE41-NEXT: roundpd $10, %xmm0, %xmm0 265; SSE41-NEXT: roundpd $10, %xmm1, %xmm1 266; SSE41-NEXT: roundpd $10, %xmm2, %xmm2 267; SSE41-NEXT: roundpd $10, %xmm3, %xmm3 268; SSE41-NEXT: retq 269; 270; AVX-LABEL: ceil_v8f64: 271; AVX: ## %bb.0: 272; AVX-NEXT: vroundpd $10, %ymm0, %ymm0 273; AVX-NEXT: vroundpd $10, %ymm1, %ymm1 274; AVX-NEXT: retq 275; 276; AVX512-LABEL: ceil_v8f64: 277; AVX512: ## %bb.0: 278; AVX512-NEXT: vrndscalepd $10, %zmm0, %zmm0 279; AVX512-NEXT: retq 280 %t = call <8 x double> @llvm.ceil.v8f64(<8 x double> %p) 281 ret <8 x double> %t 282} 283declare <8 x double> @llvm.ceil.v8f64(<8 x double> %p) 284 285define <16 x float> @ceil_v16f32(<16 x float> %p) { 286; SSE41-LABEL: ceil_v16f32: 287; SSE41: ## %bb.0: 288; SSE41-NEXT: roundps $10, %xmm0, %xmm0 289; SSE41-NEXT: roundps $10, %xmm1, %xmm1 290; SSE41-NEXT: roundps $10, %xmm2, %xmm2 291; SSE41-NEXT: roundps $10, %xmm3, %xmm3 292; SSE41-NEXT: retq 293; 294; AVX-LABEL: ceil_v16f32: 295; AVX: ## %bb.0: 296; AVX-NEXT: vroundps $10, %ymm0, %ymm0 297; AVX-NEXT: vroundps $10, %ymm1, %ymm1 298; AVX-NEXT: retq 299; 300; AVX512-LABEL: ceil_v16f32: 301; AVX512: ## %bb.0: 302; AVX512-NEXT: vrndscaleps $10, %zmm0, %zmm0 303; AVX512-NEXT: retq 304 %t = call <16 x float> @llvm.ceil.v16f32(<16 x float> %p) 305 ret <16 x float> %t 306} 307declare <16 x float> @llvm.ceil.v16f32(<16 x float> %p) 308 309define <2 x double> @trunc_v2f64(<2 x double> %p) { 310; SSE41-LABEL: trunc_v2f64: 311; SSE41: ## %bb.0: 312; SSE41-NEXT: roundpd $11, %xmm0, %xmm0 313; SSE41-NEXT: retq 314; 315; AVX-LABEL: trunc_v2f64: 316; AVX: ## %bb.0: 317; AVX-NEXT: vroundpd $11, %xmm0, %xmm0 318; AVX-NEXT: retq 319; 320; AVX512-LABEL: trunc_v2f64: 321; AVX512: ## %bb.0: 322; AVX512-NEXT: vroundpd $11, %xmm0, %xmm0 323; AVX512-NEXT: retq 324 %t = call <2 x double> @llvm.trunc.v2f64(<2 x double> %p) 325 ret <2 x double> %t 326} 327declare <2 x double> @llvm.trunc.v2f64(<2 x double> %p) 328 329define <4 x float> @trunc_v4f32(<4 x float> %p) { 330; SSE41-LABEL: trunc_v4f32: 331; SSE41: ## %bb.0: 332; SSE41-NEXT: roundps $11, %xmm0, %xmm0 333; SSE41-NEXT: retq 334; 335; AVX-LABEL: trunc_v4f32: 336; AVX: ## %bb.0: 337; AVX-NEXT: vroundps $11, %xmm0, %xmm0 338; AVX-NEXT: retq 339; 340; AVX512-LABEL: trunc_v4f32: 341; AVX512: ## %bb.0: 342; AVX512-NEXT: vroundps $11, %xmm0, %xmm0 343; AVX512-NEXT: retq 344 %t = call <4 x float> @llvm.trunc.v4f32(<4 x float> %p) 345 ret <4 x float> %t 346} 347declare <4 x float> @llvm.trunc.v4f32(<4 x float> %p) 348 349define <4 x double> @trunc_v4f64(<4 x double> %p) { 350; SSE41-LABEL: trunc_v4f64: 351; SSE41: ## %bb.0: 352; SSE41-NEXT: roundpd $11, %xmm0, %xmm0 353; SSE41-NEXT: roundpd $11, %xmm1, %xmm1 354; SSE41-NEXT: retq 355; 356; AVX-LABEL: trunc_v4f64: 357; AVX: ## %bb.0: 358; AVX-NEXT: vroundpd $11, %ymm0, %ymm0 359; AVX-NEXT: retq 360; 361; AVX512-LABEL: trunc_v4f64: 362; AVX512: ## %bb.0: 363; AVX512-NEXT: vroundpd $11, %ymm0, %ymm0 364; AVX512-NEXT: retq 365 %t = call <4 x double> @llvm.trunc.v4f64(<4 x double> %p) 366 ret <4 x double> %t 367} 368declare <4 x double> @llvm.trunc.v4f64(<4 x double> %p) 369 370define <8 x float> @trunc_v8f32(<8 x float> %p) { 371; SSE41-LABEL: trunc_v8f32: 372; SSE41: ## %bb.0: 373; SSE41-NEXT: roundps $11, %xmm0, %xmm0 374; SSE41-NEXT: roundps $11, %xmm1, %xmm1 375; SSE41-NEXT: retq 376; 377; AVX-LABEL: trunc_v8f32: 378; AVX: ## %bb.0: 379; AVX-NEXT: vroundps $11, %ymm0, %ymm0 380; AVX-NEXT: retq 381; 382; AVX512-LABEL: trunc_v8f32: 383; AVX512: ## %bb.0: 384; AVX512-NEXT: vroundps $11, %ymm0, %ymm0 385; AVX512-NEXT: retq 386 %t = call <8 x float> @llvm.trunc.v8f32(<8 x float> %p) 387 ret <8 x float> %t 388} 389declare <8 x float> @llvm.trunc.v8f32(<8 x float> %p) 390 391define <8 x double> @trunc_v8f64(<8 x double> %p){ 392; SSE41-LABEL: trunc_v8f64: 393; SSE41: ## %bb.0: 394; SSE41-NEXT: roundpd $11, %xmm0, %xmm0 395; SSE41-NEXT: roundpd $11, %xmm1, %xmm1 396; SSE41-NEXT: roundpd $11, %xmm2, %xmm2 397; SSE41-NEXT: roundpd $11, %xmm3, %xmm3 398; SSE41-NEXT: retq 399; 400; AVX-LABEL: trunc_v8f64: 401; AVX: ## %bb.0: 402; AVX-NEXT: vroundpd $11, %ymm0, %ymm0 403; AVX-NEXT: vroundpd $11, %ymm1, %ymm1 404; AVX-NEXT: retq 405; 406; AVX512-LABEL: trunc_v8f64: 407; AVX512: ## %bb.0: 408; AVX512-NEXT: vrndscalepd $11, %zmm0, %zmm0 409; AVX512-NEXT: retq 410 %t = call <8 x double> @llvm.trunc.v8f64(<8 x double> %p) 411 ret <8 x double> %t 412} 413declare <8 x double> @llvm.trunc.v8f64(<8 x double> %p) 414 415define <16 x float> @trunc_v16f32(<16 x float> %p) { 416; SSE41-LABEL: trunc_v16f32: 417; SSE41: ## %bb.0: 418; SSE41-NEXT: roundps $11, %xmm0, %xmm0 419; SSE41-NEXT: roundps $11, %xmm1, %xmm1 420; SSE41-NEXT: roundps $11, %xmm2, %xmm2 421; SSE41-NEXT: roundps $11, %xmm3, %xmm3 422; SSE41-NEXT: retq 423; 424; AVX-LABEL: trunc_v16f32: 425; AVX: ## %bb.0: 426; AVX-NEXT: vroundps $11, %ymm0, %ymm0 427; AVX-NEXT: vroundps $11, %ymm1, %ymm1 428; AVX-NEXT: retq 429; 430; AVX512-LABEL: trunc_v16f32: 431; AVX512: ## %bb.0: 432; AVX512-NEXT: vrndscaleps $11, %zmm0, %zmm0 433; AVX512-NEXT: retq 434 %t = call <16 x float> @llvm.trunc.v16f32(<16 x float> %p) 435 ret <16 x float> %t 436} 437declare <16 x float> @llvm.trunc.v16f32(<16 x float> %p) 438 439define <2 x double> @rint_v2f64(<2 x double> %p) { 440; SSE41-LABEL: rint_v2f64: 441; SSE41: ## %bb.0: 442; SSE41-NEXT: roundpd $4, %xmm0, %xmm0 443; SSE41-NEXT: retq 444; 445; AVX-LABEL: rint_v2f64: 446; AVX: ## %bb.0: 447; AVX-NEXT: vroundpd $4, %xmm0, %xmm0 448; AVX-NEXT: retq 449; 450; AVX512-LABEL: rint_v2f64: 451; AVX512: ## %bb.0: 452; AVX512-NEXT: vroundpd $4, %xmm0, %xmm0 453; AVX512-NEXT: retq 454 %t = call <2 x double> @llvm.rint.v2f64(<2 x double> %p) 455 ret <2 x double> %t 456} 457declare <2 x double> @llvm.rint.v2f64(<2 x double> %p) 458 459define <4 x float> @rint_v4f32(<4 x float> %p) { 460; SSE41-LABEL: rint_v4f32: 461; SSE41: ## %bb.0: 462; SSE41-NEXT: roundps $4, %xmm0, %xmm0 463; SSE41-NEXT: retq 464; 465; AVX-LABEL: rint_v4f32: 466; AVX: ## %bb.0: 467; AVX-NEXT: vroundps $4, %xmm0, %xmm0 468; AVX-NEXT: retq 469; 470; AVX512-LABEL: rint_v4f32: 471; AVX512: ## %bb.0: 472; AVX512-NEXT: vroundps $4, %xmm0, %xmm0 473; AVX512-NEXT: retq 474 %t = call <4 x float> @llvm.rint.v4f32(<4 x float> %p) 475 ret <4 x float> %t 476} 477declare <4 x float> @llvm.rint.v4f32(<4 x float> %p) 478 479define <4 x double> @rint_v4f64(<4 x double> %p) { 480; SSE41-LABEL: rint_v4f64: 481; SSE41: ## %bb.0: 482; SSE41-NEXT: roundpd $4, %xmm0, %xmm0 483; SSE41-NEXT: roundpd $4, %xmm1, %xmm1 484; SSE41-NEXT: retq 485; 486; AVX-LABEL: rint_v4f64: 487; AVX: ## %bb.0: 488; AVX-NEXT: vroundpd $4, %ymm0, %ymm0 489; AVX-NEXT: retq 490; 491; AVX512-LABEL: rint_v4f64: 492; AVX512: ## %bb.0: 493; AVX512-NEXT: vroundpd $4, %ymm0, %ymm0 494; AVX512-NEXT: retq 495 %t = call <4 x double> @llvm.rint.v4f64(<4 x double> %p) 496 ret <4 x double> %t 497} 498declare <4 x double> @llvm.rint.v4f64(<4 x double> %p) 499 500define <8 x float> @rint_v8f32(<8 x float> %p) { 501; SSE41-LABEL: rint_v8f32: 502; SSE41: ## %bb.0: 503; SSE41-NEXT: roundps $4, %xmm0, %xmm0 504; SSE41-NEXT: roundps $4, %xmm1, %xmm1 505; SSE41-NEXT: retq 506; 507; AVX-LABEL: rint_v8f32: 508; AVX: ## %bb.0: 509; AVX-NEXT: vroundps $4, %ymm0, %ymm0 510; AVX-NEXT: retq 511; 512; AVX512-LABEL: rint_v8f32: 513; AVX512: ## %bb.0: 514; AVX512-NEXT: vroundps $4, %ymm0, %ymm0 515; AVX512-NEXT: retq 516 %t = call <8 x float> @llvm.rint.v8f32(<8 x float> %p) 517 ret <8 x float> %t 518} 519declare <8 x float> @llvm.rint.v8f32(<8 x float> %p) 520 521define <8 x double> @rint_v8f64(<8 x double> %p){ 522; SSE41-LABEL: rint_v8f64: 523; SSE41: ## %bb.0: 524; SSE41-NEXT: roundpd $4, %xmm0, %xmm0 525; SSE41-NEXT: roundpd $4, %xmm1, %xmm1 526; SSE41-NEXT: roundpd $4, %xmm2, %xmm2 527; SSE41-NEXT: roundpd $4, %xmm3, %xmm3 528; SSE41-NEXT: retq 529; 530; AVX-LABEL: rint_v8f64: 531; AVX: ## %bb.0: 532; AVX-NEXT: vroundpd $4, %ymm0, %ymm0 533; AVX-NEXT: vroundpd $4, %ymm1, %ymm1 534; AVX-NEXT: retq 535; 536; AVX512-LABEL: rint_v8f64: 537; AVX512: ## %bb.0: 538; AVX512-NEXT: vrndscalepd $4, %zmm0, %zmm0 539; AVX512-NEXT: retq 540 %t = call <8 x double> @llvm.rint.v8f64(<8 x double> %p) 541 ret <8 x double> %t 542} 543declare <8 x double> @llvm.rint.v8f64(<8 x double> %p) 544 545define <16 x float> @rint_v16f32(<16 x float> %p) { 546; SSE41-LABEL: rint_v16f32: 547; SSE41: ## %bb.0: 548; SSE41-NEXT: roundps $4, %xmm0, %xmm0 549; SSE41-NEXT: roundps $4, %xmm1, %xmm1 550; SSE41-NEXT: roundps $4, %xmm2, %xmm2 551; SSE41-NEXT: roundps $4, %xmm3, %xmm3 552; SSE41-NEXT: retq 553; 554; AVX-LABEL: rint_v16f32: 555; AVX: ## %bb.0: 556; AVX-NEXT: vroundps $4, %ymm0, %ymm0 557; AVX-NEXT: vroundps $4, %ymm1, %ymm1 558; AVX-NEXT: retq 559; 560; AVX512-LABEL: rint_v16f32: 561; AVX512: ## %bb.0: 562; AVX512-NEXT: vrndscaleps $4, %zmm0, %zmm0 563; AVX512-NEXT: retq 564 %t = call <16 x float> @llvm.rint.v16f32(<16 x float> %p) 565 ret <16 x float> %t 566} 567declare <16 x float> @llvm.rint.v16f32(<16 x float> %p) 568 569define <2 x double> @nearbyint_v2f64(<2 x double> %p) { 570; SSE41-LABEL: nearbyint_v2f64: 571; SSE41: ## %bb.0: 572; SSE41-NEXT: roundpd $12, %xmm0, %xmm0 573; SSE41-NEXT: retq 574; 575; AVX-LABEL: nearbyint_v2f64: 576; AVX: ## %bb.0: 577; AVX-NEXT: vroundpd $12, %xmm0, %xmm0 578; AVX-NEXT: retq 579; 580; AVX512-LABEL: nearbyint_v2f64: 581; AVX512: ## %bb.0: 582; AVX512-NEXT: vroundpd $12, %xmm0, %xmm0 583; AVX512-NEXT: retq 584 %t = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %p) 585 ret <2 x double> %t 586} 587declare <2 x double> @llvm.nearbyint.v2f64(<2 x double> %p) 588 589define <4 x float> @nearbyint_v4f32(<4 x float> %p) { 590; SSE41-LABEL: nearbyint_v4f32: 591; SSE41: ## %bb.0: 592; SSE41-NEXT: roundps $12, %xmm0, %xmm0 593; SSE41-NEXT: retq 594; 595; AVX-LABEL: nearbyint_v4f32: 596; AVX: ## %bb.0: 597; AVX-NEXT: vroundps $12, %xmm0, %xmm0 598; AVX-NEXT: retq 599; 600; AVX512-LABEL: nearbyint_v4f32: 601; AVX512: ## %bb.0: 602; AVX512-NEXT: vroundps $12, %xmm0, %xmm0 603; AVX512-NEXT: retq 604 %t = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %p) 605 ret <4 x float> %t 606} 607declare <4 x float> @llvm.nearbyint.v4f32(<4 x float> %p) 608 609define <4 x double> @nearbyint_v4f64(<4 x double> %p) { 610; SSE41-LABEL: nearbyint_v4f64: 611; SSE41: ## %bb.0: 612; SSE41-NEXT: roundpd $12, %xmm0, %xmm0 613; SSE41-NEXT: roundpd $12, %xmm1, %xmm1 614; SSE41-NEXT: retq 615; 616; AVX-LABEL: nearbyint_v4f64: 617; AVX: ## %bb.0: 618; AVX-NEXT: vroundpd $12, %ymm0, %ymm0 619; AVX-NEXT: retq 620; 621; AVX512-LABEL: nearbyint_v4f64: 622; AVX512: ## %bb.0: 623; AVX512-NEXT: vroundpd $12, %ymm0, %ymm0 624; AVX512-NEXT: retq 625 %t = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %p) 626 ret <4 x double> %t 627} 628declare <4 x double> @llvm.nearbyint.v4f64(<4 x double> %p) 629 630define <8 x float> @nearbyint_v8f32(<8 x float> %p) { 631; SSE41-LABEL: nearbyint_v8f32: 632; SSE41: ## %bb.0: 633; SSE41-NEXT: roundps $12, %xmm0, %xmm0 634; SSE41-NEXT: roundps $12, %xmm1, %xmm1 635; SSE41-NEXT: retq 636; 637; AVX-LABEL: nearbyint_v8f32: 638; AVX: ## %bb.0: 639; AVX-NEXT: vroundps $12, %ymm0, %ymm0 640; AVX-NEXT: retq 641; 642; AVX512-LABEL: nearbyint_v8f32: 643; AVX512: ## %bb.0: 644; AVX512-NEXT: vroundps $12, %ymm0, %ymm0 645; AVX512-NEXT: retq 646 %t = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> %p) 647 ret <8 x float> %t 648} 649declare <8 x float> @llvm.nearbyint.v8f32(<8 x float> %p) 650 651define <8 x double> @nearbyint_v8f64(<8 x double> %p){ 652; SSE41-LABEL: nearbyint_v8f64: 653; SSE41: ## %bb.0: 654; SSE41-NEXT: roundpd $12, %xmm0, %xmm0 655; SSE41-NEXT: roundpd $12, %xmm1, %xmm1 656; SSE41-NEXT: roundpd $12, %xmm2, %xmm2 657; SSE41-NEXT: roundpd $12, %xmm3, %xmm3 658; SSE41-NEXT: retq 659; 660; AVX-LABEL: nearbyint_v8f64: 661; AVX: ## %bb.0: 662; AVX-NEXT: vroundpd $12, %ymm0, %ymm0 663; AVX-NEXT: vroundpd $12, %ymm1, %ymm1 664; AVX-NEXT: retq 665; 666; AVX512-LABEL: nearbyint_v8f64: 667; AVX512: ## %bb.0: 668; AVX512-NEXT: vrndscalepd $12, %zmm0, %zmm0 669; AVX512-NEXT: retq 670 %t = call <8 x double> @llvm.nearbyint.v8f64(<8 x double> %p) 671 ret <8 x double> %t 672} 673declare <8 x double> @llvm.nearbyint.v8f64(<8 x double> %p) 674 675define <16 x float> @nearbyint_v16f32(<16 x float> %p) { 676; SSE41-LABEL: nearbyint_v16f32: 677; SSE41: ## %bb.0: 678; SSE41-NEXT: roundps $12, %xmm0, %xmm0 679; SSE41-NEXT: roundps $12, %xmm1, %xmm1 680; SSE41-NEXT: roundps $12, %xmm2, %xmm2 681; SSE41-NEXT: roundps $12, %xmm3, %xmm3 682; SSE41-NEXT: retq 683; 684; AVX-LABEL: nearbyint_v16f32: 685; AVX: ## %bb.0: 686; AVX-NEXT: vroundps $12, %ymm0, %ymm0 687; AVX-NEXT: vroundps $12, %ymm1, %ymm1 688; AVX-NEXT: retq 689; 690; AVX512-LABEL: nearbyint_v16f32: 691; AVX512: ## %bb.0: 692; AVX512-NEXT: vrndscaleps $12, %zmm0, %zmm0 693; AVX512-NEXT: retq 694 %t = call <16 x float> @llvm.nearbyint.v16f32(<16 x float> %p) 695 ret <16 x float> %t 696} 697declare <16 x float> @llvm.nearbyint.v16f32(<16 x float> %p) 698 699; 700; Constant Folding 701; 702 703define <2 x double> @const_floor_v2f64() { 704; SSE41-LABEL: const_floor_v2f64: 705; SSE41: ## %bb.0: 706; SSE41-NEXT: movaps {{.*#+}} xmm0 = [-2.0E+0,2.0E+0] 707; SSE41-NEXT: retq 708; 709; AVX-LABEL: const_floor_v2f64: 710; AVX: ## %bb.0: 711; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-2.0E+0,2.0E+0] 712; AVX-NEXT: retq 713; 714; AVX512-LABEL: const_floor_v2f64: 715; AVX512: ## %bb.0: 716; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [-2.0E+0,2.0E+0] 717; AVX512-NEXT: retq 718 %t = call <2 x double> @llvm.floor.v2f64(<2 x double> <double -1.5, double 2.5>) 719 ret <2 x double> %t 720} 721 722define <4 x float> @const_floor_v4f32() { 723; SSE41-LABEL: const_floor_v4f32: 724; SSE41: ## %bb.0: 725; SSE41-NEXT: movaps {{.*#+}} xmm0 = [-4.0E+0,6.0E+0,-9.0E+0,2.0E+0] 726; SSE41-NEXT: retq 727; 728; AVX-LABEL: const_floor_v4f32: 729; AVX: ## %bb.0: 730; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-4.0E+0,6.0E+0,-9.0E+0,2.0E+0] 731; AVX-NEXT: retq 732; 733; AVX512-LABEL: const_floor_v4f32: 734; AVX512: ## %bb.0: 735; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [-4.0E+0,6.0E+0,-9.0E+0,2.0E+0] 736; AVX512-NEXT: retq 737 %t = call <4 x float> @llvm.floor.v4f32(<4 x float> <float -3.5, float 6.0, float -9.0, float 2.5>) 738 ret <4 x float> %t 739} 740 741define <2 x double> @const_ceil_v2f64() { 742; SSE41-LABEL: const_ceil_v2f64: 743; SSE41: ## %bb.0: 744; SSE41-NEXT: movaps {{.*#+}} xmm0 = [-1.0E+0,3.0E+0] 745; SSE41-NEXT: retq 746; 747; AVX-LABEL: const_ceil_v2f64: 748; AVX: ## %bb.0: 749; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-1.0E+0,3.0E+0] 750; AVX-NEXT: retq 751; 752; AVX512-LABEL: const_ceil_v2f64: 753; AVX512: ## %bb.0: 754; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [-1.0E+0,3.0E+0] 755; AVX512-NEXT: retq 756 %t = call <2 x double> @llvm.ceil.v2f64(<2 x double> <double -1.5, double 2.5>) 757 ret <2 x double> %t 758} 759 760define <4 x float> @const_ceil_v4f32() { 761; SSE41-LABEL: const_ceil_v4f32: 762; SSE41: ## %bb.0: 763; SSE41-NEXT: movaps {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,3.0E+0] 764; SSE41-NEXT: retq 765; 766; AVX-LABEL: const_ceil_v4f32: 767; AVX: ## %bb.0: 768; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,3.0E+0] 769; AVX-NEXT: retq 770; 771; AVX512-LABEL: const_ceil_v4f32: 772; AVX512: ## %bb.0: 773; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,3.0E+0] 774; AVX512-NEXT: retq 775 %t = call <4 x float> @llvm.ceil.v4f32(<4 x float> <float -3.5, float 6.0, float -9.0, float 2.5>) 776 ret <4 x float> %t 777} 778 779define <2 x double> @const_trunc_v2f64() { 780; SSE41-LABEL: const_trunc_v2f64: 781; SSE41: ## %bb.0: 782; SSE41-NEXT: movaps {{.*#+}} xmm0 = [-1.0E+0,2.0E+0] 783; SSE41-NEXT: retq 784; 785; AVX-LABEL: const_trunc_v2f64: 786; AVX: ## %bb.0: 787; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-1.0E+0,2.0E+0] 788; AVX-NEXT: retq 789; 790; AVX512-LABEL: const_trunc_v2f64: 791; AVX512: ## %bb.0: 792; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [-1.0E+0,2.0E+0] 793; AVX512-NEXT: retq 794 %t = call <2 x double> @llvm.trunc.v2f64(<2 x double> <double -1.5, double 2.5>) 795 ret <2 x double> %t 796} 797 798define <4 x float> @const_trunc_v4f32() { 799; SSE41-LABEL: const_trunc_v4f32: 800; SSE41: ## %bb.0: 801; SSE41-NEXT: movaps {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,2.0E+0] 802; SSE41-NEXT: retq 803; 804; AVX-LABEL: const_trunc_v4f32: 805; AVX: ## %bb.0: 806; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,2.0E+0] 807; AVX-NEXT: retq 808; 809; AVX512-LABEL: const_trunc_v4f32: 810; AVX512: ## %bb.0: 811; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,2.0E+0] 812; AVX512-NEXT: retq 813 %t = call <4 x float> @llvm.trunc.v4f32(<4 x float> <float -3.5, float 6.0, float -9.0, float 2.5>) 814 ret <4 x float> %t 815} 816 817; 818; Scalar and masked instructions 819; 820 821define <4 x float> @floor_ss(<4 x float> %x, <4 x float> %y) nounwind { 822; SSE41-LABEL: floor_ss: 823; SSE41: ## %bb.0: 824; SSE41-NEXT: roundss $9, %xmm0, %xmm0 825; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 826; SSE41-NEXT: retq 827; 828; AVX-LABEL: floor_ss: 829; AVX: ## %bb.0: 830; AVX-NEXT: vroundss $9, %xmm0, %xmm0, %xmm0 831; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 832; AVX-NEXT: retq 833; 834; AVX512-LABEL: floor_ss: 835; AVX512: ## %bb.0: 836; AVX512-NEXT: vroundss $9, %xmm0, %xmm0, %xmm0 837; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 838; AVX512-NEXT: retq 839 %s = extractelement <4 x float> %x, i32 0 840 %call = call float @llvm.floor.f32(float %s) 841 %res = insertelement <4 x float> %y, float %call, i32 0 842 ret <4 x float> %res 843} 844declare float @llvm.floor.f32(float %s) 845 846define <2 x double> @floor_sd(<2 x double> %x, <2 x double> %y) nounwind { 847; SSE41-LABEL: floor_sd: 848; SSE41: ## %bb.0: 849; SSE41-NEXT: roundsd $9, %xmm0, %xmm0 850; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] 851; SSE41-NEXT: retq 852; 853; AVX-LABEL: floor_sd: 854; AVX: ## %bb.0: 855; AVX-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm0 856; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] 857; AVX-NEXT: retq 858; 859; AVX512-LABEL: floor_sd: 860; AVX512: ## %bb.0: 861; AVX512-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm0 862; AVX512-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] 863; AVX512-NEXT: retq 864 %s = extractelement <2 x double> %x, i32 0 865 %call = call double @llvm.floor.f64(double %s) 866 %res = insertelement <2 x double> %y, double %call, i32 0 867 ret <2 x double> %res 868} 869declare double @llvm.floor.f64(double %s) 870 871define <4 x float> @floor_mask_128_ps(<4 x float> %x, <4 x float> %y) nounwind { 872; SSE41-LABEL: floor_mask_128_ps: 873; SSE41: ## %bb.0: 874; SSE41-NEXT: roundps $9, %xmm0, %xmm2 875; SSE41-NEXT: cmpeqps %xmm1, %xmm0 876; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1 877; SSE41-NEXT: movaps %xmm1, %xmm0 878; SSE41-NEXT: retq 879; 880; AVX-LABEL: floor_mask_128_ps: 881; AVX: ## %bb.0: 882; AVX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm2 883; AVX-NEXT: vroundps $9, %xmm0, %xmm0 884; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 885; AVX-NEXT: retq 886; 887; AVX512F-LABEL: floor_mask_128_ps: 888; AVX512F: ## %bb.0: 889; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 890; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 891; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1 892; AVX512F-NEXT: vroundps $9, %xmm0, %xmm0 893; AVX512F-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} 894; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 895; AVX512F-NEXT: vzeroupper 896; AVX512F-NEXT: retq 897; 898; AVX512VL-LABEL: floor_mask_128_ps: 899; AVX512VL: ## %bb.0: 900; AVX512VL-NEXT: vcmpeqps %xmm1, %xmm0, %k1 901; AVX512VL-NEXT: vrndscaleps $9, %xmm0, %xmm1 {%k1} 902; AVX512VL-NEXT: vmovaps %xmm1, %xmm0 903; AVX512VL-NEXT: retq 904 %k = fcmp oeq <4 x float> %x, %y 905 %call = call <4 x float> @llvm.floor.v4f32(<4 x float> %x) 906 %res = select <4 x i1> %k, <4 x float> %call, <4 x float> %y 907 ret <4 x float> %res 908} 909 910define <4 x float> @floor_maskz_128_ps(<4 x float> %x, <4 x float> %y) nounwind { 911; SSE41-LABEL: floor_maskz_128_ps: 912; SSE41: ## %bb.0: 913; SSE41-NEXT: cmpeqps %xmm0, %xmm1 914; SSE41-NEXT: roundps $9, %xmm0, %xmm0 915; SSE41-NEXT: andps %xmm1, %xmm0 916; SSE41-NEXT: retq 917; 918; AVX-LABEL: floor_maskz_128_ps: 919; AVX: ## %bb.0: 920; AVX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm1 921; AVX-NEXT: vroundps $9, %xmm0, %xmm0 922; AVX-NEXT: vandps %xmm0, %xmm1, %xmm0 923; AVX-NEXT: retq 924; 925; AVX512F-LABEL: floor_maskz_128_ps: 926; AVX512F: ## %bb.0: 927; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 928; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 929; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1 930; AVX512F-NEXT: vroundps $9, %xmm0, %xmm0 931; AVX512F-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} 932; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 933; AVX512F-NEXT: vzeroupper 934; AVX512F-NEXT: retq 935; 936; AVX512VL-LABEL: floor_maskz_128_ps: 937; AVX512VL: ## %bb.0: 938; AVX512VL-NEXT: vcmpeqps %xmm1, %xmm0, %k1 939; AVX512VL-NEXT: vrndscaleps $9, %xmm0, %xmm0 {%k1} {z} 940; AVX512VL-NEXT: retq 941 %k = fcmp oeq <4 x float> %x, %y 942 %call = call <4 x float> @llvm.floor.v4f32(<4 x float> %x) 943 %res = select <4 x i1> %k, <4 x float> %call, <4 x float> zeroinitializer 944 ret <4 x float> %res 945} 946 947define <2 x double> @floor_mask_128_pd(<2 x double> %x, <2 x double> %y) nounwind { 948; SSE41-LABEL: floor_mask_128_pd: 949; SSE41: ## %bb.0: 950; SSE41-NEXT: roundpd $9, %xmm0, %xmm2 951; SSE41-NEXT: cmpeqpd %xmm1, %xmm0 952; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 953; SSE41-NEXT: movapd %xmm1, %xmm0 954; SSE41-NEXT: retq 955; 956; AVX-LABEL: floor_mask_128_pd: 957; AVX: ## %bb.0: 958; AVX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm2 959; AVX-NEXT: vroundpd $9, %xmm0, %xmm0 960; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 961; AVX-NEXT: retq 962; 963; AVX512F-LABEL: floor_mask_128_pd: 964; AVX512F: ## %bb.0: 965; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 966; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 967; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1 968; AVX512F-NEXT: vroundpd $9, %xmm0, %xmm0 969; AVX512F-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1} 970; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 971; AVX512F-NEXT: vzeroupper 972; AVX512F-NEXT: retq 973; 974; AVX512VL-LABEL: floor_mask_128_pd: 975; AVX512VL: ## %bb.0: 976; AVX512VL-NEXT: vcmpeqpd %xmm1, %xmm0, %k1 977; AVX512VL-NEXT: vrndscalepd $9, %xmm0, %xmm1 {%k1} 978; AVX512VL-NEXT: vmovapd %xmm1, %xmm0 979; AVX512VL-NEXT: retq 980 %k = fcmp oeq <2 x double> %x, %y 981 %call = call <2 x double> @llvm.floor.v2f64(<2 x double> %x) 982 %res = select <2 x i1> %k, <2 x double> %call, <2 x double> %y 983 ret <2 x double> %res 984} 985 986define <2 x double> @floor_maskz_128_pd(<2 x double> %x, <2 x double> %y) nounwind { 987; SSE41-LABEL: floor_maskz_128_pd: 988; SSE41: ## %bb.0: 989; SSE41-NEXT: cmpeqpd %xmm0, %xmm1 990; SSE41-NEXT: roundpd $9, %xmm0, %xmm0 991; SSE41-NEXT: andpd %xmm1, %xmm0 992; SSE41-NEXT: retq 993; 994; AVX-LABEL: floor_maskz_128_pd: 995; AVX: ## %bb.0: 996; AVX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm1 997; AVX-NEXT: vroundpd $9, %xmm0, %xmm0 998; AVX-NEXT: vandpd %xmm0, %xmm1, %xmm0 999; AVX-NEXT: retq 1000; 1001; AVX512F-LABEL: floor_maskz_128_pd: 1002; AVX512F: ## %bb.0: 1003; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 1004; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 1005; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1 1006; AVX512F-NEXT: vroundpd $9, %xmm0, %xmm0 1007; AVX512F-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z} 1008; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 1009; AVX512F-NEXT: vzeroupper 1010; AVX512F-NEXT: retq 1011; 1012; AVX512VL-LABEL: floor_maskz_128_pd: 1013; AVX512VL: ## %bb.0: 1014; AVX512VL-NEXT: vcmpeqpd %xmm1, %xmm0, %k1 1015; AVX512VL-NEXT: vrndscalepd $9, %xmm0, %xmm0 {%k1} {z} 1016; AVX512VL-NEXT: retq 1017 %k = fcmp oeq <2 x double> %x, %y 1018 %call = call <2 x double> @llvm.floor.v2f64(<2 x double> %x) 1019 %res = select <2 x i1> %k, <2 x double> %call, <2 x double> zeroinitializer 1020 ret <2 x double> %res 1021} 1022 1023define <8 x float> @floor_mask_256_ps(<8 x float> %x, <8 x float> %y) nounwind { 1024; SSE41-LABEL: floor_mask_256_ps: 1025; SSE41: ## %bb.0: 1026; SSE41-NEXT: roundps $9, %xmm1, %xmm4 1027; SSE41-NEXT: cmpeqps %xmm3, %xmm1 1028; SSE41-NEXT: roundps $9, %xmm0, %xmm5 1029; SSE41-NEXT: cmpeqps %xmm2, %xmm0 1030; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm2 1031; SSE41-NEXT: movaps %xmm1, %xmm0 1032; SSE41-NEXT: blendvps %xmm0, %xmm4, %xmm3 1033; SSE41-NEXT: movaps %xmm2, %xmm0 1034; SSE41-NEXT: movaps %xmm3, %xmm1 1035; SSE41-NEXT: retq 1036; 1037; AVX-LABEL: floor_mask_256_ps: 1038; AVX: ## %bb.0: 1039; AVX-NEXT: vcmpeqps %ymm1, %ymm0, %ymm2 1040; AVX-NEXT: vroundps $9, %ymm0, %ymm0 1041; AVX-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0 1042; AVX-NEXT: retq 1043; 1044; AVX512F-LABEL: floor_mask_256_ps: 1045; AVX512F: ## %bb.0: 1046; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1 1047; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 1048; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1 1049; AVX512F-NEXT: vroundps $9, %ymm0, %ymm0 1050; AVX512F-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} 1051; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 1052; AVX512F-NEXT: retq 1053; 1054; AVX512VL-LABEL: floor_mask_256_ps: 1055; AVX512VL: ## %bb.0: 1056; AVX512VL-NEXT: vcmpeqps %ymm1, %ymm0, %k1 1057; AVX512VL-NEXT: vrndscaleps $9, %ymm0, %ymm1 {%k1} 1058; AVX512VL-NEXT: vmovaps %ymm1, %ymm0 1059; AVX512VL-NEXT: retq 1060 %k = fcmp oeq <8 x float> %x, %y 1061 %call = call <8 x float> @llvm.floor.v8f32(<8 x float> %x) 1062 %res = select <8 x i1> %k, <8 x float> %call, <8 x float> %y 1063 ret <8 x float> %res 1064} 1065 1066define <8 x float> @floor_maskz_256_ps(<8 x float> %x, <8 x float> %y) nounwind { 1067; SSE41-LABEL: floor_maskz_256_ps: 1068; SSE41: ## %bb.0: 1069; SSE41-NEXT: cmpeqps %xmm1, %xmm3 1070; SSE41-NEXT: cmpeqps %xmm0, %xmm2 1071; SSE41-NEXT: roundps $9, %xmm1, %xmm1 1072; SSE41-NEXT: andps %xmm3, %xmm1 1073; SSE41-NEXT: roundps $9, %xmm0, %xmm0 1074; SSE41-NEXT: andps %xmm2, %xmm0 1075; SSE41-NEXT: retq 1076; 1077; AVX-LABEL: floor_maskz_256_ps: 1078; AVX: ## %bb.0: 1079; AVX-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1 1080; AVX-NEXT: vroundps $9, %ymm0, %ymm0 1081; AVX-NEXT: vandps %ymm0, %ymm1, %ymm0 1082; AVX-NEXT: retq 1083; 1084; AVX512F-LABEL: floor_maskz_256_ps: 1085; AVX512F: ## %bb.0: 1086; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1 1087; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 1088; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1 1089; AVX512F-NEXT: vroundps $9, %ymm0, %ymm0 1090; AVX512F-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} 1091; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 1092; AVX512F-NEXT: retq 1093; 1094; AVX512VL-LABEL: floor_maskz_256_ps: 1095; AVX512VL: ## %bb.0: 1096; AVX512VL-NEXT: vcmpeqps %ymm1, %ymm0, %k1 1097; AVX512VL-NEXT: vrndscaleps $9, %ymm0, %ymm0 {%k1} {z} 1098; AVX512VL-NEXT: retq 1099 %k = fcmp oeq <8 x float> %x, %y 1100 %call = call <8 x float> @llvm.floor.v8f32(<8 x float> %x) 1101 %res = select <8 x i1> %k, <8 x float> %call, <8 x float> zeroinitializer 1102 ret <8 x float> %res 1103} 1104 1105define <4 x double> @floor_mask_256_pd(<4 x double> %x, <4 x double> %y) nounwind { 1106; SSE41-LABEL: floor_mask_256_pd: 1107; SSE41: ## %bb.0: 1108; SSE41-NEXT: roundpd $9, %xmm1, %xmm4 1109; SSE41-NEXT: cmpeqpd %xmm3, %xmm1 1110; SSE41-NEXT: roundpd $9, %xmm0, %xmm5 1111; SSE41-NEXT: cmpeqpd %xmm2, %xmm0 1112; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2 1113; SSE41-NEXT: movapd %xmm1, %xmm0 1114; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 1115; SSE41-NEXT: movapd %xmm2, %xmm0 1116; SSE41-NEXT: movapd %xmm3, %xmm1 1117; SSE41-NEXT: retq 1118; 1119; AVX-LABEL: floor_mask_256_pd: 1120; AVX: ## %bb.0: 1121; AVX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm2 1122; AVX-NEXT: vroundpd $9, %ymm0, %ymm0 1123; AVX-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 1124; AVX-NEXT: retq 1125; 1126; AVX512F-LABEL: floor_mask_256_pd: 1127; AVX512F: ## %bb.0: 1128; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1 1129; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 1130; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1 1131; AVX512F-NEXT: vroundpd $9, %ymm0, %ymm0 1132; AVX512F-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1} 1133; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 1134; AVX512F-NEXT: retq 1135; 1136; AVX512VL-LABEL: floor_mask_256_pd: 1137; AVX512VL: ## %bb.0: 1138; AVX512VL-NEXT: vcmpeqpd %ymm1, %ymm0, %k1 1139; AVX512VL-NEXT: vrndscalepd $9, %ymm0, %ymm1 {%k1} 1140; AVX512VL-NEXT: vmovapd %ymm1, %ymm0 1141; AVX512VL-NEXT: retq 1142 %k = fcmp oeq <4 x double> %x, %y 1143 %call = call <4 x double> @llvm.floor.v4f64(<4 x double> %x) 1144 %res = select <4 x i1> %k, <4 x double> %call, <4 x double> %y 1145 ret <4 x double> %res 1146} 1147 1148define <4 x double> @floor_maskz_256_pd(<4 x double> %x, <4 x double> %y) nounwind { 1149; SSE41-LABEL: floor_maskz_256_pd: 1150; SSE41: ## %bb.0: 1151; SSE41-NEXT: cmpeqpd %xmm1, %xmm3 1152; SSE41-NEXT: cmpeqpd %xmm0, %xmm2 1153; SSE41-NEXT: roundpd $9, %xmm1, %xmm1 1154; SSE41-NEXT: andpd %xmm3, %xmm1 1155; SSE41-NEXT: roundpd $9, %xmm0, %xmm0 1156; SSE41-NEXT: andpd %xmm2, %xmm0 1157; SSE41-NEXT: retq 1158; 1159; AVX-LABEL: floor_maskz_256_pd: 1160; AVX: ## %bb.0: 1161; AVX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm1 1162; AVX-NEXT: vroundpd $9, %ymm0, %ymm0 1163; AVX-NEXT: vandpd %ymm0, %ymm1, %ymm0 1164; AVX-NEXT: retq 1165; 1166; AVX512F-LABEL: floor_maskz_256_pd: 1167; AVX512F: ## %bb.0: 1168; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1 1169; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 1170; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1 1171; AVX512F-NEXT: vroundpd $9, %ymm0, %ymm0 1172; AVX512F-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z} 1173; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 1174; AVX512F-NEXT: retq 1175; 1176; AVX512VL-LABEL: floor_maskz_256_pd: 1177; AVX512VL: ## %bb.0: 1178; AVX512VL-NEXT: vcmpeqpd %ymm1, %ymm0, %k1 1179; AVX512VL-NEXT: vrndscalepd $9, %ymm0, %ymm0 {%k1} {z} 1180; AVX512VL-NEXT: retq 1181 %k = fcmp oeq <4 x double> %x, %y 1182 %call = call <4 x double> @llvm.floor.v4f64(<4 x double> %x) 1183 %res = select <4 x i1> %k, <4 x double> %call, <4 x double> zeroinitializer 1184 ret <4 x double> %res 1185} 1186 1187define <16 x float> @floor_mask_512_ps(<16 x float> %x, <16 x float> %y) nounwind { 1188; SSE41-LABEL: floor_mask_512_ps: 1189; SSE41: ## %bb.0: 1190; SSE41-NEXT: roundps $9, %xmm3, %xmm8 1191; SSE41-NEXT: cmpeqps %xmm7, %xmm3 1192; SSE41-NEXT: roundps $9, %xmm2, %xmm9 1193; SSE41-NEXT: cmpeqps %xmm6, %xmm2 1194; SSE41-NEXT: roundps $9, %xmm1, %xmm10 1195; SSE41-NEXT: cmpeqps %xmm5, %xmm1 1196; SSE41-NEXT: roundps $9, %xmm0, %xmm11 1197; SSE41-NEXT: cmpeqps %xmm4, %xmm0 1198; SSE41-NEXT: blendvps %xmm0, %xmm11, %xmm4 1199; SSE41-NEXT: movaps %xmm1, %xmm0 1200; SSE41-NEXT: blendvps %xmm0, %xmm10, %xmm5 1201; SSE41-NEXT: movaps %xmm2, %xmm0 1202; SSE41-NEXT: blendvps %xmm0, %xmm9, %xmm6 1203; SSE41-NEXT: movaps %xmm3, %xmm0 1204; SSE41-NEXT: blendvps %xmm0, %xmm8, %xmm7 1205; SSE41-NEXT: movaps %xmm4, %xmm0 1206; SSE41-NEXT: movaps %xmm5, %xmm1 1207; SSE41-NEXT: movaps %xmm6, %xmm2 1208; SSE41-NEXT: movaps %xmm7, %xmm3 1209; SSE41-NEXT: retq 1210; 1211; AVX-LABEL: floor_mask_512_ps: 1212; AVX: ## %bb.0: 1213; AVX-NEXT: vcmpeqps %ymm3, %ymm1, %ymm4 1214; AVX-NEXT: vcmpeqps %ymm2, %ymm0, %ymm5 1215; AVX-NEXT: vroundps $9, %ymm1, %ymm1 1216; AVX-NEXT: vroundps $9, %ymm0, %ymm0 1217; AVX-NEXT: vblendvps %ymm5, %ymm0, %ymm2, %ymm0 1218; AVX-NEXT: vblendvps %ymm4, %ymm1, %ymm3, %ymm1 1219; AVX-NEXT: retq 1220; 1221; AVX512-LABEL: floor_mask_512_ps: 1222; AVX512: ## %bb.0: 1223; AVX512-NEXT: vcmpeqps %zmm1, %zmm0, %k1 1224; AVX512-NEXT: vrndscaleps $9, %zmm0, %zmm1 {%k1} 1225; AVX512-NEXT: vmovaps %zmm1, %zmm0 1226; AVX512-NEXT: retq 1227 %k = fcmp oeq <16 x float> %x, %y 1228 %call = call <16 x float> @llvm.floor.v16f32(<16 x float> %x) 1229 %res = select <16 x i1> %k, <16 x float> %call, <16 x float> %y 1230 ret <16 x float> %res 1231} 1232 1233define <16 x float> @floor_maskz_512_ps(<16 x float> %x, <16 x float> %y) nounwind { 1234; SSE41-LABEL: floor_maskz_512_ps: 1235; SSE41: ## %bb.0: 1236; SSE41-NEXT: cmpeqps %xmm3, %xmm7 1237; SSE41-NEXT: cmpeqps %xmm2, %xmm6 1238; SSE41-NEXT: cmpeqps %xmm1, %xmm5 1239; SSE41-NEXT: cmpeqps %xmm0, %xmm4 1240; SSE41-NEXT: roundps $9, %xmm3, %xmm3 1241; SSE41-NEXT: andps %xmm7, %xmm3 1242; SSE41-NEXT: roundps $9, %xmm2, %xmm2 1243; SSE41-NEXT: andps %xmm6, %xmm2 1244; SSE41-NEXT: roundps $9, %xmm1, %xmm1 1245; SSE41-NEXT: andps %xmm5, %xmm1 1246; SSE41-NEXT: roundps $9, %xmm0, %xmm0 1247; SSE41-NEXT: andps %xmm4, %xmm0 1248; SSE41-NEXT: retq 1249; 1250; AVX-LABEL: floor_maskz_512_ps: 1251; AVX: ## %bb.0: 1252; AVX-NEXT: vcmpeqps %ymm3, %ymm1, %ymm3 1253; AVX-NEXT: vcmpeqps %ymm2, %ymm0, %ymm2 1254; AVX-NEXT: vroundps $9, %ymm1, %ymm1 1255; AVX-NEXT: vandps %ymm1, %ymm3, %ymm1 1256; AVX-NEXT: vroundps $9, %ymm0, %ymm0 1257; AVX-NEXT: vandps %ymm0, %ymm2, %ymm0 1258; AVX-NEXT: retq 1259; 1260; AVX512-LABEL: floor_maskz_512_ps: 1261; AVX512: ## %bb.0: 1262; AVX512-NEXT: vcmpeqps %zmm1, %zmm0, %k1 1263; AVX512-NEXT: vrndscaleps $9, %zmm0, %zmm0 {%k1} {z} 1264; AVX512-NEXT: retq 1265 %k = fcmp oeq <16 x float> %x, %y 1266 %call = call <16 x float> @llvm.floor.v16f32(<16 x float> %x) 1267 %res = select <16 x i1> %k, <16 x float> %call, <16 x float> zeroinitializer 1268 ret <16 x float> %res 1269} 1270 1271define <8 x double> @floor_mask_512_pd(<8 x double> %x, <8 x double> %y) nounwind { 1272; SSE41-LABEL: floor_mask_512_pd: 1273; SSE41: ## %bb.0: 1274; SSE41-NEXT: roundpd $9, %xmm3, %xmm8 1275; SSE41-NEXT: cmpeqpd %xmm7, %xmm3 1276; SSE41-NEXT: roundpd $9, %xmm2, %xmm9 1277; SSE41-NEXT: cmpeqpd %xmm6, %xmm2 1278; SSE41-NEXT: roundpd $9, %xmm1, %xmm10 1279; SSE41-NEXT: cmpeqpd %xmm5, %xmm1 1280; SSE41-NEXT: roundpd $9, %xmm0, %xmm11 1281; SSE41-NEXT: cmpeqpd %xmm4, %xmm0 1282; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm4 1283; SSE41-NEXT: movapd %xmm1, %xmm0 1284; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm5 1285; SSE41-NEXT: movapd %xmm2, %xmm0 1286; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm6 1287; SSE41-NEXT: movapd %xmm3, %xmm0 1288; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm7 1289; SSE41-NEXT: movapd %xmm4, %xmm0 1290; SSE41-NEXT: movapd %xmm5, %xmm1 1291; SSE41-NEXT: movapd %xmm6, %xmm2 1292; SSE41-NEXT: movapd %xmm7, %xmm3 1293; SSE41-NEXT: retq 1294; 1295; AVX-LABEL: floor_mask_512_pd: 1296; AVX: ## %bb.0: 1297; AVX-NEXT: vcmpeqpd %ymm3, %ymm1, %ymm4 1298; AVX-NEXT: vcmpeqpd %ymm2, %ymm0, %ymm5 1299; AVX-NEXT: vroundpd $9, %ymm1, %ymm1 1300; AVX-NEXT: vroundpd $9, %ymm0, %ymm0 1301; AVX-NEXT: vblendvpd %ymm5, %ymm0, %ymm2, %ymm0 1302; AVX-NEXT: vblendvpd %ymm4, %ymm1, %ymm3, %ymm1 1303; AVX-NEXT: retq 1304; 1305; AVX512-LABEL: floor_mask_512_pd: 1306; AVX512: ## %bb.0: 1307; AVX512-NEXT: vcmpeqpd %zmm1, %zmm0, %k1 1308; AVX512-NEXT: vrndscalepd $9, %zmm0, %zmm1 {%k1} 1309; AVX512-NEXT: vmovapd %zmm1, %zmm0 1310; AVX512-NEXT: retq 1311 %k = fcmp oeq <8 x double> %x, %y 1312 %call = call <8 x double> @llvm.floor.v8f64(<8 x double> %x) 1313 %res = select <8 x i1> %k, <8 x double> %call, <8 x double> %y 1314 ret <8 x double> %res 1315} 1316 1317define <8 x double> @floor_maskz_512_pd(<8 x double> %x, <8 x double> %y) nounwind { 1318; SSE41-LABEL: floor_maskz_512_pd: 1319; SSE41: ## %bb.0: 1320; SSE41-NEXT: cmpeqpd %xmm3, %xmm7 1321; SSE41-NEXT: cmpeqpd %xmm2, %xmm6 1322; SSE41-NEXT: cmpeqpd %xmm1, %xmm5 1323; SSE41-NEXT: cmpeqpd %xmm0, %xmm4 1324; SSE41-NEXT: roundpd $9, %xmm3, %xmm3 1325; SSE41-NEXT: andpd %xmm7, %xmm3 1326; SSE41-NEXT: roundpd $9, %xmm2, %xmm2 1327; SSE41-NEXT: andpd %xmm6, %xmm2 1328; SSE41-NEXT: roundpd $9, %xmm1, %xmm1 1329; SSE41-NEXT: andpd %xmm5, %xmm1 1330; SSE41-NEXT: roundpd $9, %xmm0, %xmm0 1331; SSE41-NEXT: andpd %xmm4, %xmm0 1332; SSE41-NEXT: retq 1333; 1334; AVX-LABEL: floor_maskz_512_pd: 1335; AVX: ## %bb.0: 1336; AVX-NEXT: vcmpeqpd %ymm3, %ymm1, %ymm3 1337; AVX-NEXT: vcmpeqpd %ymm2, %ymm0, %ymm2 1338; AVX-NEXT: vroundpd $9, %ymm1, %ymm1 1339; AVX-NEXT: vandpd %ymm1, %ymm3, %ymm1 1340; AVX-NEXT: vroundpd $9, %ymm0, %ymm0 1341; AVX-NEXT: vandpd %ymm0, %ymm2, %ymm0 1342; AVX-NEXT: retq 1343; 1344; AVX512-LABEL: floor_maskz_512_pd: 1345; AVX512: ## %bb.0: 1346; AVX512-NEXT: vcmpeqpd %zmm1, %zmm0, %k1 1347; AVX512-NEXT: vrndscalepd $9, %zmm0, %zmm0 {%k1} {z} 1348; AVX512-NEXT: retq 1349 %k = fcmp oeq <8 x double> %x, %y 1350 %call = call <8 x double> @llvm.floor.v8f64(<8 x double> %x) 1351 %res = select <8 x i1> %k, <8 x double> %call, <8 x double> zeroinitializer 1352 ret <8 x double> %res 1353} 1354 1355define <4 x float> @floor_mask_ss(<4 x float> %x, <4 x float> %y, <4 x float> %w, i8 %k) nounwind { 1356; SSE41-LABEL: floor_mask_ss: 1357; SSE41: ## %bb.0: 1358; SSE41-NEXT: testb $1, %dil 1359; SSE41-NEXT: je LBB52_2 1360; SSE41-NEXT: ## %bb.1: 1361; SSE41-NEXT: xorps %xmm2, %xmm2 1362; SSE41-NEXT: roundss $9, %xmm0, %xmm2 1363; SSE41-NEXT: LBB52_2: 1364; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] 1365; SSE41-NEXT: movaps %xmm1, %xmm0 1366; SSE41-NEXT: retq 1367; 1368; AVX-LABEL: floor_mask_ss: 1369; AVX: ## %bb.0: 1370; AVX-NEXT: testb $1, %dil 1371; AVX-NEXT: je LBB52_2 1372; AVX-NEXT: ## %bb.1: 1373; AVX-NEXT: vroundss $9, %xmm0, %xmm0, %xmm2 1374; AVX-NEXT: LBB52_2: 1375; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3] 1376; AVX-NEXT: retq 1377; 1378; AVX512-LABEL: floor_mask_ss: 1379; AVX512: ## %bb.0: 1380; AVX512-NEXT: vroundss $9, %xmm0, %xmm0, %xmm0 1381; AVX512-NEXT: kmovw %edi, %k1 1382; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm2 {%k1} 1383; AVX512-NEXT: vmovaps %xmm2, %xmm0 1384; AVX512-NEXT: retq 1385 %mask = and i8 %k, 1 1386 %nmask = icmp eq i8 %mask, 0 1387 %s = extractelement <4 x float> %x, i64 0 1388 %call = tail call float @llvm.floor.f32(float %s) 1389 %dst = extractelement <4 x float> %w, i64 0 1390 %low = select i1 %nmask, float %dst, float %call 1391 %res = insertelement <4 x float> %y, float %low, i64 0 1392 ret <4 x float> %res 1393} 1394 1395define <4 x float> @floor_maskz_ss(<4 x float> %x, <4 x float> %y, i8 %k) nounwind { 1396; SSE41-LABEL: floor_maskz_ss: 1397; SSE41: ## %bb.0: 1398; SSE41-NEXT: testb $1, %dil 1399; SSE41-NEXT: xorps %xmm2, %xmm2 1400; SSE41-NEXT: je LBB53_2 1401; SSE41-NEXT: ## %bb.1: 1402; SSE41-NEXT: xorps %xmm2, %xmm2 1403; SSE41-NEXT: roundss $9, %xmm0, %xmm2 1404; SSE41-NEXT: LBB53_2: 1405; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] 1406; SSE41-NEXT: movaps %xmm1, %xmm0 1407; SSE41-NEXT: retq 1408; 1409; AVX-LABEL: floor_maskz_ss: 1410; AVX: ## %bb.0: 1411; AVX-NEXT: testb $1, %dil 1412; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 1413; AVX-NEXT: je LBB53_2 1414; AVX-NEXT: ## %bb.1: 1415; AVX-NEXT: vroundss $9, %xmm0, %xmm0, %xmm2 1416; AVX-NEXT: LBB53_2: 1417; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3] 1418; AVX-NEXT: retq 1419; 1420; AVX512-LABEL: floor_maskz_ss: 1421; AVX512: ## %bb.0: 1422; AVX512-NEXT: vroundss $9, %xmm0, %xmm0, %xmm0 1423; AVX512-NEXT: kmovw %edi, %k1 1424; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm0 {%k1} {z} 1425; AVX512-NEXT: retq 1426 %mask = and i8 %k, 1 1427 %nmask = icmp eq i8 %mask, 0 1428 %s = extractelement <4 x float> %x, i64 0 1429 %call = tail call float @llvm.floor.f32(float %s) 1430 %low = select i1 %nmask, float zeroinitializer, float %call 1431 %res = insertelement <4 x float> %y, float %low, i64 0 1432 ret <4 x float> %res 1433} 1434 1435define <2 x double> @floor_mask_sd(<2 x double> %x, <2 x double> %y, <2 x double> %w, i8 %k) nounwind { 1436; SSE41-LABEL: floor_mask_sd: 1437; SSE41: ## %bb.0: 1438; SSE41-NEXT: testb $1, %dil 1439; SSE41-NEXT: je LBB54_2 1440; SSE41-NEXT: ## %bb.1: 1441; SSE41-NEXT: xorps %xmm2, %xmm2 1442; SSE41-NEXT: roundsd $9, %xmm0, %xmm2 1443; SSE41-NEXT: LBB54_2: 1444; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1] 1445; SSE41-NEXT: movapd %xmm1, %xmm0 1446; SSE41-NEXT: retq 1447; 1448; AVX-LABEL: floor_mask_sd: 1449; AVX: ## %bb.0: 1450; AVX-NEXT: testb $1, %dil 1451; AVX-NEXT: je LBB54_2 1452; AVX-NEXT: ## %bb.1: 1453; AVX-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm2 1454; AVX-NEXT: LBB54_2: 1455; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1] 1456; AVX-NEXT: retq 1457; 1458; AVX512-LABEL: floor_mask_sd: 1459; AVX512: ## %bb.0: 1460; AVX512-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm0 1461; AVX512-NEXT: kmovw %edi, %k1 1462; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm2 {%k1} 1463; AVX512-NEXT: vmovapd %xmm2, %xmm0 1464; AVX512-NEXT: retq 1465 %mask = and i8 %k, 1 1466 %nmask = icmp eq i8 %mask, 0 1467 %s = extractelement <2 x double> %x, i64 0 1468 %call = tail call double @llvm.floor.f64(double %s) 1469 %dst = extractelement <2 x double> %w, i64 0 1470 %low = select i1 %nmask, double %dst, double %call 1471 %res = insertelement <2 x double> %y, double %low, i64 0 1472 ret <2 x double> %res 1473} 1474 1475define <2 x double> @floor_maskz_sd(<2 x double> %x, <2 x double> %y, i8 %k) nounwind { 1476; SSE41-LABEL: floor_maskz_sd: 1477; SSE41: ## %bb.0: 1478; SSE41-NEXT: testb $1, %dil 1479; SSE41-NEXT: xorpd %xmm2, %xmm2 1480; SSE41-NEXT: je LBB55_2 1481; SSE41-NEXT: ## %bb.1: 1482; SSE41-NEXT: xorps %xmm2, %xmm2 1483; SSE41-NEXT: roundsd $9, %xmm0, %xmm2 1484; SSE41-NEXT: LBB55_2: 1485; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1] 1486; SSE41-NEXT: movapd %xmm1, %xmm0 1487; SSE41-NEXT: retq 1488; 1489; AVX-LABEL: floor_maskz_sd: 1490; AVX: ## %bb.0: 1491; AVX-NEXT: testb $1, %dil 1492; AVX-NEXT: vxorpd %xmm2, %xmm2, %xmm2 1493; AVX-NEXT: je LBB55_2 1494; AVX-NEXT: ## %bb.1: 1495; AVX-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm2 1496; AVX-NEXT: LBB55_2: 1497; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1] 1498; AVX-NEXT: retq 1499; 1500; AVX512-LABEL: floor_maskz_sd: 1501; AVX512: ## %bb.0: 1502; AVX512-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm0 1503; AVX512-NEXT: kmovw %edi, %k1 1504; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm0 {%k1} {z} 1505; AVX512-NEXT: retq 1506 %mask = and i8 %k, 1 1507 %nmask = icmp eq i8 %mask, 0 1508 %s = extractelement <2 x double> %x, i64 0 1509 %call = tail call double @llvm.floor.f64(double %s) 1510 %low = select i1 %nmask, double zeroinitializer, double %call 1511 %res = insertelement <2 x double> %y, double %low, i64 0 1512 ret <2 x double> %res 1513} 1514 1515define <4 x float> @floor_mask_ss_trunc(<4 x float> %x, <4 x float> %y, <4 x float> %w, i16 %k) nounwind { 1516; SSE41-LABEL: floor_mask_ss_trunc: 1517; SSE41: ## %bb.0: 1518; SSE41-NEXT: testb $1, %dil 1519; SSE41-NEXT: je LBB56_2 1520; SSE41-NEXT: ## %bb.1: 1521; SSE41-NEXT: xorps %xmm2, %xmm2 1522; SSE41-NEXT: roundss $9, %xmm0, %xmm2 1523; SSE41-NEXT: LBB56_2: 1524; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] 1525; SSE41-NEXT: movaps %xmm1, %xmm0 1526; SSE41-NEXT: retq 1527; 1528; AVX-LABEL: floor_mask_ss_trunc: 1529; AVX: ## %bb.0: 1530; AVX-NEXT: testb $1, %dil 1531; AVX-NEXT: je LBB56_2 1532; AVX-NEXT: ## %bb.1: 1533; AVX-NEXT: vroundss $9, %xmm0, %xmm0, %xmm2 1534; AVX-NEXT: LBB56_2: 1535; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3] 1536; AVX-NEXT: retq 1537; 1538; AVX512-LABEL: floor_mask_ss_trunc: 1539; AVX512: ## %bb.0: 1540; AVX512-NEXT: vroundss $9, %xmm0, %xmm0, %xmm0 1541; AVX512-NEXT: kmovw %edi, %k1 1542; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm2 {%k1} 1543; AVX512-NEXT: vmovaps %xmm2, %xmm0 1544; AVX512-NEXT: retq 1545 %mask = trunc i16 %k to i1 1546 %s = extractelement <4 x float> %x, i64 0 1547 %call = tail call float @llvm.floor.f32(float %s) 1548 %dst = extractelement <4 x float> %w, i64 0 1549 %low = select i1 %mask, float %call, float %dst 1550 %res = insertelement <4 x float> %y, float %low, i64 0 1551 ret <4 x float> %res 1552} 1553 1554define <4 x float> @floor_maskz_ss_trunc(<4 x float> %x, <4 x float> %y, i16 %k) nounwind { 1555; SSE41-LABEL: floor_maskz_ss_trunc: 1556; SSE41: ## %bb.0: 1557; SSE41-NEXT: testb $1, %dil 1558; SSE41-NEXT: jne LBB57_1 1559; SSE41-NEXT: ## %bb.2: 1560; SSE41-NEXT: xorps %xmm0, %xmm0 1561; SSE41-NEXT: jmp LBB57_3 1562; SSE41-NEXT: LBB57_1: 1563; SSE41-NEXT: roundss $9, %xmm0, %xmm0 1564; SSE41-NEXT: LBB57_3: 1565; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1566; SSE41-NEXT: movaps %xmm1, %xmm0 1567; SSE41-NEXT: retq 1568; 1569; AVX-LABEL: floor_maskz_ss_trunc: 1570; AVX: ## %bb.0: 1571; AVX-NEXT: testb $1, %dil 1572; AVX-NEXT: jne LBB57_1 1573; AVX-NEXT: ## %bb.2: 1574; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 1575; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1576; AVX-NEXT: retq 1577; AVX-NEXT: LBB57_1: 1578; AVX-NEXT: vroundss $9, %xmm0, %xmm0, %xmm0 1579; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1580; AVX-NEXT: retq 1581; 1582; AVX512-LABEL: floor_maskz_ss_trunc: 1583; AVX512: ## %bb.0: 1584; AVX512-NEXT: vroundss $9, %xmm0, %xmm0, %xmm0 1585; AVX512-NEXT: kmovw %edi, %k1 1586; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm0 {%k1} {z} 1587; AVX512-NEXT: retq 1588 %mask = trunc i16 %k to i1 1589 %s = extractelement <4 x float> %x, i64 0 1590 %call = tail call float @llvm.floor.f32(float %s) 1591 %low = select i1 %mask, float %call, float zeroinitializer 1592 %res = insertelement <4 x float> %y, float %low, i64 0 1593 ret <4 x float> %res 1594} 1595 1596define <2 x double> @floor_mask_sd_trunc(<2 x double> %x, <2 x double> %y, <2 x double> %w, i16 %k) nounwind { 1597; SSE41-LABEL: floor_mask_sd_trunc: 1598; SSE41: ## %bb.0: 1599; SSE41-NEXT: testb $1, %dil 1600; SSE41-NEXT: je LBB58_2 1601; SSE41-NEXT: ## %bb.1: 1602; SSE41-NEXT: xorps %xmm2, %xmm2 1603; SSE41-NEXT: roundsd $9, %xmm0, %xmm2 1604; SSE41-NEXT: LBB58_2: 1605; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1] 1606; SSE41-NEXT: movapd %xmm1, %xmm0 1607; SSE41-NEXT: retq 1608; 1609; AVX-LABEL: floor_mask_sd_trunc: 1610; AVX: ## %bb.0: 1611; AVX-NEXT: testb $1, %dil 1612; AVX-NEXT: je LBB58_2 1613; AVX-NEXT: ## %bb.1: 1614; AVX-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm2 1615; AVX-NEXT: LBB58_2: 1616; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1] 1617; AVX-NEXT: retq 1618; 1619; AVX512-LABEL: floor_mask_sd_trunc: 1620; AVX512: ## %bb.0: 1621; AVX512-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm0 1622; AVX512-NEXT: kmovw %edi, %k1 1623; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm2 {%k1} 1624; AVX512-NEXT: vmovapd %xmm2, %xmm0 1625; AVX512-NEXT: retq 1626 %mask = trunc i16 %k to i1 1627 %s = extractelement <2 x double> %x, i64 0 1628 %call = tail call double @llvm.floor.f64(double %s) 1629 %dst = extractelement <2 x double> %w, i64 0 1630 %low = select i1 %mask, double %call, double %dst 1631 %res = insertelement <2 x double> %y, double %low, i64 0 1632 ret <2 x double> %res 1633} 1634 1635define <2 x double> @floor_maskz_sd_trunc(<2 x double> %x, <2 x double> %y, i16 %k) nounwind { 1636; SSE41-LABEL: floor_maskz_sd_trunc: 1637; SSE41: ## %bb.0: 1638; SSE41-NEXT: testb $1, %dil 1639; SSE41-NEXT: jne LBB59_1 1640; SSE41-NEXT: ## %bb.2: 1641; SSE41-NEXT: xorpd %xmm0, %xmm0 1642; SSE41-NEXT: jmp LBB59_3 1643; SSE41-NEXT: LBB59_1: 1644; SSE41-NEXT: roundsd $9, %xmm0, %xmm0 1645; SSE41-NEXT: LBB59_3: 1646; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 1647; SSE41-NEXT: movapd %xmm1, %xmm0 1648; SSE41-NEXT: retq 1649; 1650; AVX-LABEL: floor_maskz_sd_trunc: 1651; AVX: ## %bb.0: 1652; AVX-NEXT: testb $1, %dil 1653; AVX-NEXT: jne LBB59_1 1654; AVX-NEXT: ## %bb.2: 1655; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 1656; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 1657; AVX-NEXT: retq 1658; AVX-NEXT: LBB59_1: 1659; AVX-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm0 1660; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] 1661; AVX-NEXT: retq 1662; 1663; AVX512-LABEL: floor_maskz_sd_trunc: 1664; AVX512: ## %bb.0: 1665; AVX512-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm0 1666; AVX512-NEXT: kmovw %edi, %k1 1667; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm0 {%k1} {z} 1668; AVX512-NEXT: retq 1669 %mask = trunc i16 %k to i1 1670 %s = extractelement <2 x double> %x, i64 0 1671 %call = tail call double @llvm.floor.f64(double %s) 1672 %low = select i1 %mask, double %call, double zeroinitializer 1673 %res = insertelement <2 x double> %y, double %low, i64 0 1674 ret <2 x double> %res 1675} 1676 1677define <4 x float> @floor_mask_ss_mask8(<4 x float> %x, <4 x float> %y, <4 x float> %w) nounwind { 1678; SSE41-LABEL: floor_mask_ss_mask8: 1679; SSE41: ## %bb.0: 1680; SSE41-NEXT: roundss $9, %xmm0, %xmm3 1681; SSE41-NEXT: cmpeqss %xmm1, %xmm0 1682; SSE41-NEXT: andps %xmm0, %xmm3 1683; SSE41-NEXT: andnps %xmm2, %xmm0 1684; SSE41-NEXT: orps %xmm3, %xmm0 1685; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1686; SSE41-NEXT: retq 1687; 1688; AVX-LABEL: floor_mask_ss_mask8: 1689; AVX: ## %bb.0: 1690; AVX-NEXT: vroundss $9, %xmm0, %xmm0, %xmm3 1691; AVX-NEXT: vcmpeqss %xmm1, %xmm0, %xmm0 1692; AVX-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0 1693; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1694; AVX-NEXT: retq 1695; 1696; AVX512-LABEL: floor_mask_ss_mask8: 1697; AVX512: ## %bb.0: 1698; AVX512-NEXT: vroundss $9, %xmm0, %xmm0, %xmm3 1699; AVX512-NEXT: vcmpeqss %xmm1, %xmm0, %k1 1700; AVX512-NEXT: vmovss %xmm3, %xmm1, %xmm2 {%k1} 1701; AVX512-NEXT: vmovaps %xmm2, %xmm0 1702; AVX512-NEXT: retq 1703 %mask1 = fcmp oeq <4 x float> %x, %y 1704 %mask = extractelement <4 x i1> %mask1, i64 0 1705 %s = extractelement <4 x float> %x, i64 0 1706 %call = tail call float @llvm.floor.f32(float %s) 1707 %dst = extractelement <4 x float> %w, i64 0 1708 %low = select i1 %mask, float %call, float %dst 1709 %res = insertelement <4 x float> %y, float %low, i64 0 1710 ret <4 x float> %res 1711} 1712 1713define <4 x float> @floor_maskz_ss_mask8(<4 x float> %x, <4 x float> %y) nounwind { 1714; SSE41-LABEL: floor_maskz_ss_mask8: 1715; SSE41: ## %bb.0: 1716; SSE41-NEXT: roundss $9, %xmm0, %xmm2 1717; SSE41-NEXT: cmpeqss %xmm1, %xmm0 1718; SSE41-NEXT: andps %xmm2, %xmm0 1719; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1720; SSE41-NEXT: retq 1721; 1722; AVX-LABEL: floor_maskz_ss_mask8: 1723; AVX: ## %bb.0: 1724; AVX-NEXT: vroundss $9, %xmm0, %xmm0, %xmm2 1725; AVX-NEXT: vcmpeqss %xmm1, %xmm0, %xmm0 1726; AVX-NEXT: vandps %xmm2, %xmm0, %xmm0 1727; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1728; AVX-NEXT: retq 1729; 1730; AVX512-LABEL: floor_maskz_ss_mask8: 1731; AVX512: ## %bb.0: 1732; AVX512-NEXT: vroundss $9, %xmm0, %xmm0, %xmm2 1733; AVX512-NEXT: vcmpeqss %xmm1, %xmm0, %k1 1734; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm0 {%k1} {z} 1735; AVX512-NEXT: retq 1736 %mask1 = fcmp oeq <4 x float> %x, %y 1737 %mask = extractelement <4 x i1> %mask1, i64 0 1738 %s = extractelement <4 x float> %x, i64 0 1739 %call = tail call float @llvm.floor.f32(float %s) 1740 %low = select i1 %mask, float %call, float zeroinitializer 1741 %res = insertelement <4 x float> %y, float %low, i64 0 1742 ret <4 x float> %res 1743} 1744 1745define <2 x double> @floor_mask_sd_mask8(<2 x double> %x, <2 x double> %y, <2 x double> %w) nounwind { 1746; SSE41-LABEL: floor_mask_sd_mask8: 1747; SSE41: ## %bb.0: 1748; SSE41-NEXT: roundsd $9, %xmm0, %xmm3 1749; SSE41-NEXT: cmpeqsd %xmm1, %xmm0 1750; SSE41-NEXT: andpd %xmm0, %xmm3 1751; SSE41-NEXT: andnpd %xmm2, %xmm0 1752; SSE41-NEXT: orpd %xmm3, %xmm0 1753; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] 1754; SSE41-NEXT: retq 1755; 1756; AVX-LABEL: floor_mask_sd_mask8: 1757; AVX: ## %bb.0: 1758; AVX-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm3 1759; AVX-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0 1760; AVX-NEXT: vblendvpd %xmm0, %xmm3, %xmm2, %xmm0 1761; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] 1762; AVX-NEXT: retq 1763; 1764; AVX512-LABEL: floor_mask_sd_mask8: 1765; AVX512: ## %bb.0: 1766; AVX512-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm3 1767; AVX512-NEXT: vcmpeqsd %xmm1, %xmm0, %k1 1768; AVX512-NEXT: vmovsd %xmm3, %xmm1, %xmm2 {%k1} 1769; AVX512-NEXT: vmovapd %xmm2, %xmm0 1770; AVX512-NEXT: retq 1771 %mask1 = fcmp oeq <2 x double> %x, %y 1772 %mask = extractelement <2 x i1> %mask1, i64 0 1773 %s = extractelement <2 x double> %x, i64 0 1774 %call = tail call double @llvm.floor.f64(double %s) 1775 %dst = extractelement <2 x double> %w, i64 0 1776 %low = select i1 %mask, double %call, double %dst 1777 %res = insertelement <2 x double> %y, double %low, i64 0 1778 ret <2 x double> %res 1779} 1780 1781define <2 x double> @floor_maskz_sd_mask8(<2 x double> %x, <2 x double> %y) nounwind { 1782; SSE41-LABEL: floor_maskz_sd_mask8: 1783; SSE41: ## %bb.0: 1784; SSE41-NEXT: roundsd $9, %xmm0, %xmm2 1785; SSE41-NEXT: cmpeqsd %xmm1, %xmm0 1786; SSE41-NEXT: andpd %xmm2, %xmm0 1787; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] 1788; SSE41-NEXT: retq 1789; 1790; AVX-LABEL: floor_maskz_sd_mask8: 1791; AVX: ## %bb.0: 1792; AVX-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm2 1793; AVX-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0 1794; AVX-NEXT: vandpd %xmm2, %xmm0, %xmm0 1795; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] 1796; AVX-NEXT: retq 1797; 1798; AVX512-LABEL: floor_maskz_sd_mask8: 1799; AVX512: ## %bb.0: 1800; AVX512-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm2 1801; AVX512-NEXT: vcmpeqsd %xmm1, %xmm0, %k1 1802; AVX512-NEXT: vmovsd %xmm2, %xmm1, %xmm0 {%k1} {z} 1803; AVX512-NEXT: retq 1804 %mask1 = fcmp oeq <2 x double> %x, %y 1805 %mask = extractelement <2 x i1> %mask1, i64 0 1806 %s = extractelement <2 x double> %x, i64 0 1807 %call = tail call double @llvm.floor.f64(double %s) 1808 %low = select i1 %mask, double %call, double zeroinitializer 1809 %res = insertelement <2 x double> %y, double %low, i64 0 1810 ret <2 x double> %res 1811} 1812 1813define <4 x float> @ceil_ss(<4 x float> %x, <4 x float> %y) nounwind { 1814; SSE41-LABEL: ceil_ss: 1815; SSE41: ## %bb.0: 1816; SSE41-NEXT: roundss $10, %xmm0, %xmm0 1817; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1818; SSE41-NEXT: retq 1819; 1820; AVX-LABEL: ceil_ss: 1821; AVX: ## %bb.0: 1822; AVX-NEXT: vroundss $10, %xmm0, %xmm0, %xmm0 1823; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1824; AVX-NEXT: retq 1825; 1826; AVX512-LABEL: ceil_ss: 1827; AVX512: ## %bb.0: 1828; AVX512-NEXT: vroundss $10, %xmm0, %xmm0, %xmm0 1829; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1830; AVX512-NEXT: retq 1831 %s = extractelement <4 x float> %x, i32 0 1832 %call = call float @llvm.ceil.f32(float %s) 1833 %res = insertelement <4 x float> %y, float %call, i32 0 1834 ret <4 x float> %res 1835} 1836declare float @llvm.ceil.f32(float %s) 1837 1838define <2 x double> @ceil_sd(<2 x double> %x, <2 x double> %y) nounwind { 1839; SSE41-LABEL: ceil_sd: 1840; SSE41: ## %bb.0: 1841; SSE41-NEXT: roundsd $10, %xmm0, %xmm0 1842; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] 1843; SSE41-NEXT: retq 1844; 1845; AVX-LABEL: ceil_sd: 1846; AVX: ## %bb.0: 1847; AVX-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm0 1848; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] 1849; AVX-NEXT: retq 1850; 1851; AVX512-LABEL: ceil_sd: 1852; AVX512: ## %bb.0: 1853; AVX512-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm0 1854; AVX512-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] 1855; AVX512-NEXT: retq 1856 %s = extractelement <2 x double> %x, i32 0 1857 %call = call double @llvm.ceil.f64(double %s) 1858 %res = insertelement <2 x double> %y, double %call, i32 0 1859 ret <2 x double> %res 1860} 1861declare double @llvm.ceil.f64(double %s) 1862 1863define <4 x float> @ceil_mask_128_ps(<4 x float> %x, <4 x float> %y) nounwind { 1864; SSE41-LABEL: ceil_mask_128_ps: 1865; SSE41: ## %bb.0: 1866; SSE41-NEXT: roundps $10, %xmm0, %xmm2 1867; SSE41-NEXT: cmpeqps %xmm1, %xmm0 1868; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1 1869; SSE41-NEXT: movaps %xmm1, %xmm0 1870; SSE41-NEXT: retq 1871; 1872; AVX-LABEL: ceil_mask_128_ps: 1873; AVX: ## %bb.0: 1874; AVX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm2 1875; AVX-NEXT: vroundps $10, %xmm0, %xmm0 1876; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 1877; AVX-NEXT: retq 1878; 1879; AVX512F-LABEL: ceil_mask_128_ps: 1880; AVX512F: ## %bb.0: 1881; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 1882; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 1883; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1 1884; AVX512F-NEXT: vroundps $10, %xmm0, %xmm0 1885; AVX512F-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} 1886; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 1887; AVX512F-NEXT: vzeroupper 1888; AVX512F-NEXT: retq 1889; 1890; AVX512VL-LABEL: ceil_mask_128_ps: 1891; AVX512VL: ## %bb.0: 1892; AVX512VL-NEXT: vcmpeqps %xmm1, %xmm0, %k1 1893; AVX512VL-NEXT: vrndscaleps $10, %xmm0, %xmm1 {%k1} 1894; AVX512VL-NEXT: vmovaps %xmm1, %xmm0 1895; AVX512VL-NEXT: retq 1896 %k = fcmp oeq <4 x float> %x, %y 1897 %call = call <4 x float> @llvm.ceil.v4f32(<4 x float> %x) 1898 %res = select <4 x i1> %k, <4 x float> %call, <4 x float> %y 1899 ret <4 x float> %res 1900} 1901 1902define <4 x float> @ceil_maskz_128_ps(<4 x float> %x, <4 x float> %y) nounwind { 1903; SSE41-LABEL: ceil_maskz_128_ps: 1904; SSE41: ## %bb.0: 1905; SSE41-NEXT: cmpeqps %xmm0, %xmm1 1906; SSE41-NEXT: roundps $10, %xmm0, %xmm0 1907; SSE41-NEXT: andps %xmm1, %xmm0 1908; SSE41-NEXT: retq 1909; 1910; AVX-LABEL: ceil_maskz_128_ps: 1911; AVX: ## %bb.0: 1912; AVX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm1 1913; AVX-NEXT: vroundps $10, %xmm0, %xmm0 1914; AVX-NEXT: vandps %xmm0, %xmm1, %xmm0 1915; AVX-NEXT: retq 1916; 1917; AVX512F-LABEL: ceil_maskz_128_ps: 1918; AVX512F: ## %bb.0: 1919; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 1920; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 1921; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1 1922; AVX512F-NEXT: vroundps $10, %xmm0, %xmm0 1923; AVX512F-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} 1924; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 1925; AVX512F-NEXT: vzeroupper 1926; AVX512F-NEXT: retq 1927; 1928; AVX512VL-LABEL: ceil_maskz_128_ps: 1929; AVX512VL: ## %bb.0: 1930; AVX512VL-NEXT: vcmpeqps %xmm1, %xmm0, %k1 1931; AVX512VL-NEXT: vrndscaleps $10, %xmm0, %xmm0 {%k1} {z} 1932; AVX512VL-NEXT: retq 1933 %k = fcmp oeq <4 x float> %x, %y 1934 %call = call <4 x float> @llvm.ceil.v4f32(<4 x float> %x) 1935 %res = select <4 x i1> %k, <4 x float> %call, <4 x float> zeroinitializer 1936 ret <4 x float> %res 1937} 1938 1939define <2 x double> @ceil_mask_128_pd(<2 x double> %x, <2 x double> %y) nounwind { 1940; SSE41-LABEL: ceil_mask_128_pd: 1941; SSE41: ## %bb.0: 1942; SSE41-NEXT: roundpd $10, %xmm0, %xmm2 1943; SSE41-NEXT: cmpeqpd %xmm1, %xmm0 1944; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 1945; SSE41-NEXT: movapd %xmm1, %xmm0 1946; SSE41-NEXT: retq 1947; 1948; AVX-LABEL: ceil_mask_128_pd: 1949; AVX: ## %bb.0: 1950; AVX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm2 1951; AVX-NEXT: vroundpd $10, %xmm0, %xmm0 1952; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 1953; AVX-NEXT: retq 1954; 1955; AVX512F-LABEL: ceil_mask_128_pd: 1956; AVX512F: ## %bb.0: 1957; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 1958; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 1959; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1 1960; AVX512F-NEXT: vroundpd $10, %xmm0, %xmm0 1961; AVX512F-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1} 1962; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 1963; AVX512F-NEXT: vzeroupper 1964; AVX512F-NEXT: retq 1965; 1966; AVX512VL-LABEL: ceil_mask_128_pd: 1967; AVX512VL: ## %bb.0: 1968; AVX512VL-NEXT: vcmpeqpd %xmm1, %xmm0, %k1 1969; AVX512VL-NEXT: vrndscalepd $10, %xmm0, %xmm1 {%k1} 1970; AVX512VL-NEXT: vmovapd %xmm1, %xmm0 1971; AVX512VL-NEXT: retq 1972 %k = fcmp oeq <2 x double> %x, %y 1973 %call = call <2 x double> @llvm.ceil.v2f64(<2 x double> %x) 1974 %res = select <2 x i1> %k, <2 x double> %call, <2 x double> %y 1975 ret <2 x double> %res 1976} 1977 1978define <2 x double> @ceil_maskz_128_pd(<2 x double> %x, <2 x double> %y) nounwind { 1979; SSE41-LABEL: ceil_maskz_128_pd: 1980; SSE41: ## %bb.0: 1981; SSE41-NEXT: cmpeqpd %xmm0, %xmm1 1982; SSE41-NEXT: roundpd $10, %xmm0, %xmm0 1983; SSE41-NEXT: andpd %xmm1, %xmm0 1984; SSE41-NEXT: retq 1985; 1986; AVX-LABEL: ceil_maskz_128_pd: 1987; AVX: ## %bb.0: 1988; AVX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm1 1989; AVX-NEXT: vroundpd $10, %xmm0, %xmm0 1990; AVX-NEXT: vandpd %xmm0, %xmm1, %xmm0 1991; AVX-NEXT: retq 1992; 1993; AVX512F-LABEL: ceil_maskz_128_pd: 1994; AVX512F: ## %bb.0: 1995; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 1996; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 1997; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1 1998; AVX512F-NEXT: vroundpd $10, %xmm0, %xmm0 1999; AVX512F-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z} 2000; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 2001; AVX512F-NEXT: vzeroupper 2002; AVX512F-NEXT: retq 2003; 2004; AVX512VL-LABEL: ceil_maskz_128_pd: 2005; AVX512VL: ## %bb.0: 2006; AVX512VL-NEXT: vcmpeqpd %xmm1, %xmm0, %k1 2007; AVX512VL-NEXT: vrndscalepd $10, %xmm0, %xmm0 {%k1} {z} 2008; AVX512VL-NEXT: retq 2009 %k = fcmp oeq <2 x double> %x, %y 2010 %call = call <2 x double> @llvm.ceil.v2f64(<2 x double> %x) 2011 %res = select <2 x i1> %k, <2 x double> %call, <2 x double> zeroinitializer 2012 ret <2 x double> %res 2013} 2014 2015define <8 x float> @ceil_mask_256_ps(<8 x float> %x, <8 x float> %y) nounwind { 2016; SSE41-LABEL: ceil_mask_256_ps: 2017; SSE41: ## %bb.0: 2018; SSE41-NEXT: roundps $10, %xmm1, %xmm4 2019; SSE41-NEXT: cmpeqps %xmm3, %xmm1 2020; SSE41-NEXT: roundps $10, %xmm0, %xmm5 2021; SSE41-NEXT: cmpeqps %xmm2, %xmm0 2022; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm2 2023; SSE41-NEXT: movaps %xmm1, %xmm0 2024; SSE41-NEXT: blendvps %xmm0, %xmm4, %xmm3 2025; SSE41-NEXT: movaps %xmm2, %xmm0 2026; SSE41-NEXT: movaps %xmm3, %xmm1 2027; SSE41-NEXT: retq 2028; 2029; AVX-LABEL: ceil_mask_256_ps: 2030; AVX: ## %bb.0: 2031; AVX-NEXT: vcmpeqps %ymm1, %ymm0, %ymm2 2032; AVX-NEXT: vroundps $10, %ymm0, %ymm0 2033; AVX-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0 2034; AVX-NEXT: retq 2035; 2036; AVX512F-LABEL: ceil_mask_256_ps: 2037; AVX512F: ## %bb.0: 2038; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1 2039; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 2040; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1 2041; AVX512F-NEXT: vroundps $10, %ymm0, %ymm0 2042; AVX512F-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} 2043; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 2044; AVX512F-NEXT: retq 2045; 2046; AVX512VL-LABEL: ceil_mask_256_ps: 2047; AVX512VL: ## %bb.0: 2048; AVX512VL-NEXT: vcmpeqps %ymm1, %ymm0, %k1 2049; AVX512VL-NEXT: vrndscaleps $10, %ymm0, %ymm1 {%k1} 2050; AVX512VL-NEXT: vmovaps %ymm1, %ymm0 2051; AVX512VL-NEXT: retq 2052 %k = fcmp oeq <8 x float> %x, %y 2053 %call = call <8 x float> @llvm.ceil.v8f32(<8 x float> %x) 2054 %res = select <8 x i1> %k, <8 x float> %call, <8 x float> %y 2055 ret <8 x float> %res 2056} 2057 2058define <8 x float> @ceil_maskz_256_ps(<8 x float> %x, <8 x float> %y) nounwind { 2059; SSE41-LABEL: ceil_maskz_256_ps: 2060; SSE41: ## %bb.0: 2061; SSE41-NEXT: cmpeqps %xmm1, %xmm3 2062; SSE41-NEXT: cmpeqps %xmm0, %xmm2 2063; SSE41-NEXT: roundps $10, %xmm1, %xmm1 2064; SSE41-NEXT: andps %xmm3, %xmm1 2065; SSE41-NEXT: roundps $10, %xmm0, %xmm0 2066; SSE41-NEXT: andps %xmm2, %xmm0 2067; SSE41-NEXT: retq 2068; 2069; AVX-LABEL: ceil_maskz_256_ps: 2070; AVX: ## %bb.0: 2071; AVX-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1 2072; AVX-NEXT: vroundps $10, %ymm0, %ymm0 2073; AVX-NEXT: vandps %ymm0, %ymm1, %ymm0 2074; AVX-NEXT: retq 2075; 2076; AVX512F-LABEL: ceil_maskz_256_ps: 2077; AVX512F: ## %bb.0: 2078; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1 2079; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 2080; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1 2081; AVX512F-NEXT: vroundps $10, %ymm0, %ymm0 2082; AVX512F-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} 2083; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 2084; AVX512F-NEXT: retq 2085; 2086; AVX512VL-LABEL: ceil_maskz_256_ps: 2087; AVX512VL: ## %bb.0: 2088; AVX512VL-NEXT: vcmpeqps %ymm1, %ymm0, %k1 2089; AVX512VL-NEXT: vrndscaleps $10, %ymm0, %ymm0 {%k1} {z} 2090; AVX512VL-NEXT: retq 2091 %k = fcmp oeq <8 x float> %x, %y 2092 %call = call <8 x float> @llvm.ceil.v8f32(<8 x float> %x) 2093 %res = select <8 x i1> %k, <8 x float> %call, <8 x float> zeroinitializer 2094 ret <8 x float> %res 2095} 2096 2097define <4 x double> @ceil_mask_256_pd(<4 x double> %x, <4 x double> %y) nounwind { 2098; SSE41-LABEL: ceil_mask_256_pd: 2099; SSE41: ## %bb.0: 2100; SSE41-NEXT: roundpd $10, %xmm1, %xmm4 2101; SSE41-NEXT: cmpeqpd %xmm3, %xmm1 2102; SSE41-NEXT: roundpd $10, %xmm0, %xmm5 2103; SSE41-NEXT: cmpeqpd %xmm2, %xmm0 2104; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2 2105; SSE41-NEXT: movapd %xmm1, %xmm0 2106; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 2107; SSE41-NEXT: movapd %xmm2, %xmm0 2108; SSE41-NEXT: movapd %xmm3, %xmm1 2109; SSE41-NEXT: retq 2110; 2111; AVX-LABEL: ceil_mask_256_pd: 2112; AVX: ## %bb.0: 2113; AVX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm2 2114; AVX-NEXT: vroundpd $10, %ymm0, %ymm0 2115; AVX-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 2116; AVX-NEXT: retq 2117; 2118; AVX512F-LABEL: ceil_mask_256_pd: 2119; AVX512F: ## %bb.0: 2120; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1 2121; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 2122; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1 2123; AVX512F-NEXT: vroundpd $10, %ymm0, %ymm0 2124; AVX512F-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1} 2125; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 2126; AVX512F-NEXT: retq 2127; 2128; AVX512VL-LABEL: ceil_mask_256_pd: 2129; AVX512VL: ## %bb.0: 2130; AVX512VL-NEXT: vcmpeqpd %ymm1, %ymm0, %k1 2131; AVX512VL-NEXT: vrndscalepd $10, %ymm0, %ymm1 {%k1} 2132; AVX512VL-NEXT: vmovapd %ymm1, %ymm0 2133; AVX512VL-NEXT: retq 2134 %k = fcmp oeq <4 x double> %x, %y 2135 %call = call <4 x double> @llvm.ceil.v4f64(<4 x double> %x) 2136 %res = select <4 x i1> %k, <4 x double> %call, <4 x double> %y 2137 ret <4 x double> %res 2138} 2139 2140define <4 x double> @ceil_maskz_256_pd(<4 x double> %x, <4 x double> %y) nounwind { 2141; SSE41-LABEL: ceil_maskz_256_pd: 2142; SSE41: ## %bb.0: 2143; SSE41-NEXT: cmpeqpd %xmm1, %xmm3 2144; SSE41-NEXT: cmpeqpd %xmm0, %xmm2 2145; SSE41-NEXT: roundpd $10, %xmm1, %xmm1 2146; SSE41-NEXT: andpd %xmm3, %xmm1 2147; SSE41-NEXT: roundpd $10, %xmm0, %xmm0 2148; SSE41-NEXT: andpd %xmm2, %xmm0 2149; SSE41-NEXT: retq 2150; 2151; AVX-LABEL: ceil_maskz_256_pd: 2152; AVX: ## %bb.0: 2153; AVX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm1 2154; AVX-NEXT: vroundpd $10, %ymm0, %ymm0 2155; AVX-NEXT: vandpd %ymm0, %ymm1, %ymm0 2156; AVX-NEXT: retq 2157; 2158; AVX512F-LABEL: ceil_maskz_256_pd: 2159; AVX512F: ## %bb.0: 2160; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1 2161; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 2162; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1 2163; AVX512F-NEXT: vroundpd $10, %ymm0, %ymm0 2164; AVX512F-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z} 2165; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 2166; AVX512F-NEXT: retq 2167; 2168; AVX512VL-LABEL: ceil_maskz_256_pd: 2169; AVX512VL: ## %bb.0: 2170; AVX512VL-NEXT: vcmpeqpd %ymm1, %ymm0, %k1 2171; AVX512VL-NEXT: vrndscalepd $10, %ymm0, %ymm0 {%k1} {z} 2172; AVX512VL-NEXT: retq 2173 %k = fcmp oeq <4 x double> %x, %y 2174 %call = call <4 x double> @llvm.ceil.v4f64(<4 x double> %x) 2175 %res = select <4 x i1> %k, <4 x double> %call, <4 x double> zeroinitializer 2176 ret <4 x double> %res 2177} 2178 2179define <16 x float> @ceil_mask_512_ps(<16 x float> %x, <16 x float> %y) nounwind { 2180; SSE41-LABEL: ceil_mask_512_ps: 2181; SSE41: ## %bb.0: 2182; SSE41-NEXT: roundps $10, %xmm3, %xmm8 2183; SSE41-NEXT: cmpeqps %xmm7, %xmm3 2184; SSE41-NEXT: roundps $10, %xmm2, %xmm9 2185; SSE41-NEXT: cmpeqps %xmm6, %xmm2 2186; SSE41-NEXT: roundps $10, %xmm1, %xmm10 2187; SSE41-NEXT: cmpeqps %xmm5, %xmm1 2188; SSE41-NEXT: roundps $10, %xmm0, %xmm11 2189; SSE41-NEXT: cmpeqps %xmm4, %xmm0 2190; SSE41-NEXT: blendvps %xmm0, %xmm11, %xmm4 2191; SSE41-NEXT: movaps %xmm1, %xmm0 2192; SSE41-NEXT: blendvps %xmm0, %xmm10, %xmm5 2193; SSE41-NEXT: movaps %xmm2, %xmm0 2194; SSE41-NEXT: blendvps %xmm0, %xmm9, %xmm6 2195; SSE41-NEXT: movaps %xmm3, %xmm0 2196; SSE41-NEXT: blendvps %xmm0, %xmm8, %xmm7 2197; SSE41-NEXT: movaps %xmm4, %xmm0 2198; SSE41-NEXT: movaps %xmm5, %xmm1 2199; SSE41-NEXT: movaps %xmm6, %xmm2 2200; SSE41-NEXT: movaps %xmm7, %xmm3 2201; SSE41-NEXT: retq 2202; 2203; AVX-LABEL: ceil_mask_512_ps: 2204; AVX: ## %bb.0: 2205; AVX-NEXT: vcmpeqps %ymm3, %ymm1, %ymm4 2206; AVX-NEXT: vcmpeqps %ymm2, %ymm0, %ymm5 2207; AVX-NEXT: vroundps $10, %ymm1, %ymm1 2208; AVX-NEXT: vroundps $10, %ymm0, %ymm0 2209; AVX-NEXT: vblendvps %ymm5, %ymm0, %ymm2, %ymm0 2210; AVX-NEXT: vblendvps %ymm4, %ymm1, %ymm3, %ymm1 2211; AVX-NEXT: retq 2212; 2213; AVX512-LABEL: ceil_mask_512_ps: 2214; AVX512: ## %bb.0: 2215; AVX512-NEXT: vcmpeqps %zmm1, %zmm0, %k1 2216; AVX512-NEXT: vrndscaleps $10, %zmm0, %zmm1 {%k1} 2217; AVX512-NEXT: vmovaps %zmm1, %zmm0 2218; AVX512-NEXT: retq 2219 %k = fcmp oeq <16 x float> %x, %y 2220 %call = call <16 x float> @llvm.ceil.v16f32(<16 x float> %x) 2221 %res = select <16 x i1> %k, <16 x float> %call, <16 x float> %y 2222 ret <16 x float> %res 2223} 2224 2225define <16 x float> @ceil_maskz_512_ps(<16 x float> %x, <16 x float> %y) nounwind { 2226; SSE41-LABEL: ceil_maskz_512_ps: 2227; SSE41: ## %bb.0: 2228; SSE41-NEXT: cmpeqps %xmm3, %xmm7 2229; SSE41-NEXT: cmpeqps %xmm2, %xmm6 2230; SSE41-NEXT: cmpeqps %xmm1, %xmm5 2231; SSE41-NEXT: cmpeqps %xmm0, %xmm4 2232; SSE41-NEXT: roundps $10, %xmm3, %xmm3 2233; SSE41-NEXT: andps %xmm7, %xmm3 2234; SSE41-NEXT: roundps $10, %xmm2, %xmm2 2235; SSE41-NEXT: andps %xmm6, %xmm2 2236; SSE41-NEXT: roundps $10, %xmm1, %xmm1 2237; SSE41-NEXT: andps %xmm5, %xmm1 2238; SSE41-NEXT: roundps $10, %xmm0, %xmm0 2239; SSE41-NEXT: andps %xmm4, %xmm0 2240; SSE41-NEXT: retq 2241; 2242; AVX-LABEL: ceil_maskz_512_ps: 2243; AVX: ## %bb.0: 2244; AVX-NEXT: vcmpeqps %ymm3, %ymm1, %ymm3 2245; AVX-NEXT: vcmpeqps %ymm2, %ymm0, %ymm2 2246; AVX-NEXT: vroundps $10, %ymm1, %ymm1 2247; AVX-NEXT: vandps %ymm1, %ymm3, %ymm1 2248; AVX-NEXT: vroundps $10, %ymm0, %ymm0 2249; AVX-NEXT: vandps %ymm0, %ymm2, %ymm0 2250; AVX-NEXT: retq 2251; 2252; AVX512-LABEL: ceil_maskz_512_ps: 2253; AVX512: ## %bb.0: 2254; AVX512-NEXT: vcmpeqps %zmm1, %zmm0, %k1 2255; AVX512-NEXT: vrndscaleps $10, %zmm0, %zmm0 {%k1} {z} 2256; AVX512-NEXT: retq 2257 %k = fcmp oeq <16 x float> %x, %y 2258 %call = call <16 x float> @llvm.ceil.v16f32(<16 x float> %x) 2259 %res = select <16 x i1> %k, <16 x float> %call, <16 x float> zeroinitializer 2260 ret <16 x float> %res 2261} 2262 2263define <8 x double> @ceil_mask_512_pd(<8 x double> %x, <8 x double> %y) nounwind { 2264; SSE41-LABEL: ceil_mask_512_pd: 2265; SSE41: ## %bb.0: 2266; SSE41-NEXT: roundpd $10, %xmm3, %xmm8 2267; SSE41-NEXT: cmpeqpd %xmm7, %xmm3 2268; SSE41-NEXT: roundpd $10, %xmm2, %xmm9 2269; SSE41-NEXT: cmpeqpd %xmm6, %xmm2 2270; SSE41-NEXT: roundpd $10, %xmm1, %xmm10 2271; SSE41-NEXT: cmpeqpd %xmm5, %xmm1 2272; SSE41-NEXT: roundpd $10, %xmm0, %xmm11 2273; SSE41-NEXT: cmpeqpd %xmm4, %xmm0 2274; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm4 2275; SSE41-NEXT: movapd %xmm1, %xmm0 2276; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm5 2277; SSE41-NEXT: movapd %xmm2, %xmm0 2278; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm6 2279; SSE41-NEXT: movapd %xmm3, %xmm0 2280; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm7 2281; SSE41-NEXT: movapd %xmm4, %xmm0 2282; SSE41-NEXT: movapd %xmm5, %xmm1 2283; SSE41-NEXT: movapd %xmm6, %xmm2 2284; SSE41-NEXT: movapd %xmm7, %xmm3 2285; SSE41-NEXT: retq 2286; 2287; AVX-LABEL: ceil_mask_512_pd: 2288; AVX: ## %bb.0: 2289; AVX-NEXT: vcmpeqpd %ymm3, %ymm1, %ymm4 2290; AVX-NEXT: vcmpeqpd %ymm2, %ymm0, %ymm5 2291; AVX-NEXT: vroundpd $10, %ymm1, %ymm1 2292; AVX-NEXT: vroundpd $10, %ymm0, %ymm0 2293; AVX-NEXT: vblendvpd %ymm5, %ymm0, %ymm2, %ymm0 2294; AVX-NEXT: vblendvpd %ymm4, %ymm1, %ymm3, %ymm1 2295; AVX-NEXT: retq 2296; 2297; AVX512-LABEL: ceil_mask_512_pd: 2298; AVX512: ## %bb.0: 2299; AVX512-NEXT: vcmpeqpd %zmm1, %zmm0, %k1 2300; AVX512-NEXT: vrndscalepd $10, %zmm0, %zmm1 {%k1} 2301; AVX512-NEXT: vmovapd %zmm1, %zmm0 2302; AVX512-NEXT: retq 2303 %k = fcmp oeq <8 x double> %x, %y 2304 %call = call <8 x double> @llvm.ceil.v8f64(<8 x double> %x) 2305 %res = select <8 x i1> %k, <8 x double> %call, <8 x double> %y 2306 ret <8 x double> %res 2307} 2308 2309define <8 x double> @ceil_maskz_512_pd(<8 x double> %x, <8 x double> %y) nounwind { 2310; SSE41-LABEL: ceil_maskz_512_pd: 2311; SSE41: ## %bb.0: 2312; SSE41-NEXT: cmpeqpd %xmm3, %xmm7 2313; SSE41-NEXT: cmpeqpd %xmm2, %xmm6 2314; SSE41-NEXT: cmpeqpd %xmm1, %xmm5 2315; SSE41-NEXT: cmpeqpd %xmm0, %xmm4 2316; SSE41-NEXT: roundpd $10, %xmm3, %xmm3 2317; SSE41-NEXT: andpd %xmm7, %xmm3 2318; SSE41-NEXT: roundpd $10, %xmm2, %xmm2 2319; SSE41-NEXT: andpd %xmm6, %xmm2 2320; SSE41-NEXT: roundpd $10, %xmm1, %xmm1 2321; SSE41-NEXT: andpd %xmm5, %xmm1 2322; SSE41-NEXT: roundpd $10, %xmm0, %xmm0 2323; SSE41-NEXT: andpd %xmm4, %xmm0 2324; SSE41-NEXT: retq 2325; 2326; AVX-LABEL: ceil_maskz_512_pd: 2327; AVX: ## %bb.0: 2328; AVX-NEXT: vcmpeqpd %ymm3, %ymm1, %ymm3 2329; AVX-NEXT: vcmpeqpd %ymm2, %ymm0, %ymm2 2330; AVX-NEXT: vroundpd $10, %ymm1, %ymm1 2331; AVX-NEXT: vandpd %ymm1, %ymm3, %ymm1 2332; AVX-NEXT: vroundpd $10, %ymm0, %ymm0 2333; AVX-NEXT: vandpd %ymm0, %ymm2, %ymm0 2334; AVX-NEXT: retq 2335; 2336; AVX512-LABEL: ceil_maskz_512_pd: 2337; AVX512: ## %bb.0: 2338; AVX512-NEXT: vcmpeqpd %zmm1, %zmm0, %k1 2339; AVX512-NEXT: vrndscalepd $10, %zmm0, %zmm0 {%k1} {z} 2340; AVX512-NEXT: retq 2341 %k = fcmp oeq <8 x double> %x, %y 2342 %call = call <8 x double> @llvm.ceil.v8f64(<8 x double> %x) 2343 %res = select <8 x i1> %k, <8 x double> %call, <8 x double> zeroinitializer 2344 ret <8 x double> %res 2345} 2346 2347define <4 x float> @ceil_mask_ss(<4 x float> %x, <4 x float> %y, <4 x float> %w, i8 %k) nounwind { 2348; SSE41-LABEL: ceil_mask_ss: 2349; SSE41: ## %bb.0: 2350; SSE41-NEXT: testb $1, %dil 2351; SSE41-NEXT: je LBB78_2 2352; SSE41-NEXT: ## %bb.1: 2353; SSE41-NEXT: xorps %xmm2, %xmm2 2354; SSE41-NEXT: roundss $10, %xmm0, %xmm2 2355; SSE41-NEXT: LBB78_2: 2356; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] 2357; SSE41-NEXT: movaps %xmm1, %xmm0 2358; SSE41-NEXT: retq 2359; 2360; AVX-LABEL: ceil_mask_ss: 2361; AVX: ## %bb.0: 2362; AVX-NEXT: testb $1, %dil 2363; AVX-NEXT: je LBB78_2 2364; AVX-NEXT: ## %bb.1: 2365; AVX-NEXT: vroundss $10, %xmm0, %xmm0, %xmm2 2366; AVX-NEXT: LBB78_2: 2367; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3] 2368; AVX-NEXT: retq 2369; 2370; AVX512-LABEL: ceil_mask_ss: 2371; AVX512: ## %bb.0: 2372; AVX512-NEXT: vroundss $10, %xmm0, %xmm0, %xmm0 2373; AVX512-NEXT: kmovw %edi, %k1 2374; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm2 {%k1} 2375; AVX512-NEXT: vmovaps %xmm2, %xmm0 2376; AVX512-NEXT: retq 2377 %mask = and i8 %k, 1 2378 %nmask = icmp eq i8 %mask, 0 2379 %s = extractelement <4 x float> %x, i64 0 2380 %call = tail call float @llvm.ceil.f32(float %s) 2381 %dst = extractelement <4 x float> %w, i64 0 2382 %low = select i1 %nmask, float %dst, float %call 2383 %res = insertelement <4 x float> %y, float %low, i64 0 2384 ret <4 x float> %res 2385} 2386 2387define <4 x float> @ceil_maskz_ss(<4 x float> %x, <4 x float> %y, i8 %k) nounwind { 2388; SSE41-LABEL: ceil_maskz_ss: 2389; SSE41: ## %bb.0: 2390; SSE41-NEXT: testb $1, %dil 2391; SSE41-NEXT: xorps %xmm2, %xmm2 2392; SSE41-NEXT: je LBB79_2 2393; SSE41-NEXT: ## %bb.1: 2394; SSE41-NEXT: xorps %xmm2, %xmm2 2395; SSE41-NEXT: roundss $10, %xmm0, %xmm2 2396; SSE41-NEXT: LBB79_2: 2397; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] 2398; SSE41-NEXT: movaps %xmm1, %xmm0 2399; SSE41-NEXT: retq 2400; 2401; AVX-LABEL: ceil_maskz_ss: 2402; AVX: ## %bb.0: 2403; AVX-NEXT: testb $1, %dil 2404; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 2405; AVX-NEXT: je LBB79_2 2406; AVX-NEXT: ## %bb.1: 2407; AVX-NEXT: vroundss $10, %xmm0, %xmm0, %xmm2 2408; AVX-NEXT: LBB79_2: 2409; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3] 2410; AVX-NEXT: retq 2411; 2412; AVX512-LABEL: ceil_maskz_ss: 2413; AVX512: ## %bb.0: 2414; AVX512-NEXT: vroundss $10, %xmm0, %xmm0, %xmm0 2415; AVX512-NEXT: kmovw %edi, %k1 2416; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm0 {%k1} {z} 2417; AVX512-NEXT: retq 2418 %mask = and i8 %k, 1 2419 %nmask = icmp eq i8 %mask, 0 2420 %s = extractelement <4 x float> %x, i64 0 2421 %call = tail call float @llvm.ceil.f32(float %s) 2422 %low = select i1 %nmask, float zeroinitializer, float %call 2423 %res = insertelement <4 x float> %y, float %low, i64 0 2424 ret <4 x float> %res 2425} 2426 2427define <2 x double> @ceil_mask_sd(<2 x double> %x, <2 x double> %y, <2 x double> %w, i8 %k) nounwind { 2428; SSE41-LABEL: ceil_mask_sd: 2429; SSE41: ## %bb.0: 2430; SSE41-NEXT: testb $1, %dil 2431; SSE41-NEXT: je LBB80_2 2432; SSE41-NEXT: ## %bb.1: 2433; SSE41-NEXT: xorps %xmm2, %xmm2 2434; SSE41-NEXT: roundsd $10, %xmm0, %xmm2 2435; SSE41-NEXT: LBB80_2: 2436; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1] 2437; SSE41-NEXT: movapd %xmm1, %xmm0 2438; SSE41-NEXT: retq 2439; 2440; AVX-LABEL: ceil_mask_sd: 2441; AVX: ## %bb.0: 2442; AVX-NEXT: testb $1, %dil 2443; AVX-NEXT: je LBB80_2 2444; AVX-NEXT: ## %bb.1: 2445; AVX-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm2 2446; AVX-NEXT: LBB80_2: 2447; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1] 2448; AVX-NEXT: retq 2449; 2450; AVX512-LABEL: ceil_mask_sd: 2451; AVX512: ## %bb.0: 2452; AVX512-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm0 2453; AVX512-NEXT: kmovw %edi, %k1 2454; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm2 {%k1} 2455; AVX512-NEXT: vmovapd %xmm2, %xmm0 2456; AVX512-NEXT: retq 2457 %mask = and i8 %k, 1 2458 %nmask = icmp eq i8 %mask, 0 2459 %s = extractelement <2 x double> %x, i64 0 2460 %call = tail call double @llvm.ceil.f64(double %s) 2461 %dst = extractelement <2 x double> %w, i64 0 2462 %low = select i1 %nmask, double %dst, double %call 2463 %res = insertelement <2 x double> %y, double %low, i64 0 2464 ret <2 x double> %res 2465} 2466 2467define <2 x double> @ceil_maskz_sd(<2 x double> %x, <2 x double> %y, i8 %k) nounwind { 2468; SSE41-LABEL: ceil_maskz_sd: 2469; SSE41: ## %bb.0: 2470; SSE41-NEXT: testb $1, %dil 2471; SSE41-NEXT: xorpd %xmm2, %xmm2 2472; SSE41-NEXT: je LBB81_2 2473; SSE41-NEXT: ## %bb.1: 2474; SSE41-NEXT: xorps %xmm2, %xmm2 2475; SSE41-NEXT: roundsd $10, %xmm0, %xmm2 2476; SSE41-NEXT: LBB81_2: 2477; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1] 2478; SSE41-NEXT: movapd %xmm1, %xmm0 2479; SSE41-NEXT: retq 2480; 2481; AVX-LABEL: ceil_maskz_sd: 2482; AVX: ## %bb.0: 2483; AVX-NEXT: testb $1, %dil 2484; AVX-NEXT: vxorpd %xmm2, %xmm2, %xmm2 2485; AVX-NEXT: je LBB81_2 2486; AVX-NEXT: ## %bb.1: 2487; AVX-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm2 2488; AVX-NEXT: LBB81_2: 2489; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1] 2490; AVX-NEXT: retq 2491; 2492; AVX512-LABEL: ceil_maskz_sd: 2493; AVX512: ## %bb.0: 2494; AVX512-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm0 2495; AVX512-NEXT: kmovw %edi, %k1 2496; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm0 {%k1} {z} 2497; AVX512-NEXT: retq 2498 %mask = and i8 %k, 1 2499 %nmask = icmp eq i8 %mask, 0 2500 %s = extractelement <2 x double> %x, i64 0 2501 %call = tail call double @llvm.ceil.f64(double %s) 2502 %low = select i1 %nmask, double zeroinitializer, double %call 2503 %res = insertelement <2 x double> %y, double %low, i64 0 2504 ret <2 x double> %res 2505} 2506 2507define <4 x float> @ceil_mask_ss_trunc(<4 x float> %x, <4 x float> %y, <4 x float> %w, i16 %k) nounwind { 2508; SSE41-LABEL: ceil_mask_ss_trunc: 2509; SSE41: ## %bb.0: 2510; SSE41-NEXT: testb $1, %dil 2511; SSE41-NEXT: je LBB82_2 2512; SSE41-NEXT: ## %bb.1: 2513; SSE41-NEXT: xorps %xmm2, %xmm2 2514; SSE41-NEXT: roundss $10, %xmm0, %xmm2 2515; SSE41-NEXT: LBB82_2: 2516; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] 2517; SSE41-NEXT: movaps %xmm1, %xmm0 2518; SSE41-NEXT: retq 2519; 2520; AVX-LABEL: ceil_mask_ss_trunc: 2521; AVX: ## %bb.0: 2522; AVX-NEXT: testb $1, %dil 2523; AVX-NEXT: je LBB82_2 2524; AVX-NEXT: ## %bb.1: 2525; AVX-NEXT: vroundss $10, %xmm0, %xmm0, %xmm2 2526; AVX-NEXT: LBB82_2: 2527; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3] 2528; AVX-NEXT: retq 2529; 2530; AVX512-LABEL: ceil_mask_ss_trunc: 2531; AVX512: ## %bb.0: 2532; AVX512-NEXT: vroundss $10, %xmm0, %xmm0, %xmm0 2533; AVX512-NEXT: kmovw %edi, %k1 2534; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm2 {%k1} 2535; AVX512-NEXT: vmovaps %xmm2, %xmm0 2536; AVX512-NEXT: retq 2537 %mask = trunc i16 %k to i1 2538 %s = extractelement <4 x float> %x, i64 0 2539 %call = tail call float @llvm.ceil.f32(float %s) 2540 %dst = extractelement <4 x float> %w, i64 0 2541 %low = select i1 %mask, float %call, float %dst 2542 %res = insertelement <4 x float> %y, float %low, i64 0 2543 ret <4 x float> %res 2544} 2545 2546define <4 x float> @ceil_maskz_ss_trunc(<4 x float> %x, <4 x float> %y, i16 %k) nounwind { 2547; SSE41-LABEL: ceil_maskz_ss_trunc: 2548; SSE41: ## %bb.0: 2549; SSE41-NEXT: testb $1, %dil 2550; SSE41-NEXT: jne LBB83_1 2551; SSE41-NEXT: ## %bb.2: 2552; SSE41-NEXT: xorps %xmm0, %xmm0 2553; SSE41-NEXT: jmp LBB83_3 2554; SSE41-NEXT: LBB83_1: 2555; SSE41-NEXT: roundss $10, %xmm0, %xmm0 2556; SSE41-NEXT: LBB83_3: 2557; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 2558; SSE41-NEXT: movaps %xmm1, %xmm0 2559; SSE41-NEXT: retq 2560; 2561; AVX-LABEL: ceil_maskz_ss_trunc: 2562; AVX: ## %bb.0: 2563; AVX-NEXT: testb $1, %dil 2564; AVX-NEXT: jne LBB83_1 2565; AVX-NEXT: ## %bb.2: 2566; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 2567; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 2568; AVX-NEXT: retq 2569; AVX-NEXT: LBB83_1: 2570; AVX-NEXT: vroundss $10, %xmm0, %xmm0, %xmm0 2571; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 2572; AVX-NEXT: retq 2573; 2574; AVX512-LABEL: ceil_maskz_ss_trunc: 2575; AVX512: ## %bb.0: 2576; AVX512-NEXT: vroundss $10, %xmm0, %xmm0, %xmm0 2577; AVX512-NEXT: kmovw %edi, %k1 2578; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm0 {%k1} {z} 2579; AVX512-NEXT: retq 2580 %mask = trunc i16 %k to i1 2581 %s = extractelement <4 x float> %x, i64 0 2582 %call = tail call float @llvm.ceil.f32(float %s) 2583 %low = select i1 %mask, float %call, float zeroinitializer 2584 %res = insertelement <4 x float> %y, float %low, i64 0 2585 ret <4 x float> %res 2586} 2587 2588define <2 x double> @ceil_mask_sd_trunc(<2 x double> %x, <2 x double> %y, <2 x double> %w, i16 %k) nounwind { 2589; SSE41-LABEL: ceil_mask_sd_trunc: 2590; SSE41: ## %bb.0: 2591; SSE41-NEXT: testb $1, %dil 2592; SSE41-NEXT: je LBB84_2 2593; SSE41-NEXT: ## %bb.1: 2594; SSE41-NEXT: xorps %xmm2, %xmm2 2595; SSE41-NEXT: roundsd $10, %xmm0, %xmm2 2596; SSE41-NEXT: LBB84_2: 2597; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1] 2598; SSE41-NEXT: movapd %xmm1, %xmm0 2599; SSE41-NEXT: retq 2600; 2601; AVX-LABEL: ceil_mask_sd_trunc: 2602; AVX: ## %bb.0: 2603; AVX-NEXT: testb $1, %dil 2604; AVX-NEXT: je LBB84_2 2605; AVX-NEXT: ## %bb.1: 2606; AVX-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm2 2607; AVX-NEXT: LBB84_2: 2608; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1] 2609; AVX-NEXT: retq 2610; 2611; AVX512-LABEL: ceil_mask_sd_trunc: 2612; AVX512: ## %bb.0: 2613; AVX512-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm0 2614; AVX512-NEXT: kmovw %edi, %k1 2615; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm2 {%k1} 2616; AVX512-NEXT: vmovapd %xmm2, %xmm0 2617; AVX512-NEXT: retq 2618 %mask = trunc i16 %k to i1 2619 %s = extractelement <2 x double> %x, i64 0 2620 %call = tail call double @llvm.ceil.f64(double %s) 2621 %dst = extractelement <2 x double> %w, i64 0 2622 %low = select i1 %mask, double %call, double %dst 2623 %res = insertelement <2 x double> %y, double %low, i64 0 2624 ret <2 x double> %res 2625} 2626 2627define <2 x double> @ceil_maskz_sd_trunc(<2 x double> %x, <2 x double> %y, i16 %k) nounwind { 2628; SSE41-LABEL: ceil_maskz_sd_trunc: 2629; SSE41: ## %bb.0: 2630; SSE41-NEXT: testb $1, %dil 2631; SSE41-NEXT: jne LBB85_1 2632; SSE41-NEXT: ## %bb.2: 2633; SSE41-NEXT: xorpd %xmm0, %xmm0 2634; SSE41-NEXT: jmp LBB85_3 2635; SSE41-NEXT: LBB85_1: 2636; SSE41-NEXT: roundsd $10, %xmm0, %xmm0 2637; SSE41-NEXT: LBB85_3: 2638; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 2639; SSE41-NEXT: movapd %xmm1, %xmm0 2640; SSE41-NEXT: retq 2641; 2642; AVX-LABEL: ceil_maskz_sd_trunc: 2643; AVX: ## %bb.0: 2644; AVX-NEXT: testb $1, %dil 2645; AVX-NEXT: jne LBB85_1 2646; AVX-NEXT: ## %bb.2: 2647; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 2648; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2649; AVX-NEXT: retq 2650; AVX-NEXT: LBB85_1: 2651; AVX-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm0 2652; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] 2653; AVX-NEXT: retq 2654; 2655; AVX512-LABEL: ceil_maskz_sd_trunc: 2656; AVX512: ## %bb.0: 2657; AVX512-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm0 2658; AVX512-NEXT: kmovw %edi, %k1 2659; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm0 {%k1} {z} 2660; AVX512-NEXT: retq 2661 %mask = trunc i16 %k to i1 2662 %s = extractelement <2 x double> %x, i64 0 2663 %call = tail call double @llvm.ceil.f64(double %s) 2664 %low = select i1 %mask, double %call, double zeroinitializer 2665 %res = insertelement <2 x double> %y, double %low, i64 0 2666 ret <2 x double> %res 2667} 2668 2669define <4 x float> @ceil_mask_ss_mask8(<4 x float> %x, <4 x float> %y, <4 x float> %w) nounwind { 2670; SSE41-LABEL: ceil_mask_ss_mask8: 2671; SSE41: ## %bb.0: 2672; SSE41-NEXT: roundss $10, %xmm0, %xmm3 2673; SSE41-NEXT: cmpeqss %xmm1, %xmm0 2674; SSE41-NEXT: andps %xmm0, %xmm3 2675; SSE41-NEXT: andnps %xmm2, %xmm0 2676; SSE41-NEXT: orps %xmm3, %xmm0 2677; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 2678; SSE41-NEXT: retq 2679; 2680; AVX-LABEL: ceil_mask_ss_mask8: 2681; AVX: ## %bb.0: 2682; AVX-NEXT: vroundss $10, %xmm0, %xmm0, %xmm3 2683; AVX-NEXT: vcmpeqss %xmm1, %xmm0, %xmm0 2684; AVX-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0 2685; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 2686; AVX-NEXT: retq 2687; 2688; AVX512-LABEL: ceil_mask_ss_mask8: 2689; AVX512: ## %bb.0: 2690; AVX512-NEXT: vroundss $10, %xmm0, %xmm0, %xmm3 2691; AVX512-NEXT: vcmpeqss %xmm1, %xmm0, %k1 2692; AVX512-NEXT: vmovss %xmm3, %xmm1, %xmm2 {%k1} 2693; AVX512-NEXT: vmovaps %xmm2, %xmm0 2694; AVX512-NEXT: retq 2695 %mask1 = fcmp oeq <4 x float> %x, %y 2696 %mask = extractelement <4 x i1> %mask1, i64 0 2697 %s = extractelement <4 x float> %x, i64 0 2698 %call = tail call float @llvm.ceil.f32(float %s) 2699 %dst = extractelement <4 x float> %w, i64 0 2700 %low = select i1 %mask, float %call, float %dst 2701 %res = insertelement <4 x float> %y, float %low, i64 0 2702 ret <4 x float> %res 2703} 2704 2705define <4 x float> @ceil_maskz_ss_mask8(<4 x float> %x, <4 x float> %y) nounwind { 2706; SSE41-LABEL: ceil_maskz_ss_mask8: 2707; SSE41: ## %bb.0: 2708; SSE41-NEXT: roundss $10, %xmm0, %xmm2 2709; SSE41-NEXT: cmpeqss %xmm1, %xmm0 2710; SSE41-NEXT: andps %xmm2, %xmm0 2711; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 2712; SSE41-NEXT: retq 2713; 2714; AVX-LABEL: ceil_maskz_ss_mask8: 2715; AVX: ## %bb.0: 2716; AVX-NEXT: vroundss $10, %xmm0, %xmm0, %xmm2 2717; AVX-NEXT: vcmpeqss %xmm1, %xmm0, %xmm0 2718; AVX-NEXT: vandps %xmm2, %xmm0, %xmm0 2719; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 2720; AVX-NEXT: retq 2721; 2722; AVX512-LABEL: ceil_maskz_ss_mask8: 2723; AVX512: ## %bb.0: 2724; AVX512-NEXT: vroundss $10, %xmm0, %xmm0, %xmm2 2725; AVX512-NEXT: vcmpeqss %xmm1, %xmm0, %k1 2726; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm0 {%k1} {z} 2727; AVX512-NEXT: retq 2728 %mask1 = fcmp oeq <4 x float> %x, %y 2729 %mask = extractelement <4 x i1> %mask1, i64 0 2730 %s = extractelement <4 x float> %x, i64 0 2731 %call = tail call float @llvm.ceil.f32(float %s) 2732 %low = select i1 %mask, float %call, float zeroinitializer 2733 %res = insertelement <4 x float> %y, float %low, i64 0 2734 ret <4 x float> %res 2735} 2736 2737define <2 x double> @ceil_mask_sd_mask8(<2 x double> %x, <2 x double> %y, <2 x double> %w) nounwind { 2738; SSE41-LABEL: ceil_mask_sd_mask8: 2739; SSE41: ## %bb.0: 2740; SSE41-NEXT: roundsd $10, %xmm0, %xmm3 2741; SSE41-NEXT: cmpeqsd %xmm1, %xmm0 2742; SSE41-NEXT: andpd %xmm0, %xmm3 2743; SSE41-NEXT: andnpd %xmm2, %xmm0 2744; SSE41-NEXT: orpd %xmm3, %xmm0 2745; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] 2746; SSE41-NEXT: retq 2747; 2748; AVX-LABEL: ceil_mask_sd_mask8: 2749; AVX: ## %bb.0: 2750; AVX-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm3 2751; AVX-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0 2752; AVX-NEXT: vblendvpd %xmm0, %xmm3, %xmm2, %xmm0 2753; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] 2754; AVX-NEXT: retq 2755; 2756; AVX512-LABEL: ceil_mask_sd_mask8: 2757; AVX512: ## %bb.0: 2758; AVX512-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm3 2759; AVX512-NEXT: vcmpeqsd %xmm1, %xmm0, %k1 2760; AVX512-NEXT: vmovsd %xmm3, %xmm1, %xmm2 {%k1} 2761; AVX512-NEXT: vmovapd %xmm2, %xmm0 2762; AVX512-NEXT: retq 2763 %mask1 = fcmp oeq <2 x double> %x, %y 2764 %mask = extractelement <2 x i1> %mask1, i64 0 2765 %s = extractelement <2 x double> %x, i64 0 2766 %call = tail call double @llvm.ceil.f64(double %s) 2767 %dst = extractelement <2 x double> %w, i64 0 2768 %low = select i1 %mask, double %call, double %dst 2769 %res = insertelement <2 x double> %y, double %low, i64 0 2770 ret <2 x double> %res 2771} 2772 2773define <2 x double> @ceil_maskz_sd_mask8(<2 x double> %x, <2 x double> %y) nounwind { 2774; SSE41-LABEL: ceil_maskz_sd_mask8: 2775; SSE41: ## %bb.0: 2776; SSE41-NEXT: roundsd $10, %xmm0, %xmm2 2777; SSE41-NEXT: cmpeqsd %xmm1, %xmm0 2778; SSE41-NEXT: andpd %xmm2, %xmm0 2779; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] 2780; SSE41-NEXT: retq 2781; 2782; AVX-LABEL: ceil_maskz_sd_mask8: 2783; AVX: ## %bb.0: 2784; AVX-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm2 2785; AVX-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0 2786; AVX-NEXT: vandpd %xmm2, %xmm0, %xmm0 2787; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] 2788; AVX-NEXT: retq 2789; 2790; AVX512-LABEL: ceil_maskz_sd_mask8: 2791; AVX512: ## %bb.0: 2792; AVX512-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm2 2793; AVX512-NEXT: vcmpeqsd %xmm1, %xmm0, %k1 2794; AVX512-NEXT: vmovsd %xmm2, %xmm1, %xmm0 {%k1} {z} 2795; AVX512-NEXT: retq 2796 %mask1 = fcmp oeq <2 x double> %x, %y 2797 %mask = extractelement <2 x i1> %mask1, i64 0 2798 %s = extractelement <2 x double> %x, i64 0 2799 %call = tail call double @llvm.ceil.f64(double %s) 2800 %low = select i1 %mask, double %call, double zeroinitializer 2801 %res = insertelement <2 x double> %y, double %low, i64 0 2802 ret <2 x double> %res 2803} 2804