1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefix=SSE --check-prefix=SSE4A 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=VLX 8 9; Make sure that we generate non-temporal stores for the test cases below. 10; We use xorps for zeroing, so domain information isn't available anymore. 11 12; Scalar versions (zeroing means we can this even for fp types). 13 14define void @test_zero_f32(float* %dst) { 15; SSE-LABEL: test_zero_f32: 16; SSE: # %bb.0: 17; SSE-NEXT: xorl %eax, %eax 18; SSE-NEXT: movntil %eax, (%rdi) 19; SSE-NEXT: retq 20; 21; AVX-LABEL: test_zero_f32: 22; AVX: # %bb.0: 23; AVX-NEXT: xorl %eax, %eax 24; AVX-NEXT: movntil %eax, (%rdi) 25; AVX-NEXT: retq 26; 27; VLX-LABEL: test_zero_f32: 28; VLX: # %bb.0: 29; VLX-NEXT: xorl %eax, %eax 30; VLX-NEXT: movntil %eax, (%rdi) 31; VLX-NEXT: retq 32 store float zeroinitializer, float* %dst, align 1, !nontemporal !1 33 ret void 34} 35 36define void @test_zero_i32(i32* %dst) { 37; SSE-LABEL: test_zero_i32: 38; SSE: # %bb.0: 39; SSE-NEXT: xorl %eax, %eax 40; SSE-NEXT: movntil %eax, (%rdi) 41; SSE-NEXT: retq 42; 43; AVX-LABEL: test_zero_i32: 44; AVX: # %bb.0: 45; AVX-NEXT: xorl %eax, %eax 46; AVX-NEXT: movntil %eax, (%rdi) 47; AVX-NEXT: retq 48; 49; VLX-LABEL: test_zero_i32: 50; VLX: # %bb.0: 51; VLX-NEXT: xorl %eax, %eax 52; VLX-NEXT: movntil %eax, (%rdi) 53; VLX-NEXT: retq 54 store i32 zeroinitializer, i32* %dst, align 1, !nontemporal !1 55 ret void 56} 57 58define void @test_zero_f64(double* %dst) { 59; SSE-LABEL: test_zero_f64: 60; SSE: # %bb.0: 61; SSE-NEXT: xorl %eax, %eax 62; SSE-NEXT: movntiq %rax, (%rdi) 63; SSE-NEXT: retq 64; 65; AVX-LABEL: test_zero_f64: 66; AVX: # %bb.0: 67; AVX-NEXT: xorl %eax, %eax 68; AVX-NEXT: movntiq %rax, (%rdi) 69; AVX-NEXT: retq 70; 71; VLX-LABEL: test_zero_f64: 72; VLX: # %bb.0: 73; VLX-NEXT: xorl %eax, %eax 74; VLX-NEXT: movntiq %rax, (%rdi) 75; VLX-NEXT: retq 76 store double zeroinitializer, double* %dst, align 1, !nontemporal !1 77 ret void 78} 79 80define void @test_zero_i64(i64* %dst) { 81; SSE-LABEL: test_zero_i64: 82; SSE: # %bb.0: 83; SSE-NEXT: xorl %eax, %eax 84; SSE-NEXT: movntiq %rax, (%rdi) 85; SSE-NEXT: retq 86; 87; AVX-LABEL: test_zero_i64: 88; AVX: # %bb.0: 89; AVX-NEXT: xorl %eax, %eax 90; AVX-NEXT: movntiq %rax, (%rdi) 91; AVX-NEXT: retq 92; 93; VLX-LABEL: test_zero_i64: 94; VLX: # %bb.0: 95; VLX-NEXT: xorl %eax, %eax 96; VLX-NEXT: movntiq %rax, (%rdi) 97; VLX-NEXT: retq 98 store i64 zeroinitializer, i64* %dst, align 1, !nontemporal !1 99 ret void 100} 101 102; And now XMM versions. 103 104define void @test_zero_v4f32(<4 x float>* %dst) { 105; SSE-LABEL: test_zero_v4f32: 106; SSE: # %bb.0: 107; SSE-NEXT: xorps %xmm0, %xmm0 108; SSE-NEXT: movntps %xmm0, (%rdi) 109; SSE-NEXT: retq 110; 111; AVX-LABEL: test_zero_v4f32: 112; AVX: # %bb.0: 113; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 114; AVX-NEXT: vmovntps %xmm0, (%rdi) 115; AVX-NEXT: retq 116; 117; VLX-LABEL: test_zero_v4f32: 118; VLX: # %bb.0: 119; VLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 120; VLX-NEXT: vmovntps %xmm0, (%rdi) 121; VLX-NEXT: retq 122 store <4 x float> zeroinitializer, <4 x float>* %dst, align 16, !nontemporal !1 123 ret void 124} 125 126define void @test_zero_v4i32(<4 x i32>* %dst) { 127; SSE-LABEL: test_zero_v4i32: 128; SSE: # %bb.0: 129; SSE-NEXT: xorps %xmm0, %xmm0 130; SSE-NEXT: movntps %xmm0, (%rdi) 131; SSE-NEXT: retq 132; 133; AVX-LABEL: test_zero_v4i32: 134; AVX: # %bb.0: 135; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 136; AVX-NEXT: vmovntps %xmm0, (%rdi) 137; AVX-NEXT: retq 138; 139; VLX-LABEL: test_zero_v4i32: 140; VLX: # %bb.0: 141; VLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 142; VLX-NEXT: vmovntps %xmm0, (%rdi) 143; VLX-NEXT: retq 144 store <4 x i32> zeroinitializer, <4 x i32>* %dst, align 16, !nontemporal !1 145 store <4 x i32> zeroinitializer, <4 x i32>* %dst, align 16, !nontemporal !1 146 ret void 147} 148 149define void @test_zero_v2f64(<2 x double>* %dst) { 150; SSE-LABEL: test_zero_v2f64: 151; SSE: # %bb.0: 152; SSE-NEXT: xorps %xmm0, %xmm0 153; SSE-NEXT: movntps %xmm0, (%rdi) 154; SSE-NEXT: retq 155; 156; AVX-LABEL: test_zero_v2f64: 157; AVX: # %bb.0: 158; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 159; AVX-NEXT: vmovntps %xmm0, (%rdi) 160; AVX-NEXT: retq 161; 162; VLX-LABEL: test_zero_v2f64: 163; VLX: # %bb.0: 164; VLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 165; VLX-NEXT: vmovntps %xmm0, (%rdi) 166; VLX-NEXT: retq 167 store <2 x double> zeroinitializer, <2 x double>* %dst, align 16, !nontemporal !1 168 ret void 169} 170 171define void @test_zero_v2i64(<2 x i64>* %dst) { 172; SSE-LABEL: test_zero_v2i64: 173; SSE: # %bb.0: 174; SSE-NEXT: xorps %xmm0, %xmm0 175; SSE-NEXT: movntps %xmm0, (%rdi) 176; SSE-NEXT: retq 177; 178; AVX-LABEL: test_zero_v2i64: 179; AVX: # %bb.0: 180; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 181; AVX-NEXT: vmovntps %xmm0, (%rdi) 182; AVX-NEXT: retq 183; 184; VLX-LABEL: test_zero_v2i64: 185; VLX: # %bb.0: 186; VLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 187; VLX-NEXT: vmovntps %xmm0, (%rdi) 188; VLX-NEXT: retq 189 store <2 x i64> zeroinitializer, <2 x i64>* %dst, align 16, !nontemporal !1 190 ret void 191} 192 193define void @test_zero_v8i16(<8 x i16>* %dst) { 194; SSE-LABEL: test_zero_v8i16: 195; SSE: # %bb.0: 196; SSE-NEXT: xorps %xmm0, %xmm0 197; SSE-NEXT: movntps %xmm0, (%rdi) 198; SSE-NEXT: retq 199; 200; AVX-LABEL: test_zero_v8i16: 201; AVX: # %bb.0: 202; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 203; AVX-NEXT: vmovntps %xmm0, (%rdi) 204; AVX-NEXT: retq 205; 206; VLX-LABEL: test_zero_v8i16: 207; VLX: # %bb.0: 208; VLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 209; VLX-NEXT: vmovntps %xmm0, (%rdi) 210; VLX-NEXT: retq 211 store <8 x i16> zeroinitializer, <8 x i16>* %dst, align 16, !nontemporal !1 212 ret void 213} 214 215define void @test_zero_v16i8(<16 x i8>* %dst) { 216; SSE-LABEL: test_zero_v16i8: 217; SSE: # %bb.0: 218; SSE-NEXT: xorps %xmm0, %xmm0 219; SSE-NEXT: movntps %xmm0, (%rdi) 220; SSE-NEXT: retq 221; 222; AVX-LABEL: test_zero_v16i8: 223; AVX: # %bb.0: 224; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 225; AVX-NEXT: vmovntps %xmm0, (%rdi) 226; AVX-NEXT: retq 227; 228; VLX-LABEL: test_zero_v16i8: 229; VLX: # %bb.0: 230; VLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 231; VLX-NEXT: vmovntps %xmm0, (%rdi) 232; VLX-NEXT: retq 233 store <16 x i8> zeroinitializer, <16 x i8>* %dst, align 16, !nontemporal !1 234 ret void 235} 236 237; And now YMM versions. 238 239define void @test_zero_v8f32(<8 x float>* %dst) { 240; SSE-LABEL: test_zero_v8f32: 241; SSE: # %bb.0: 242; SSE-NEXT: xorps %xmm0, %xmm0 243; SSE-NEXT: movntps %xmm0, 16(%rdi) 244; SSE-NEXT: movntps %xmm0, (%rdi) 245; SSE-NEXT: retq 246; 247; AVX-LABEL: test_zero_v8f32: 248; AVX: # %bb.0: 249; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 250; AVX-NEXT: vmovntps %ymm0, (%rdi) 251; AVX-NEXT: vzeroupper 252; AVX-NEXT: retq 253; 254; VLX-LABEL: test_zero_v8f32: 255; VLX: # %bb.0: 256; VLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 257; VLX-NEXT: vmovntps %ymm0, (%rdi) 258; VLX-NEXT: vzeroupper 259; VLX-NEXT: retq 260 store <8 x float> zeroinitializer, <8 x float>* %dst, align 32, !nontemporal !1 261 ret void 262} 263 264define void @test_zero_v8i32(<8 x i32>* %dst) { 265; SSE-LABEL: test_zero_v8i32: 266; SSE: # %bb.0: 267; SSE-NEXT: xorps %xmm0, %xmm0 268; SSE-NEXT: movntps %xmm0, 16(%rdi) 269; SSE-NEXT: movntps %xmm0, (%rdi) 270; SSE-NEXT: retq 271; 272; AVX-LABEL: test_zero_v8i32: 273; AVX: # %bb.0: 274; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 275; AVX-NEXT: vmovntps %ymm0, (%rdi) 276; AVX-NEXT: vzeroupper 277; AVX-NEXT: retq 278; 279; VLX-LABEL: test_zero_v8i32: 280; VLX: # %bb.0: 281; VLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 282; VLX-NEXT: vmovntps %ymm0, (%rdi) 283; VLX-NEXT: vzeroupper 284; VLX-NEXT: retq 285 store <8 x i32> zeroinitializer, <8 x i32>* %dst, align 32, !nontemporal !1 286 ret void 287} 288 289define void @test_zero_v4f64(<4 x double>* %dst) { 290; SSE-LABEL: test_zero_v4f64: 291; SSE: # %bb.0: 292; SSE-NEXT: xorps %xmm0, %xmm0 293; SSE-NEXT: movntps %xmm0, 16(%rdi) 294; SSE-NEXT: movntps %xmm0, (%rdi) 295; SSE-NEXT: retq 296; 297; AVX-LABEL: test_zero_v4f64: 298; AVX: # %bb.0: 299; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 300; AVX-NEXT: vmovntps %ymm0, (%rdi) 301; AVX-NEXT: vzeroupper 302; AVX-NEXT: retq 303; 304; VLX-LABEL: test_zero_v4f64: 305; VLX: # %bb.0: 306; VLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 307; VLX-NEXT: vmovntps %ymm0, (%rdi) 308; VLX-NEXT: vzeroupper 309; VLX-NEXT: retq 310 store <4 x double> zeroinitializer, <4 x double>* %dst, align 32, !nontemporal !1 311 ret void 312} 313 314define void @test_zero_v4i64(<4 x i64>* %dst) { 315; SSE-LABEL: test_zero_v4i64: 316; SSE: # %bb.0: 317; SSE-NEXT: xorps %xmm0, %xmm0 318; SSE-NEXT: movntps %xmm0, 16(%rdi) 319; SSE-NEXT: movntps %xmm0, (%rdi) 320; SSE-NEXT: retq 321; 322; AVX-LABEL: test_zero_v4i64: 323; AVX: # %bb.0: 324; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 325; AVX-NEXT: vmovntps %ymm0, (%rdi) 326; AVX-NEXT: vzeroupper 327; AVX-NEXT: retq 328; 329; VLX-LABEL: test_zero_v4i64: 330; VLX: # %bb.0: 331; VLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 332; VLX-NEXT: vmovntps %ymm0, (%rdi) 333; VLX-NEXT: vzeroupper 334; VLX-NEXT: retq 335 store <4 x i64> zeroinitializer, <4 x i64>* %dst, align 32, !nontemporal !1 336 ret void 337} 338 339define void @test_zero_v16i16(<16 x i16>* %dst) { 340; SSE-LABEL: test_zero_v16i16: 341; SSE: # %bb.0: 342; SSE-NEXT: xorps %xmm0, %xmm0 343; SSE-NEXT: movntps %xmm0, 16(%rdi) 344; SSE-NEXT: movntps %xmm0, (%rdi) 345; SSE-NEXT: retq 346; 347; AVX-LABEL: test_zero_v16i16: 348; AVX: # %bb.0: 349; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 350; AVX-NEXT: vmovntps %ymm0, (%rdi) 351; AVX-NEXT: vzeroupper 352; AVX-NEXT: retq 353; 354; VLX-LABEL: test_zero_v16i16: 355; VLX: # %bb.0: 356; VLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 357; VLX-NEXT: vmovntps %ymm0, (%rdi) 358; VLX-NEXT: vzeroupper 359; VLX-NEXT: retq 360 store <16 x i16> zeroinitializer, <16 x i16>* %dst, align 32, !nontemporal !1 361 ret void 362} 363 364define void @test_zero_v32i8(<32 x i8>* %dst) { 365; SSE-LABEL: test_zero_v32i8: 366; SSE: # %bb.0: 367; SSE-NEXT: xorps %xmm0, %xmm0 368; SSE-NEXT: movntps %xmm0, 16(%rdi) 369; SSE-NEXT: movntps %xmm0, (%rdi) 370; SSE-NEXT: retq 371; 372; AVX-LABEL: test_zero_v32i8: 373; AVX: # %bb.0: 374; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 375; AVX-NEXT: vmovntps %ymm0, (%rdi) 376; AVX-NEXT: vzeroupper 377; AVX-NEXT: retq 378; 379; VLX-LABEL: test_zero_v32i8: 380; VLX: # %bb.0: 381; VLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 382; VLX-NEXT: vmovntps %ymm0, (%rdi) 383; VLX-NEXT: vzeroupper 384; VLX-NEXT: retq 385 store <32 x i8> zeroinitializer, <32 x i8>* %dst, align 32, !nontemporal !1 386 ret void 387} 388 389 390; Check that we also handle arguments. Here the type survives longer. 391 392; Scalar versions. 393 394define void @test_arg_f32(float %arg, float* %dst) { 395; SSE2-LABEL: test_arg_f32: 396; SSE2: # %bb.0: 397; SSE2-NEXT: movss %xmm0, (%rdi) 398; SSE2-NEXT: retq 399; 400; SSE4A-LABEL: test_arg_f32: 401; SSE4A: # %bb.0: 402; SSE4A-NEXT: movntss %xmm0, (%rdi) 403; SSE4A-NEXT: retq 404; 405; SSE41-LABEL: test_arg_f32: 406; SSE41: # %bb.0: 407; SSE41-NEXT: movss %xmm0, (%rdi) 408; SSE41-NEXT: retq 409; 410; AVX-LABEL: test_arg_f32: 411; AVX: # %bb.0: 412; AVX-NEXT: vmovss %xmm0, (%rdi) 413; AVX-NEXT: retq 414; 415; VLX-LABEL: test_arg_f32: 416; VLX: # %bb.0: 417; VLX-NEXT: vmovss %xmm0, (%rdi) 418; VLX-NEXT: retq 419 store float %arg, float* %dst, align 1, !nontemporal !1 420 ret void 421} 422 423define void @test_arg_i32(i32 %arg, i32* %dst) { 424; SSE-LABEL: test_arg_i32: 425; SSE: # %bb.0: 426; SSE-NEXT: movntil %edi, (%rsi) 427; SSE-NEXT: retq 428; 429; AVX-LABEL: test_arg_i32: 430; AVX: # %bb.0: 431; AVX-NEXT: movntil %edi, (%rsi) 432; AVX-NEXT: retq 433; 434; VLX-LABEL: test_arg_i32: 435; VLX: # %bb.0: 436; VLX-NEXT: movntil %edi, (%rsi) 437; VLX-NEXT: retq 438 store i32 %arg, i32* %dst, align 1, !nontemporal !1 439 ret void 440} 441 442define void @test_arg_f64(double %arg, double* %dst) { 443; SSE2-LABEL: test_arg_f64: 444; SSE2: # %bb.0: 445; SSE2-NEXT: movsd %xmm0, (%rdi) 446; SSE2-NEXT: retq 447; 448; SSE4A-LABEL: test_arg_f64: 449; SSE4A: # %bb.0: 450; SSE4A-NEXT: movntsd %xmm0, (%rdi) 451; SSE4A-NEXT: retq 452; 453; SSE41-LABEL: test_arg_f64: 454; SSE41: # %bb.0: 455; SSE41-NEXT: movsd %xmm0, (%rdi) 456; SSE41-NEXT: retq 457; 458; AVX-LABEL: test_arg_f64: 459; AVX: # %bb.0: 460; AVX-NEXT: vmovsd %xmm0, (%rdi) 461; AVX-NEXT: retq 462; 463; VLX-LABEL: test_arg_f64: 464; VLX: # %bb.0: 465; VLX-NEXT: vmovsd %xmm0, (%rdi) 466; VLX-NEXT: retq 467 store double %arg, double* %dst, align 1, !nontemporal !1 468 ret void 469} 470 471define void @test_arg_i64(i64 %arg, i64* %dst) { 472; SSE-LABEL: test_arg_i64: 473; SSE: # %bb.0: 474; SSE-NEXT: movntiq %rdi, (%rsi) 475; SSE-NEXT: retq 476; 477; AVX-LABEL: test_arg_i64: 478; AVX: # %bb.0: 479; AVX-NEXT: movntiq %rdi, (%rsi) 480; AVX-NEXT: retq 481; 482; VLX-LABEL: test_arg_i64: 483; VLX: # %bb.0: 484; VLX-NEXT: movntiq %rdi, (%rsi) 485; VLX-NEXT: retq 486 store i64 %arg, i64* %dst, align 1, !nontemporal !1 487 ret void 488} 489 490; Extract versions 491 492define void @test_extract_f32(<4 x float> %arg, float* %dst) { 493; SSE2-LABEL: test_extract_f32: 494; SSE2: # %bb.0: 495; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] 496; SSE2-NEXT: movss %xmm0, (%rdi) 497; SSE2-NEXT: retq 498; 499; SSE4A-LABEL: test_extract_f32: 500; SSE4A: # %bb.0: 501; SSE4A-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 502; SSE4A-NEXT: movntss %xmm0, (%rdi) 503; SSE4A-NEXT: retq 504; 505; SSE41-LABEL: test_extract_f32: 506; SSE41: # %bb.0: 507; SSE41-NEXT: extractps $1, %xmm0, %eax 508; SSE41-NEXT: movntil %eax, (%rdi) 509; SSE41-NEXT: retq 510; 511; AVX-LABEL: test_extract_f32: 512; AVX: # %bb.0: 513; AVX-NEXT: vextractps $1, %xmm0, %eax 514; AVX-NEXT: movntil %eax, (%rdi) 515; AVX-NEXT: retq 516; 517; VLX-LABEL: test_extract_f32: 518; VLX: # %bb.0: 519; VLX-NEXT: vextractps $1, %xmm0, %eax 520; VLX-NEXT: movntil %eax, (%rdi) 521; VLX-NEXT: retq 522 %1 = extractelement <4 x float> %arg, i32 1 523 store float %1, float* %dst, align 1, !nontemporal !1 524 ret void 525} 526 527define void @test_extract_i32(<4 x i32> %arg, i32* %dst) { 528; SSE2-LABEL: test_extract_i32: 529; SSE2: # %bb.0: 530; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 531; SSE2-NEXT: movd %xmm0, %eax 532; SSE2-NEXT: movntil %eax, (%rdi) 533; SSE2-NEXT: retq 534; 535; SSE4A-LABEL: test_extract_i32: 536; SSE4A: # %bb.0: 537; SSE4A-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 538; SSE4A-NEXT: movd %xmm0, %eax 539; SSE4A-NEXT: movntil %eax, (%rdi) 540; SSE4A-NEXT: retq 541; 542; SSE41-LABEL: test_extract_i32: 543; SSE41: # %bb.0: 544; SSE41-NEXT: extractps $1, %xmm0, %eax 545; SSE41-NEXT: movntil %eax, (%rdi) 546; SSE41-NEXT: retq 547; 548; AVX-LABEL: test_extract_i32: 549; AVX: # %bb.0: 550; AVX-NEXT: vextractps $1, %xmm0, %eax 551; AVX-NEXT: movntil %eax, (%rdi) 552; AVX-NEXT: retq 553; 554; VLX-LABEL: test_extract_i32: 555; VLX: # %bb.0: 556; VLX-NEXT: vextractps $1, %xmm0, %eax 557; VLX-NEXT: movntil %eax, (%rdi) 558; VLX-NEXT: retq 559 %1 = extractelement <4 x i32> %arg, i32 1 560 store i32 %1, i32* %dst, align 1, !nontemporal !1 561 ret void 562} 563 564define void @test_extract_f64(<2 x double> %arg, double* %dst) { 565; SSE2-LABEL: test_extract_f64: 566; SSE2: # %bb.0: 567; SSE2-NEXT: movhpd %xmm0, (%rdi) 568; SSE2-NEXT: retq 569; 570; SSE4A-LABEL: test_extract_f64: 571; SSE4A: # %bb.0: 572; SSE4A-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 573; SSE4A-NEXT: movntsd %xmm0, (%rdi) 574; SSE4A-NEXT: retq 575; 576; SSE41-LABEL: test_extract_f64: 577; SSE41: # %bb.0: 578; SSE41-NEXT: movhpd %xmm0, (%rdi) 579; SSE41-NEXT: retq 580; 581; AVX-LABEL: test_extract_f64: 582; AVX: # %bb.0: 583; AVX-NEXT: vmovhpd %xmm0, (%rdi) 584; AVX-NEXT: retq 585; 586; VLX-LABEL: test_extract_f64: 587; VLX: # %bb.0: 588; VLX-NEXT: vmovhpd %xmm0, (%rdi) 589; VLX-NEXT: retq 590 %1 = extractelement <2 x double> %arg, i32 1 591 store double %1, double* %dst, align 1, !nontemporal !1 592 ret void 593} 594 595define void @test_extract_i64(<2 x i64> %arg, i64* %dst) { 596; SSE2-LABEL: test_extract_i64: 597; SSE2: # %bb.0: 598; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 599; SSE2-NEXT: movq %xmm0, %rax 600; SSE2-NEXT: movntiq %rax, (%rdi) 601; SSE2-NEXT: retq 602; 603; SSE4A-LABEL: test_extract_i64: 604; SSE4A: # %bb.0: 605; SSE4A-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 606; SSE4A-NEXT: movq %xmm0, %rax 607; SSE4A-NEXT: movntiq %rax, (%rdi) 608; SSE4A-NEXT: retq 609; 610; SSE41-LABEL: test_extract_i64: 611; SSE41: # %bb.0: 612; SSE41-NEXT: pextrq $1, %xmm0, %rax 613; SSE41-NEXT: movntiq %rax, (%rdi) 614; SSE41-NEXT: retq 615; 616; AVX-LABEL: test_extract_i64: 617; AVX: # %bb.0: 618; AVX-NEXT: vpextrq $1, %xmm0, %rax 619; AVX-NEXT: movntiq %rax, (%rdi) 620; AVX-NEXT: retq 621; 622; VLX-LABEL: test_extract_i64: 623; VLX: # %bb.0: 624; VLX-NEXT: vpextrq $1, %xmm0, %rax 625; VLX-NEXT: movntiq %rax, (%rdi) 626; VLX-NEXT: retq 627 %1 = extractelement <2 x i64> %arg, i32 1 628 store i64 %1, i64* %dst, align 1, !nontemporal !1 629 ret void 630} 631 632; And now XMM versions. 633 634define void @test_arg_v4f32(<4 x float> %arg, <4 x float>* %dst) { 635; SSE-LABEL: test_arg_v4f32: 636; SSE: # %bb.0: 637; SSE-NEXT: movntps %xmm0, (%rdi) 638; SSE-NEXT: retq 639; 640; AVX-LABEL: test_arg_v4f32: 641; AVX: # %bb.0: 642; AVX-NEXT: vmovntps %xmm0, (%rdi) 643; AVX-NEXT: retq 644; 645; VLX-LABEL: test_arg_v4f32: 646; VLX: # %bb.0: 647; VLX-NEXT: vmovntps %xmm0, (%rdi) 648; VLX-NEXT: retq 649 store <4 x float> %arg, <4 x float>* %dst, align 16, !nontemporal !1 650 ret void 651} 652 653define void @test_arg_v4i32(<4 x i32> %arg, <4 x i32>* %dst) { 654; SSE-LABEL: test_arg_v4i32: 655; SSE: # %bb.0: 656; SSE-NEXT: movntps %xmm0, (%rdi) 657; SSE-NEXT: retq 658; 659; AVX-LABEL: test_arg_v4i32: 660; AVX: # %bb.0: 661; AVX-NEXT: vmovntps %xmm0, (%rdi) 662; AVX-NEXT: retq 663; 664; VLX-LABEL: test_arg_v4i32: 665; VLX: # %bb.0: 666; VLX-NEXT: vmovntps %xmm0, (%rdi) 667; VLX-NEXT: retq 668 store <4 x i32> %arg, <4 x i32>* %dst, align 16, !nontemporal !1 669 ret void 670} 671 672define void @test_arg_v2f64(<2 x double> %arg, <2 x double>* %dst) { 673; SSE-LABEL: test_arg_v2f64: 674; SSE: # %bb.0: 675; SSE-NEXT: movntps %xmm0, (%rdi) 676; SSE-NEXT: retq 677; 678; AVX-LABEL: test_arg_v2f64: 679; AVX: # %bb.0: 680; AVX-NEXT: vmovntps %xmm0, (%rdi) 681; AVX-NEXT: retq 682; 683; VLX-LABEL: test_arg_v2f64: 684; VLX: # %bb.0: 685; VLX-NEXT: vmovntps %xmm0, (%rdi) 686; VLX-NEXT: retq 687 store <2 x double> %arg, <2 x double>* %dst, align 16, !nontemporal !1 688 ret void 689} 690 691define void @test_arg_v2i64(<2 x i64> %arg, <2 x i64>* %dst) { 692; SSE-LABEL: test_arg_v2i64: 693; SSE: # %bb.0: 694; SSE-NEXT: movntps %xmm0, (%rdi) 695; SSE-NEXT: retq 696; 697; AVX-LABEL: test_arg_v2i64: 698; AVX: # %bb.0: 699; AVX-NEXT: vmovntps %xmm0, (%rdi) 700; AVX-NEXT: retq 701; 702; VLX-LABEL: test_arg_v2i64: 703; VLX: # %bb.0: 704; VLX-NEXT: vmovntps %xmm0, (%rdi) 705; VLX-NEXT: retq 706 store <2 x i64> %arg, <2 x i64>* %dst, align 16, !nontemporal !1 707 ret void 708} 709 710define void @test_arg_v8i16(<8 x i16> %arg, <8 x i16>* %dst) { 711; SSE-LABEL: test_arg_v8i16: 712; SSE: # %bb.0: 713; SSE-NEXT: movntps %xmm0, (%rdi) 714; SSE-NEXT: retq 715; 716; AVX-LABEL: test_arg_v8i16: 717; AVX: # %bb.0: 718; AVX-NEXT: vmovntps %xmm0, (%rdi) 719; AVX-NEXT: retq 720; 721; VLX-LABEL: test_arg_v8i16: 722; VLX: # %bb.0: 723; VLX-NEXT: vmovntps %xmm0, (%rdi) 724; VLX-NEXT: retq 725 store <8 x i16> %arg, <8 x i16>* %dst, align 16, !nontemporal !1 726 ret void 727} 728 729define void @test_arg_v16i8(<16 x i8> %arg, <16 x i8>* %dst) { 730; SSE-LABEL: test_arg_v16i8: 731; SSE: # %bb.0: 732; SSE-NEXT: movntps %xmm0, (%rdi) 733; SSE-NEXT: retq 734; 735; AVX-LABEL: test_arg_v16i8: 736; AVX: # %bb.0: 737; AVX-NEXT: vmovntps %xmm0, (%rdi) 738; AVX-NEXT: retq 739; 740; VLX-LABEL: test_arg_v16i8: 741; VLX: # %bb.0: 742; VLX-NEXT: vmovntps %xmm0, (%rdi) 743; VLX-NEXT: retq 744 store <16 x i8> %arg, <16 x i8>* %dst, align 16, !nontemporal !1 745 ret void 746} 747 748; And now YMM versions. 749 750define void @test_arg_v8f32(<8 x float> %arg, <8 x float>* %dst) { 751; SSE-LABEL: test_arg_v8f32: 752; SSE: # %bb.0: 753; SSE-NEXT: movntps %xmm1, 16(%rdi) 754; SSE-NEXT: movntps %xmm0, (%rdi) 755; SSE-NEXT: retq 756; 757; AVX-LABEL: test_arg_v8f32: 758; AVX: # %bb.0: 759; AVX-NEXT: vmovntps %ymm0, (%rdi) 760; AVX-NEXT: vzeroupper 761; AVX-NEXT: retq 762; 763; VLX-LABEL: test_arg_v8f32: 764; VLX: # %bb.0: 765; VLX-NEXT: vmovntps %ymm0, (%rdi) 766; VLX-NEXT: vzeroupper 767; VLX-NEXT: retq 768 store <8 x float> %arg, <8 x float>* %dst, align 32, !nontemporal !1 769 ret void 770} 771 772define void @test_arg_v8i32(<8 x i32> %arg, <8 x i32>* %dst) { 773; SSE-LABEL: test_arg_v8i32: 774; SSE: # %bb.0: 775; SSE-NEXT: movntps %xmm1, 16(%rdi) 776; SSE-NEXT: movntps %xmm0, (%rdi) 777; SSE-NEXT: retq 778; 779; AVX-LABEL: test_arg_v8i32: 780; AVX: # %bb.0: 781; AVX-NEXT: vmovntps %ymm0, (%rdi) 782; AVX-NEXT: vzeroupper 783; AVX-NEXT: retq 784; 785; VLX-LABEL: test_arg_v8i32: 786; VLX: # %bb.0: 787; VLX-NEXT: vmovntps %ymm0, (%rdi) 788; VLX-NEXT: vzeroupper 789; VLX-NEXT: retq 790 store <8 x i32> %arg, <8 x i32>* %dst, align 32, !nontemporal !1 791 ret void 792} 793 794define void @test_arg_v4f64(<4 x double> %arg, <4 x double>* %dst) { 795; SSE-LABEL: test_arg_v4f64: 796; SSE: # %bb.0: 797; SSE-NEXT: movntps %xmm1, 16(%rdi) 798; SSE-NEXT: movntps %xmm0, (%rdi) 799; SSE-NEXT: retq 800; 801; AVX-LABEL: test_arg_v4f64: 802; AVX: # %bb.0: 803; AVX-NEXT: vmovntps %ymm0, (%rdi) 804; AVX-NEXT: vzeroupper 805; AVX-NEXT: retq 806; 807; VLX-LABEL: test_arg_v4f64: 808; VLX: # %bb.0: 809; VLX-NEXT: vmovntps %ymm0, (%rdi) 810; VLX-NEXT: vzeroupper 811; VLX-NEXT: retq 812 store <4 x double> %arg, <4 x double>* %dst, align 32, !nontemporal !1 813 ret void 814} 815 816define void @test_arg_v4i64(<4 x i64> %arg, <4 x i64>* %dst) { 817; SSE-LABEL: test_arg_v4i64: 818; SSE: # %bb.0: 819; SSE-NEXT: movntps %xmm1, 16(%rdi) 820; SSE-NEXT: movntps %xmm0, (%rdi) 821; SSE-NEXT: retq 822; 823; AVX-LABEL: test_arg_v4i64: 824; AVX: # %bb.0: 825; AVX-NEXT: vmovntps %ymm0, (%rdi) 826; AVX-NEXT: vzeroupper 827; AVX-NEXT: retq 828; 829; VLX-LABEL: test_arg_v4i64: 830; VLX: # %bb.0: 831; VLX-NEXT: vmovntps %ymm0, (%rdi) 832; VLX-NEXT: vzeroupper 833; VLX-NEXT: retq 834 store <4 x i64> %arg, <4 x i64>* %dst, align 32, !nontemporal !1 835 ret void 836} 837 838define void @test_arg_v16i16(<16 x i16> %arg, <16 x i16>* %dst) { 839; SSE-LABEL: test_arg_v16i16: 840; SSE: # %bb.0: 841; SSE-NEXT: movntps %xmm1, 16(%rdi) 842; SSE-NEXT: movntps %xmm0, (%rdi) 843; SSE-NEXT: retq 844; 845; AVX-LABEL: test_arg_v16i16: 846; AVX: # %bb.0: 847; AVX-NEXT: vmovntps %ymm0, (%rdi) 848; AVX-NEXT: vzeroupper 849; AVX-NEXT: retq 850; 851; VLX-LABEL: test_arg_v16i16: 852; VLX: # %bb.0: 853; VLX-NEXT: vmovntps %ymm0, (%rdi) 854; VLX-NEXT: vzeroupper 855; VLX-NEXT: retq 856 store <16 x i16> %arg, <16 x i16>* %dst, align 32, !nontemporal !1 857 ret void 858} 859 860define void @test_arg_v32i8(<32 x i8> %arg, <32 x i8>* %dst) { 861; SSE-LABEL: test_arg_v32i8: 862; SSE: # %bb.0: 863; SSE-NEXT: movntps %xmm1, 16(%rdi) 864; SSE-NEXT: movntps %xmm0, (%rdi) 865; SSE-NEXT: retq 866; 867; AVX-LABEL: test_arg_v32i8: 868; AVX: # %bb.0: 869; AVX-NEXT: vmovntps %ymm0, (%rdi) 870; AVX-NEXT: vzeroupper 871; AVX-NEXT: retq 872; 873; VLX-LABEL: test_arg_v32i8: 874; VLX: # %bb.0: 875; VLX-NEXT: vmovntps %ymm0, (%rdi) 876; VLX-NEXT: vzeroupper 877; VLX-NEXT: retq 878 store <32 x i8> %arg, <32 x i8>* %dst, align 32, !nontemporal !1 879 ret void 880} 881 882 883; Now check that if the execution domain is trivially visible, we use it. 884; We use an add to make the type survive all the way to the MOVNT. 885 886define void @test_op_v4f32(<4 x float> %a, <4 x float> %b, <4 x float>* %dst) { 887; SSE-LABEL: test_op_v4f32: 888; SSE: # %bb.0: 889; SSE-NEXT: addps %xmm1, %xmm0 890; SSE-NEXT: movntps %xmm0, (%rdi) 891; SSE-NEXT: retq 892; 893; AVX-LABEL: test_op_v4f32: 894; AVX: # %bb.0: 895; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 896; AVX-NEXT: vmovntps %xmm0, (%rdi) 897; AVX-NEXT: retq 898; 899; VLX-LABEL: test_op_v4f32: 900; VLX: # %bb.0: 901; VLX-NEXT: vaddps %xmm1, %xmm0, %xmm0 902; VLX-NEXT: vmovntps %xmm0, (%rdi) 903; VLX-NEXT: retq 904 %r = fadd <4 x float> %a, %b 905 store <4 x float> %r, <4 x float>* %dst, align 16, !nontemporal !1 906 ret void 907} 908 909define void @test_op_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32>* %dst) { 910; SSE-LABEL: test_op_v4i32: 911; SSE: # %bb.0: 912; SSE-NEXT: paddd %xmm1, %xmm0 913; SSE-NEXT: movntdq %xmm0, (%rdi) 914; SSE-NEXT: retq 915; 916; AVX-LABEL: test_op_v4i32: 917; AVX: # %bb.0: 918; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 919; AVX-NEXT: vmovntdq %xmm0, (%rdi) 920; AVX-NEXT: retq 921; 922; VLX-LABEL: test_op_v4i32: 923; VLX: # %bb.0: 924; VLX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 925; VLX-NEXT: vmovntdq %xmm0, (%rdi) 926; VLX-NEXT: retq 927 %r = add <4 x i32> %a, %b 928 store <4 x i32> %r, <4 x i32>* %dst, align 16, !nontemporal !1 929 ret void 930} 931 932define void @test_op_v2f64(<2 x double> %a, <2 x double> %b, <2 x double>* %dst) { 933; SSE-LABEL: test_op_v2f64: 934; SSE: # %bb.0: 935; SSE-NEXT: addpd %xmm1, %xmm0 936; SSE-NEXT: movntpd %xmm0, (%rdi) 937; SSE-NEXT: retq 938; 939; AVX-LABEL: test_op_v2f64: 940; AVX: # %bb.0: 941; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 942; AVX-NEXT: vmovntpd %xmm0, (%rdi) 943; AVX-NEXT: retq 944; 945; VLX-LABEL: test_op_v2f64: 946; VLX: # %bb.0: 947; VLX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 948; VLX-NEXT: vmovntpd %xmm0, (%rdi) 949; VLX-NEXT: retq 950 %r = fadd <2 x double> %a, %b 951 store <2 x double> %r, <2 x double>* %dst, align 16, !nontemporal !1 952 ret void 953} 954 955define void @test_op_v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64>* %dst) { 956; SSE-LABEL: test_op_v2i64: 957; SSE: # %bb.0: 958; SSE-NEXT: paddq %xmm1, %xmm0 959; SSE-NEXT: movntdq %xmm0, (%rdi) 960; SSE-NEXT: retq 961; 962; AVX-LABEL: test_op_v2i64: 963; AVX: # %bb.0: 964; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 965; AVX-NEXT: vmovntdq %xmm0, (%rdi) 966; AVX-NEXT: retq 967; 968; VLX-LABEL: test_op_v2i64: 969; VLX: # %bb.0: 970; VLX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 971; VLX-NEXT: vmovntdq %xmm0, (%rdi) 972; VLX-NEXT: retq 973 %r = add <2 x i64> %a, %b 974 store <2 x i64> %r, <2 x i64>* %dst, align 16, !nontemporal !1 975 ret void 976} 977 978define void @test_op_v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16>* %dst) { 979; SSE-LABEL: test_op_v8i16: 980; SSE: # %bb.0: 981; SSE-NEXT: paddw %xmm1, %xmm0 982; SSE-NEXT: movntdq %xmm0, (%rdi) 983; SSE-NEXT: retq 984; 985; AVX-LABEL: test_op_v8i16: 986; AVX: # %bb.0: 987; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 988; AVX-NEXT: vmovntdq %xmm0, (%rdi) 989; AVX-NEXT: retq 990; 991; VLX-LABEL: test_op_v8i16: 992; VLX: # %bb.0: 993; VLX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 994; VLX-NEXT: vmovntdq %xmm0, (%rdi) 995; VLX-NEXT: retq 996 %r = add <8 x i16> %a, %b 997 store <8 x i16> %r, <8 x i16>* %dst, align 16, !nontemporal !1 998 ret void 999} 1000 1001define void @test_op_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8>* %dst) { 1002; SSE-LABEL: test_op_v16i8: 1003; SSE: # %bb.0: 1004; SSE-NEXT: paddb %xmm1, %xmm0 1005; SSE-NEXT: movntdq %xmm0, (%rdi) 1006; SSE-NEXT: retq 1007; 1008; AVX-LABEL: test_op_v16i8: 1009; AVX: # %bb.0: 1010; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 1011; AVX-NEXT: vmovntdq %xmm0, (%rdi) 1012; AVX-NEXT: retq 1013; 1014; VLX-LABEL: test_op_v16i8: 1015; VLX: # %bb.0: 1016; VLX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 1017; VLX-NEXT: vmovntdq %xmm0, (%rdi) 1018; VLX-NEXT: retq 1019 %r = add <16 x i8> %a, %b 1020 store <16 x i8> %r, <16 x i8>* %dst, align 16, !nontemporal !1 1021 ret void 1022} 1023 1024; And now YMM versions. 1025 1026define void @test_op_v8f32(<8 x float> %a, <8 x float> %b, <8 x float>* %dst) { 1027; SSE-LABEL: test_op_v8f32: 1028; SSE: # %bb.0: 1029; SSE-NEXT: addps %xmm2, %xmm0 1030; SSE-NEXT: addps %xmm3, %xmm1 1031; SSE-NEXT: movntps %xmm1, 16(%rdi) 1032; SSE-NEXT: movntps %xmm0, (%rdi) 1033; SSE-NEXT: retq 1034; 1035; AVX-LABEL: test_op_v8f32: 1036; AVX: # %bb.0: 1037; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 1038; AVX-NEXT: vmovntps %ymm0, (%rdi) 1039; AVX-NEXT: vzeroupper 1040; AVX-NEXT: retq 1041; 1042; VLX-LABEL: test_op_v8f32: 1043; VLX: # %bb.0: 1044; VLX-NEXT: vaddps %ymm1, %ymm0, %ymm0 1045; VLX-NEXT: vmovntps %ymm0, (%rdi) 1046; VLX-NEXT: vzeroupper 1047; VLX-NEXT: retq 1048 %r = fadd <8 x float> %a, %b 1049 store <8 x float> %r, <8 x float>* %dst, align 32, !nontemporal !1 1050 ret void 1051} 1052 1053define void @test_op_v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32>* %dst) { 1054; SSE-LABEL: test_op_v8i32: 1055; SSE: # %bb.0: 1056; SSE-NEXT: paddd %xmm2, %xmm0 1057; SSE-NEXT: paddd %xmm3, %xmm1 1058; SSE-NEXT: movntdq %xmm1, 16(%rdi) 1059; SSE-NEXT: movntdq %xmm0, (%rdi) 1060; SSE-NEXT: retq 1061; 1062; AVX1-LABEL: test_op_v8i32: 1063; AVX1: # %bb.0: 1064; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1065; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1066; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 1067; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1068; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1069; AVX1-NEXT: vmovntps %ymm0, (%rdi) 1070; AVX1-NEXT: vzeroupper 1071; AVX1-NEXT: retq 1072; 1073; AVX2-LABEL: test_op_v8i32: 1074; AVX2: # %bb.0: 1075; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 1076; AVX2-NEXT: vmovntdq %ymm0, (%rdi) 1077; AVX2-NEXT: vzeroupper 1078; AVX2-NEXT: retq 1079; 1080; VLX-LABEL: test_op_v8i32: 1081; VLX: # %bb.0: 1082; VLX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 1083; VLX-NEXT: vmovntdq %ymm0, (%rdi) 1084; VLX-NEXT: vzeroupper 1085; VLX-NEXT: retq 1086 %r = add <8 x i32> %a, %b 1087 store <8 x i32> %r, <8 x i32>* %dst, align 32, !nontemporal !1 1088 ret void 1089} 1090 1091define void @test_op_v4f64(<4 x double> %a, <4 x double> %b, <4 x double>* %dst) { 1092; SSE-LABEL: test_op_v4f64: 1093; SSE: # %bb.0: 1094; SSE-NEXT: addpd %xmm2, %xmm0 1095; SSE-NEXT: addpd %xmm3, %xmm1 1096; SSE-NEXT: movntpd %xmm1, 16(%rdi) 1097; SSE-NEXT: movntpd %xmm0, (%rdi) 1098; SSE-NEXT: retq 1099; 1100; AVX-LABEL: test_op_v4f64: 1101; AVX: # %bb.0: 1102; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 1103; AVX-NEXT: vmovntpd %ymm0, (%rdi) 1104; AVX-NEXT: vzeroupper 1105; AVX-NEXT: retq 1106; 1107; VLX-LABEL: test_op_v4f64: 1108; VLX: # %bb.0: 1109; VLX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 1110; VLX-NEXT: vmovntpd %ymm0, (%rdi) 1111; VLX-NEXT: vzeroupper 1112; VLX-NEXT: retq 1113 %r = fadd <4 x double> %a, %b 1114 store <4 x double> %r, <4 x double>* %dst, align 32, !nontemporal !1 1115 ret void 1116} 1117 1118define void @test_op_v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64>* %dst) { 1119; SSE-LABEL: test_op_v4i64: 1120; SSE: # %bb.0: 1121; SSE-NEXT: paddq %xmm2, %xmm0 1122; SSE-NEXT: paddq %xmm3, %xmm1 1123; SSE-NEXT: movntdq %xmm1, 16(%rdi) 1124; SSE-NEXT: movntdq %xmm0, (%rdi) 1125; SSE-NEXT: retq 1126; 1127; AVX1-LABEL: test_op_v4i64: 1128; AVX1: # %bb.0: 1129; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1130; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1131; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 1132; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 1133; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1134; AVX1-NEXT: vmovntps %ymm0, (%rdi) 1135; AVX1-NEXT: vzeroupper 1136; AVX1-NEXT: retq 1137; 1138; AVX2-LABEL: test_op_v4i64: 1139; AVX2: # %bb.0: 1140; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 1141; AVX2-NEXT: vmovntdq %ymm0, (%rdi) 1142; AVX2-NEXT: vzeroupper 1143; AVX2-NEXT: retq 1144; 1145; VLX-LABEL: test_op_v4i64: 1146; VLX: # %bb.0: 1147; VLX-NEXT: vpaddq %ymm1, %ymm0, %ymm0 1148; VLX-NEXT: vmovntdq %ymm0, (%rdi) 1149; VLX-NEXT: vzeroupper 1150; VLX-NEXT: retq 1151 %r = add <4 x i64> %a, %b 1152 store <4 x i64> %r, <4 x i64>* %dst, align 32, !nontemporal !1 1153 ret void 1154} 1155 1156define void @test_op_v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16>* %dst) { 1157; SSE-LABEL: test_op_v16i16: 1158; SSE: # %bb.0: 1159; SSE-NEXT: paddw %xmm2, %xmm0 1160; SSE-NEXT: paddw %xmm3, %xmm1 1161; SSE-NEXT: movntdq %xmm1, 16(%rdi) 1162; SSE-NEXT: movntdq %xmm0, (%rdi) 1163; SSE-NEXT: retq 1164; 1165; AVX1-LABEL: test_op_v16i16: 1166; AVX1: # %bb.0: 1167; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1168; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1169; AVX1-NEXT: vpaddw %xmm2, %xmm3, %xmm2 1170; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1171; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1172; AVX1-NEXT: vmovntps %ymm0, (%rdi) 1173; AVX1-NEXT: vzeroupper 1174; AVX1-NEXT: retq 1175; 1176; AVX2-LABEL: test_op_v16i16: 1177; AVX2: # %bb.0: 1178; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 1179; AVX2-NEXT: vmovntdq %ymm0, (%rdi) 1180; AVX2-NEXT: vzeroupper 1181; AVX2-NEXT: retq 1182; 1183; VLX-LABEL: test_op_v16i16: 1184; VLX: # %bb.0: 1185; VLX-NEXT: vpaddw %ymm1, %ymm0, %ymm0 1186; VLX-NEXT: vmovntdq %ymm0, (%rdi) 1187; VLX-NEXT: vzeroupper 1188; VLX-NEXT: retq 1189 %r = add <16 x i16> %a, %b 1190 store <16 x i16> %r, <16 x i16>* %dst, align 32, !nontemporal !1 1191 ret void 1192} 1193 1194define void @test_op_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8>* %dst) { 1195; SSE-LABEL: test_op_v32i8: 1196; SSE: # %bb.0: 1197; SSE-NEXT: paddb %xmm2, %xmm0 1198; SSE-NEXT: paddb %xmm3, %xmm1 1199; SSE-NEXT: movntdq %xmm1, 16(%rdi) 1200; SSE-NEXT: movntdq %xmm0, (%rdi) 1201; SSE-NEXT: retq 1202; 1203; AVX1-LABEL: test_op_v32i8: 1204; AVX1: # %bb.0: 1205; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1206; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1207; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 1208; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 1209; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1210; AVX1-NEXT: vmovntps %ymm0, (%rdi) 1211; AVX1-NEXT: vzeroupper 1212; AVX1-NEXT: retq 1213; 1214; AVX2-LABEL: test_op_v32i8: 1215; AVX2: # %bb.0: 1216; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 1217; AVX2-NEXT: vmovntdq %ymm0, (%rdi) 1218; AVX2-NEXT: vzeroupper 1219; AVX2-NEXT: retq 1220; 1221; VLX-LABEL: test_op_v32i8: 1222; VLX: # %bb.0: 1223; VLX-NEXT: vpaddb %ymm1, %ymm0, %ymm0 1224; VLX-NEXT: vmovntdq %ymm0, (%rdi) 1225; VLX-NEXT: vzeroupper 1226; VLX-NEXT: retq 1227 %r = add <32 x i8> %a, %b 1228 store <32 x i8> %r, <32 x i8>* %dst, align 32, !nontemporal !1 1229 ret void 1230} 1231 1232; 256-bit NT stores require 256-bit alignment. 1233; FIXME: For AVX, we could lower this to 2x movntps %xmm. Taken further, we 1234; could even scalarize to movnti when we have 1-alignment: nontemporal is 1235; probably always worth even some 20 instruction scalarization. 1236define void @test_unaligned_v8f32(<8 x float> %a, <8 x float> %b, <8 x float>* %dst) { 1237; SSE-LABEL: test_unaligned_v8f32: 1238; SSE: # %bb.0: 1239; SSE-NEXT: addps %xmm2, %xmm0 1240; SSE-NEXT: addps %xmm3, %xmm1 1241; SSE-NEXT: movntps %xmm1, 16(%rdi) 1242; SSE-NEXT: movntps %xmm0, (%rdi) 1243; SSE-NEXT: retq 1244; 1245; AVX-LABEL: test_unaligned_v8f32: 1246; AVX: # %bb.0: 1247; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 1248; AVX-NEXT: vmovups %ymm0, (%rdi) 1249; AVX-NEXT: vzeroupper 1250; AVX-NEXT: retq 1251; 1252; VLX-LABEL: test_unaligned_v8f32: 1253; VLX: # %bb.0: 1254; VLX-NEXT: vaddps %ymm1, %ymm0, %ymm0 1255; VLX-NEXT: vmovups %ymm0, (%rdi) 1256; VLX-NEXT: vzeroupper 1257; VLX-NEXT: retq 1258 %r = fadd <8 x float> %a, %b 1259 store <8 x float> %r, <8 x float>* %dst, align 16, !nontemporal !1 1260 ret void 1261} 1262 1263!1 = !{i32 1} 1264