1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+sse2 < %s | FileCheck %s --check-prefix=ALL32 --check-prefix=SSE32 3; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+sse2 < %s | FileCheck %s --check-prefix=ALL64 --check-prefix=SSE64 4; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx < %s | FileCheck %s --check-prefix=ALL32 --check-prefix=AVX32 --check-prefix=AVXONLY32 5; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx < %s | FileCheck %s --check-prefix=ALL64 --check-prefix=AVX64 --check-prefix=AVXONLY64 6; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL32 --check-prefix=AVX32 --check-prefix=AVX51232 --check-prefix=KNL32 7; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL64 --check-prefix=AVX64 --check-prefix=AVX51264 --check-prefix=KNL64 8; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512vl,+avx512dq,+avx512bw < %s | FileCheck %s --check-prefix=ALL32 --check-prefix=AVX32 --check-prefix=AVX51232 --check-prefix=SKX32 9; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512f,+avx512dq,+avx512bw < %s | FileCheck %s --check-prefix=ALL64 --check-prefix=AVX64 --check-prefix=AVX51264 --check-prefix=SKX64 10 11define i32 @test_store_32(i32* nocapture %addr, i32 %value) { 12; ALL32-LABEL: test_store_32: 13; ALL32: # %bb.0: # %entry 14; ALL32-NEXT: movl %esi, (%rdi) 15; ALL32-NEXT: movl %esi, %eax 16; ALL32-NEXT: retq 17; 18; ALL64-LABEL: test_store_32: 19; ALL64: # %bb.0: # %entry 20; ALL64-NEXT: movl {{[0-9]+}}(%esp), %eax 21; ALL64-NEXT: movl {{[0-9]+}}(%esp), %ecx 22; ALL64-NEXT: movl %eax, (%ecx) 23; ALL64-NEXT: retl 24entry: 25 store i32 %value, i32* %addr, align 1 26 ret i32 %value 27} 28 29define i16 @test_store_16(i16* nocapture %addr, i16 %value) { 30; ALL32-LABEL: test_store_16: 31; ALL32: # %bb.0: # %entry 32; ALL32-NEXT: movw %si, (%rdi) 33; ALL32-NEXT: movl %esi, %eax 34; ALL32-NEXT: retq 35; 36; ALL64-LABEL: test_store_16: 37; ALL64: # %bb.0: # %entry 38; ALL64-NEXT: movzwl {{[0-9]+}}(%esp), %eax 39; ALL64-NEXT: movl {{[0-9]+}}(%esp), %ecx 40; ALL64-NEXT: movw %ax, (%ecx) 41; ALL64-NEXT: retl 42entry: 43 store i16 %value, i16* %addr, align 1 44 ret i16 %value 45} 46 47define <4 x i32> @test_store_4xi32(<4 x i32>* nocapture %addr, <4 x i32> %value, <4 x i32> %value2) { 48; SSE32-LABEL: test_store_4xi32: 49; SSE32: # %bb.0: 50; SSE32-NEXT: paddd %xmm1, %xmm0 51; SSE32-NEXT: movdqu %xmm0, (%rdi) 52; SSE32-NEXT: retq 53; 54; SSE64-LABEL: test_store_4xi32: 55; SSE64: # %bb.0: 56; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 57; SSE64-NEXT: paddd %xmm1, %xmm0 58; SSE64-NEXT: movdqu %xmm0, (%eax) 59; SSE64-NEXT: retl 60; 61; AVX32-LABEL: test_store_4xi32: 62; AVX32: # %bb.0: 63; AVX32-NEXT: vpaddd %xmm1, %xmm0, %xmm0 64; AVX32-NEXT: vmovdqu %xmm0, (%rdi) 65; AVX32-NEXT: retq 66; 67; AVX64-LABEL: test_store_4xi32: 68; AVX64: # %bb.0: 69; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 70; AVX64-NEXT: vpaddd %xmm1, %xmm0, %xmm0 71; AVX64-NEXT: vmovdqu %xmm0, (%eax) 72; AVX64-NEXT: retl 73 %foo = add <4 x i32> %value, %value2 ; to force integer type on store 74 store <4 x i32> %foo, <4 x i32>* %addr, align 1 75 ret <4 x i32> %foo 76} 77 78define <4 x i32> @test_store_4xi32_aligned(<4 x i32>* nocapture %addr, <4 x i32> %value, <4 x i32> %value2) { 79; SSE32-LABEL: test_store_4xi32_aligned: 80; SSE32: # %bb.0: 81; SSE32-NEXT: paddd %xmm1, %xmm0 82; SSE32-NEXT: movdqa %xmm0, (%rdi) 83; SSE32-NEXT: retq 84; 85; SSE64-LABEL: test_store_4xi32_aligned: 86; SSE64: # %bb.0: 87; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 88; SSE64-NEXT: paddd %xmm1, %xmm0 89; SSE64-NEXT: movdqa %xmm0, (%eax) 90; SSE64-NEXT: retl 91; 92; AVX32-LABEL: test_store_4xi32_aligned: 93; AVX32: # %bb.0: 94; AVX32-NEXT: vpaddd %xmm1, %xmm0, %xmm0 95; AVX32-NEXT: vmovdqa %xmm0, (%rdi) 96; AVX32-NEXT: retq 97; 98; AVX64-LABEL: test_store_4xi32_aligned: 99; AVX64: # %bb.0: 100; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 101; AVX64-NEXT: vpaddd %xmm1, %xmm0, %xmm0 102; AVX64-NEXT: vmovdqa %xmm0, (%eax) 103; AVX64-NEXT: retl 104 %foo = add <4 x i32> %value, %value2 ; to force integer type on store 105 store <4 x i32> %foo, <4 x i32>* %addr, align 16 106 ret <4 x i32> %foo 107} 108 109define <4 x float> @test_store_4xf32(<4 x float>* nocapture %addr, <4 x float> %value) { 110; SSE32-LABEL: test_store_4xf32: 111; SSE32: # %bb.0: 112; SSE32-NEXT: movups %xmm0, (%rdi) 113; SSE32-NEXT: retq 114; 115; SSE64-LABEL: test_store_4xf32: 116; SSE64: # %bb.0: 117; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 118; SSE64-NEXT: movups %xmm0, (%eax) 119; SSE64-NEXT: retl 120; 121; AVX32-LABEL: test_store_4xf32: 122; AVX32: # %bb.0: 123; AVX32-NEXT: vmovups %xmm0, (%rdi) 124; AVX32-NEXT: retq 125; 126; AVX64-LABEL: test_store_4xf32: 127; AVX64: # %bb.0: 128; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 129; AVX64-NEXT: vmovups %xmm0, (%eax) 130; AVX64-NEXT: retl 131 store <4 x float> %value, <4 x float>* %addr, align 1 132 ret <4 x float> %value 133} 134 135define <4 x float> @test_store_4xf32_aligned(<4 x float>* nocapture %addr, <4 x float> %value) { 136; SSE32-LABEL: test_store_4xf32_aligned: 137; SSE32: # %bb.0: 138; SSE32-NEXT: movaps %xmm0, (%rdi) 139; SSE32-NEXT: retq 140; 141; SSE64-LABEL: test_store_4xf32_aligned: 142; SSE64: # %bb.0: 143; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 144; SSE64-NEXT: movaps %xmm0, (%eax) 145; SSE64-NEXT: retl 146; 147; AVX32-LABEL: test_store_4xf32_aligned: 148; AVX32: # %bb.0: 149; AVX32-NEXT: vmovaps %xmm0, (%rdi) 150; AVX32-NEXT: retq 151; 152; AVX64-LABEL: test_store_4xf32_aligned: 153; AVX64: # %bb.0: 154; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 155; AVX64-NEXT: vmovaps %xmm0, (%eax) 156; AVX64-NEXT: retl 157 store <4 x float> %value, <4 x float>* %addr, align 16 158 ret <4 x float> %value 159} 160 161define <2 x double> @test_store_2xf64(<2 x double>* nocapture %addr, <2 x double> %value, <2 x double> %value2) { 162; SSE32-LABEL: test_store_2xf64: 163; SSE32: # %bb.0: 164; SSE32-NEXT: addpd %xmm1, %xmm0 165; SSE32-NEXT: movupd %xmm0, (%rdi) 166; SSE32-NEXT: retq 167; 168; SSE64-LABEL: test_store_2xf64: 169; SSE64: # %bb.0: 170; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 171; SSE64-NEXT: addpd %xmm1, %xmm0 172; SSE64-NEXT: movupd %xmm0, (%eax) 173; SSE64-NEXT: retl 174; 175; AVX32-LABEL: test_store_2xf64: 176; AVX32: # %bb.0: 177; AVX32-NEXT: vaddpd %xmm1, %xmm0, %xmm0 178; AVX32-NEXT: vmovupd %xmm0, (%rdi) 179; AVX32-NEXT: retq 180; 181; AVX64-LABEL: test_store_2xf64: 182; AVX64: # %bb.0: 183; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 184; AVX64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 185; AVX64-NEXT: vmovupd %xmm0, (%eax) 186; AVX64-NEXT: retl 187 %foo = fadd <2 x double> %value, %value2 ; to force dobule type on store 188 store <2 x double> %foo, <2 x double>* %addr, align 1 189 ret <2 x double> %foo 190} 191 192define <2 x double> @test_store_2xf64_aligned(<2 x double>* nocapture %addr, <2 x double> %value, <2 x double> %value2) { 193; SSE32-LABEL: test_store_2xf64_aligned: 194; SSE32: # %bb.0: 195; SSE32-NEXT: addpd %xmm1, %xmm0 196; SSE32-NEXT: movapd %xmm0, (%rdi) 197; SSE32-NEXT: retq 198; 199; SSE64-LABEL: test_store_2xf64_aligned: 200; SSE64: # %bb.0: 201; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 202; SSE64-NEXT: addpd %xmm1, %xmm0 203; SSE64-NEXT: movapd %xmm0, (%eax) 204; SSE64-NEXT: retl 205; 206; AVX32-LABEL: test_store_2xf64_aligned: 207; AVX32: # %bb.0: 208; AVX32-NEXT: vaddpd %xmm1, %xmm0, %xmm0 209; AVX32-NEXT: vmovapd %xmm0, (%rdi) 210; AVX32-NEXT: retq 211; 212; AVX64-LABEL: test_store_2xf64_aligned: 213; AVX64: # %bb.0: 214; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 215; AVX64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 216; AVX64-NEXT: vmovapd %xmm0, (%eax) 217; AVX64-NEXT: retl 218 %foo = fadd <2 x double> %value, %value2 ; to force dobule type on store 219 store <2 x double> %foo, <2 x double>* %addr, align 16 220 ret <2 x double> %foo 221} 222 223define <8 x i32> @test_store_8xi32(<8 x i32>* nocapture %addr, <8 x i32> %value) { 224; SSE32-LABEL: test_store_8xi32: 225; SSE32: # %bb.0: 226; SSE32-NEXT: movups %xmm0, (%rdi) 227; SSE32-NEXT: movups %xmm1, 16(%rdi) 228; SSE32-NEXT: retq 229; 230; SSE64-LABEL: test_store_8xi32: 231; SSE64: # %bb.0: 232; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 233; SSE64-NEXT: movups %xmm0, (%eax) 234; SSE64-NEXT: movups %xmm1, 16(%eax) 235; SSE64-NEXT: retl 236; 237; AVX32-LABEL: test_store_8xi32: 238; AVX32: # %bb.0: 239; AVX32-NEXT: vmovups %ymm0, (%rdi) 240; AVX32-NEXT: retq 241; 242; AVX64-LABEL: test_store_8xi32: 243; AVX64: # %bb.0: 244; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 245; AVX64-NEXT: vmovups %ymm0, (%eax) 246; AVX64-NEXT: retl 247 store <8 x i32> %value, <8 x i32>* %addr, align 1 248 ret <8 x i32> %value 249} 250 251define <8 x i32> @test_store_8xi32_aligned(<8 x i32>* nocapture %addr, <8 x i32> %value) { 252; SSE32-LABEL: test_store_8xi32_aligned: 253; SSE32: # %bb.0: 254; SSE32-NEXT: movaps %xmm0, (%rdi) 255; SSE32-NEXT: movaps %xmm1, 16(%rdi) 256; SSE32-NEXT: retq 257; 258; SSE64-LABEL: test_store_8xi32_aligned: 259; SSE64: # %bb.0: 260; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 261; SSE64-NEXT: movaps %xmm0, (%eax) 262; SSE64-NEXT: movaps %xmm1, 16(%eax) 263; SSE64-NEXT: retl 264; 265; AVX32-LABEL: test_store_8xi32_aligned: 266; AVX32: # %bb.0: 267; AVX32-NEXT: vmovaps %ymm0, (%rdi) 268; AVX32-NEXT: retq 269; 270; AVX64-LABEL: test_store_8xi32_aligned: 271; AVX64: # %bb.0: 272; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 273; AVX64-NEXT: vmovaps %ymm0, (%eax) 274; AVX64-NEXT: retl 275 store <8 x i32> %value, <8 x i32>* %addr, align 32 276 ret <8 x i32> %value 277} 278 279define <8 x float> @test_store_8xf32(<8 x float>* nocapture %addr, <8 x float> %value) { 280; SSE32-LABEL: test_store_8xf32: 281; SSE32: # %bb.0: 282; SSE32-NEXT: movups %xmm0, (%rdi) 283; SSE32-NEXT: movups %xmm1, 16(%rdi) 284; SSE32-NEXT: retq 285; 286; SSE64-LABEL: test_store_8xf32: 287; SSE64: # %bb.0: 288; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 289; SSE64-NEXT: movups %xmm0, (%eax) 290; SSE64-NEXT: movups %xmm1, 16(%eax) 291; SSE64-NEXT: retl 292; 293; AVX32-LABEL: test_store_8xf32: 294; AVX32: # %bb.0: 295; AVX32-NEXT: vmovups %ymm0, (%rdi) 296; AVX32-NEXT: retq 297; 298; AVX64-LABEL: test_store_8xf32: 299; AVX64: # %bb.0: 300; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 301; AVX64-NEXT: vmovups %ymm0, (%eax) 302; AVX64-NEXT: retl 303 store <8 x float> %value, <8 x float>* %addr, align 1 304 ret <8 x float> %value 305} 306 307define <8 x float> @test_store_8xf32_aligned(<8 x float>* nocapture %addr, <8 x float> %value) { 308; SSE32-LABEL: test_store_8xf32_aligned: 309; SSE32: # %bb.0: 310; SSE32-NEXT: movaps %xmm0, (%rdi) 311; SSE32-NEXT: movaps %xmm1, 16(%rdi) 312; SSE32-NEXT: retq 313; 314; SSE64-LABEL: test_store_8xf32_aligned: 315; SSE64: # %bb.0: 316; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 317; SSE64-NEXT: movaps %xmm0, (%eax) 318; SSE64-NEXT: movaps %xmm1, 16(%eax) 319; SSE64-NEXT: retl 320; 321; AVX32-LABEL: test_store_8xf32_aligned: 322; AVX32: # %bb.0: 323; AVX32-NEXT: vmovaps %ymm0, (%rdi) 324; AVX32-NEXT: retq 325; 326; AVX64-LABEL: test_store_8xf32_aligned: 327; AVX64: # %bb.0: 328; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 329; AVX64-NEXT: vmovaps %ymm0, (%eax) 330; AVX64-NEXT: retl 331 store <8 x float> %value, <8 x float>* %addr, align 32 332 ret <8 x float> %value 333} 334 335define <4 x double> @test_store_4xf64(<4 x double>* nocapture %addr, <4 x double> %value, <4 x double> %value2) { 336; SSE32-LABEL: test_store_4xf64: 337; SSE32: # %bb.0: 338; SSE32-NEXT: addpd %xmm3, %xmm1 339; SSE32-NEXT: addpd %xmm2, %xmm0 340; SSE32-NEXT: movupd %xmm0, (%rdi) 341; SSE32-NEXT: movupd %xmm1, 16(%rdi) 342; SSE32-NEXT: retq 343; 344; SSE64-LABEL: test_store_4xf64: 345; SSE64: # %bb.0: 346; SSE64-NEXT: subl $12, %esp 347; SSE64-NEXT: .cfi_def_cfa_offset 16 348; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 349; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm1 350; SSE64-NEXT: addpd %xmm2, %xmm0 351; SSE64-NEXT: movupd %xmm0, (%eax) 352; SSE64-NEXT: movupd %xmm1, 16(%eax) 353; SSE64-NEXT: addl $12, %esp 354; SSE64-NEXT: .cfi_def_cfa_offset 4 355; SSE64-NEXT: retl 356; 357; AVX32-LABEL: test_store_4xf64: 358; AVX32: # %bb.0: 359; AVX32-NEXT: vaddpd %ymm1, %ymm0, %ymm0 360; AVX32-NEXT: vmovupd %ymm0, (%rdi) 361; AVX32-NEXT: retq 362; 363; AVX64-LABEL: test_store_4xf64: 364; AVX64: # %bb.0: 365; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 366; AVX64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 367; AVX64-NEXT: vmovupd %ymm0, (%eax) 368; AVX64-NEXT: retl 369 %foo = fadd <4 x double> %value, %value2 ; to force dobule type on store 370 store <4 x double> %foo, <4 x double>* %addr, align 1 371 ret <4 x double> %foo 372} 373 374define <4 x double> @test_store_4xf64_aligned(<4 x double>* nocapture %addr, <4 x double> %value, <4 x double> %value2) { 375; SSE32-LABEL: test_store_4xf64_aligned: 376; SSE32: # %bb.0: 377; SSE32-NEXT: addpd %xmm3, %xmm1 378; SSE32-NEXT: addpd %xmm2, %xmm0 379; SSE32-NEXT: movapd %xmm0, (%rdi) 380; SSE32-NEXT: movapd %xmm1, 16(%rdi) 381; SSE32-NEXT: retq 382; 383; SSE64-LABEL: test_store_4xf64_aligned: 384; SSE64: # %bb.0: 385; SSE64-NEXT: subl $12, %esp 386; SSE64-NEXT: .cfi_def_cfa_offset 16 387; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 388; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm1 389; SSE64-NEXT: addpd %xmm2, %xmm0 390; SSE64-NEXT: movapd %xmm0, (%eax) 391; SSE64-NEXT: movapd %xmm1, 16(%eax) 392; SSE64-NEXT: addl $12, %esp 393; SSE64-NEXT: .cfi_def_cfa_offset 4 394; SSE64-NEXT: retl 395; 396; AVX32-LABEL: test_store_4xf64_aligned: 397; AVX32: # %bb.0: 398; AVX32-NEXT: vaddpd %ymm1, %ymm0, %ymm0 399; AVX32-NEXT: vmovapd %ymm0, (%rdi) 400; AVX32-NEXT: retq 401; 402; AVX64-LABEL: test_store_4xf64_aligned: 403; AVX64: # %bb.0: 404; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 405; AVX64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 406; AVX64-NEXT: vmovapd %ymm0, (%eax) 407; AVX64-NEXT: retl 408 %foo = fadd <4 x double> %value, %value2 ; to force dobule type on store 409 store <4 x double> %foo, <4 x double>* %addr, align 32 410 ret <4 x double> %foo 411} 412 413define <16 x i32> @test_store_16xi32(<16 x i32>* nocapture %addr, <16 x i32> %value) { 414; SSE32-LABEL: test_store_16xi32: 415; SSE32: # %bb.0: 416; SSE32-NEXT: movups %xmm0, (%rdi) 417; SSE32-NEXT: movups %xmm1, 16(%rdi) 418; SSE32-NEXT: movups %xmm2, 32(%rdi) 419; SSE32-NEXT: movups %xmm3, 48(%rdi) 420; SSE32-NEXT: retq 421; 422; SSE64-LABEL: test_store_16xi32: 423; SSE64: # %bb.0: 424; SSE64-NEXT: subl $12, %esp 425; SSE64-NEXT: .cfi_def_cfa_offset 16 426; SSE64-NEXT: movaps {{[0-9]+}}(%esp), %xmm3 427; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 428; SSE64-NEXT: movups %xmm0, (%eax) 429; SSE64-NEXT: movups %xmm1, 16(%eax) 430; SSE64-NEXT: movups %xmm2, 32(%eax) 431; SSE64-NEXT: movups %xmm3, 48(%eax) 432; SSE64-NEXT: addl $12, %esp 433; SSE64-NEXT: .cfi_def_cfa_offset 4 434; SSE64-NEXT: retl 435; 436; AVXONLY32-LABEL: test_store_16xi32: 437; AVXONLY32: # %bb.0: 438; AVXONLY32-NEXT: vmovups %ymm0, (%rdi) 439; AVXONLY32-NEXT: vmovups %ymm1, 32(%rdi) 440; AVXONLY32-NEXT: retq 441; 442; AVXONLY64-LABEL: test_store_16xi32: 443; AVXONLY64: # %bb.0: 444; AVXONLY64-NEXT: movl {{[0-9]+}}(%esp), %eax 445; AVXONLY64-NEXT: vmovups %ymm0, (%eax) 446; AVXONLY64-NEXT: vmovups %ymm1, 32(%eax) 447; AVXONLY64-NEXT: retl 448; 449; AVX51232-LABEL: test_store_16xi32: 450; AVX51232: # %bb.0: 451; AVX51232-NEXT: vmovups %zmm0, (%rdi) 452; AVX51232-NEXT: retq 453; 454; AVX51264-LABEL: test_store_16xi32: 455; AVX51264: # %bb.0: 456; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax 457; AVX51264-NEXT: vmovups %zmm0, (%eax) 458; AVX51264-NEXT: retl 459 store <16 x i32> %value, <16 x i32>* %addr, align 1 460 ret <16 x i32> %value 461} 462 463define <16 x i32> @test_store_16xi32_aligned(<16 x i32>* nocapture %addr, <16 x i32> %value) { 464; SSE32-LABEL: test_store_16xi32_aligned: 465; SSE32: # %bb.0: 466; SSE32-NEXT: movaps %xmm0, (%rdi) 467; SSE32-NEXT: movaps %xmm1, 16(%rdi) 468; SSE32-NEXT: movaps %xmm2, 32(%rdi) 469; SSE32-NEXT: movaps %xmm3, 48(%rdi) 470; SSE32-NEXT: retq 471; 472; SSE64-LABEL: test_store_16xi32_aligned: 473; SSE64: # %bb.0: 474; SSE64-NEXT: subl $12, %esp 475; SSE64-NEXT: .cfi_def_cfa_offset 16 476; SSE64-NEXT: movaps {{[0-9]+}}(%esp), %xmm3 477; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 478; SSE64-NEXT: movaps %xmm0, (%eax) 479; SSE64-NEXT: movaps %xmm1, 16(%eax) 480; SSE64-NEXT: movaps %xmm2, 32(%eax) 481; SSE64-NEXT: movaps %xmm3, 48(%eax) 482; SSE64-NEXT: addl $12, %esp 483; SSE64-NEXT: .cfi_def_cfa_offset 4 484; SSE64-NEXT: retl 485; 486; AVXONLY32-LABEL: test_store_16xi32_aligned: 487; AVXONLY32: # %bb.0: 488; AVXONLY32-NEXT: vmovaps %ymm0, (%rdi) 489; AVXONLY32-NEXT: vmovaps %ymm1, 32(%rdi) 490; AVXONLY32-NEXT: retq 491; 492; AVXONLY64-LABEL: test_store_16xi32_aligned: 493; AVXONLY64: # %bb.0: 494; AVXONLY64-NEXT: movl {{[0-9]+}}(%esp), %eax 495; AVXONLY64-NEXT: vmovaps %ymm0, (%eax) 496; AVXONLY64-NEXT: vmovaps %ymm1, 32(%eax) 497; AVXONLY64-NEXT: retl 498; 499; AVX51232-LABEL: test_store_16xi32_aligned: 500; AVX51232: # %bb.0: 501; AVX51232-NEXT: vmovaps %zmm0, (%rdi) 502; AVX51232-NEXT: retq 503; 504; AVX51264-LABEL: test_store_16xi32_aligned: 505; AVX51264: # %bb.0: 506; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax 507; AVX51264-NEXT: vmovaps %zmm0, (%eax) 508; AVX51264-NEXT: retl 509 store <16 x i32> %value, <16 x i32>* %addr, align 64 510 ret <16 x i32> %value 511} 512 513define <16 x float> @test_store_16xf32(<16 x float>* nocapture %addr, <16 x float> %value) { 514; SSE32-LABEL: test_store_16xf32: 515; SSE32: # %bb.0: 516; SSE32-NEXT: movups %xmm0, (%rdi) 517; SSE32-NEXT: movups %xmm1, 16(%rdi) 518; SSE32-NEXT: movups %xmm2, 32(%rdi) 519; SSE32-NEXT: movups %xmm3, 48(%rdi) 520; SSE32-NEXT: retq 521; 522; SSE64-LABEL: test_store_16xf32: 523; SSE64: # %bb.0: 524; SSE64-NEXT: subl $12, %esp 525; SSE64-NEXT: .cfi_def_cfa_offset 16 526; SSE64-NEXT: movaps {{[0-9]+}}(%esp), %xmm3 527; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 528; SSE64-NEXT: movups %xmm0, (%eax) 529; SSE64-NEXT: movups %xmm1, 16(%eax) 530; SSE64-NEXT: movups %xmm2, 32(%eax) 531; SSE64-NEXT: movups %xmm3, 48(%eax) 532; SSE64-NEXT: addl $12, %esp 533; SSE64-NEXT: .cfi_def_cfa_offset 4 534; SSE64-NEXT: retl 535; 536; AVXONLY32-LABEL: test_store_16xf32: 537; AVXONLY32: # %bb.0: 538; AVXONLY32-NEXT: vmovups %ymm0, (%rdi) 539; AVXONLY32-NEXT: vmovups %ymm1, 32(%rdi) 540; AVXONLY32-NEXT: retq 541; 542; AVXONLY64-LABEL: test_store_16xf32: 543; AVXONLY64: # %bb.0: 544; AVXONLY64-NEXT: movl {{[0-9]+}}(%esp), %eax 545; AVXONLY64-NEXT: vmovups %ymm0, (%eax) 546; AVXONLY64-NEXT: vmovups %ymm1, 32(%eax) 547; AVXONLY64-NEXT: retl 548; 549; AVX51232-LABEL: test_store_16xf32: 550; AVX51232: # %bb.0: 551; AVX51232-NEXT: vmovups %zmm0, (%rdi) 552; AVX51232-NEXT: retq 553; 554; AVX51264-LABEL: test_store_16xf32: 555; AVX51264: # %bb.0: 556; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax 557; AVX51264-NEXT: vmovups %zmm0, (%eax) 558; AVX51264-NEXT: retl 559 store <16 x float> %value, <16 x float>* %addr, align 1 560 ret <16 x float> %value 561} 562 563define <16 x float> @test_store_16xf32_aligned(<16 x float>* nocapture %addr, <16 x float> %value) { 564; SSE32-LABEL: test_store_16xf32_aligned: 565; SSE32: # %bb.0: 566; SSE32-NEXT: movaps %xmm0, (%rdi) 567; SSE32-NEXT: movaps %xmm1, 16(%rdi) 568; SSE32-NEXT: movaps %xmm2, 32(%rdi) 569; SSE32-NEXT: movaps %xmm3, 48(%rdi) 570; SSE32-NEXT: retq 571; 572; SSE64-LABEL: test_store_16xf32_aligned: 573; SSE64: # %bb.0: 574; SSE64-NEXT: subl $12, %esp 575; SSE64-NEXT: .cfi_def_cfa_offset 16 576; SSE64-NEXT: movaps {{[0-9]+}}(%esp), %xmm3 577; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 578; SSE64-NEXT: movaps %xmm0, (%eax) 579; SSE64-NEXT: movaps %xmm1, 16(%eax) 580; SSE64-NEXT: movaps %xmm2, 32(%eax) 581; SSE64-NEXT: movaps %xmm3, 48(%eax) 582; SSE64-NEXT: addl $12, %esp 583; SSE64-NEXT: .cfi_def_cfa_offset 4 584; SSE64-NEXT: retl 585; 586; AVXONLY32-LABEL: test_store_16xf32_aligned: 587; AVXONLY32: # %bb.0: 588; AVXONLY32-NEXT: vmovaps %ymm0, (%rdi) 589; AVXONLY32-NEXT: vmovaps %ymm1, 32(%rdi) 590; AVXONLY32-NEXT: retq 591; 592; AVXONLY64-LABEL: test_store_16xf32_aligned: 593; AVXONLY64: # %bb.0: 594; AVXONLY64-NEXT: movl {{[0-9]+}}(%esp), %eax 595; AVXONLY64-NEXT: vmovaps %ymm0, (%eax) 596; AVXONLY64-NEXT: vmovaps %ymm1, 32(%eax) 597; AVXONLY64-NEXT: retl 598; 599; AVX51232-LABEL: test_store_16xf32_aligned: 600; AVX51232: # %bb.0: 601; AVX51232-NEXT: vmovaps %zmm0, (%rdi) 602; AVX51232-NEXT: retq 603; 604; AVX51264-LABEL: test_store_16xf32_aligned: 605; AVX51264: # %bb.0: 606; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax 607; AVX51264-NEXT: vmovaps %zmm0, (%eax) 608; AVX51264-NEXT: retl 609 store <16 x float> %value, <16 x float>* %addr, align 64 610 ret <16 x float> %value 611} 612 613define <8 x double> @test_store_8xf64(<8 x double>* nocapture %addr, <8 x double> %value, <8 x double> %value2) { 614; SSE32-LABEL: test_store_8xf64: 615; SSE32: # %bb.0: 616; SSE32-NEXT: addpd %xmm7, %xmm3 617; SSE32-NEXT: addpd %xmm6, %xmm2 618; SSE32-NEXT: addpd %xmm5, %xmm1 619; SSE32-NEXT: addpd %xmm4, %xmm0 620; SSE32-NEXT: movupd %xmm0, (%rdi) 621; SSE32-NEXT: movupd %xmm1, 16(%rdi) 622; SSE32-NEXT: movupd %xmm2, 32(%rdi) 623; SSE32-NEXT: movupd %xmm3, 48(%rdi) 624; SSE32-NEXT: retq 625; 626; SSE64-LABEL: test_store_8xf64: 627; SSE64: # %bb.0: 628; SSE64-NEXT: subl $12, %esp 629; SSE64-NEXT: .cfi_def_cfa_offset 16 630; SSE64-NEXT: movapd {{[0-9]+}}(%esp), %xmm3 631; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 632; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm3 633; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm2 634; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm1 635; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm0 636; SSE64-NEXT: movupd %xmm0, (%eax) 637; SSE64-NEXT: movupd %xmm1, 16(%eax) 638; SSE64-NEXT: movupd %xmm2, 32(%eax) 639; SSE64-NEXT: movupd %xmm3, 48(%eax) 640; SSE64-NEXT: addl $12, %esp 641; SSE64-NEXT: .cfi_def_cfa_offset 4 642; SSE64-NEXT: retl 643; 644; AVXONLY32-LABEL: test_store_8xf64: 645; AVXONLY32: # %bb.0: 646; AVXONLY32-NEXT: vaddpd %ymm3, %ymm1, %ymm1 647; AVXONLY32-NEXT: vaddpd %ymm2, %ymm0, %ymm0 648; AVXONLY32-NEXT: vmovupd %ymm0, (%rdi) 649; AVXONLY32-NEXT: vmovupd %ymm1, 32(%rdi) 650; AVXONLY32-NEXT: retq 651; 652; AVXONLY64-LABEL: test_store_8xf64: 653; AVXONLY64: # %bb.0: 654; AVXONLY64-NEXT: pushl %ebp 655; AVXONLY64-NEXT: .cfi_def_cfa_offset 8 656; AVXONLY64-NEXT: .cfi_offset %ebp, -8 657; AVXONLY64-NEXT: movl %esp, %ebp 658; AVXONLY64-NEXT: .cfi_def_cfa_register %ebp 659; AVXONLY64-NEXT: andl $-32, %esp 660; AVXONLY64-NEXT: subl $32, %esp 661; AVXONLY64-NEXT: movl 8(%ebp), %eax 662; AVXONLY64-NEXT: vaddpd 40(%ebp), %ymm1, %ymm1 663; AVXONLY64-NEXT: vaddpd %ymm2, %ymm0, %ymm0 664; AVXONLY64-NEXT: vmovupd %ymm0, (%eax) 665; AVXONLY64-NEXT: vmovupd %ymm1, 32(%eax) 666; AVXONLY64-NEXT: movl %ebp, %esp 667; AVXONLY64-NEXT: popl %ebp 668; AVXONLY64-NEXT: .cfi_def_cfa %esp, 4 669; AVXONLY64-NEXT: retl 670; 671; AVX51232-LABEL: test_store_8xf64: 672; AVX51232: # %bb.0: 673; AVX51232-NEXT: vaddpd %zmm1, %zmm0, %zmm0 674; AVX51232-NEXT: vmovupd %zmm0, (%rdi) 675; AVX51232-NEXT: retq 676; 677; AVX51264-LABEL: test_store_8xf64: 678; AVX51264: # %bb.0: 679; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax 680; AVX51264-NEXT: vaddpd %zmm1, %zmm0, %zmm0 681; AVX51264-NEXT: vmovupd %zmm0, (%eax) 682; AVX51264-NEXT: retl 683 %foo = fadd <8 x double> %value, %value2 ; to force dobule type on store 684 store <8 x double> %foo, <8 x double>* %addr, align 1 685 ret <8 x double> %foo 686} 687 688define <8 x double> @test_store_8xf64_aligned(<8 x double>* nocapture %addr, <8 x double> %value, <8 x double> %value2) { 689; SSE32-LABEL: test_store_8xf64_aligned: 690; SSE32: # %bb.0: 691; SSE32-NEXT: addpd %xmm7, %xmm3 692; SSE32-NEXT: addpd %xmm6, %xmm2 693; SSE32-NEXT: addpd %xmm5, %xmm1 694; SSE32-NEXT: addpd %xmm4, %xmm0 695; SSE32-NEXT: movapd %xmm0, (%rdi) 696; SSE32-NEXT: movapd %xmm1, 16(%rdi) 697; SSE32-NEXT: movapd %xmm2, 32(%rdi) 698; SSE32-NEXT: movapd %xmm3, 48(%rdi) 699; SSE32-NEXT: retq 700; 701; SSE64-LABEL: test_store_8xf64_aligned: 702; SSE64: # %bb.0: 703; SSE64-NEXT: subl $12, %esp 704; SSE64-NEXT: .cfi_def_cfa_offset 16 705; SSE64-NEXT: movapd {{[0-9]+}}(%esp), %xmm3 706; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 707; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm3 708; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm2 709; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm1 710; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm0 711; SSE64-NEXT: movapd %xmm0, (%eax) 712; SSE64-NEXT: movapd %xmm1, 16(%eax) 713; SSE64-NEXT: movapd %xmm2, 32(%eax) 714; SSE64-NEXT: movapd %xmm3, 48(%eax) 715; SSE64-NEXT: addl $12, %esp 716; SSE64-NEXT: .cfi_def_cfa_offset 4 717; SSE64-NEXT: retl 718; 719; AVXONLY32-LABEL: test_store_8xf64_aligned: 720; AVXONLY32: # %bb.0: 721; AVXONLY32-NEXT: vaddpd %ymm3, %ymm1, %ymm1 722; AVXONLY32-NEXT: vaddpd %ymm2, %ymm0, %ymm0 723; AVXONLY32-NEXT: vmovapd %ymm0, (%rdi) 724; AVXONLY32-NEXT: vmovapd %ymm1, 32(%rdi) 725; AVXONLY32-NEXT: retq 726; 727; AVXONLY64-LABEL: test_store_8xf64_aligned: 728; AVXONLY64: # %bb.0: 729; AVXONLY64-NEXT: pushl %ebp 730; AVXONLY64-NEXT: .cfi_def_cfa_offset 8 731; AVXONLY64-NEXT: .cfi_offset %ebp, -8 732; AVXONLY64-NEXT: movl %esp, %ebp 733; AVXONLY64-NEXT: .cfi_def_cfa_register %ebp 734; AVXONLY64-NEXT: andl $-32, %esp 735; AVXONLY64-NEXT: subl $32, %esp 736; AVXONLY64-NEXT: movl 8(%ebp), %eax 737; AVXONLY64-NEXT: vaddpd 40(%ebp), %ymm1, %ymm1 738; AVXONLY64-NEXT: vaddpd %ymm2, %ymm0, %ymm0 739; AVXONLY64-NEXT: vmovapd %ymm0, (%eax) 740; AVXONLY64-NEXT: vmovapd %ymm1, 32(%eax) 741; AVXONLY64-NEXT: movl %ebp, %esp 742; AVXONLY64-NEXT: popl %ebp 743; AVXONLY64-NEXT: .cfi_def_cfa %esp, 4 744; AVXONLY64-NEXT: retl 745; 746; AVX51232-LABEL: test_store_8xf64_aligned: 747; AVX51232: # %bb.0: 748; AVX51232-NEXT: vaddpd %zmm1, %zmm0, %zmm0 749; AVX51232-NEXT: vmovapd %zmm0, (%rdi) 750; AVX51232-NEXT: retq 751; 752; AVX51264-LABEL: test_store_8xf64_aligned: 753; AVX51264: # %bb.0: 754; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax 755; AVX51264-NEXT: vaddpd %zmm1, %zmm0, %zmm0 756; AVX51264-NEXT: vmovapd %zmm0, (%eax) 757; AVX51264-NEXT: retl 758 %foo = fadd <8 x double> %value, %value2 ; to force dobule type on store 759 store <8 x double> %foo, <8 x double>* %addr, align 64 760 ret <8 x double> %foo 761} 762