1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+sse2 < %s | FileCheck %s --check-prefixes=ALL32,SSE32 3; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+sse2 < %s | FileCheck %s --check-prefixes=ALL64,SSE64 4; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx < %s | FileCheck %s --check-prefixes=ALL32,AVX32,AVXONLY32 5; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx < %s | FileCheck %s --check-prefixes=ALL64,AVX64,AVXONLY64 6; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512f < %s | FileCheck %s --check-prefixes=ALL32,AVX32,AVX51232 7; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512f < %s | FileCheck %s --check-prefixes=ALL64,AVX64,AVX51264 8; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512vl,+avx512dq,+avx512bw < %s | FileCheck %s --check-prefixes=ALL32,AVX32,AVX51232 9; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512f,+avx512dq,+avx512bw < %s | FileCheck %s --check-prefixes=ALL64,AVX64,AVX51264 10 11define i32 @test_store_32(i32* nocapture %addr, i32 %value) { 12; ALL32-LABEL: test_store_32: 13; ALL32: # %bb.0: # %entry 14; ALL32-NEXT: movl %esi, %eax 15; ALL32-NEXT: movl %esi, (%rdi) 16; ALL32-NEXT: retq 17; 18; ALL64-LABEL: test_store_32: 19; ALL64: # %bb.0: # %entry 20; ALL64-NEXT: movl {{[0-9]+}}(%esp), %eax 21; ALL64-NEXT: movl {{[0-9]+}}(%esp), %ecx 22; ALL64-NEXT: movl %eax, (%ecx) 23; ALL64-NEXT: retl 24entry: 25 store i32 %value, i32* %addr, align 1 26 ret i32 %value 27} 28 29define i16 @test_store_16(i16* nocapture %addr, i16 %value) { 30; ALL32-LABEL: test_store_16: 31; ALL32: # %bb.0: # %entry 32; ALL32-NEXT: movl %esi, %eax 33; ALL32-NEXT: movw %ax, (%rdi) 34; ALL32-NEXT: # kill: def $ax killed $ax killed $eax 35; ALL32-NEXT: retq 36; 37; ALL64-LABEL: test_store_16: 38; ALL64: # %bb.0: # %entry 39; ALL64-NEXT: movzwl {{[0-9]+}}(%esp), %eax 40; ALL64-NEXT: movl {{[0-9]+}}(%esp), %ecx 41; ALL64-NEXT: movw %ax, (%ecx) 42; ALL64-NEXT: retl 43entry: 44 store i16 %value, i16* %addr, align 1 45 ret i16 %value 46} 47 48define <4 x i32> @test_store_4xi32(<4 x i32>* nocapture %addr, <4 x i32> %value, <4 x i32> %value2) { 49; SSE32-LABEL: test_store_4xi32: 50; SSE32: # %bb.0: 51; SSE32-NEXT: paddd %xmm1, %xmm0 52; SSE32-NEXT: movdqu %xmm0, (%rdi) 53; SSE32-NEXT: retq 54; 55; SSE64-LABEL: test_store_4xi32: 56; SSE64: # %bb.0: 57; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 58; SSE64-NEXT: paddd %xmm1, %xmm0 59; SSE64-NEXT: movdqu %xmm0, (%eax) 60; SSE64-NEXT: retl 61; 62; AVX32-LABEL: test_store_4xi32: 63; AVX32: # %bb.0: 64; AVX32-NEXT: vpaddd %xmm1, %xmm0, %xmm0 65; AVX32-NEXT: vmovdqu %xmm0, (%rdi) 66; AVX32-NEXT: retq 67; 68; AVX64-LABEL: test_store_4xi32: 69; AVX64: # %bb.0: 70; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 71; AVX64-NEXT: vpaddd %xmm1, %xmm0, %xmm0 72; AVX64-NEXT: vmovdqu %xmm0, (%eax) 73; AVX64-NEXT: retl 74 %foo = add <4 x i32> %value, %value2 ; to force integer type on store 75 store <4 x i32> %foo, <4 x i32>* %addr, align 1 76 ret <4 x i32> %foo 77} 78 79define <4 x i32> @test_store_4xi32_aligned(<4 x i32>* nocapture %addr, <4 x i32> %value, <4 x i32> %value2) { 80; SSE32-LABEL: test_store_4xi32_aligned: 81; SSE32: # %bb.0: 82; SSE32-NEXT: paddd %xmm1, %xmm0 83; SSE32-NEXT: movdqa %xmm0, (%rdi) 84; SSE32-NEXT: retq 85; 86; SSE64-LABEL: test_store_4xi32_aligned: 87; SSE64: # %bb.0: 88; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 89; SSE64-NEXT: paddd %xmm1, %xmm0 90; SSE64-NEXT: movdqa %xmm0, (%eax) 91; SSE64-NEXT: retl 92; 93; AVX32-LABEL: test_store_4xi32_aligned: 94; AVX32: # %bb.0: 95; AVX32-NEXT: vpaddd %xmm1, %xmm0, %xmm0 96; AVX32-NEXT: vmovdqa %xmm0, (%rdi) 97; AVX32-NEXT: retq 98; 99; AVX64-LABEL: test_store_4xi32_aligned: 100; AVX64: # %bb.0: 101; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 102; AVX64-NEXT: vpaddd %xmm1, %xmm0, %xmm0 103; AVX64-NEXT: vmovdqa %xmm0, (%eax) 104; AVX64-NEXT: retl 105 %foo = add <4 x i32> %value, %value2 ; to force integer type on store 106 store <4 x i32> %foo, <4 x i32>* %addr, align 16 107 ret <4 x i32> %foo 108} 109 110define <4 x float> @test_store_4xf32(<4 x float>* nocapture %addr, <4 x float> %value) { 111; SSE32-LABEL: test_store_4xf32: 112; SSE32: # %bb.0: 113; SSE32-NEXT: movups %xmm0, (%rdi) 114; SSE32-NEXT: retq 115; 116; SSE64-LABEL: test_store_4xf32: 117; SSE64: # %bb.0: 118; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 119; SSE64-NEXT: movups %xmm0, (%eax) 120; SSE64-NEXT: retl 121; 122; AVX32-LABEL: test_store_4xf32: 123; AVX32: # %bb.0: 124; AVX32-NEXT: vmovups %xmm0, (%rdi) 125; AVX32-NEXT: retq 126; 127; AVX64-LABEL: test_store_4xf32: 128; AVX64: # %bb.0: 129; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 130; AVX64-NEXT: vmovups %xmm0, (%eax) 131; AVX64-NEXT: retl 132 store <4 x float> %value, <4 x float>* %addr, align 1 133 ret <4 x float> %value 134} 135 136define <4 x float> @test_store_4xf32_aligned(<4 x float>* nocapture %addr, <4 x float> %value) { 137; SSE32-LABEL: test_store_4xf32_aligned: 138; SSE32: # %bb.0: 139; SSE32-NEXT: movaps %xmm0, (%rdi) 140; SSE32-NEXT: retq 141; 142; SSE64-LABEL: test_store_4xf32_aligned: 143; SSE64: # %bb.0: 144; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 145; SSE64-NEXT: movaps %xmm0, (%eax) 146; SSE64-NEXT: retl 147; 148; AVX32-LABEL: test_store_4xf32_aligned: 149; AVX32: # %bb.0: 150; AVX32-NEXT: vmovaps %xmm0, (%rdi) 151; AVX32-NEXT: retq 152; 153; AVX64-LABEL: test_store_4xf32_aligned: 154; AVX64: # %bb.0: 155; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 156; AVX64-NEXT: vmovaps %xmm0, (%eax) 157; AVX64-NEXT: retl 158 store <4 x float> %value, <4 x float>* %addr, align 16 159 ret <4 x float> %value 160} 161 162define <2 x double> @test_store_2xf64(<2 x double>* nocapture %addr, <2 x double> %value, <2 x double> %value2) { 163; SSE32-LABEL: test_store_2xf64: 164; SSE32: # %bb.0: 165; SSE32-NEXT: addpd %xmm1, %xmm0 166; SSE32-NEXT: movupd %xmm0, (%rdi) 167; SSE32-NEXT: retq 168; 169; SSE64-LABEL: test_store_2xf64: 170; SSE64: # %bb.0: 171; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 172; SSE64-NEXT: addpd %xmm1, %xmm0 173; SSE64-NEXT: movupd %xmm0, (%eax) 174; SSE64-NEXT: retl 175; 176; AVX32-LABEL: test_store_2xf64: 177; AVX32: # %bb.0: 178; AVX32-NEXT: vaddpd %xmm1, %xmm0, %xmm0 179; AVX32-NEXT: vmovupd %xmm0, (%rdi) 180; AVX32-NEXT: retq 181; 182; AVX64-LABEL: test_store_2xf64: 183; AVX64: # %bb.0: 184; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 185; AVX64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 186; AVX64-NEXT: vmovupd %xmm0, (%eax) 187; AVX64-NEXT: retl 188 %foo = fadd <2 x double> %value, %value2 ; to force dobule type on store 189 store <2 x double> %foo, <2 x double>* %addr, align 1 190 ret <2 x double> %foo 191} 192 193define <2 x double> @test_store_2xf64_aligned(<2 x double>* nocapture %addr, <2 x double> %value, <2 x double> %value2) { 194; SSE32-LABEL: test_store_2xf64_aligned: 195; SSE32: # %bb.0: 196; SSE32-NEXT: addpd %xmm1, %xmm0 197; SSE32-NEXT: movapd %xmm0, (%rdi) 198; SSE32-NEXT: retq 199; 200; SSE64-LABEL: test_store_2xf64_aligned: 201; SSE64: # %bb.0: 202; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 203; SSE64-NEXT: addpd %xmm1, %xmm0 204; SSE64-NEXT: movapd %xmm0, (%eax) 205; SSE64-NEXT: retl 206; 207; AVX32-LABEL: test_store_2xf64_aligned: 208; AVX32: # %bb.0: 209; AVX32-NEXT: vaddpd %xmm1, %xmm0, %xmm0 210; AVX32-NEXT: vmovapd %xmm0, (%rdi) 211; AVX32-NEXT: retq 212; 213; AVX64-LABEL: test_store_2xf64_aligned: 214; AVX64: # %bb.0: 215; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 216; AVX64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 217; AVX64-NEXT: vmovapd %xmm0, (%eax) 218; AVX64-NEXT: retl 219 %foo = fadd <2 x double> %value, %value2 ; to force dobule type on store 220 store <2 x double> %foo, <2 x double>* %addr, align 16 221 ret <2 x double> %foo 222} 223 224define <8 x i32> @test_store_8xi32(<8 x i32>* nocapture %addr, <8 x i32> %value) { 225; SSE32-LABEL: test_store_8xi32: 226; SSE32: # %bb.0: 227; SSE32-NEXT: movups %xmm0, (%rdi) 228; SSE32-NEXT: movups %xmm1, 16(%rdi) 229; SSE32-NEXT: retq 230; 231; SSE64-LABEL: test_store_8xi32: 232; SSE64: # %bb.0: 233; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 234; SSE64-NEXT: movups %xmm0, (%eax) 235; SSE64-NEXT: movups %xmm1, 16(%eax) 236; SSE64-NEXT: retl 237; 238; AVX32-LABEL: test_store_8xi32: 239; AVX32: # %bb.0: 240; AVX32-NEXT: vmovups %ymm0, (%rdi) 241; AVX32-NEXT: retq 242; 243; AVX64-LABEL: test_store_8xi32: 244; AVX64: # %bb.0: 245; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 246; AVX64-NEXT: vmovups %ymm0, (%eax) 247; AVX64-NEXT: retl 248 store <8 x i32> %value, <8 x i32>* %addr, align 1 249 ret <8 x i32> %value 250} 251 252define <8 x i32> @test_store_8xi32_aligned(<8 x i32>* nocapture %addr, <8 x i32> %value) { 253; SSE32-LABEL: test_store_8xi32_aligned: 254; SSE32: # %bb.0: 255; SSE32-NEXT: movaps %xmm0, (%rdi) 256; SSE32-NEXT: movaps %xmm1, 16(%rdi) 257; SSE32-NEXT: retq 258; 259; SSE64-LABEL: test_store_8xi32_aligned: 260; SSE64: # %bb.0: 261; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 262; SSE64-NEXT: movaps %xmm0, (%eax) 263; SSE64-NEXT: movaps %xmm1, 16(%eax) 264; SSE64-NEXT: retl 265; 266; AVX32-LABEL: test_store_8xi32_aligned: 267; AVX32: # %bb.0: 268; AVX32-NEXT: vmovaps %ymm0, (%rdi) 269; AVX32-NEXT: retq 270; 271; AVX64-LABEL: test_store_8xi32_aligned: 272; AVX64: # %bb.0: 273; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 274; AVX64-NEXT: vmovaps %ymm0, (%eax) 275; AVX64-NEXT: retl 276 store <8 x i32> %value, <8 x i32>* %addr, align 32 277 ret <8 x i32> %value 278} 279 280define <8 x float> @test_store_8xf32(<8 x float>* nocapture %addr, <8 x float> %value) { 281; SSE32-LABEL: test_store_8xf32: 282; SSE32: # %bb.0: 283; SSE32-NEXT: movups %xmm0, (%rdi) 284; SSE32-NEXT: movups %xmm1, 16(%rdi) 285; SSE32-NEXT: retq 286; 287; SSE64-LABEL: test_store_8xf32: 288; SSE64: # %bb.0: 289; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 290; SSE64-NEXT: movups %xmm0, (%eax) 291; SSE64-NEXT: movups %xmm1, 16(%eax) 292; SSE64-NEXT: retl 293; 294; AVX32-LABEL: test_store_8xf32: 295; AVX32: # %bb.0: 296; AVX32-NEXT: vmovups %ymm0, (%rdi) 297; AVX32-NEXT: retq 298; 299; AVX64-LABEL: test_store_8xf32: 300; AVX64: # %bb.0: 301; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 302; AVX64-NEXT: vmovups %ymm0, (%eax) 303; AVX64-NEXT: retl 304 store <8 x float> %value, <8 x float>* %addr, align 1 305 ret <8 x float> %value 306} 307 308define <8 x float> @test_store_8xf32_aligned(<8 x float>* nocapture %addr, <8 x float> %value) { 309; SSE32-LABEL: test_store_8xf32_aligned: 310; SSE32: # %bb.0: 311; SSE32-NEXT: movaps %xmm0, (%rdi) 312; SSE32-NEXT: movaps %xmm1, 16(%rdi) 313; SSE32-NEXT: retq 314; 315; SSE64-LABEL: test_store_8xf32_aligned: 316; SSE64: # %bb.0: 317; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 318; SSE64-NEXT: movaps %xmm0, (%eax) 319; SSE64-NEXT: movaps %xmm1, 16(%eax) 320; SSE64-NEXT: retl 321; 322; AVX32-LABEL: test_store_8xf32_aligned: 323; AVX32: # %bb.0: 324; AVX32-NEXT: vmovaps %ymm0, (%rdi) 325; AVX32-NEXT: retq 326; 327; AVX64-LABEL: test_store_8xf32_aligned: 328; AVX64: # %bb.0: 329; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 330; AVX64-NEXT: vmovaps %ymm0, (%eax) 331; AVX64-NEXT: retl 332 store <8 x float> %value, <8 x float>* %addr, align 32 333 ret <8 x float> %value 334} 335 336define <4 x double> @test_store_4xf64(<4 x double>* nocapture %addr, <4 x double> %value, <4 x double> %value2) { 337; SSE32-LABEL: test_store_4xf64: 338; SSE32: # %bb.0: 339; SSE32-NEXT: addpd %xmm2, %xmm0 340; SSE32-NEXT: movupd %xmm0, (%rdi) 341; SSE32-NEXT: addpd %xmm3, %xmm1 342; SSE32-NEXT: movupd %xmm1, 16(%rdi) 343; SSE32-NEXT: retq 344; 345; SSE64-LABEL: test_store_4xf64: 346; SSE64: # %bb.0: 347; SSE64-NEXT: subl $12, %esp 348; SSE64-NEXT: .cfi_def_cfa_offset 16 349; SSE64-NEXT: movapd {{[0-9]+}}(%esp), %xmm3 350; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 351; SSE64-NEXT: addpd %xmm2, %xmm0 352; SSE64-NEXT: movupd %xmm0, (%eax) 353; SSE64-NEXT: addpd %xmm3, %xmm1 354; SSE64-NEXT: movupd %xmm1, 16(%eax) 355; SSE64-NEXT: addl $12, %esp 356; SSE64-NEXT: .cfi_def_cfa_offset 4 357; SSE64-NEXT: retl 358; 359; AVX32-LABEL: test_store_4xf64: 360; AVX32: # %bb.0: 361; AVX32-NEXT: vaddpd %ymm1, %ymm0, %ymm0 362; AVX32-NEXT: vmovupd %ymm0, (%rdi) 363; AVX32-NEXT: retq 364; 365; AVX64-LABEL: test_store_4xf64: 366; AVX64: # %bb.0: 367; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 368; AVX64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 369; AVX64-NEXT: vmovupd %ymm0, (%eax) 370; AVX64-NEXT: retl 371 %foo = fadd <4 x double> %value, %value2 ; to force dobule type on store 372 store <4 x double> %foo, <4 x double>* %addr, align 1 373 ret <4 x double> %foo 374} 375 376define <4 x double> @test_store_4xf64_aligned(<4 x double>* nocapture %addr, <4 x double> %value, <4 x double> %value2) { 377; SSE32-LABEL: test_store_4xf64_aligned: 378; SSE32: # %bb.0: 379; SSE32-NEXT: addpd %xmm2, %xmm0 380; SSE32-NEXT: movapd %xmm0, (%rdi) 381; SSE32-NEXT: addpd %xmm3, %xmm1 382; SSE32-NEXT: movapd %xmm1, 16(%rdi) 383; SSE32-NEXT: retq 384; 385; SSE64-LABEL: test_store_4xf64_aligned: 386; SSE64: # %bb.0: 387; SSE64-NEXT: subl $12, %esp 388; SSE64-NEXT: .cfi_def_cfa_offset 16 389; SSE64-NEXT: movapd {{[0-9]+}}(%esp), %xmm3 390; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 391; SSE64-NEXT: addpd %xmm2, %xmm0 392; SSE64-NEXT: movapd %xmm0, (%eax) 393; SSE64-NEXT: addpd %xmm3, %xmm1 394; SSE64-NEXT: movapd %xmm1, 16(%eax) 395; SSE64-NEXT: addl $12, %esp 396; SSE64-NEXT: .cfi_def_cfa_offset 4 397; SSE64-NEXT: retl 398; 399; AVX32-LABEL: test_store_4xf64_aligned: 400; AVX32: # %bb.0: 401; AVX32-NEXT: vaddpd %ymm1, %ymm0, %ymm0 402; AVX32-NEXT: vmovapd %ymm0, (%rdi) 403; AVX32-NEXT: retq 404; 405; AVX64-LABEL: test_store_4xf64_aligned: 406; AVX64: # %bb.0: 407; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 408; AVX64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 409; AVX64-NEXT: vmovapd %ymm0, (%eax) 410; AVX64-NEXT: retl 411 %foo = fadd <4 x double> %value, %value2 ; to force dobule type on store 412 store <4 x double> %foo, <4 x double>* %addr, align 32 413 ret <4 x double> %foo 414} 415 416define <16 x i32> @test_store_16xi32(<16 x i32>* nocapture %addr, <16 x i32> %value) { 417; SSE32-LABEL: test_store_16xi32: 418; SSE32: # %bb.0: 419; SSE32-NEXT: movups %xmm0, (%rdi) 420; SSE32-NEXT: movups %xmm1, 16(%rdi) 421; SSE32-NEXT: movups %xmm2, 32(%rdi) 422; SSE32-NEXT: movups %xmm3, 48(%rdi) 423; SSE32-NEXT: retq 424; 425; SSE64-LABEL: test_store_16xi32: 426; SSE64: # %bb.0: 427; SSE64-NEXT: subl $12, %esp 428; SSE64-NEXT: .cfi_def_cfa_offset 16 429; SSE64-NEXT: movaps {{[0-9]+}}(%esp), %xmm3 430; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 431; SSE64-NEXT: movups %xmm0, (%eax) 432; SSE64-NEXT: movups %xmm1, 16(%eax) 433; SSE64-NEXT: movups %xmm2, 32(%eax) 434; SSE64-NEXT: movups %xmm3, 48(%eax) 435; SSE64-NEXT: addl $12, %esp 436; SSE64-NEXT: .cfi_def_cfa_offset 4 437; SSE64-NEXT: retl 438; 439; AVXONLY32-LABEL: test_store_16xi32: 440; AVXONLY32: # %bb.0: 441; AVXONLY32-NEXT: vmovups %ymm0, (%rdi) 442; AVXONLY32-NEXT: vmovups %ymm1, 32(%rdi) 443; AVXONLY32-NEXT: retq 444; 445; AVXONLY64-LABEL: test_store_16xi32: 446; AVXONLY64: # %bb.0: 447; AVXONLY64-NEXT: movl {{[0-9]+}}(%esp), %eax 448; AVXONLY64-NEXT: vmovups %ymm0, (%eax) 449; AVXONLY64-NEXT: vmovups %ymm1, 32(%eax) 450; AVXONLY64-NEXT: retl 451; 452; AVX51232-LABEL: test_store_16xi32: 453; AVX51232: # %bb.0: 454; AVX51232-NEXT: vmovups %zmm0, (%rdi) 455; AVX51232-NEXT: retq 456; 457; AVX51264-LABEL: test_store_16xi32: 458; AVX51264: # %bb.0: 459; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax 460; AVX51264-NEXT: vmovups %zmm0, (%eax) 461; AVX51264-NEXT: retl 462 store <16 x i32> %value, <16 x i32>* %addr, align 1 463 ret <16 x i32> %value 464} 465 466define <16 x i32> @test_store_16xi32_aligned(<16 x i32>* nocapture %addr, <16 x i32> %value) { 467; SSE32-LABEL: test_store_16xi32_aligned: 468; SSE32: # %bb.0: 469; SSE32-NEXT: movaps %xmm0, (%rdi) 470; SSE32-NEXT: movaps %xmm1, 16(%rdi) 471; SSE32-NEXT: movaps %xmm2, 32(%rdi) 472; SSE32-NEXT: movaps %xmm3, 48(%rdi) 473; SSE32-NEXT: retq 474; 475; SSE64-LABEL: test_store_16xi32_aligned: 476; SSE64: # %bb.0: 477; SSE64-NEXT: subl $12, %esp 478; SSE64-NEXT: .cfi_def_cfa_offset 16 479; SSE64-NEXT: movaps {{[0-9]+}}(%esp), %xmm3 480; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 481; SSE64-NEXT: movaps %xmm0, (%eax) 482; SSE64-NEXT: movaps %xmm1, 16(%eax) 483; SSE64-NEXT: movaps %xmm2, 32(%eax) 484; SSE64-NEXT: movaps %xmm3, 48(%eax) 485; SSE64-NEXT: addl $12, %esp 486; SSE64-NEXT: .cfi_def_cfa_offset 4 487; SSE64-NEXT: retl 488; 489; AVXONLY32-LABEL: test_store_16xi32_aligned: 490; AVXONLY32: # %bb.0: 491; AVXONLY32-NEXT: vmovaps %ymm0, (%rdi) 492; AVXONLY32-NEXT: vmovaps %ymm1, 32(%rdi) 493; AVXONLY32-NEXT: retq 494; 495; AVXONLY64-LABEL: test_store_16xi32_aligned: 496; AVXONLY64: # %bb.0: 497; AVXONLY64-NEXT: movl {{[0-9]+}}(%esp), %eax 498; AVXONLY64-NEXT: vmovaps %ymm0, (%eax) 499; AVXONLY64-NEXT: vmovaps %ymm1, 32(%eax) 500; AVXONLY64-NEXT: retl 501; 502; AVX51232-LABEL: test_store_16xi32_aligned: 503; AVX51232: # %bb.0: 504; AVX51232-NEXT: vmovaps %zmm0, (%rdi) 505; AVX51232-NEXT: retq 506; 507; AVX51264-LABEL: test_store_16xi32_aligned: 508; AVX51264: # %bb.0: 509; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax 510; AVX51264-NEXT: vmovaps %zmm0, (%eax) 511; AVX51264-NEXT: retl 512 store <16 x i32> %value, <16 x i32>* %addr, align 64 513 ret <16 x i32> %value 514} 515 516define <16 x float> @test_store_16xf32(<16 x float>* nocapture %addr, <16 x float> %value) { 517; SSE32-LABEL: test_store_16xf32: 518; SSE32: # %bb.0: 519; SSE32-NEXT: movups %xmm0, (%rdi) 520; SSE32-NEXT: movups %xmm1, 16(%rdi) 521; SSE32-NEXT: movups %xmm2, 32(%rdi) 522; SSE32-NEXT: movups %xmm3, 48(%rdi) 523; SSE32-NEXT: retq 524; 525; SSE64-LABEL: test_store_16xf32: 526; SSE64: # %bb.0: 527; SSE64-NEXT: subl $12, %esp 528; SSE64-NEXT: .cfi_def_cfa_offset 16 529; SSE64-NEXT: movaps {{[0-9]+}}(%esp), %xmm3 530; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 531; SSE64-NEXT: movups %xmm0, (%eax) 532; SSE64-NEXT: movups %xmm1, 16(%eax) 533; SSE64-NEXT: movups %xmm2, 32(%eax) 534; SSE64-NEXT: movups %xmm3, 48(%eax) 535; SSE64-NEXT: addl $12, %esp 536; SSE64-NEXT: .cfi_def_cfa_offset 4 537; SSE64-NEXT: retl 538; 539; AVXONLY32-LABEL: test_store_16xf32: 540; AVXONLY32: # %bb.0: 541; AVXONLY32-NEXT: vmovups %ymm0, (%rdi) 542; AVXONLY32-NEXT: vmovups %ymm1, 32(%rdi) 543; AVXONLY32-NEXT: retq 544; 545; AVXONLY64-LABEL: test_store_16xf32: 546; AVXONLY64: # %bb.0: 547; AVXONLY64-NEXT: movl {{[0-9]+}}(%esp), %eax 548; AVXONLY64-NEXT: vmovups %ymm0, (%eax) 549; AVXONLY64-NEXT: vmovups %ymm1, 32(%eax) 550; AVXONLY64-NEXT: retl 551; 552; AVX51232-LABEL: test_store_16xf32: 553; AVX51232: # %bb.0: 554; AVX51232-NEXT: vmovups %zmm0, (%rdi) 555; AVX51232-NEXT: retq 556; 557; AVX51264-LABEL: test_store_16xf32: 558; AVX51264: # %bb.0: 559; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax 560; AVX51264-NEXT: vmovups %zmm0, (%eax) 561; AVX51264-NEXT: retl 562 store <16 x float> %value, <16 x float>* %addr, align 1 563 ret <16 x float> %value 564} 565 566define <16 x float> @test_store_16xf32_aligned(<16 x float>* nocapture %addr, <16 x float> %value) { 567; SSE32-LABEL: test_store_16xf32_aligned: 568; SSE32: # %bb.0: 569; SSE32-NEXT: movaps %xmm0, (%rdi) 570; SSE32-NEXT: movaps %xmm1, 16(%rdi) 571; SSE32-NEXT: movaps %xmm2, 32(%rdi) 572; SSE32-NEXT: movaps %xmm3, 48(%rdi) 573; SSE32-NEXT: retq 574; 575; SSE64-LABEL: test_store_16xf32_aligned: 576; SSE64: # %bb.0: 577; SSE64-NEXT: subl $12, %esp 578; SSE64-NEXT: .cfi_def_cfa_offset 16 579; SSE64-NEXT: movaps {{[0-9]+}}(%esp), %xmm3 580; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 581; SSE64-NEXT: movaps %xmm0, (%eax) 582; SSE64-NEXT: movaps %xmm1, 16(%eax) 583; SSE64-NEXT: movaps %xmm2, 32(%eax) 584; SSE64-NEXT: movaps %xmm3, 48(%eax) 585; SSE64-NEXT: addl $12, %esp 586; SSE64-NEXT: .cfi_def_cfa_offset 4 587; SSE64-NEXT: retl 588; 589; AVXONLY32-LABEL: test_store_16xf32_aligned: 590; AVXONLY32: # %bb.0: 591; AVXONLY32-NEXT: vmovaps %ymm0, (%rdi) 592; AVXONLY32-NEXT: vmovaps %ymm1, 32(%rdi) 593; AVXONLY32-NEXT: retq 594; 595; AVXONLY64-LABEL: test_store_16xf32_aligned: 596; AVXONLY64: # %bb.0: 597; AVXONLY64-NEXT: movl {{[0-9]+}}(%esp), %eax 598; AVXONLY64-NEXT: vmovaps %ymm0, (%eax) 599; AVXONLY64-NEXT: vmovaps %ymm1, 32(%eax) 600; AVXONLY64-NEXT: retl 601; 602; AVX51232-LABEL: test_store_16xf32_aligned: 603; AVX51232: # %bb.0: 604; AVX51232-NEXT: vmovaps %zmm0, (%rdi) 605; AVX51232-NEXT: retq 606; 607; AVX51264-LABEL: test_store_16xf32_aligned: 608; AVX51264: # %bb.0: 609; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax 610; AVX51264-NEXT: vmovaps %zmm0, (%eax) 611; AVX51264-NEXT: retl 612 store <16 x float> %value, <16 x float>* %addr, align 64 613 ret <16 x float> %value 614} 615 616define <8 x double> @test_store_8xf64(<8 x double>* nocapture %addr, <8 x double> %value, <8 x double> %value2) { 617; SSE32-LABEL: test_store_8xf64: 618; SSE32: # %bb.0: 619; SSE32-NEXT: addpd %xmm4, %xmm0 620; SSE32-NEXT: movupd %xmm0, (%rdi) 621; SSE32-NEXT: addpd %xmm5, %xmm1 622; SSE32-NEXT: movupd %xmm1, 16(%rdi) 623; SSE32-NEXT: addpd %xmm6, %xmm2 624; SSE32-NEXT: movupd %xmm2, 32(%rdi) 625; SSE32-NEXT: addpd %xmm7, %xmm3 626; SSE32-NEXT: movupd %xmm3, 48(%rdi) 627; SSE32-NEXT: retq 628; 629; SSE64-LABEL: test_store_8xf64: 630; SSE64: # %bb.0: 631; SSE64-NEXT: subl $12, %esp 632; SSE64-NEXT: .cfi_def_cfa_offset 16 633; SSE64-NEXT: movapd {{[0-9]+}}(%esp), %xmm4 634; SSE64-NEXT: movapd {{[0-9]+}}(%esp), %xmm5 635; SSE64-NEXT: movapd {{[0-9]+}}(%esp), %xmm6 636; SSE64-NEXT: movapd {{[0-9]+}}(%esp), %xmm3 637; SSE64-NEXT: addpd %xmm4, %xmm3 638; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 639; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm0 640; SSE64-NEXT: movupd %xmm0, (%eax) 641; SSE64-NEXT: addpd %xmm6, %xmm1 642; SSE64-NEXT: movupd %xmm1, 16(%eax) 643; SSE64-NEXT: addpd %xmm5, %xmm2 644; SSE64-NEXT: movupd %xmm2, 32(%eax) 645; SSE64-NEXT: movupd %xmm3, 48(%eax) 646; SSE64-NEXT: addl $12, %esp 647; SSE64-NEXT: .cfi_def_cfa_offset 4 648; SSE64-NEXT: retl 649; 650; AVXONLY32-LABEL: test_store_8xf64: 651; AVXONLY32: # %bb.0: 652; AVXONLY32-NEXT: vaddpd %ymm2, %ymm0, %ymm0 653; AVXONLY32-NEXT: vmovupd %ymm0, (%rdi) 654; AVXONLY32-NEXT: vaddpd %ymm3, %ymm1, %ymm1 655; AVXONLY32-NEXT: vmovupd %ymm1, 32(%rdi) 656; AVXONLY32-NEXT: retq 657; 658; AVXONLY64-LABEL: test_store_8xf64: 659; AVXONLY64: # %bb.0: 660; AVXONLY64-NEXT: pushl %ebp 661; AVXONLY64-NEXT: .cfi_def_cfa_offset 8 662; AVXONLY64-NEXT: .cfi_offset %ebp, -8 663; AVXONLY64-NEXT: movl %esp, %ebp 664; AVXONLY64-NEXT: .cfi_def_cfa_register %ebp 665; AVXONLY64-NEXT: andl $-32, %esp 666; AVXONLY64-NEXT: subl $32, %esp 667; AVXONLY64-NEXT: vmovapd 40(%ebp), %ymm3 668; AVXONLY64-NEXT: movl 8(%ebp), %eax 669; AVXONLY64-NEXT: vaddpd %ymm2, %ymm0, %ymm0 670; AVXONLY64-NEXT: vmovupd %ymm0, (%eax) 671; AVXONLY64-NEXT: vaddpd %ymm3, %ymm1, %ymm1 672; AVXONLY64-NEXT: vmovupd %ymm1, 32(%eax) 673; AVXONLY64-NEXT: movl %ebp, %esp 674; AVXONLY64-NEXT: popl %ebp 675; AVXONLY64-NEXT: .cfi_def_cfa %esp, 4 676; AVXONLY64-NEXT: retl 677; 678; AVX51232-LABEL: test_store_8xf64: 679; AVX51232: # %bb.0: 680; AVX51232-NEXT: vaddpd %zmm1, %zmm0, %zmm0 681; AVX51232-NEXT: vmovupd %zmm0, (%rdi) 682; AVX51232-NEXT: retq 683; 684; AVX51264-LABEL: test_store_8xf64: 685; AVX51264: # %bb.0: 686; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax 687; AVX51264-NEXT: vaddpd %zmm1, %zmm0, %zmm0 688; AVX51264-NEXT: vmovupd %zmm0, (%eax) 689; AVX51264-NEXT: retl 690 %foo = fadd <8 x double> %value, %value2 ; to force dobule type on store 691 store <8 x double> %foo, <8 x double>* %addr, align 1 692 ret <8 x double> %foo 693} 694 695define <8 x double> @test_store_8xf64_aligned(<8 x double>* nocapture %addr, <8 x double> %value, <8 x double> %value2) { 696; SSE32-LABEL: test_store_8xf64_aligned: 697; SSE32: # %bb.0: 698; SSE32-NEXT: addpd %xmm4, %xmm0 699; SSE32-NEXT: movapd %xmm0, (%rdi) 700; SSE32-NEXT: addpd %xmm5, %xmm1 701; SSE32-NEXT: movapd %xmm1, 16(%rdi) 702; SSE32-NEXT: addpd %xmm6, %xmm2 703; SSE32-NEXT: movapd %xmm2, 32(%rdi) 704; SSE32-NEXT: addpd %xmm7, %xmm3 705; SSE32-NEXT: movapd %xmm3, 48(%rdi) 706; SSE32-NEXT: retq 707; 708; SSE64-LABEL: test_store_8xf64_aligned: 709; SSE64: # %bb.0: 710; SSE64-NEXT: subl $12, %esp 711; SSE64-NEXT: .cfi_def_cfa_offset 16 712; SSE64-NEXT: movapd {{[0-9]+}}(%esp), %xmm4 713; SSE64-NEXT: movapd {{[0-9]+}}(%esp), %xmm5 714; SSE64-NEXT: movapd {{[0-9]+}}(%esp), %xmm6 715; SSE64-NEXT: movapd {{[0-9]+}}(%esp), %xmm3 716; SSE64-NEXT: addpd %xmm4, %xmm3 717; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 718; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm0 719; SSE64-NEXT: movapd %xmm0, (%eax) 720; SSE64-NEXT: addpd %xmm6, %xmm1 721; SSE64-NEXT: movapd %xmm1, 16(%eax) 722; SSE64-NEXT: addpd %xmm5, %xmm2 723; SSE64-NEXT: movapd %xmm2, 32(%eax) 724; SSE64-NEXT: movapd %xmm3, 48(%eax) 725; SSE64-NEXT: addl $12, %esp 726; SSE64-NEXT: .cfi_def_cfa_offset 4 727; SSE64-NEXT: retl 728; 729; AVXONLY32-LABEL: test_store_8xf64_aligned: 730; AVXONLY32: # %bb.0: 731; AVXONLY32-NEXT: vaddpd %ymm2, %ymm0, %ymm0 732; AVXONLY32-NEXT: vmovapd %ymm0, (%rdi) 733; AVXONLY32-NEXT: vaddpd %ymm3, %ymm1, %ymm1 734; AVXONLY32-NEXT: vmovapd %ymm1, 32(%rdi) 735; AVXONLY32-NEXT: retq 736; 737; AVXONLY64-LABEL: test_store_8xf64_aligned: 738; AVXONLY64: # %bb.0: 739; AVXONLY64-NEXT: pushl %ebp 740; AVXONLY64-NEXT: .cfi_def_cfa_offset 8 741; AVXONLY64-NEXT: .cfi_offset %ebp, -8 742; AVXONLY64-NEXT: movl %esp, %ebp 743; AVXONLY64-NEXT: .cfi_def_cfa_register %ebp 744; AVXONLY64-NEXT: andl $-32, %esp 745; AVXONLY64-NEXT: subl $32, %esp 746; AVXONLY64-NEXT: vmovapd 40(%ebp), %ymm3 747; AVXONLY64-NEXT: movl 8(%ebp), %eax 748; AVXONLY64-NEXT: vaddpd %ymm2, %ymm0, %ymm0 749; AVXONLY64-NEXT: vmovapd %ymm0, (%eax) 750; AVXONLY64-NEXT: vaddpd %ymm3, %ymm1, %ymm1 751; AVXONLY64-NEXT: vmovapd %ymm1, 32(%eax) 752; AVXONLY64-NEXT: movl %ebp, %esp 753; AVXONLY64-NEXT: popl %ebp 754; AVXONLY64-NEXT: .cfi_def_cfa %esp, 4 755; AVXONLY64-NEXT: retl 756; 757; AVX51232-LABEL: test_store_8xf64_aligned: 758; AVX51232: # %bb.0: 759; AVX51232-NEXT: vaddpd %zmm1, %zmm0, %zmm0 760; AVX51232-NEXT: vmovapd %zmm0, (%rdi) 761; AVX51232-NEXT: retq 762; 763; AVX51264-LABEL: test_store_8xf64_aligned: 764; AVX51264: # %bb.0: 765; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax 766; AVX51264-NEXT: vaddpd %zmm1, %zmm0, %zmm0 767; AVX51264-NEXT: vmovapd %zmm0, (%eax) 768; AVX51264-NEXT: retl 769 %foo = fadd <8 x double> %value, %value2 ; to force dobule type on store 770 store <8 x double> %foo, <8 x double>* %addr, align 64 771 ret <8 x double> %foo 772} 773