1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefix=X32 3; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefix=X64 4 5define i32 @f(<8 x float> %A, i8* %B, <4 x double> %C, <4 x i64> %E, <8 x i32> %F, <16 x i16> %G, <32 x i8> %H, i32* %loadptr) nounwind { 6; X32-LABEL: f: 7; X32: # %bb.0: 8; X32-NEXT: pushl %ebp 9; X32-NEXT: movl %esp, %ebp 10; X32-NEXT: andl $-32, %esp 11; X32-NEXT: subl $32, %esp 12; X32-NEXT: vmovdqa 104(%ebp), %ymm3 13; X32-NEXT: vmovdqa 72(%ebp), %ymm4 14; X32-NEXT: vmovdqa 40(%ebp), %ymm5 15; X32-NEXT: movl 8(%ebp), %ecx 16; X32-NEXT: movl 136(%ebp), %edx 17; X32-NEXT: movl (%edx), %eax 18; X32-NEXT: vaddps {{\.LCPI.*}}, %ymm0, %ymm0 19; X32-NEXT: vmovntps %ymm0, (%ecx) 20; X32-NEXT: vpaddq {{\.LCPI.*}}, %ymm2, %ymm0 21; X32-NEXT: addl (%edx), %eax 22; X32-NEXT: vmovntdq %ymm0, (%ecx) 23; X32-NEXT: vaddpd {{\.LCPI.*}}, %ymm1, %ymm0 24; X32-NEXT: addl (%edx), %eax 25; X32-NEXT: vmovntpd %ymm0, (%ecx) 26; X32-NEXT: vpaddd {{\.LCPI.*}}, %ymm5, %ymm0 27; X32-NEXT: addl (%edx), %eax 28; X32-NEXT: vmovntdq %ymm0, (%ecx) 29; X32-NEXT: vpaddw {{\.LCPI.*}}, %ymm4, %ymm0 30; X32-NEXT: addl (%edx), %eax 31; X32-NEXT: vmovntdq %ymm0, (%ecx) 32; X32-NEXT: vpaddb {{\.LCPI.*}}, %ymm3, %ymm0 33; X32-NEXT: addl (%edx), %eax 34; X32-NEXT: vmovntdq %ymm0, (%ecx) 35; X32-NEXT: movl %ebp, %esp 36; X32-NEXT: popl %ebp 37; X32-NEXT: vzeroupper 38; X32-NEXT: retl 39; 40; X64-LABEL: f: 41; X64: # %bb.0: 42; X64-NEXT: movl (%rsi), %eax 43; X64-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0 44; X64-NEXT: vmovntps %ymm0, (%rdi) 45; X64-NEXT: vpaddq {{.*}}(%rip), %ymm2, %ymm0 46; X64-NEXT: addl (%rsi), %eax 47; X64-NEXT: vmovntdq %ymm0, (%rdi) 48; X64-NEXT: vaddpd {{.*}}(%rip), %ymm1, %ymm0 49; X64-NEXT: addl (%rsi), %eax 50; X64-NEXT: vmovntpd %ymm0, (%rdi) 51; X64-NEXT: vpaddd {{.*}}(%rip), %ymm3, %ymm0 52; X64-NEXT: addl (%rsi), %eax 53; X64-NEXT: vmovntdq %ymm0, (%rdi) 54; X64-NEXT: vpaddw {{.*}}(%rip), %ymm4, %ymm0 55; X64-NEXT: addl (%rsi), %eax 56; X64-NEXT: vmovntdq %ymm0, (%rdi) 57; X64-NEXT: vpaddb {{.*}}(%rip), %ymm5, %ymm0 58; X64-NEXT: addl (%rsi), %eax 59; X64-NEXT: vmovntdq %ymm0, (%rdi) 60; X64-NEXT: vzeroupper 61; X64-NEXT: retq 62 %v0 = load i32, i32* %loadptr, align 1 63 %cast = bitcast i8* %B to <8 x float>* 64 %A2 = fadd <8 x float> %A, <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0> 65 store <8 x float> %A2, <8 x float>* %cast, align 32, !nontemporal !0 66 %v1 = load i32, i32* %loadptr, align 1 67 %cast1 = bitcast i8* %B to <4 x i64>* 68 %E2 = add <4 x i64> %E, <i64 1, i64 2, i64 3, i64 4> 69 store <4 x i64> %E2, <4 x i64>* %cast1, align 32, !nontemporal !0 70 %v2 = load i32, i32* %loadptr, align 1 71 %cast2 = bitcast i8* %B to <4 x double>* 72 %C2 = fadd <4 x double> %C, <double 1.0, double 2.0, double 3.0, double 4.0> 73 store <4 x double> %C2, <4 x double>* %cast2, align 32, !nontemporal !0 74 %v3 = load i32, i32* %loadptr, align 1 75 %cast3 = bitcast i8* %B to <8 x i32>* 76 %F2 = add <8 x i32> %F, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 77 store <8 x i32> %F2, <8 x i32>* %cast3, align 32, !nontemporal !0 78 %v4 = load i32, i32* %loadptr, align 1 79 %cast4 = bitcast i8* %B to <16 x i16>* 80 %G2 = add <16 x i16> %G, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8> 81 store <16 x i16> %G2, <16 x i16>* %cast4, align 32, !nontemporal !0 82 %v5 = load i32, i32* %loadptr, align 1 83 %cast5 = bitcast i8* %B to <32 x i8>* 84 %H2 = add <32 x i8> %H, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8> 85 store <32 x i8> %H2, <32 x i8>* %cast5, align 32, !nontemporal !0 86 %v6 = load i32, i32* %loadptr, align 1 87 %sum1 = add i32 %v0, %v1 88 %sum2 = add i32 %sum1, %v2 89 %sum3 = add i32 %sum2, %v3 90 %sum4 = add i32 %sum3, %v4 91 %sum5 = add i32 %sum4, %v5 92 %sum6 = add i32 %sum5, %v6 93 ret i32 %sum5 94} 95 96!0 = !{i32 1} 97