• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512BW
6
7@a = global [1024 x i8] zeroinitializer, align 16
8@b = global [1024 x i8] zeroinitializer, align 16
9
10define i32 @sad_16i8() nounwind {
11; SSE2-LABEL: sad_16i8:
12; SSE2:       # BB#0: # %entry
13; SSE2-NEXT:    pxor %xmm0, %xmm0
14; SSE2-NEXT:    movq $-1024, %rax # imm = 0xFC00
15; SSE2-NEXT:    pxor %xmm1, %xmm1
16; SSE2-NEXT:    .p2align 4, 0x90
17; SSE2-NEXT:  .LBB0_1: # %vector.body
18; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
19; SSE2-NEXT:    movdqu a+1024(%rax), %xmm2
20; SSE2-NEXT:    movdqu b+1024(%rax), %xmm3
21; SSE2-NEXT:    psadbw %xmm2, %xmm3
22; SSE2-NEXT:    paddd %xmm3, %xmm1
23; SSE2-NEXT:    addq $4, %rax
24; SSE2-NEXT:    jne .LBB0_1
25; SSE2-NEXT:  # BB#2: # %middle.block
26; SSE2-NEXT:    paddd %xmm0, %xmm1
27; SSE2-NEXT:    paddd %xmm0, %xmm0
28; SSE2-NEXT:    paddd %xmm1, %xmm0
29; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
30; SSE2-NEXT:    paddd %xmm0, %xmm1
31; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
32; SSE2-NEXT:    paddd %xmm1, %xmm0
33; SSE2-NEXT:    movd %xmm0, %eax
34; SSE2-NEXT:    retq
35;
36; AVX2-LABEL: sad_16i8:
37; AVX2:       # BB#0: # %entry
38; AVX2-NEXT:    vpxor %ymm0, %ymm0, %ymm0
39; AVX2-NEXT:    movq $-1024, %rax # imm = 0xFC00
40; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
41; AVX2-NEXT:    .p2align 4, 0x90
42; AVX2-NEXT:  .LBB0_1: # %vector.body
43; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
44; AVX2-NEXT:    vmovdqu a+1024(%rax), %xmm2
45; AVX2-NEXT:    vpsadbw b+1024(%rax), %xmm2, %xmm2
46; AVX2-NEXT:    vpaddd %xmm1, %xmm2, %xmm2
47; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
48; AVX2-NEXT:    addq $4, %rax
49; AVX2-NEXT:    jne .LBB0_1
50; AVX2-NEXT:  # BB#2: # %middle.block
51; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
52; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
53; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
54; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
55; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
56; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
57; AVX2-NEXT:    vmovd %xmm0, %eax
58; AVX2-NEXT:    vzeroupper
59; AVX2-NEXT:    retq
60;
61; AVX512F-LABEL: sad_16i8:
62; AVX512F:       # BB#0: # %entry
63; AVX512F-NEXT:    vpxord %zmm0, %zmm0, %zmm0
64; AVX512F-NEXT:    movq $-1024, %rax # imm = 0xFC00
65; AVX512F-NEXT:    .p2align 4, 0x90
66; AVX512F-NEXT:  .LBB0_1: # %vector.body
67; AVX512F-NEXT:    # =>This Inner Loop Header: Depth=1
68; AVX512F-NEXT:    vmovdqu a+1024(%rax), %xmm1
69; AVX512F-NEXT:    vpsadbw b+1024(%rax), %xmm1, %xmm1
70; AVX512F-NEXT:    vpaddd %xmm0, %xmm1, %xmm1
71; AVX512F-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
72; AVX512F-NEXT:    addq $4, %rax
73; AVX512F-NEXT:    jne .LBB0_1
74; AVX512F-NEXT:  # BB#2: # %middle.block
75; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1]
76; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
77; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1]
78; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
79; AVX512F-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
80; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
81; AVX512F-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
82; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
83; AVX512F-NEXT:    vmovd %xmm0, %eax
84; AVX512F-NEXT:    retq
85;
86; AVX512BW-LABEL: sad_16i8:
87; AVX512BW:       # BB#0: # %entry
88; AVX512BW-NEXT:    vpxord %zmm0, %zmm0, %zmm0
89; AVX512BW-NEXT:    movq $-1024, %rax # imm = 0xFC00
90; AVX512BW-NEXT:    .p2align 4, 0x90
91; AVX512BW-NEXT:  .LBB0_1: # %vector.body
92; AVX512BW-NEXT:    # =>This Inner Loop Header: Depth=1
93; AVX512BW-NEXT:    vmovdqu a+1024(%rax), %xmm1
94; AVX512BW-NEXT:    vpsadbw b+1024(%rax), %xmm1, %xmm1
95; AVX512BW-NEXT:    vpaddd %xmm0, %xmm1, %xmm1
96; AVX512BW-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
97; AVX512BW-NEXT:    addq $4, %rax
98; AVX512BW-NEXT:    jne .LBB0_1
99; AVX512BW-NEXT:  # BB#2: # %middle.block
100; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1]
101; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
102; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1]
103; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
104; AVX512BW-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
105; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
106; AVX512BW-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
107; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
108; AVX512BW-NEXT:    vmovd %xmm0, %eax
109; AVX512BW-NEXT:    retq
110entry:
111  br label %vector.body
112
113vector.body:
114  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
115  %vec.phi = phi <16 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ]
116  %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index
117  %1 = bitcast i8* %0 to <16 x i8>*
118  %wide.load = load <16 x i8>, <16 x i8>* %1, align 4
119  %2 = zext <16 x i8> %wide.load to <16 x i32>
120  %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index
121  %4 = bitcast i8* %3 to <16 x i8>*
122  %wide.load1 = load <16 x i8>, <16 x i8>* %4, align 4
123  %5 = zext <16 x i8> %wide.load1 to <16 x i32>
124  %6 = sub nsw <16 x i32> %2, %5
125  %7 = icmp sgt <16 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
126  %8 = sub nsw <16 x i32> zeroinitializer, %6
127  %9 = select <16 x i1> %7, <16 x i32> %6, <16 x i32> %8
128  %10 = add nsw <16 x i32> %9, %vec.phi
129  %index.next = add i64 %index, 4
130  %11 = icmp eq i64 %index.next, 1024
131  br i1 %11, label %middle.block, label %vector.body
132
133middle.block:
134  %.lcssa = phi <16 x i32> [ %10, %vector.body ]
135  %rdx.shuf = shufflevector <16 x i32> %.lcssa, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
136  %bin.rdx = add <16 x i32> %.lcssa, %rdx.shuf
137  %rdx.shuf2 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
138  %bin.rdx2 = add <16 x i32> %bin.rdx, %rdx.shuf2
139  %rdx.shuf3 = shufflevector <16 x i32> %bin.rdx2, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
140  %bin.rdx3 = add <16 x i32> %bin.rdx2, %rdx.shuf3
141  %rdx.shuf4 = shufflevector <16 x i32> %bin.rdx3, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
142  %bin.rdx4 = add <16 x i32> %bin.rdx3, %rdx.shuf4
143  %12 = extractelement <16 x i32> %bin.rdx4, i32 0
144  ret i32 %12
145}
146
147define i32 @sad_32i8() nounwind {
148; SSE2-LABEL: sad_32i8:
149; SSE2:       # BB#0: # %entry
150; SSE2-NEXT:    pxor %xmm12, %xmm12
151; SSE2-NEXT:    movq $-1024, %rax # imm = 0xFC00
152; SSE2-NEXT:    pxor %xmm4, %xmm4
153; SSE2-NEXT:    pxor %xmm2, %xmm2
154; SSE2-NEXT:    pxor %xmm0, %xmm0
155; SSE2-NEXT:    pxor %xmm1, %xmm1
156; SSE2-NEXT:    pxor %xmm13, %xmm13
157; SSE2-NEXT:    pxor %xmm15, %xmm15
158; SSE2-NEXT:    pxor %xmm5, %xmm5
159; SSE2-NEXT:    pxor %xmm14, %xmm14
160; SSE2-NEXT:    .p2align 4, 0x90
161; SSE2-NEXT:  .LBB1_1: # %vector.body
162; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
163; SSE2-NEXT:    movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill
164; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
165; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
166; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
167; SSE2-NEXT:    movdqa a+1040(%rax), %xmm0
168; SSE2-NEXT:    movdqa a+1024(%rax), %xmm1
169; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm1[2,3,0,1]
170; SSE2-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7]
171; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[2,3,0,1]
172; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3],xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7]
173; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3],xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7]
174; SSE2-NEXT:    movdqa %xmm1, %xmm6
175; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7]
176; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
177; SSE2-NEXT:    movdqa %xmm0, %xmm2
178; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
179; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
180; SSE2-NEXT:    movdqa b+1040(%rax), %xmm3
181; SSE2-NEXT:    movdqa b+1024(%rax), %xmm5
182; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm3[2,3,0,1]
183; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3],xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7]
184; SSE2-NEXT:    movdqa %xmm3, %xmm10
185; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7]
186; SSE2-NEXT:    psubd %xmm3, %xmm0
187; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm5[2,3,0,1]
188; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
189; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3]
190; SSE2-NEXT:    psubd %xmm10, %xmm2
191; SSE2-NEXT:    movdqa %xmm5, %xmm3
192; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
193; SSE2-NEXT:    psubd %xmm5, %xmm1
194; SSE2-NEXT:    movdqa %xmm7, %xmm5
195; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7]
196; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3]
197; SSE2-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3],xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7]
198; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3]
199; SSE2-NEXT:    psubd %xmm3, %xmm6
200; SSE2-NEXT:    movdqa %xmm4, %xmm10
201; SSE2-NEXT:    movdqa %xmm9, %xmm4
202; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7]
203; SSE2-NEXT:    psubd %xmm9, %xmm7
204; SSE2-NEXT:    movdqa %xmm8, %xmm3
205; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7]
206; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3]
207; SSE2-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
208; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3]
209; SSE2-NEXT:    psubd %xmm4, %xmm5
210; SSE2-NEXT:    movdqa %xmm11, %xmm4
211; SSE2-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
212; SSE2-NEXT:    psubd %xmm11, %xmm8
213; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3]
214; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3]
215; SSE2-NEXT:    psubd %xmm4, %xmm3
216; SSE2-NEXT:    movdqa %xmm3, %xmm4
217; SSE2-NEXT:    psrad $31, %xmm4
218; SSE2-NEXT:    paddd %xmm4, %xmm3
219; SSE2-NEXT:    pxor %xmm4, %xmm3
220; SSE2-NEXT:    movdqa %xmm8, %xmm4
221; SSE2-NEXT:    psrad $31, %xmm4
222; SSE2-NEXT:    paddd %xmm4, %xmm8
223; SSE2-NEXT:    pxor %xmm4, %xmm8
224; SSE2-NEXT:    movdqa %xmm5, %xmm4
225; SSE2-NEXT:    psrad $31, %xmm4
226; SSE2-NEXT:    paddd %xmm4, %xmm5
227; SSE2-NEXT:    pxor %xmm4, %xmm5
228; SSE2-NEXT:    movdqa %xmm7, %xmm4
229; SSE2-NEXT:    psrad $31, %xmm4
230; SSE2-NEXT:    paddd %xmm4, %xmm7
231; SSE2-NEXT:    pxor %xmm4, %xmm7
232; SSE2-NEXT:    movdqa %xmm6, %xmm4
233; SSE2-NEXT:    psrad $31, %xmm4
234; SSE2-NEXT:    paddd %xmm4, %xmm6
235; SSE2-NEXT:    pxor %xmm4, %xmm6
236; SSE2-NEXT:    movdqa %xmm1, %xmm4
237; SSE2-NEXT:    psrad $31, %xmm4
238; SSE2-NEXT:    paddd %xmm4, %xmm1
239; SSE2-NEXT:    pxor %xmm4, %xmm1
240; SSE2-NEXT:    movdqa %xmm2, %xmm4
241; SSE2-NEXT:    psrad $31, %xmm4
242; SSE2-NEXT:    paddd %xmm4, %xmm2
243; SSE2-NEXT:    pxor %xmm4, %xmm2
244; SSE2-NEXT:    movdqa %xmm0, %xmm4
245; SSE2-NEXT:    psrad $31, %xmm4
246; SSE2-NEXT:    paddd %xmm4, %xmm0
247; SSE2-NEXT:    pxor %xmm4, %xmm0
248; SSE2-NEXT:    movdqa %xmm10, %xmm4
249; SSE2-NEXT:    paddd %xmm0, %xmm15
250; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
251; SSE2-NEXT:    paddd %xmm2, %xmm13
252; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
253; SSE2-NEXT:    paddd %xmm1, %xmm2
254; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
255; SSE2-NEXT:    paddd %xmm6, %xmm4
256; SSE2-NEXT:    paddd %xmm7, %xmm14
257; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
258; SSE2-NEXT:    paddd %xmm5, %xmm6
259; SSE2-NEXT:    movdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill
260; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
261; SSE2-NEXT:    paddd %xmm8, %xmm1
262; SSE2-NEXT:    paddd %xmm3, %xmm0
263; SSE2-NEXT:    addq $4, %rax
264; SSE2-NEXT:    jne .LBB1_1
265; SSE2-NEXT:  # BB#2: # %middle.block
266; SSE2-NEXT:    paddd %xmm15, %xmm2
267; SSE2-NEXT:    paddd %xmm14, %xmm1
268; SSE2-NEXT:    paddd %xmm13, %xmm4
269; SSE2-NEXT:    paddd %xmm5, %xmm0
270; SSE2-NEXT:    paddd %xmm4, %xmm0
271; SSE2-NEXT:    paddd %xmm2, %xmm1
272; SSE2-NEXT:    paddd %xmm0, %xmm1
273; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
274; SSE2-NEXT:    paddd %xmm1, %xmm0
275; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
276; SSE2-NEXT:    paddd %xmm0, %xmm1
277; SSE2-NEXT:    movd %xmm1, %eax
278; SSE2-NEXT:    retq
279;
280; AVX2-LABEL: sad_32i8:
281; AVX2:       # BB#0: # %entry
282; AVX2-NEXT:    vpxor %ymm0, %ymm0, %ymm0
283; AVX2-NEXT:    movq $-1024, %rax # imm = 0xFC00
284; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
285; AVX2-NEXT:    .p2align 4, 0x90
286; AVX2-NEXT:  .LBB1_1: # %vector.body
287; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
288; AVX2-NEXT:    vmovdqa a+1024(%rax), %ymm2
289; AVX2-NEXT:    vpsadbw b+1024(%rax), %ymm2, %ymm2
290; AVX2-NEXT:    vpaddd %ymm1, %ymm2, %ymm1
291; AVX2-NEXT:    addq $4, %rax
292; AVX2-NEXT:    jne .LBB1_1
293; AVX2-NEXT:  # BB#2: # %middle.block
294; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm1
295; AVX2-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
296; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
297; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
298; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
299; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
300; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
301; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
302; AVX2-NEXT:    vmovd %xmm0, %eax
303; AVX2-NEXT:    vzeroupper
304; AVX2-NEXT:    retq
305;
306; AVX512F-LABEL: sad_32i8:
307; AVX512F:       # BB#0: # %entry
308; AVX512F-NEXT:    vpxord %zmm0, %zmm0, %zmm0
309; AVX512F-NEXT:    movq $-1024, %rax # imm = 0xFC00
310; AVX512F-NEXT:    vpxord %zmm1, %zmm1, %zmm1
311; AVX512F-NEXT:    .p2align 4, 0x90
312; AVX512F-NEXT:  .LBB1_1: # %vector.body
313; AVX512F-NEXT:    # =>This Inner Loop Header: Depth=1
314; AVX512F-NEXT:    vmovdqa a+1024(%rax), %ymm2
315; AVX512F-NEXT:    vpsadbw b+1024(%rax), %ymm2, %ymm2
316; AVX512F-NEXT:    vpaddd %ymm1, %ymm2, %ymm2
317; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm1
318; AVX512F-NEXT:    addq $4, %rax
319; AVX512F-NEXT:    jne .LBB1_1
320; AVX512F-NEXT:  # BB#2: # %middle.block
321; AVX512F-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
322; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1]
323; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
324; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1]
325; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
326; AVX512F-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
327; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
328; AVX512F-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
329; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
330; AVX512F-NEXT:    vmovd %xmm0, %eax
331; AVX512F-NEXT:    retq
332;
333; AVX512BW-LABEL: sad_32i8:
334; AVX512BW:       # BB#0: # %entry
335; AVX512BW-NEXT:    vpxord %zmm0, %zmm0, %zmm0
336; AVX512BW-NEXT:    movq $-1024, %rax # imm = 0xFC00
337; AVX512BW-NEXT:    vpxord %zmm1, %zmm1, %zmm1
338; AVX512BW-NEXT:    .p2align 4, 0x90
339; AVX512BW-NEXT:  .LBB1_1: # %vector.body
340; AVX512BW-NEXT:    # =>This Inner Loop Header: Depth=1
341; AVX512BW-NEXT:    vmovdqa a+1024(%rax), %ymm2
342; AVX512BW-NEXT:    vpsadbw b+1024(%rax), %ymm2, %ymm2
343; AVX512BW-NEXT:    vpaddd %ymm1, %ymm2, %ymm2
344; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm1
345; AVX512BW-NEXT:    addq $4, %rax
346; AVX512BW-NEXT:    jne .LBB1_1
347; AVX512BW-NEXT:  # BB#2: # %middle.block
348; AVX512BW-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
349; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1]
350; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
351; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1]
352; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
353; AVX512BW-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
354; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
355; AVX512BW-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
356; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
357; AVX512BW-NEXT:    vmovd %xmm0, %eax
358; AVX512BW-NEXT:    retq
359entry:
360  br label %vector.body
361
362vector.body:
363  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
364  %vec.phi = phi <32 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ]
365  %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index
366  %1 = bitcast i8* %0 to <32 x i8>*
367  %wide.load = load <32 x i8>, <32 x i8>* %1, align 32
368  %2 = zext <32 x i8> %wide.load to <32 x i32>
369  %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index
370  %4 = bitcast i8* %3 to <32 x i8>*
371  %wide.load1 = load <32 x i8>, <32 x i8>* %4, align 32
372  %5 = zext <32 x i8> %wide.load1 to <32 x i32>
373  %6 = sub nsw <32 x i32> %2, %5
374  %7 = icmp sgt <32 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
375  %8 = sub nsw <32 x i32> zeroinitializer, %6
376  %9 = select <32 x i1> %7, <32 x i32> %6, <32 x i32> %8
377  %10 = add nsw <32 x i32> %9, %vec.phi
378  %index.next = add i64 %index, 4
379  %11 = icmp eq i64 %index.next, 1024
380  br i1 %11, label %middle.block, label %vector.body
381
382middle.block:
383  %.lcssa = phi <32 x i32> [ %10, %vector.body ]
384  %rdx.shuf = shufflevector <32 x i32> %.lcssa, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
385  %bin.rdx = add <32 x i32> %.lcssa, %rdx.shuf
386  %rdx.shuf2 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
387  %bin.rdx2 = add <32 x i32> %bin.rdx, %rdx.shuf2
388  %rdx.shuf3 = shufflevector <32 x i32> %bin.rdx2, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
389  %bin.rdx3 = add <32 x i32> %bin.rdx2, %rdx.shuf3
390  %rdx.shuf4 = shufflevector <32 x i32> %bin.rdx3, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
391  %bin.rdx4 = add <32 x i32> %bin.rdx3, %rdx.shuf4
392  %rdx.shuf5 = shufflevector <32 x i32> %bin.rdx4, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
393  %bin.rdx5 = add <32 x i32> %bin.rdx4, %rdx.shuf5
394  %12 = extractelement <32 x i32> %bin.rdx5, i32 0
395  ret i32 %12
396}
397
398define i32 @sad_avx64i8() nounwind {
399; SSE2-LABEL: sad_avx64i8:
400; SSE2:       # BB#0: # %entry
401; SSE2-NEXT:    subq $232, %rsp
402; SSE2-NEXT:    pxor %xmm8, %xmm8
403; SSE2-NEXT:    movq $-1024, %rax # imm = 0xFC00
404; SSE2-NEXT:    pxor %xmm5, %xmm5
405; SSE2-NEXT:    pxor %xmm2, %xmm2
406; SSE2-NEXT:    pxor %xmm1, %xmm1
407; SSE2-NEXT:    pxor %xmm3, %xmm3
408; SSE2-NEXT:    pxor %xmm6, %xmm6
409; SSE2-NEXT:    pxor %xmm0, %xmm0
410; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
411; SSE2-NEXT:    pxor %xmm13, %xmm13
412; SSE2-NEXT:    pxor %xmm10, %xmm10
413; SSE2-NEXT:    pxor %xmm0, %xmm0
414; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
415; SSE2-NEXT:    pxor %xmm12, %xmm12
416; SSE2-NEXT:    pxor %xmm0, %xmm0
417; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
418; SSE2-NEXT:    pxor %xmm11, %xmm11
419; SSE2-NEXT:    pxor %xmm15, %xmm15
420; SSE2-NEXT:    pxor %xmm9, %xmm9
421; SSE2-NEXT:    pxor %xmm7, %xmm7
422; SSE2-NEXT:    pxor %xmm0, %xmm0
423; SSE2-NEXT:    .p2align 4, 0x90
424; SSE2-NEXT:  .LBB2_1: # %vector.body
425; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
426; SSE2-NEXT:    movdqa %xmm12, {{[0-9]+}}(%rsp) # 16-byte Spill
427; SSE2-NEXT:    movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
428; SSE2-NEXT:    movdqa %xmm15, {{[0-9]+}}(%rsp) # 16-byte Spill
429; SSE2-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
430; SSE2-NEXT:    movdqa %xmm11, {{[0-9]+}}(%rsp) # 16-byte Spill
431; SSE2-NEXT:    movdqa %xmm9, {{[0-9]+}}(%rsp) # 16-byte Spill
432; SSE2-NEXT:    movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill
433; SSE2-NEXT:    movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
434; SSE2-NEXT:    movdqa %xmm6, {{[0-9]+}}(%rsp) # 16-byte Spill
435; SSE2-NEXT:    movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
436; SSE2-NEXT:    movdqa %xmm5, {{[0-9]+}}(%rsp) # 16-byte Spill
437; SSE2-NEXT:    movdqa %xmm13, {{[0-9]+}}(%rsp) # 16-byte Spill
438; SSE2-NEXT:    movdqa %xmm10, {{[0-9]+}}(%rsp) # 16-byte Spill
439; SSE2-NEXT:    movdqa a+1040(%rax), %xmm13
440; SSE2-NEXT:    movdqa a+1024(%rax), %xmm1
441; SSE2-NEXT:    movdqa a+1056(%rax), %xmm3
442; SSE2-NEXT:    movdqa a+1072(%rax), %xmm6
443; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
444; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
445; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
446; SSE2-NEXT:    movdqa %xmm3, %xmm12
447; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
448; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
449; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm13[2,3,0,1]
450; SSE2-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
451; SSE2-NEXT:    movdqa %xmm10, -{{[0-9]+}}(%rsp) # 16-byte Spill
452; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
453; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
454; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
455; SSE2-NEXT:    movdqa %xmm1, %xmm0
456; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
457; SSE2-NEXT:    movdqa %xmm0, %xmm15
458; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
459; SSE2-NEXT:    punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3],xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7]
460; SSE2-NEXT:    movdqa %xmm13, %xmm0
461; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
462; SSE2-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7]
463; SSE2-NEXT:    movdqa b+1040(%rax), %xmm7
464; SSE2-NEXT:    movdqa b+1024(%rax), %xmm11
465; SSE2-NEXT:    movdqa b+1056(%rax), %xmm9
466; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[2,3,0,1]
467; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
468; SSE2-NEXT:    movdqa %xmm7, %xmm4
469; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
470; SSE2-NEXT:    psubd %xmm7, %xmm13
471; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm11[2,3,0,1]
472; SSE2-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7]
473; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3]
474; SSE2-NEXT:    psubd %xmm4, %xmm0
475; SSE2-NEXT:    movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
476; SSE2-NEXT:    movdqa %xmm11, %xmm4
477; SSE2-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7]
478; SSE2-NEXT:    psubd %xmm11, %xmm1
479; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm9[2,3,0,1]
480; SSE2-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
481; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3]
482; SSE2-NEXT:    psubd %xmm4, %xmm15
483; SSE2-NEXT:    movdqa %xmm15, -{{[0-9]+}}(%rsp) # 16-byte Spill
484; SSE2-NEXT:    movdqa %xmm9, %xmm4
485; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
486; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
487; SSE2-NEXT:    psubd %xmm9, %xmm3
488; SSE2-NEXT:    movdqa %xmm5, %xmm0
489; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
490; SSE2-NEXT:    psubd %xmm5, %xmm10
491; SSE2-NEXT:    movdqa %xmm2, %xmm15
492; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
493; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
494; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3]
495; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
496; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
497; SSE2-NEXT:    psubd %xmm0, %xmm5
498; SSE2-NEXT:    movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill
499; SSE2-NEXT:    movdqa %xmm7, %xmm0
500; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
501; SSE2-NEXT:    psubd %xmm7, %xmm2
502; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
503; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[2,3,0,1]
504; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
505; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
506; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
507; SSE2-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3]
508; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
509; SSE2-NEXT:    psubd %xmm0, %xmm15
510; SSE2-NEXT:    movdqa %xmm2, %xmm11
511; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
512; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7]
513; SSE2-NEXT:    punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3],xmm14[4],xmm8[4],xmm14[5],xmm8[5],xmm14[6],xmm8[6],xmm14[7],xmm8[7]
514; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7]
515; SSE2-NEXT:    psubd %xmm4, %xmm12
516; SSE2-NEXT:    movdqa %xmm14, %xmm0
517; SSE2-NEXT:    punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3]
518; SSE2-NEXT:    psubd %xmm14, %xmm2
519; SSE2-NEXT:    movdqa %xmm2, %xmm14
520; SSE2-NEXT:    movdqa %xmm6, %xmm9
521; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3]
522; SSE2-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7]
523; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
524; SSE2-NEXT:    psubd %xmm0, %xmm11
525; SSE2-NEXT:    movdqa b+1072(%rax), %xmm0
526; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
527; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
528; SSE2-NEXT:    movdqa %xmm0, %xmm5
529; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
530; SSE2-NEXT:    psubd %xmm0, %xmm6
531; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
532; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
533; SSE2-NEXT:    psubd %xmm5, %xmm9
534; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
535; SSE2-NEXT:    movdqa %xmm7, %xmm0
536; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
537; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7]
538; SSE2-NEXT:    movdqa %xmm4, %xmm5
539; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3]
540; SSE2-NEXT:    psubd %xmm4, %xmm7
541; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
542; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
543; SSE2-NEXT:    psubd %xmm5, %xmm0
544; SSE2-NEXT:    movdqa %xmm0, %xmm4
545; SSE2-NEXT:    psrad $31, %xmm4
546; SSE2-NEXT:    paddd %xmm4, %xmm0
547; SSE2-NEXT:    pxor %xmm4, %xmm0
548; SSE2-NEXT:    movdqa %xmm7, %xmm4
549; SSE2-NEXT:    psrad $31, %xmm4
550; SSE2-NEXT:    paddd %xmm4, %xmm7
551; SSE2-NEXT:    pxor %xmm4, %xmm7
552; SSE2-NEXT:    movdqa %xmm9, %xmm4
553; SSE2-NEXT:    psrad $31, %xmm4
554; SSE2-NEXT:    paddd %xmm4, %xmm9
555; SSE2-NEXT:    pxor %xmm4, %xmm9
556; SSE2-NEXT:    movdqa %xmm6, %xmm4
557; SSE2-NEXT:    psrad $31, %xmm4
558; SSE2-NEXT:    paddd %xmm4, %xmm6
559; SSE2-NEXT:    pxor %xmm4, %xmm6
560; SSE2-NEXT:    movdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill
561; SSE2-NEXT:    movdqa %xmm11, %xmm4
562; SSE2-NEXT:    psrad $31, %xmm4
563; SSE2-NEXT:    paddd %xmm4, %xmm11
564; SSE2-NEXT:    pxor %xmm4, %xmm11
565; SSE2-NEXT:    movdqa %xmm14, %xmm4
566; SSE2-NEXT:    psrad $31, %xmm4
567; SSE2-NEXT:    paddd %xmm4, %xmm14
568; SSE2-NEXT:    pxor %xmm4, %xmm14
569; SSE2-NEXT:    movdqa %xmm12, %xmm4
570; SSE2-NEXT:    psrad $31, %xmm4
571; SSE2-NEXT:    paddd %xmm4, %xmm12
572; SSE2-NEXT:    pxor %xmm4, %xmm12
573; SSE2-NEXT:    movdqa %xmm12, {{[0-9]+}}(%rsp) # 16-byte Spill
574; SSE2-NEXT:    movdqa %xmm15, %xmm4
575; SSE2-NEXT:    psrad $31, %xmm4
576; SSE2-NEXT:    paddd %xmm4, %xmm15
577; SSE2-NEXT:    pxor %xmm4, %xmm15
578; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
579; SSE2-NEXT:    movdqa %xmm2, %xmm4
580; SSE2-NEXT:    psrad $31, %xmm4
581; SSE2-NEXT:    paddd %xmm4, %xmm2
582; SSE2-NEXT:    pxor %xmm4, %xmm2
583; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
584; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
585; SSE2-NEXT:    movdqa %xmm2, %xmm4
586; SSE2-NEXT:    psrad $31, %xmm4
587; SSE2-NEXT:    paddd %xmm4, %xmm2
588; SSE2-NEXT:    pxor %xmm4, %xmm2
589; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
590; SSE2-NEXT:    movdqa %xmm10, %xmm4
591; SSE2-NEXT:    psrad $31, %xmm4
592; SSE2-NEXT:    paddd %xmm4, %xmm10
593; SSE2-NEXT:    pxor %xmm4, %xmm10
594; SSE2-NEXT:    movdqa %xmm3, %xmm4
595; SSE2-NEXT:    psrad $31, %xmm4
596; SSE2-NEXT:    paddd %xmm4, %xmm3
597; SSE2-NEXT:    pxor %xmm4, %xmm3
598; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
599; SSE2-NEXT:    movdqa %xmm2, %xmm4
600; SSE2-NEXT:    psrad $31, %xmm4
601; SSE2-NEXT:    paddd %xmm4, %xmm2
602; SSE2-NEXT:    pxor %xmm4, %xmm2
603; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
604; SSE2-NEXT:    movdqa %xmm1, %xmm4
605; SSE2-NEXT:    psrad $31, %xmm4
606; SSE2-NEXT:    paddd %xmm4, %xmm1
607; SSE2-NEXT:    pxor %xmm4, %xmm1
608; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
609; SSE2-NEXT:    movdqa %xmm2, %xmm4
610; SSE2-NEXT:    psrad $31, %xmm4
611; SSE2-NEXT:    paddd %xmm4, %xmm2
612; SSE2-NEXT:    pxor %xmm4, %xmm2
613; SSE2-NEXT:    movdqa %xmm2, %xmm5
614; SSE2-NEXT:    movdqa %xmm13, %xmm4
615; SSE2-NEXT:    psrad $31, %xmm4
616; SSE2-NEXT:    paddd %xmm4, %xmm13
617; SSE2-NEXT:    pxor %xmm4, %xmm13
618; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
619; SSE2-NEXT:    paddd %xmm13, %xmm2
620; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
621; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm12 # 16-byte Reload
622; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
623; SSE2-NEXT:    paddd %xmm5, %xmm6
624; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
625; SSE2-NEXT:    paddd %xmm1, %xmm4
626; SSE2-NEXT:    movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill
627; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
628; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
629; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload
630; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
631; SSE2-NEXT:    paddd %xmm3, %xmm4
632; SSE2-NEXT:    movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
633; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
634; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
635; SSE2-NEXT:    paddd %xmm10, %xmm4
636; SSE2-NEXT:    movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill
637; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm13 # 16-byte Reload
638; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm10 # 16-byte Reload
639; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm13 # 16-byte Folded Reload
640; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload
641; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
642; SSE2-NEXT:    paddd %xmm15, %xmm1
643; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm15 # 16-byte Reload
644; SSE2-NEXT:    paddd {{[0-9]+}}(%rsp), %xmm12 # 16-byte Folded Reload
645; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
646; SSE2-NEXT:    paddd %xmm14, %xmm4
647; SSE2-NEXT:    movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
648; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
649; SSE2-NEXT:    paddd %xmm11, %xmm4
650; SSE2-NEXT:    movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill
651; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm11 # 16-byte Reload
652; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm15 # 16-byte Folded Reload
653; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
654; SSE2-NEXT:    paddd %xmm9, %xmm4
655; SSE2-NEXT:    movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill
656; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9 # 16-byte Reload
657; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
658; SSE2-NEXT:    paddd %xmm7, %xmm4
659; SSE2-NEXT:    movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
660; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload
661; SSE2-NEXT:    movdqa (%rsp), %xmm4 # 16-byte Reload
662; SSE2-NEXT:    paddd %xmm0, %xmm4
663; SSE2-NEXT:    movdqa %xmm4, (%rsp) # 16-byte Spill
664; SSE2-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
665; SSE2-NEXT:    addq $4, %rax
666; SSE2-NEXT:    jne .LBB2_1
667; SSE2-NEXT:  # BB#2: # %middle.block
668; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
669; SSE2-NEXT:    paddd %xmm7, %xmm13
670; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload
671; SSE2-NEXT:    paddd %xmm15, %xmm6
672; SSE2-NEXT:    paddd %xmm11, %xmm3
673; SSE2-NEXT:    paddd %xmm0, %xmm10
674; SSE2-NEXT:    paddd %xmm12, %xmm2
675; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
676; SSE2-NEXT:    paddd %xmm9, %xmm0
677; SSE2-NEXT:    paddd %xmm2, %xmm0
678; SSE2-NEXT:    paddd %xmm3, %xmm10
679; SSE2-NEXT:    paddd %xmm5, %xmm6
680; SSE2-NEXT:    paddd %xmm1, %xmm13
681; SSE2-NEXT:    paddd %xmm6, %xmm13
682; SSE2-NEXT:    paddd %xmm0, %xmm10
683; SSE2-NEXT:    paddd %xmm13, %xmm10
684; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[2,3,0,1]
685; SSE2-NEXT:    paddd %xmm10, %xmm0
686; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
687; SSE2-NEXT:    paddd %xmm0, %xmm1
688; SSE2-NEXT:    movd %xmm1, %eax
689; SSE2-NEXT:    addq $232, %rsp
690; SSE2-NEXT:    retq
691;
692; AVX2-LABEL: sad_avx64i8:
693; AVX2:       # BB#0: # %entry
694; AVX2-NEXT:    vpxor %ymm0, %ymm0, %ymm0
695; AVX2-NEXT:    movq $-1024, %rax # imm = 0xFC00
696; AVX2-NEXT:    vpxor %ymm2, %ymm2, %ymm2
697; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
698; AVX2-NEXT:    vpxor %ymm3, %ymm3, %ymm3
699; AVX2-NEXT:    vpxor %ymm4, %ymm4, %ymm4
700; AVX2-NEXT:    vpxor %ymm6, %ymm6, %ymm6
701; AVX2-NEXT:    vpxor %ymm5, %ymm5, %ymm5
702; AVX2-NEXT:    vpxor %ymm7, %ymm7, %ymm7
703; AVX2-NEXT:    .p2align 4, 0x90
704; AVX2-NEXT:  .LBB2_1: # %vector.body
705; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
706; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
707; AVX2-NEXT:    vmovdqu %ymm8, -{{[0-9]+}}(%rsp) # 32-byte Spill
708; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
709; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
710; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
711; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm12 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
712; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
713; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
714; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
715; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
716; AVX2-NEXT:    vpsubd %ymm8, %ymm15, %ymm8
717; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
718; AVX2-NEXT:    vpsubd %ymm15, %ymm14, %ymm14
719; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
720; AVX2-NEXT:    vpsubd %ymm15, %ymm13, %ymm13
721; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
722; AVX2-NEXT:    vpsubd %ymm15, %ymm12, %ymm12
723; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
724; AVX2-NEXT:    vpsubd %ymm15, %ymm11, %ymm11
725; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
726; AVX2-NEXT:    vpsubd %ymm15, %ymm10, %ymm10
727; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
728; AVX2-NEXT:    vpsubd %ymm15, %ymm9, %ymm9
729; AVX2-NEXT:    vmovdqu %ymm9, -{{[0-9]+}}(%rsp) # 32-byte Spill
730; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
731; AVX2-NEXT:    vmovdqu -{{[0-9]+}}(%rsp), %ymm9 # 32-byte Reload
732; AVX2-NEXT:    vpsubd %ymm15, %ymm9, %ymm15
733; AVX2-NEXT:    vpabsd %ymm8, %ymm8
734; AVX2-NEXT:    vpaddd %ymm3, %ymm8, %ymm3
735; AVX2-NEXT:    vpabsd %ymm14, %ymm8
736; AVX2-NEXT:    vpaddd %ymm1, %ymm8, %ymm1
737; AVX2-NEXT:    vpabsd %ymm13, %ymm8
738; AVX2-NEXT:    vpaddd %ymm2, %ymm8, %ymm2
739; AVX2-NEXT:    vpabsd %ymm12, %ymm8
740; AVX2-NEXT:    vpaddd %ymm0, %ymm8, %ymm0
741; AVX2-NEXT:    vpabsd %ymm11, %ymm8
742; AVX2-NEXT:    vpaddd %ymm4, %ymm8, %ymm4
743; AVX2-NEXT:    vpabsd %ymm10, %ymm8
744; AVX2-NEXT:    vpaddd %ymm6, %ymm8, %ymm6
745; AVX2-NEXT:    vpabsd -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Folded Reload
746; AVX2-NEXT:    vpaddd %ymm5, %ymm8, %ymm5
747; AVX2-NEXT:    vpabsd %ymm15, %ymm8
748; AVX2-NEXT:    vpaddd %ymm7, %ymm8, %ymm7
749; AVX2-NEXT:    addq $4, %rax
750; AVX2-NEXT:    jne .LBB2_1
751; AVX2-NEXT:  # BB#2: # %middle.block
752; AVX2-NEXT:    vpaddd %ymm6, %ymm2, %ymm2
753; AVX2-NEXT:    vpaddd %ymm7, %ymm3, %ymm3
754; AVX2-NEXT:    vpaddd %ymm4, %ymm0, %ymm0
755; AVX2-NEXT:    vpaddd %ymm5, %ymm1, %ymm1
756; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
757; AVX2-NEXT:    vpaddd %ymm3, %ymm2, %ymm1
758; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
759; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
760; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
761; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
762; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
763; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
764; AVX2-NEXT:    vmovd %xmm0, %eax
765; AVX2-NEXT:    vzeroupper
766; AVX2-NEXT:    retq
767;
768; AVX512F-LABEL: sad_avx64i8:
769; AVX512F:       # BB#0: # %entry
770; AVX512F-NEXT:    vpxord %zmm0, %zmm0, %zmm0
771; AVX512F-NEXT:    movq $-1024, %rax # imm = 0xFC00
772; AVX512F-NEXT:    vpxord %zmm1, %zmm1, %zmm1
773; AVX512F-NEXT:    vpxord %zmm2, %zmm2, %zmm2
774; AVX512F-NEXT:    vpxord %zmm3, %zmm3, %zmm3
775; AVX512F-NEXT:    .p2align 4, 0x90
776; AVX512F-NEXT:  .LBB2_1: # %vector.body
777; AVX512F-NEXT:    # =>This Inner Loop Header: Depth=1
778; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
779; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
780; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
781; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
782; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
783; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
784; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
785; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
786; AVX512F-NEXT:    vpsubd %zmm11, %zmm7, %zmm7
787; AVX512F-NEXT:    vpsubd %zmm10, %zmm6, %zmm6
788; AVX512F-NEXT:    vpsubd %zmm9, %zmm5, %zmm5
789; AVX512F-NEXT:    vpsubd %zmm8, %zmm4, %zmm4
790; AVX512F-NEXT:    vpabsd %zmm4, %zmm4
791; AVX512F-NEXT:    vpabsd %zmm5, %zmm5
792; AVX512F-NEXT:    vpabsd %zmm6, %zmm6
793; AVX512F-NEXT:    vpabsd %zmm7, %zmm7
794; AVX512F-NEXT:    vpaddd %zmm3, %zmm7, %zmm3
795; AVX512F-NEXT:    vpaddd %zmm2, %zmm6, %zmm2
796; AVX512F-NEXT:    vpaddd %zmm1, %zmm5, %zmm1
797; AVX512F-NEXT:    vpaddd %zmm0, %zmm4, %zmm0
798; AVX512F-NEXT:    addq $4, %rax
799; AVX512F-NEXT:    jne .LBB2_1
800; AVX512F-NEXT:  # BB#2: # %middle.block
801; AVX512F-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
802; AVX512F-NEXT:    vpaddd %zmm3, %zmm1, %zmm1
803; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
804; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1]
805; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
806; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1]
807; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
808; AVX512F-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
809; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
810; AVX512F-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
811; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
812; AVX512F-NEXT:    vmovd %xmm0, %eax
813; AVX512F-NEXT:    retq
814;
815; AVX512BW-LABEL: sad_avx64i8:
816; AVX512BW:       # BB#0: # %entry
817; AVX512BW-NEXT:    vpxord %zmm0, %zmm0, %zmm0
818; AVX512BW-NEXT:    movq $-1024, %rax # imm = 0xFC00
819; AVX512BW-NEXT:    vpxord %zmm1, %zmm1, %zmm1
820; AVX512BW-NEXT:    .p2align 4, 0x90
821; AVX512BW-NEXT:  .LBB2_1: # %vector.body
822; AVX512BW-NEXT:    # =>This Inner Loop Header: Depth=1
823; AVX512BW-NEXT:    vmovdqu8 a+1024(%rax), %zmm2
824; AVX512BW-NEXT:    vpsadbw b+1024(%rax), %zmm2, %zmm2
825; AVX512BW-NEXT:    vpaddd %zmm1, %zmm2, %zmm1
826; AVX512BW-NEXT:    addq $4, %rax
827; AVX512BW-NEXT:    jne .LBB2_1
828; AVX512BW-NEXT:  # BB#2: # %middle.block
829; AVX512BW-NEXT:    vpaddd %zmm0, %zmm1, %zmm1
830; AVX512BW-NEXT:    vpaddd %zmm0, %zmm0, %zmm0
831; AVX512BW-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
832; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1]
833; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
834; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1]
835; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
836; AVX512BW-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
837; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
838; AVX512BW-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
839; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
840; AVX512BW-NEXT:    vmovd %xmm0, %eax
841; AVX512BW-NEXT:    retq
842entry:
843  br label %vector.body
844
845vector.body:
846  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
847  %vec.phi = phi <64 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ]
848  %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index
849  %1 = bitcast i8* %0 to <64 x i8>*
850  %wide.load = load <64 x i8>, <64 x i8>* %1, align 64
851  %2 = zext <64 x i8> %wide.load to <64 x i32>
852  %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index
853  %4 = bitcast i8* %3 to <64 x i8>*
854  %wide.load1 = load <64 x i8>, <64 x i8>* %4, align 64
855  %5 = zext <64 x i8> %wide.load1 to <64 x i32>
856  %6 = sub nsw <64 x i32> %2, %5
857  %7 = icmp sgt <64 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
858  %8 = sub nsw <64 x i32> zeroinitializer, %6
859  %9 = select <64 x i1> %7, <64 x i32> %6, <64 x i32> %8
860  %10 = add nsw <64 x i32> %9, %vec.phi
861  %index.next = add i64 %index, 4
862  %11 = icmp eq i64 %index.next, 1024
863  br i1 %11, label %middle.block, label %vector.body
864
865middle.block:
866  %.lcssa = phi <64 x i32> [ %10, %vector.body ]
867  %rdx.shuf = shufflevector <64 x i32> %.lcssa, <64 x i32> undef, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
868  %bin.rdx = add <64 x i32> %.lcssa, %rdx.shuf
869  %rdx.shuf2 = shufflevector <64 x i32> %bin.rdx, <64 x i32> undef, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
870  %bin.rdx2 = add <64 x i32> %bin.rdx, %rdx.shuf2
871  %rdx.shuf3 = shufflevector <64 x i32> %bin.rdx2, <64 x i32> undef, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
872  %bin.rdx3 = add <64 x i32> %bin.rdx2, %rdx.shuf3
873  %rdx.shuf4 = shufflevector <64 x i32> %bin.rdx3, <64 x i32> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
874  %bin.rdx4 = add <64 x i32> %bin.rdx3, %rdx.shuf4
875  %rdx.shuf5 = shufflevector <64 x i32> %bin.rdx4, <64 x i32> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
876  %bin.rdx5 = add <64 x i32> %bin.rdx4, %rdx.shuf5
877  %rdx.shuf6 = shufflevector <64 x i32> %bin.rdx5, <64 x i32> undef, <64 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
878  %bin.rdx6 = add <64 x i32> %bin.rdx5, %rdx.shuf6
879  %12 = extractelement <64 x i32> %bin.rdx6, i32 0
880  ret i32 %12
881}
882
883define i32 @sad_2i8() nounwind {
884; SSE2-LABEL: sad_2i8:
885; SSE2:       # BB#0: # %entry
886; SSE2-NEXT:    pxor %xmm0, %xmm0
887; SSE2-NEXT:    movq $-1024, %rax # imm = 0xFC00
888; SSE2-NEXT:    movl $65535, %ecx # imm = 0xFFFF
889; SSE2-NEXT:    movd %ecx, %xmm1
890; SSE2-NEXT:    .p2align 4, 0x90
891; SSE2-NEXT:  .LBB3_1: # %vector.body
892; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
893; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
894; SSE2-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
895; SSE2-NEXT:    pand %xmm1, %xmm3
896; SSE2-NEXT:    pand %xmm1, %xmm2
897; SSE2-NEXT:    psadbw %xmm3, %xmm2
898; SSE2-NEXT:    paddq %xmm2, %xmm0
899; SSE2-NEXT:    addq $4, %rax
900; SSE2-NEXT:    jne .LBB3_1
901; SSE2-NEXT:  # BB#2: # %middle.block
902; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
903; SSE2-NEXT:    paddq %xmm0, %xmm1
904; SSE2-NEXT:    movd %xmm1, %eax
905; SSE2-NEXT:    retq
906;
907; AVX2-LABEL: sad_2i8:
908; AVX2:       # BB#0: # %entry
909; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
910; AVX2-NEXT:    movq $-1024, %rax # imm = 0xFC00
911; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
912; AVX2-NEXT:    .p2align 4, 0x90
913; AVX2-NEXT:  .LBB3_1: # %vector.body
914; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
915; AVX2-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
916; AVX2-NEXT:    vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
917; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7]
918; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7]
919; AVX2-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
920; AVX2-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
921; AVX2-NEXT:    addq $4, %rax
922; AVX2-NEXT:    jne .LBB3_1
923; AVX2-NEXT:  # BB#2: # %middle.block
924; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
925; AVX2-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
926; AVX2-NEXT:    vmovd %xmm0, %eax
927; AVX2-NEXT:    retq
928;
929; AVX512F-LABEL: sad_2i8:
930; AVX512F:       # BB#0: # %entry
931; AVX512F-NEXT:    vpxor %xmm0, %xmm0, %xmm0
932; AVX512F-NEXT:    movq $-1024, %rax # imm = 0xFC00
933; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
934; AVX512F-NEXT:    .p2align 4, 0x90
935; AVX512F-NEXT:  .LBB3_1: # %vector.body
936; AVX512F-NEXT:    # =>This Inner Loop Header: Depth=1
937; AVX512F-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
938; AVX512F-NEXT:    vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
939; AVX512F-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7]
940; AVX512F-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7]
941; AVX512F-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
942; AVX512F-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
943; AVX512F-NEXT:    addq $4, %rax
944; AVX512F-NEXT:    jne .LBB3_1
945; AVX512F-NEXT:  # BB#2: # %middle.block
946; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
947; AVX512F-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
948; AVX512F-NEXT:    vmovd %xmm0, %eax
949; AVX512F-NEXT:    retq
950;
951; AVX512BW-LABEL: sad_2i8:
952; AVX512BW:       # BB#0: # %entry
953; AVX512BW-NEXT:    vpxor %xmm0, %xmm0, %xmm0
954; AVX512BW-NEXT:    movq $-1024, %rax # imm = 0xFC00
955; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
956; AVX512BW-NEXT:    .p2align 4, 0x90
957; AVX512BW-NEXT:  .LBB3_1: # %vector.body
958; AVX512BW-NEXT:    # =>This Inner Loop Header: Depth=1
959; AVX512BW-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
960; AVX512BW-NEXT:    vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
961; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7]
962; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7]
963; AVX512BW-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
964; AVX512BW-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
965; AVX512BW-NEXT:    addq $4, %rax
966; AVX512BW-NEXT:    jne .LBB3_1
967; AVX512BW-NEXT:  # BB#2: # %middle.block
968; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
969; AVX512BW-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
970; AVX512BW-NEXT:    vmovd %xmm0, %eax
971; AVX512BW-NEXT:    retq
972entry:
973  br label %vector.body
974
975vector.body:
976  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
977  %vec.phi = phi <2 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ]
978  %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index
979  %1 = bitcast i8* %0 to <2 x i8>*
980  %wide.load = load <2 x i8>, <2 x i8>* %1, align 4
981  %2 = zext <2 x i8> %wide.load to <2 x i32>
982  %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index
983  %4 = bitcast i8* %3 to <2 x i8>*
984  %wide.load1 = load <2 x i8>, <2 x i8>* %4, align 4
985  %5 = zext <2 x i8> %wide.load1 to <2 x i32>
986  %6 = sub nsw <2 x i32> %2, %5
987  %7 = icmp sgt <2 x i32> %6, <i32 -1, i32 -1>
988  %8 = sub nsw <2 x i32> zeroinitializer, %6
989  %9 = select <2 x i1> %7, <2 x i32> %6, <2 x i32> %8
990  %10 = add nsw <2 x i32> %9, %vec.phi
991  %index.next = add i64 %index, 4
992  %11 = icmp eq i64 %index.next, 1024
993  br i1 %11, label %middle.block, label %vector.body
994
995middle.block:
996  %.lcssa = phi <2 x i32> [ %10, %vector.body ]
997  %rdx.shuf = shufflevector <2 x i32> %.lcssa, <2 x i32> undef, <2 x i32> <i32 1, i32 undef>
998  %bin.rdx = add <2 x i32> %.lcssa, %rdx.shuf
999  %12 = extractelement <2 x i32> %bin.rdx, i32 0
1000  ret i32 %12
1001}
1002
1003