1; NOTE: Assertions have been autogenerated by update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512BW 6 7@a = global [1024 x i8] zeroinitializer, align 16 8@b = global [1024 x i8] zeroinitializer, align 16 9 10define i32 @sad_16i8() nounwind { 11; SSE2-LABEL: sad_16i8: 12; SSE2: # BB#0: # %entry 13; SSE2-NEXT: pxor %xmm0, %xmm0 14; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 15; SSE2-NEXT: pxor %xmm1, %xmm1 16; SSE2-NEXT: .p2align 4, 0x90 17; SSE2-NEXT: .LBB0_1: # %vector.body 18; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 19; SSE2-NEXT: movdqu a+1024(%rax), %xmm2 20; SSE2-NEXT: movdqu b+1024(%rax), %xmm3 21; SSE2-NEXT: psadbw %xmm2, %xmm3 22; SSE2-NEXT: paddd %xmm3, %xmm1 23; SSE2-NEXT: addq $4, %rax 24; SSE2-NEXT: jne .LBB0_1 25; SSE2-NEXT: # BB#2: # %middle.block 26; SSE2-NEXT: paddd %xmm0, %xmm1 27; SSE2-NEXT: paddd %xmm0, %xmm0 28; SSE2-NEXT: paddd %xmm1, %xmm0 29; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 30; SSE2-NEXT: paddd %xmm0, %xmm1 31; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] 32; SSE2-NEXT: paddd %xmm1, %xmm0 33; SSE2-NEXT: movd %xmm0, %eax 34; SSE2-NEXT: retq 35; 36; AVX2-LABEL: sad_16i8: 37; AVX2: # BB#0: # %entry 38; AVX2-NEXT: vpxor %ymm0, %ymm0, %ymm0 39; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00 40; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 41; AVX2-NEXT: .p2align 4, 0x90 42; AVX2-NEXT: .LBB0_1: # %vector.body 43; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 44; AVX2-NEXT: vmovdqu a+1024(%rax), %xmm2 45; AVX2-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2 46; AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm2 47; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 48; AVX2-NEXT: addq $4, %rax 49; AVX2-NEXT: jne .LBB0_1 50; AVX2-NEXT: # BB#2: # %middle.block 51; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 52; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 53; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 54; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 55; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 56; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 57; AVX2-NEXT: vmovd %xmm0, %eax 58; AVX2-NEXT: vzeroupper 59; AVX2-NEXT: retq 60; 61; AVX512F-LABEL: sad_16i8: 62; AVX512F: # BB#0: # %entry 63; AVX512F-NEXT: vpxord %zmm0, %zmm0, %zmm0 64; AVX512F-NEXT: movq $-1024, %rax # imm = 0xFC00 65; AVX512F-NEXT: .p2align 4, 0x90 66; AVX512F-NEXT: .LBB0_1: # %vector.body 67; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1 68; AVX512F-NEXT: vmovdqu a+1024(%rax), %xmm1 69; AVX512F-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1 70; AVX512F-NEXT: vpaddd %xmm0, %xmm1, %xmm1 71; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 72; AVX512F-NEXT: addq $4, %rax 73; AVX512F-NEXT: jne .LBB0_1 74; AVX512F-NEXT: # BB#2: # %middle.block 75; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1] 76; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 77; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1] 78; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 79; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] 80; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 81; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15] 82; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 83; AVX512F-NEXT: vmovd %xmm0, %eax 84; AVX512F-NEXT: retq 85; 86; AVX512BW-LABEL: sad_16i8: 87; AVX512BW: # BB#0: # %entry 88; AVX512BW-NEXT: vpxord %zmm0, %zmm0, %zmm0 89; AVX512BW-NEXT: movq $-1024, %rax # imm = 0xFC00 90; AVX512BW-NEXT: .p2align 4, 0x90 91; AVX512BW-NEXT: .LBB0_1: # %vector.body 92; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1 93; AVX512BW-NEXT: vmovdqu a+1024(%rax), %xmm1 94; AVX512BW-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1 95; AVX512BW-NEXT: vpaddd %xmm0, %xmm1, %xmm1 96; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 97; AVX512BW-NEXT: addq $4, %rax 98; AVX512BW-NEXT: jne .LBB0_1 99; AVX512BW-NEXT: # BB#2: # %middle.block 100; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1] 101; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 102; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1] 103; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 104; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] 105; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 106; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15] 107; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 108; AVX512BW-NEXT: vmovd %xmm0, %eax 109; AVX512BW-NEXT: retq 110entry: 111 br label %vector.body 112 113vector.body: 114 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 115 %vec.phi = phi <16 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ] 116 %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index 117 %1 = bitcast i8* %0 to <16 x i8>* 118 %wide.load = load <16 x i8>, <16 x i8>* %1, align 4 119 %2 = zext <16 x i8> %wide.load to <16 x i32> 120 %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index 121 %4 = bitcast i8* %3 to <16 x i8>* 122 %wide.load1 = load <16 x i8>, <16 x i8>* %4, align 4 123 %5 = zext <16 x i8> %wide.load1 to <16 x i32> 124 %6 = sub nsw <16 x i32> %2, %5 125 %7 = icmp sgt <16 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> 126 %8 = sub nsw <16 x i32> zeroinitializer, %6 127 %9 = select <16 x i1> %7, <16 x i32> %6, <16 x i32> %8 128 %10 = add nsw <16 x i32> %9, %vec.phi 129 %index.next = add i64 %index, 4 130 %11 = icmp eq i64 %index.next, 1024 131 br i1 %11, label %middle.block, label %vector.body 132 133middle.block: 134 %.lcssa = phi <16 x i32> [ %10, %vector.body ] 135 %rdx.shuf = shufflevector <16 x i32> %.lcssa, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 136 %bin.rdx = add <16 x i32> %.lcssa, %rdx.shuf 137 %rdx.shuf2 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 138 %bin.rdx2 = add <16 x i32> %bin.rdx, %rdx.shuf2 139 %rdx.shuf3 = shufflevector <16 x i32> %bin.rdx2, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 140 %bin.rdx3 = add <16 x i32> %bin.rdx2, %rdx.shuf3 141 %rdx.shuf4 = shufflevector <16 x i32> %bin.rdx3, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 142 %bin.rdx4 = add <16 x i32> %bin.rdx3, %rdx.shuf4 143 %12 = extractelement <16 x i32> %bin.rdx4, i32 0 144 ret i32 %12 145} 146 147define i32 @sad_32i8() nounwind { 148; SSE2-LABEL: sad_32i8: 149; SSE2: # BB#0: # %entry 150; SSE2-NEXT: pxor %xmm12, %xmm12 151; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 152; SSE2-NEXT: pxor %xmm4, %xmm4 153; SSE2-NEXT: pxor %xmm2, %xmm2 154; SSE2-NEXT: pxor %xmm0, %xmm0 155; SSE2-NEXT: pxor %xmm1, %xmm1 156; SSE2-NEXT: pxor %xmm13, %xmm13 157; SSE2-NEXT: pxor %xmm15, %xmm15 158; SSE2-NEXT: pxor %xmm5, %xmm5 159; SSE2-NEXT: pxor %xmm14, %xmm14 160; SSE2-NEXT: .p2align 4, 0x90 161; SSE2-NEXT: .LBB1_1: # %vector.body 162; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 163; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill 164; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill 165; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill 166; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill 167; SSE2-NEXT: movdqa a+1040(%rax), %xmm0 168; SSE2-NEXT: movdqa a+1024(%rax), %xmm1 169; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,3,0,1] 170; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] 171; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,0,1] 172; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3],xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7] 173; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3],xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] 174; SSE2-NEXT: movdqa %xmm1, %xmm6 175; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] 176; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] 177; SSE2-NEXT: movdqa %xmm0, %xmm2 178; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] 179; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] 180; SSE2-NEXT: movdqa b+1040(%rax), %xmm3 181; SSE2-NEXT: movdqa b+1024(%rax), %xmm5 182; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm3[2,3,0,1] 183; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3],xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] 184; SSE2-NEXT: movdqa %xmm3, %xmm10 185; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] 186; SSE2-NEXT: psubd %xmm3, %xmm0 187; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm5[2,3,0,1] 188; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7] 189; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3] 190; SSE2-NEXT: psubd %xmm10, %xmm2 191; SSE2-NEXT: movdqa %xmm5, %xmm3 192; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7] 193; SSE2-NEXT: psubd %xmm5, %xmm1 194; SSE2-NEXT: movdqa %xmm7, %xmm5 195; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7] 196; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] 197; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3],xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7] 198; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] 199; SSE2-NEXT: psubd %xmm3, %xmm6 200; SSE2-NEXT: movdqa %xmm4, %xmm10 201; SSE2-NEXT: movdqa %xmm9, %xmm4 202; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7] 203; SSE2-NEXT: psubd %xmm9, %xmm7 204; SSE2-NEXT: movdqa %xmm8, %xmm3 205; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] 206; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3] 207; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] 208; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3] 209; SSE2-NEXT: psubd %xmm4, %xmm5 210; SSE2-NEXT: movdqa %xmm11, %xmm4 211; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] 212; SSE2-NEXT: psubd %xmm11, %xmm8 213; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] 214; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3] 215; SSE2-NEXT: psubd %xmm4, %xmm3 216; SSE2-NEXT: movdqa %xmm3, %xmm4 217; SSE2-NEXT: psrad $31, %xmm4 218; SSE2-NEXT: paddd %xmm4, %xmm3 219; SSE2-NEXT: pxor %xmm4, %xmm3 220; SSE2-NEXT: movdqa %xmm8, %xmm4 221; SSE2-NEXT: psrad $31, %xmm4 222; SSE2-NEXT: paddd %xmm4, %xmm8 223; SSE2-NEXT: pxor %xmm4, %xmm8 224; SSE2-NEXT: movdqa %xmm5, %xmm4 225; SSE2-NEXT: psrad $31, %xmm4 226; SSE2-NEXT: paddd %xmm4, %xmm5 227; SSE2-NEXT: pxor %xmm4, %xmm5 228; SSE2-NEXT: movdqa %xmm7, %xmm4 229; SSE2-NEXT: psrad $31, %xmm4 230; SSE2-NEXT: paddd %xmm4, %xmm7 231; SSE2-NEXT: pxor %xmm4, %xmm7 232; SSE2-NEXT: movdqa %xmm6, %xmm4 233; SSE2-NEXT: psrad $31, %xmm4 234; SSE2-NEXT: paddd %xmm4, %xmm6 235; SSE2-NEXT: pxor %xmm4, %xmm6 236; SSE2-NEXT: movdqa %xmm1, %xmm4 237; SSE2-NEXT: psrad $31, %xmm4 238; SSE2-NEXT: paddd %xmm4, %xmm1 239; SSE2-NEXT: pxor %xmm4, %xmm1 240; SSE2-NEXT: movdqa %xmm2, %xmm4 241; SSE2-NEXT: psrad $31, %xmm4 242; SSE2-NEXT: paddd %xmm4, %xmm2 243; SSE2-NEXT: pxor %xmm4, %xmm2 244; SSE2-NEXT: movdqa %xmm0, %xmm4 245; SSE2-NEXT: psrad $31, %xmm4 246; SSE2-NEXT: paddd %xmm4, %xmm0 247; SSE2-NEXT: pxor %xmm4, %xmm0 248; SSE2-NEXT: movdqa %xmm10, %xmm4 249; SSE2-NEXT: paddd %xmm0, %xmm15 250; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload 251; SSE2-NEXT: paddd %xmm2, %xmm13 252; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload 253; SSE2-NEXT: paddd %xmm1, %xmm2 254; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload 255; SSE2-NEXT: paddd %xmm6, %xmm4 256; SSE2-NEXT: paddd %xmm7, %xmm14 257; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload 258; SSE2-NEXT: paddd %xmm5, %xmm6 259; SSE2-NEXT: movdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill 260; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload 261; SSE2-NEXT: paddd %xmm8, %xmm1 262; SSE2-NEXT: paddd %xmm3, %xmm0 263; SSE2-NEXT: addq $4, %rax 264; SSE2-NEXT: jne .LBB1_1 265; SSE2-NEXT: # BB#2: # %middle.block 266; SSE2-NEXT: paddd %xmm15, %xmm2 267; SSE2-NEXT: paddd %xmm14, %xmm1 268; SSE2-NEXT: paddd %xmm13, %xmm4 269; SSE2-NEXT: paddd %xmm5, %xmm0 270; SSE2-NEXT: paddd %xmm4, %xmm0 271; SSE2-NEXT: paddd %xmm2, %xmm1 272; SSE2-NEXT: paddd %xmm0, %xmm1 273; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 274; SSE2-NEXT: paddd %xmm1, %xmm0 275; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 276; SSE2-NEXT: paddd %xmm0, %xmm1 277; SSE2-NEXT: movd %xmm1, %eax 278; SSE2-NEXT: retq 279; 280; AVX2-LABEL: sad_32i8: 281; AVX2: # BB#0: # %entry 282; AVX2-NEXT: vpxor %ymm0, %ymm0, %ymm0 283; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00 284; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 285; AVX2-NEXT: .p2align 4, 0x90 286; AVX2-NEXT: .LBB1_1: # %vector.body 287; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 288; AVX2-NEXT: vmovdqa a+1024(%rax), %ymm2 289; AVX2-NEXT: vpsadbw b+1024(%rax), %ymm2, %ymm2 290; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 291; AVX2-NEXT: addq $4, %rax 292; AVX2-NEXT: jne .LBB1_1 293; AVX2-NEXT: # BB#2: # %middle.block 294; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm1 295; AVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm0 296; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 297; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 298; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 299; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 300; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 301; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 302; AVX2-NEXT: vmovd %xmm0, %eax 303; AVX2-NEXT: vzeroupper 304; AVX2-NEXT: retq 305; 306; AVX512F-LABEL: sad_32i8: 307; AVX512F: # BB#0: # %entry 308; AVX512F-NEXT: vpxord %zmm0, %zmm0, %zmm0 309; AVX512F-NEXT: movq $-1024, %rax # imm = 0xFC00 310; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1 311; AVX512F-NEXT: .p2align 4, 0x90 312; AVX512F-NEXT: .LBB1_1: # %vector.body 313; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1 314; AVX512F-NEXT: vmovdqa a+1024(%rax), %ymm2 315; AVX512F-NEXT: vpsadbw b+1024(%rax), %ymm2, %ymm2 316; AVX512F-NEXT: vpaddd %ymm1, %ymm2, %ymm2 317; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 318; AVX512F-NEXT: addq $4, %rax 319; AVX512F-NEXT: jne .LBB1_1 320; AVX512F-NEXT: # BB#2: # %middle.block 321; AVX512F-NEXT: vpaddd %zmm0, %zmm1, %zmm0 322; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1] 323; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 324; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1] 325; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 326; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] 327; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 328; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15] 329; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 330; AVX512F-NEXT: vmovd %xmm0, %eax 331; AVX512F-NEXT: retq 332; 333; AVX512BW-LABEL: sad_32i8: 334; AVX512BW: # BB#0: # %entry 335; AVX512BW-NEXT: vpxord %zmm0, %zmm0, %zmm0 336; AVX512BW-NEXT: movq $-1024, %rax # imm = 0xFC00 337; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1 338; AVX512BW-NEXT: .p2align 4, 0x90 339; AVX512BW-NEXT: .LBB1_1: # %vector.body 340; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1 341; AVX512BW-NEXT: vmovdqa a+1024(%rax), %ymm2 342; AVX512BW-NEXT: vpsadbw b+1024(%rax), %ymm2, %ymm2 343; AVX512BW-NEXT: vpaddd %ymm1, %ymm2, %ymm2 344; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 345; AVX512BW-NEXT: addq $4, %rax 346; AVX512BW-NEXT: jne .LBB1_1 347; AVX512BW-NEXT: # BB#2: # %middle.block 348; AVX512BW-NEXT: vpaddd %zmm0, %zmm1, %zmm0 349; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1] 350; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 351; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1] 352; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 353; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] 354; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 355; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15] 356; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 357; AVX512BW-NEXT: vmovd %xmm0, %eax 358; AVX512BW-NEXT: retq 359entry: 360 br label %vector.body 361 362vector.body: 363 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 364 %vec.phi = phi <32 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ] 365 %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index 366 %1 = bitcast i8* %0 to <32 x i8>* 367 %wide.load = load <32 x i8>, <32 x i8>* %1, align 32 368 %2 = zext <32 x i8> %wide.load to <32 x i32> 369 %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index 370 %4 = bitcast i8* %3 to <32 x i8>* 371 %wide.load1 = load <32 x i8>, <32 x i8>* %4, align 32 372 %5 = zext <32 x i8> %wide.load1 to <32 x i32> 373 %6 = sub nsw <32 x i32> %2, %5 374 %7 = icmp sgt <32 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> 375 %8 = sub nsw <32 x i32> zeroinitializer, %6 376 %9 = select <32 x i1> %7, <32 x i32> %6, <32 x i32> %8 377 %10 = add nsw <32 x i32> %9, %vec.phi 378 %index.next = add i64 %index, 4 379 %11 = icmp eq i64 %index.next, 1024 380 br i1 %11, label %middle.block, label %vector.body 381 382middle.block: 383 %.lcssa = phi <32 x i32> [ %10, %vector.body ] 384 %rdx.shuf = shufflevector <32 x i32> %.lcssa, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 385 %bin.rdx = add <32 x i32> %.lcssa, %rdx.shuf 386 %rdx.shuf2 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 387 %bin.rdx2 = add <32 x i32> %bin.rdx, %rdx.shuf2 388 %rdx.shuf3 = shufflevector <32 x i32> %bin.rdx2, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 389 %bin.rdx3 = add <32 x i32> %bin.rdx2, %rdx.shuf3 390 %rdx.shuf4 = shufflevector <32 x i32> %bin.rdx3, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 391 %bin.rdx4 = add <32 x i32> %bin.rdx3, %rdx.shuf4 392 %rdx.shuf5 = shufflevector <32 x i32> %bin.rdx4, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 393 %bin.rdx5 = add <32 x i32> %bin.rdx4, %rdx.shuf5 394 %12 = extractelement <32 x i32> %bin.rdx5, i32 0 395 ret i32 %12 396} 397 398define i32 @sad_avx64i8() nounwind { 399; SSE2-LABEL: sad_avx64i8: 400; SSE2: # BB#0: # %entry 401; SSE2-NEXT: subq $232, %rsp 402; SSE2-NEXT: pxor %xmm8, %xmm8 403; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 404; SSE2-NEXT: pxor %xmm5, %xmm5 405; SSE2-NEXT: pxor %xmm2, %xmm2 406; SSE2-NEXT: pxor %xmm1, %xmm1 407; SSE2-NEXT: pxor %xmm3, %xmm3 408; SSE2-NEXT: pxor %xmm6, %xmm6 409; SSE2-NEXT: pxor %xmm0, %xmm0 410; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill 411; SSE2-NEXT: pxor %xmm13, %xmm13 412; SSE2-NEXT: pxor %xmm10, %xmm10 413; SSE2-NEXT: pxor %xmm0, %xmm0 414; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill 415; SSE2-NEXT: pxor %xmm12, %xmm12 416; SSE2-NEXT: pxor %xmm0, %xmm0 417; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill 418; SSE2-NEXT: pxor %xmm11, %xmm11 419; SSE2-NEXT: pxor %xmm15, %xmm15 420; SSE2-NEXT: pxor %xmm9, %xmm9 421; SSE2-NEXT: pxor %xmm7, %xmm7 422; SSE2-NEXT: pxor %xmm0, %xmm0 423; SSE2-NEXT: .p2align 4, 0x90 424; SSE2-NEXT: .LBB2_1: # %vector.body 425; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 426; SSE2-NEXT: movdqa %xmm12, {{[0-9]+}}(%rsp) # 16-byte Spill 427; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill 428; SSE2-NEXT: movdqa %xmm15, {{[0-9]+}}(%rsp) # 16-byte Spill 429; SSE2-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill 430; SSE2-NEXT: movdqa %xmm11, {{[0-9]+}}(%rsp) # 16-byte Spill 431; SSE2-NEXT: movdqa %xmm9, {{[0-9]+}}(%rsp) # 16-byte Spill 432; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill 433; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill 434; SSE2-NEXT: movdqa %xmm6, {{[0-9]+}}(%rsp) # 16-byte Spill 435; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill 436; SSE2-NEXT: movdqa %xmm5, {{[0-9]+}}(%rsp) # 16-byte Spill 437; SSE2-NEXT: movdqa %xmm13, {{[0-9]+}}(%rsp) # 16-byte Spill 438; SSE2-NEXT: movdqa %xmm10, {{[0-9]+}}(%rsp) # 16-byte Spill 439; SSE2-NEXT: movdqa a+1040(%rax), %xmm13 440; SSE2-NEXT: movdqa a+1024(%rax), %xmm1 441; SSE2-NEXT: movdqa a+1056(%rax), %xmm3 442; SSE2-NEXT: movdqa a+1072(%rax), %xmm6 443; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] 444; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill 445; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] 446; SSE2-NEXT: movdqa %xmm3, %xmm12 447; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] 448; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] 449; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm13[2,3,0,1] 450; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] 451; SSE2-NEXT: movdqa %xmm10, -{{[0-9]+}}(%rsp) # 16-byte Spill 452; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] 453; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] 454; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] 455; SSE2-NEXT: movdqa %xmm1, %xmm0 456; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] 457; SSE2-NEXT: movdqa %xmm0, %xmm15 458; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] 459; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3],xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7] 460; SSE2-NEXT: movdqa %xmm13, %xmm0 461; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] 462; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7] 463; SSE2-NEXT: movdqa b+1040(%rax), %xmm7 464; SSE2-NEXT: movdqa b+1024(%rax), %xmm11 465; SSE2-NEXT: movdqa b+1056(%rax), %xmm9 466; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,3,0,1] 467; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] 468; SSE2-NEXT: movdqa %xmm7, %xmm4 469; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] 470; SSE2-NEXT: psubd %xmm7, %xmm13 471; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm11[2,3,0,1] 472; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] 473; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] 474; SSE2-NEXT: psubd %xmm4, %xmm0 475; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill 476; SSE2-NEXT: movdqa %xmm11, %xmm4 477; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] 478; SSE2-NEXT: psubd %xmm11, %xmm1 479; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm9[2,3,0,1] 480; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] 481; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] 482; SSE2-NEXT: psubd %xmm4, %xmm15 483; SSE2-NEXT: movdqa %xmm15, -{{[0-9]+}}(%rsp) # 16-byte Spill 484; SSE2-NEXT: movdqa %xmm9, %xmm4 485; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] 486; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] 487; SSE2-NEXT: psubd %xmm9, %xmm3 488; SSE2-NEXT: movdqa %xmm5, %xmm0 489; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] 490; SSE2-NEXT: psubd %xmm5, %xmm10 491; SSE2-NEXT: movdqa %xmm2, %xmm15 492; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] 493; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload 494; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3] 495; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] 496; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] 497; SSE2-NEXT: psubd %xmm0, %xmm5 498; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill 499; SSE2-NEXT: movdqa %xmm7, %xmm0 500; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] 501; SSE2-NEXT: psubd %xmm7, %xmm2 502; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill 503; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,3,0,1] 504; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] 505; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload 506; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] 507; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] 508; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] 509; SSE2-NEXT: psubd %xmm0, %xmm15 510; SSE2-NEXT: movdqa %xmm2, %xmm11 511; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] 512; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] 513; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3],xmm14[4],xmm8[4],xmm14[5],xmm8[5],xmm14[6],xmm8[6],xmm14[7],xmm8[7] 514; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] 515; SSE2-NEXT: psubd %xmm4, %xmm12 516; SSE2-NEXT: movdqa %xmm14, %xmm0 517; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3] 518; SSE2-NEXT: psubd %xmm14, %xmm2 519; SSE2-NEXT: movdqa %xmm2, %xmm14 520; SSE2-NEXT: movdqa %xmm6, %xmm9 521; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] 522; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] 523; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] 524; SSE2-NEXT: psubd %xmm0, %xmm11 525; SSE2-NEXT: movdqa b+1072(%rax), %xmm0 526; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] 527; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] 528; SSE2-NEXT: movdqa %xmm0, %xmm5 529; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] 530; SSE2-NEXT: psubd %xmm0, %xmm6 531; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] 532; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] 533; SSE2-NEXT: psubd %xmm5, %xmm9 534; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] 535; SSE2-NEXT: movdqa %xmm7, %xmm0 536; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] 537; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] 538; SSE2-NEXT: movdqa %xmm4, %xmm5 539; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] 540; SSE2-NEXT: psubd %xmm4, %xmm7 541; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] 542; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] 543; SSE2-NEXT: psubd %xmm5, %xmm0 544; SSE2-NEXT: movdqa %xmm0, %xmm4 545; SSE2-NEXT: psrad $31, %xmm4 546; SSE2-NEXT: paddd %xmm4, %xmm0 547; SSE2-NEXT: pxor %xmm4, %xmm0 548; SSE2-NEXT: movdqa %xmm7, %xmm4 549; SSE2-NEXT: psrad $31, %xmm4 550; SSE2-NEXT: paddd %xmm4, %xmm7 551; SSE2-NEXT: pxor %xmm4, %xmm7 552; SSE2-NEXT: movdqa %xmm9, %xmm4 553; SSE2-NEXT: psrad $31, %xmm4 554; SSE2-NEXT: paddd %xmm4, %xmm9 555; SSE2-NEXT: pxor %xmm4, %xmm9 556; SSE2-NEXT: movdqa %xmm6, %xmm4 557; SSE2-NEXT: psrad $31, %xmm4 558; SSE2-NEXT: paddd %xmm4, %xmm6 559; SSE2-NEXT: pxor %xmm4, %xmm6 560; SSE2-NEXT: movdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill 561; SSE2-NEXT: movdqa %xmm11, %xmm4 562; SSE2-NEXT: psrad $31, %xmm4 563; SSE2-NEXT: paddd %xmm4, %xmm11 564; SSE2-NEXT: pxor %xmm4, %xmm11 565; SSE2-NEXT: movdqa %xmm14, %xmm4 566; SSE2-NEXT: psrad $31, %xmm4 567; SSE2-NEXT: paddd %xmm4, %xmm14 568; SSE2-NEXT: pxor %xmm4, %xmm14 569; SSE2-NEXT: movdqa %xmm12, %xmm4 570; SSE2-NEXT: psrad $31, %xmm4 571; SSE2-NEXT: paddd %xmm4, %xmm12 572; SSE2-NEXT: pxor %xmm4, %xmm12 573; SSE2-NEXT: movdqa %xmm12, {{[0-9]+}}(%rsp) # 16-byte Spill 574; SSE2-NEXT: movdqa %xmm15, %xmm4 575; SSE2-NEXT: psrad $31, %xmm4 576; SSE2-NEXT: paddd %xmm4, %xmm15 577; SSE2-NEXT: pxor %xmm4, %xmm15 578; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload 579; SSE2-NEXT: movdqa %xmm2, %xmm4 580; SSE2-NEXT: psrad $31, %xmm4 581; SSE2-NEXT: paddd %xmm4, %xmm2 582; SSE2-NEXT: pxor %xmm4, %xmm2 583; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill 584; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload 585; SSE2-NEXT: movdqa %xmm2, %xmm4 586; SSE2-NEXT: psrad $31, %xmm4 587; SSE2-NEXT: paddd %xmm4, %xmm2 588; SSE2-NEXT: pxor %xmm4, %xmm2 589; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill 590; SSE2-NEXT: movdqa %xmm10, %xmm4 591; SSE2-NEXT: psrad $31, %xmm4 592; SSE2-NEXT: paddd %xmm4, %xmm10 593; SSE2-NEXT: pxor %xmm4, %xmm10 594; SSE2-NEXT: movdqa %xmm3, %xmm4 595; SSE2-NEXT: psrad $31, %xmm4 596; SSE2-NEXT: paddd %xmm4, %xmm3 597; SSE2-NEXT: pxor %xmm4, %xmm3 598; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload 599; SSE2-NEXT: movdqa %xmm2, %xmm4 600; SSE2-NEXT: psrad $31, %xmm4 601; SSE2-NEXT: paddd %xmm4, %xmm2 602; SSE2-NEXT: pxor %xmm4, %xmm2 603; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill 604; SSE2-NEXT: movdqa %xmm1, %xmm4 605; SSE2-NEXT: psrad $31, %xmm4 606; SSE2-NEXT: paddd %xmm4, %xmm1 607; SSE2-NEXT: pxor %xmm4, %xmm1 608; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload 609; SSE2-NEXT: movdqa %xmm2, %xmm4 610; SSE2-NEXT: psrad $31, %xmm4 611; SSE2-NEXT: paddd %xmm4, %xmm2 612; SSE2-NEXT: pxor %xmm4, %xmm2 613; SSE2-NEXT: movdqa %xmm2, %xmm5 614; SSE2-NEXT: movdqa %xmm13, %xmm4 615; SSE2-NEXT: psrad $31, %xmm4 616; SSE2-NEXT: paddd %xmm4, %xmm13 617; SSE2-NEXT: pxor %xmm4, %xmm13 618; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload 619; SSE2-NEXT: paddd %xmm13, %xmm2 620; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill 621; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm12 # 16-byte Reload 622; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload 623; SSE2-NEXT: paddd %xmm5, %xmm6 624; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload 625; SSE2-NEXT: paddd %xmm1, %xmm4 626; SSE2-NEXT: movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill 627; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload 628; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload 629; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload 630; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload 631; SSE2-NEXT: paddd %xmm3, %xmm4 632; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill 633; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload 634; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload 635; SSE2-NEXT: paddd %xmm10, %xmm4 636; SSE2-NEXT: movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill 637; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm13 # 16-byte Reload 638; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10 # 16-byte Reload 639; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm13 # 16-byte Folded Reload 640; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload 641; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload 642; SSE2-NEXT: paddd %xmm15, %xmm1 643; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm15 # 16-byte Reload 644; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm12 # 16-byte Folded Reload 645; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload 646; SSE2-NEXT: paddd %xmm14, %xmm4 647; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill 648; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload 649; SSE2-NEXT: paddd %xmm11, %xmm4 650; SSE2-NEXT: movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill 651; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11 # 16-byte Reload 652; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm15 # 16-byte Folded Reload 653; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload 654; SSE2-NEXT: paddd %xmm9, %xmm4 655; SSE2-NEXT: movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill 656; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 # 16-byte Reload 657; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload 658; SSE2-NEXT: paddd %xmm7, %xmm4 659; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill 660; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload 661; SSE2-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload 662; SSE2-NEXT: paddd %xmm0, %xmm4 663; SSE2-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill 664; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload 665; SSE2-NEXT: addq $4, %rax 666; SSE2-NEXT: jne .LBB2_1 667; SSE2-NEXT: # BB#2: # %middle.block 668; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload 669; SSE2-NEXT: paddd %xmm7, %xmm13 670; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload 671; SSE2-NEXT: paddd %xmm15, %xmm6 672; SSE2-NEXT: paddd %xmm11, %xmm3 673; SSE2-NEXT: paddd %xmm0, %xmm10 674; SSE2-NEXT: paddd %xmm12, %xmm2 675; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload 676; SSE2-NEXT: paddd %xmm9, %xmm0 677; SSE2-NEXT: paddd %xmm2, %xmm0 678; SSE2-NEXT: paddd %xmm3, %xmm10 679; SSE2-NEXT: paddd %xmm5, %xmm6 680; SSE2-NEXT: paddd %xmm1, %xmm13 681; SSE2-NEXT: paddd %xmm6, %xmm13 682; SSE2-NEXT: paddd %xmm0, %xmm10 683; SSE2-NEXT: paddd %xmm13, %xmm10 684; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,0,1] 685; SSE2-NEXT: paddd %xmm10, %xmm0 686; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 687; SSE2-NEXT: paddd %xmm0, %xmm1 688; SSE2-NEXT: movd %xmm1, %eax 689; SSE2-NEXT: addq $232, %rsp 690; SSE2-NEXT: retq 691; 692; AVX2-LABEL: sad_avx64i8: 693; AVX2: # BB#0: # %entry 694; AVX2-NEXT: vpxor %ymm0, %ymm0, %ymm0 695; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00 696; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 697; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 698; AVX2-NEXT: vpxor %ymm3, %ymm3, %ymm3 699; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4 700; AVX2-NEXT: vpxor %ymm6, %ymm6, %ymm6 701; AVX2-NEXT: vpxor %ymm5, %ymm5, %ymm5 702; AVX2-NEXT: vpxor %ymm7, %ymm7, %ymm7 703; AVX2-NEXT: .p2align 4, 0x90 704; AVX2-NEXT: .LBB2_1: # %vector.body 705; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 706; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 707; AVX2-NEXT: vmovdqu %ymm8, -{{[0-9]+}}(%rsp) # 32-byte Spill 708; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 709; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 710; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 711; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm12 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 712; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 713; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 714; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 715; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 716; AVX2-NEXT: vpsubd %ymm8, %ymm15, %ymm8 717; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 718; AVX2-NEXT: vpsubd %ymm15, %ymm14, %ymm14 719; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 720; AVX2-NEXT: vpsubd %ymm15, %ymm13, %ymm13 721; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 722; AVX2-NEXT: vpsubd %ymm15, %ymm12, %ymm12 723; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 724; AVX2-NEXT: vpsubd %ymm15, %ymm11, %ymm11 725; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 726; AVX2-NEXT: vpsubd %ymm15, %ymm10, %ymm10 727; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 728; AVX2-NEXT: vpsubd %ymm15, %ymm9, %ymm9 729; AVX2-NEXT: vmovdqu %ymm9, -{{[0-9]+}}(%rsp) # 32-byte Spill 730; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 731; AVX2-NEXT: vmovdqu -{{[0-9]+}}(%rsp), %ymm9 # 32-byte Reload 732; AVX2-NEXT: vpsubd %ymm15, %ymm9, %ymm15 733; AVX2-NEXT: vpabsd %ymm8, %ymm8 734; AVX2-NEXT: vpaddd %ymm3, %ymm8, %ymm3 735; AVX2-NEXT: vpabsd %ymm14, %ymm8 736; AVX2-NEXT: vpaddd %ymm1, %ymm8, %ymm1 737; AVX2-NEXT: vpabsd %ymm13, %ymm8 738; AVX2-NEXT: vpaddd %ymm2, %ymm8, %ymm2 739; AVX2-NEXT: vpabsd %ymm12, %ymm8 740; AVX2-NEXT: vpaddd %ymm0, %ymm8, %ymm0 741; AVX2-NEXT: vpabsd %ymm11, %ymm8 742; AVX2-NEXT: vpaddd %ymm4, %ymm8, %ymm4 743; AVX2-NEXT: vpabsd %ymm10, %ymm8 744; AVX2-NEXT: vpaddd %ymm6, %ymm8, %ymm6 745; AVX2-NEXT: vpabsd -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Folded Reload 746; AVX2-NEXT: vpaddd %ymm5, %ymm8, %ymm5 747; AVX2-NEXT: vpabsd %ymm15, %ymm8 748; AVX2-NEXT: vpaddd %ymm7, %ymm8, %ymm7 749; AVX2-NEXT: addq $4, %rax 750; AVX2-NEXT: jne .LBB2_1 751; AVX2-NEXT: # BB#2: # %middle.block 752; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2 753; AVX2-NEXT: vpaddd %ymm7, %ymm3, %ymm3 754; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 755; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1 756; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 757; AVX2-NEXT: vpaddd %ymm3, %ymm2, %ymm1 758; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 759; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 760; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 761; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 762; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 763; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 764; AVX2-NEXT: vmovd %xmm0, %eax 765; AVX2-NEXT: vzeroupper 766; AVX2-NEXT: retq 767; 768; AVX512F-LABEL: sad_avx64i8: 769; AVX512F: # BB#0: # %entry 770; AVX512F-NEXT: vpxord %zmm0, %zmm0, %zmm0 771; AVX512F-NEXT: movq $-1024, %rax # imm = 0xFC00 772; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1 773; AVX512F-NEXT: vpxord %zmm2, %zmm2, %zmm2 774; AVX512F-NEXT: vpxord %zmm3, %zmm3, %zmm3 775; AVX512F-NEXT: .p2align 4, 0x90 776; AVX512F-NEXT: .LBB2_1: # %vector.body 777; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1 778; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero 779; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero 780; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero 781; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero 782; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero 783; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero 784; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero 785; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero 786; AVX512F-NEXT: vpsubd %zmm11, %zmm7, %zmm7 787; AVX512F-NEXT: vpsubd %zmm10, %zmm6, %zmm6 788; AVX512F-NEXT: vpsubd %zmm9, %zmm5, %zmm5 789; AVX512F-NEXT: vpsubd %zmm8, %zmm4, %zmm4 790; AVX512F-NEXT: vpabsd %zmm4, %zmm4 791; AVX512F-NEXT: vpabsd %zmm5, %zmm5 792; AVX512F-NEXT: vpabsd %zmm6, %zmm6 793; AVX512F-NEXT: vpabsd %zmm7, %zmm7 794; AVX512F-NEXT: vpaddd %zmm3, %zmm7, %zmm3 795; AVX512F-NEXT: vpaddd %zmm2, %zmm6, %zmm2 796; AVX512F-NEXT: vpaddd %zmm1, %zmm5, %zmm1 797; AVX512F-NEXT: vpaddd %zmm0, %zmm4, %zmm0 798; AVX512F-NEXT: addq $4, %rax 799; AVX512F-NEXT: jne .LBB2_1 800; AVX512F-NEXT: # BB#2: # %middle.block 801; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0 802; AVX512F-NEXT: vpaddd %zmm3, %zmm1, %zmm1 803; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 804; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1] 805; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 806; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1] 807; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 808; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] 809; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 810; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15] 811; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 812; AVX512F-NEXT: vmovd %xmm0, %eax 813; AVX512F-NEXT: retq 814; 815; AVX512BW-LABEL: sad_avx64i8: 816; AVX512BW: # BB#0: # %entry 817; AVX512BW-NEXT: vpxord %zmm0, %zmm0, %zmm0 818; AVX512BW-NEXT: movq $-1024, %rax # imm = 0xFC00 819; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1 820; AVX512BW-NEXT: .p2align 4, 0x90 821; AVX512BW-NEXT: .LBB2_1: # %vector.body 822; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1 823; AVX512BW-NEXT: vmovdqu8 a+1024(%rax), %zmm2 824; AVX512BW-NEXT: vpsadbw b+1024(%rax), %zmm2, %zmm2 825; AVX512BW-NEXT: vpaddd %zmm1, %zmm2, %zmm1 826; AVX512BW-NEXT: addq $4, %rax 827; AVX512BW-NEXT: jne .LBB2_1 828; AVX512BW-NEXT: # BB#2: # %middle.block 829; AVX512BW-NEXT: vpaddd %zmm0, %zmm1, %zmm1 830; AVX512BW-NEXT: vpaddd %zmm0, %zmm0, %zmm0 831; AVX512BW-NEXT: vpaddd %zmm0, %zmm1, %zmm0 832; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1] 833; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 834; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1] 835; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 836; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] 837; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 838; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15] 839; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 840; AVX512BW-NEXT: vmovd %xmm0, %eax 841; AVX512BW-NEXT: retq 842entry: 843 br label %vector.body 844 845vector.body: 846 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 847 %vec.phi = phi <64 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ] 848 %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index 849 %1 = bitcast i8* %0 to <64 x i8>* 850 %wide.load = load <64 x i8>, <64 x i8>* %1, align 64 851 %2 = zext <64 x i8> %wide.load to <64 x i32> 852 %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index 853 %4 = bitcast i8* %3 to <64 x i8>* 854 %wide.load1 = load <64 x i8>, <64 x i8>* %4, align 64 855 %5 = zext <64 x i8> %wide.load1 to <64 x i32> 856 %6 = sub nsw <64 x i32> %2, %5 857 %7 = icmp sgt <64 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> 858 %8 = sub nsw <64 x i32> zeroinitializer, %6 859 %9 = select <64 x i1> %7, <64 x i32> %6, <64 x i32> %8 860 %10 = add nsw <64 x i32> %9, %vec.phi 861 %index.next = add i64 %index, 4 862 %11 = icmp eq i64 %index.next, 1024 863 br i1 %11, label %middle.block, label %vector.body 864 865middle.block: 866 %.lcssa = phi <64 x i32> [ %10, %vector.body ] 867 %rdx.shuf = shufflevector <64 x i32> %.lcssa, <64 x i32> undef, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 868 %bin.rdx = add <64 x i32> %.lcssa, %rdx.shuf 869 %rdx.shuf2 = shufflevector <64 x i32> %bin.rdx, <64 x i32> undef, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 870 %bin.rdx2 = add <64 x i32> %bin.rdx, %rdx.shuf2 871 %rdx.shuf3 = shufflevector <64 x i32> %bin.rdx2, <64 x i32> undef, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 872 %bin.rdx3 = add <64 x i32> %bin.rdx2, %rdx.shuf3 873 %rdx.shuf4 = shufflevector <64 x i32> %bin.rdx3, <64 x i32> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 874 %bin.rdx4 = add <64 x i32> %bin.rdx3, %rdx.shuf4 875 %rdx.shuf5 = shufflevector <64 x i32> %bin.rdx4, <64 x i32> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 876 %bin.rdx5 = add <64 x i32> %bin.rdx4, %rdx.shuf5 877 %rdx.shuf6 = shufflevector <64 x i32> %bin.rdx5, <64 x i32> undef, <64 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 878 %bin.rdx6 = add <64 x i32> %bin.rdx5, %rdx.shuf6 879 %12 = extractelement <64 x i32> %bin.rdx6, i32 0 880 ret i32 %12 881} 882 883define i32 @sad_2i8() nounwind { 884; SSE2-LABEL: sad_2i8: 885; SSE2: # BB#0: # %entry 886; SSE2-NEXT: pxor %xmm0, %xmm0 887; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 888; SSE2-NEXT: movl $65535, %ecx # imm = 0xFFFF 889; SSE2-NEXT: movd %ecx, %xmm1 890; SSE2-NEXT: .p2align 4, 0x90 891; SSE2-NEXT: .LBB3_1: # %vector.body 892; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 893; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 894; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero 895; SSE2-NEXT: pand %xmm1, %xmm3 896; SSE2-NEXT: pand %xmm1, %xmm2 897; SSE2-NEXT: psadbw %xmm3, %xmm2 898; SSE2-NEXT: paddq %xmm2, %xmm0 899; SSE2-NEXT: addq $4, %rax 900; SSE2-NEXT: jne .LBB3_1 901; SSE2-NEXT: # BB#2: # %middle.block 902; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 903; SSE2-NEXT: paddq %xmm0, %xmm1 904; SSE2-NEXT: movd %xmm1, %eax 905; SSE2-NEXT: retq 906; 907; AVX2-LABEL: sad_2i8: 908; AVX2: # BB#0: # %entry 909; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 910; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00 911; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 912; AVX2-NEXT: .p2align 4, 0x90 913; AVX2-NEXT: .LBB3_1: # %vector.body 914; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 915; AVX2-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero 916; AVX2-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero 917; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7] 918; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7] 919; AVX2-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 920; AVX2-NEXT: vpaddq %xmm1, %xmm2, %xmm1 921; AVX2-NEXT: addq $4, %rax 922; AVX2-NEXT: jne .LBB3_1 923; AVX2-NEXT: # BB#2: # %middle.block 924; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 925; AVX2-NEXT: vpaddq %xmm0, %xmm1, %xmm0 926; AVX2-NEXT: vmovd %xmm0, %eax 927; AVX2-NEXT: retq 928; 929; AVX512F-LABEL: sad_2i8: 930; AVX512F: # BB#0: # %entry 931; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 932; AVX512F-NEXT: movq $-1024, %rax # imm = 0xFC00 933; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 934; AVX512F-NEXT: .p2align 4, 0x90 935; AVX512F-NEXT: .LBB3_1: # %vector.body 936; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1 937; AVX512F-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero 938; AVX512F-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero 939; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7] 940; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7] 941; AVX512F-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 942; AVX512F-NEXT: vpaddq %xmm1, %xmm2, %xmm1 943; AVX512F-NEXT: addq $4, %rax 944; AVX512F-NEXT: jne .LBB3_1 945; AVX512F-NEXT: # BB#2: # %middle.block 946; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 947; AVX512F-NEXT: vpaddq %xmm0, %xmm1, %xmm0 948; AVX512F-NEXT: vmovd %xmm0, %eax 949; AVX512F-NEXT: retq 950; 951; AVX512BW-LABEL: sad_2i8: 952; AVX512BW: # BB#0: # %entry 953; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0 954; AVX512BW-NEXT: movq $-1024, %rax # imm = 0xFC00 955; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 956; AVX512BW-NEXT: .p2align 4, 0x90 957; AVX512BW-NEXT: .LBB3_1: # %vector.body 958; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1 959; AVX512BW-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero 960; AVX512BW-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero 961; AVX512BW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7] 962; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7] 963; AVX512BW-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 964; AVX512BW-NEXT: vpaddq %xmm1, %xmm2, %xmm1 965; AVX512BW-NEXT: addq $4, %rax 966; AVX512BW-NEXT: jne .LBB3_1 967; AVX512BW-NEXT: # BB#2: # %middle.block 968; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 969; AVX512BW-NEXT: vpaddq %xmm0, %xmm1, %xmm0 970; AVX512BW-NEXT: vmovd %xmm0, %eax 971; AVX512BW-NEXT: retq 972entry: 973 br label %vector.body 974 975vector.body: 976 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 977 %vec.phi = phi <2 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ] 978 %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index 979 %1 = bitcast i8* %0 to <2 x i8>* 980 %wide.load = load <2 x i8>, <2 x i8>* %1, align 4 981 %2 = zext <2 x i8> %wide.load to <2 x i32> 982 %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index 983 %4 = bitcast i8* %3 to <2 x i8>* 984 %wide.load1 = load <2 x i8>, <2 x i8>* %4, align 4 985 %5 = zext <2 x i8> %wide.load1 to <2 x i32> 986 %6 = sub nsw <2 x i32> %2, %5 987 %7 = icmp sgt <2 x i32> %6, <i32 -1, i32 -1> 988 %8 = sub nsw <2 x i32> zeroinitializer, %6 989 %9 = select <2 x i1> %7, <2 x i32> %6, <2 x i32> %8 990 %10 = add nsw <2 x i32> %9, %vec.phi 991 %index.next = add i64 %index, 4 992 %11 = icmp eq i64 %index.next, 1024 993 br i1 %11, label %middle.block, label %vector.body 994 995middle.block: 996 %.lcssa = phi <2 x i32> [ %10, %vector.body ] 997 %rdx.shuf = shufflevector <2 x i32> %.lcssa, <2 x i32> undef, <2 x i32> <i32 1, i32 undef> 998 %bin.rdx = add <2 x i32> %.lcssa, %rdx.shuf 999 %12 = extractelement <2 x i32> %bin.rdx, i32 0 1000 ret i32 %12 1001} 1002 1003