1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse,+sse2 | FileCheck %s --check-prefix=X86 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse,+sse2 | FileCheck %s --check-prefix=X64 4 5; If the target does not have a single div/rem operation, 6; -div-rem-pairs pass will decompose the remainder calculation as: 7; X % Y --> X - ((X / Y) * Y) 8; But if the target does have a single div/rem operation, 9; the opposite transform is likely beneficial. 10 11define i8 @scalar_i8(i8 %x, i8 %y, i8* %divdst) nounwind { 12; X86-LABEL: scalar_i8: 13; X86: # %bb.0: 14; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 15; X86-NEXT: movb {{[0-9]+}}(%esp), %ch 16; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 17; X86-NEXT: movzbl %cl, %eax 18; X86-NEXT: divb %ch 19; X86-NEXT: movb %al, (%edx) 20; X86-NEXT: mulb %ch 21; X86-NEXT: subb %al, %cl 22; X86-NEXT: movl %ecx, %eax 23; X86-NEXT: retl 24; 25; X64-LABEL: scalar_i8: 26; X64: # %bb.0: 27; X64-NEXT: movzbl %dil, %ecx 28; X64-NEXT: movl %ecx, %eax 29; X64-NEXT: divb %sil 30; X64-NEXT: movb %al, (%rdx) 31; X64-NEXT: mulb %sil 32; X64-NEXT: subb %al, %cl 33; X64-NEXT: movl %ecx, %eax 34; X64-NEXT: retq 35 %div = udiv i8 %x, %y 36 store i8 %div, i8* %divdst, align 4 37 %t1 = mul i8 %div, %y 38 %t2 = sub i8 %x, %t1 39 ret i8 %t2 40} 41 42define i16 @scalar_i16(i16 %x, i16 %y, i16* %divdst) nounwind { 43; X86-LABEL: scalar_i16: 44; X86: # %bb.0: 45; X86-NEXT: pushl %edi 46; X86-NEXT: pushl %esi 47; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 48; X86-NEXT: movl {{[0-9]+}}(%esp), %esi 49; X86-NEXT: movl {{[0-9]+}}(%esp), %edi 50; X86-NEXT: movl %ecx, %eax 51; X86-NEXT: xorl %edx, %edx 52; X86-NEXT: divw %si 53; X86-NEXT: # kill: def $ax killed $ax def $eax 54; X86-NEXT: movw %ax, (%edi) 55; X86-NEXT: imull %eax, %esi 56; X86-NEXT: subl %esi, %ecx 57; X86-NEXT: movl %ecx, %eax 58; X86-NEXT: popl %esi 59; X86-NEXT: popl %edi 60; X86-NEXT: retl 61; 62; X64-LABEL: scalar_i16: 63; X64: # %bb.0: 64; X64-NEXT: movq %rdx, %rcx 65; X64-NEXT: movl %edi, %eax 66; X64-NEXT: xorl %edx, %edx 67; X64-NEXT: divw %si 68; X64-NEXT: # kill: def $ax killed $ax def $eax 69; X64-NEXT: movw %ax, (%rcx) 70; X64-NEXT: imull %eax, %esi 71; X64-NEXT: subl %esi, %edi 72; X64-NEXT: movl %edi, %eax 73; X64-NEXT: retq 74 %div = udiv i16 %x, %y 75 store i16 %div, i16* %divdst, align 4 76 %t1 = mul i16 %div, %y 77 %t2 = sub i16 %x, %t1 78 ret i16 %t2 79} 80 81define i32 @scalar_i32(i32 %x, i32 %y, i32* %divdst) nounwind { 82; X86-LABEL: scalar_i32: 83; X86: # %bb.0: 84; X86-NEXT: pushl %edi 85; X86-NEXT: pushl %esi 86; X86-NEXT: movl {{[0-9]+}}(%esp), %esi 87; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 88; X86-NEXT: movl {{[0-9]+}}(%esp), %edi 89; X86-NEXT: movl %ecx, %eax 90; X86-NEXT: xorl %edx, %edx 91; X86-NEXT: divl %edi 92; X86-NEXT: movl %eax, (%esi) 93; X86-NEXT: imull %edi, %eax 94; X86-NEXT: subl %eax, %ecx 95; X86-NEXT: movl %ecx, %eax 96; X86-NEXT: popl %esi 97; X86-NEXT: popl %edi 98; X86-NEXT: retl 99; 100; X64-LABEL: scalar_i32: 101; X64: # %bb.0: 102; X64-NEXT: movq %rdx, %rcx 103; X64-NEXT: movl %edi, %eax 104; X64-NEXT: xorl %edx, %edx 105; X64-NEXT: divl %esi 106; X64-NEXT: movl %eax, (%rcx) 107; X64-NEXT: imull %esi, %eax 108; X64-NEXT: subl %eax, %edi 109; X64-NEXT: movl %edi, %eax 110; X64-NEXT: retq 111 %div = udiv i32 %x, %y 112 store i32 %div, i32* %divdst, align 4 113 %t1 = mul i32 %div, %y 114 %t2 = sub i32 %x, %t1 115 ret i32 %t2 116} 117 118define i64 @scalar_i64(i64 %x, i64 %y, i64* %divdst) nounwind { 119; X86-LABEL: scalar_i64: 120; X86: # %bb.0: 121; X86-NEXT: pushl %ebp 122; X86-NEXT: pushl %ebx 123; X86-NEXT: pushl %edi 124; X86-NEXT: pushl %esi 125; X86-NEXT: movl {{[0-9]+}}(%esp), %esi 126; X86-NEXT: movl {{[0-9]+}}(%esp), %edi 127; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp 128; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx 129; X86-NEXT: pushl %ebx 130; X86-NEXT: pushl %ebp 131; X86-NEXT: pushl %edi 132; X86-NEXT: pushl %esi 133; X86-NEXT: calll __udivdi3 134; X86-NEXT: addl $16, %esp 135; X86-NEXT: movl %edx, %ecx 136; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 137; X86-NEXT: movl %ecx, 4(%edx) 138; X86-NEXT: movl %eax, (%edx) 139; X86-NEXT: imull %eax, %ebx 140; X86-NEXT: mull %ebp 141; X86-NEXT: addl %ebx, %edx 142; X86-NEXT: imull %ebp, %ecx 143; X86-NEXT: addl %edx, %ecx 144; X86-NEXT: subl %eax, %esi 145; X86-NEXT: sbbl %ecx, %edi 146; X86-NEXT: movl %esi, %eax 147; X86-NEXT: movl %edi, %edx 148; X86-NEXT: popl %esi 149; X86-NEXT: popl %edi 150; X86-NEXT: popl %ebx 151; X86-NEXT: popl %ebp 152; X86-NEXT: retl 153; 154; X64-LABEL: scalar_i64: 155; X64: # %bb.0: 156; X64-NEXT: movq %rdx, %rcx 157; X64-NEXT: movq %rdi, %rax 158; X64-NEXT: xorl %edx, %edx 159; X64-NEXT: divq %rsi 160; X64-NEXT: movq %rax, (%rcx) 161; X64-NEXT: imulq %rsi, %rax 162; X64-NEXT: subq %rax, %rdi 163; X64-NEXT: movq %rdi, %rax 164; X64-NEXT: retq 165 %div = udiv i64 %x, %y 166 store i64 %div, i64* %divdst, align 4 167 %t1 = mul i64 %div, %y 168 %t2 = sub i64 %x, %t1 169 ret i64 %t2 170} 171 172define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, <16 x i8>* %divdst) nounwind { 173; X86-LABEL: vector_i128_i8: 174; X86: # %bb.0: 175; X86-NEXT: pushl %ebp 176; X86-NEXT: movl %esp, %ebp 177; X86-NEXT: pushl %ebx 178; X86-NEXT: pushl %edi 179; X86-NEXT: pushl %esi 180; X86-NEXT: andl $-16, %esp 181; X86-NEXT: subl $48, %esp 182; X86-NEXT: movdqa %xmm0, (%esp) 183; X86-NEXT: movdqa %xmm1, {{[0-9]+}}(%esp) 184; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 185; X86-NEXT: divb {{[0-9]+}}(%esp) 186; X86-NEXT: movzbl %al, %eax 187; X86-NEXT: movd %eax, %xmm2 188; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 189; X86-NEXT: divb {{[0-9]+}}(%esp) 190; X86-NEXT: movzbl %al, %eax 191; X86-NEXT: movd %eax, %xmm3 192; X86-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 193; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 194; X86-NEXT: divb {{[0-9]+}}(%esp) 195; X86-NEXT: movzbl %al, %eax 196; X86-NEXT: movd %eax, %xmm4 197; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 198; X86-NEXT: divb {{[0-9]+}}(%esp) 199; X86-NEXT: movzbl %al, %eax 200; X86-NEXT: movd %eax, %xmm2 201; X86-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 202; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 203; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 204; X86-NEXT: divb {{[0-9]+}}(%esp) 205; X86-NEXT: movzbl %al, %eax 206; X86-NEXT: movd %eax, %xmm3 207; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 208; X86-NEXT: divb {{[0-9]+}}(%esp) 209; X86-NEXT: movzbl %al, %eax 210; X86-NEXT: movd %eax, %xmm4 211; X86-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] 212; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 213; X86-NEXT: divb {{[0-9]+}}(%esp) 214; X86-NEXT: movzbl %al, %eax 215; X86-NEXT: movd %eax, %xmm5 216; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 217; X86-NEXT: divb {{[0-9]+}}(%esp) 218; X86-NEXT: movzbl %al, %eax 219; X86-NEXT: movd %eax, %xmm3 220; X86-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] 221; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 222; X86-NEXT: divb {{[0-9]+}}(%esp) 223; X86-NEXT: movzbl %al, %eax 224; X86-NEXT: movd %eax, %xmm5 225; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 226; X86-NEXT: divb {{[0-9]+}}(%esp) 227; X86-NEXT: movzbl %al, %eax 228; X86-NEXT: movd %eax, %xmm6 229; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 230; X86-NEXT: divb {{[0-9]+}}(%esp) 231; X86-NEXT: movzbl %al, %edx 232; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 233; X86-NEXT: divb {{[0-9]+}}(%esp) 234; X86-NEXT: movzbl %al, %esi 235; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 236; X86-NEXT: divb {{[0-9]+}}(%esp) 237; X86-NEXT: movzbl %al, %edi 238; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 239; X86-NEXT: divb {{[0-9]+}}(%esp) 240; X86-NEXT: movzbl %al, %ebx 241; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 242; X86-NEXT: divb {{[0-9]+}}(%esp) 243; X86-NEXT: movl %eax, %ecx 244; X86-NEXT: movzbl (%esp), %eax 245; X86-NEXT: divb {{[0-9]+}}(%esp) 246; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] 247; X86-NEXT: movd %edx, %xmm4 248; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 249; X86-NEXT: movd %esi, %xmm2 250; X86-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] 251; X86-NEXT: movd %edi, %xmm5 252; X86-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 253; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] 254; X86-NEXT: movd %ebx, %xmm4 255; X86-NEXT: movzbl %cl, %ecx 256; X86-NEXT: movd %ecx, %xmm6 257; X86-NEXT: movl 8(%ebp), %ecx 258; X86-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] 259; X86-NEXT: movzbl %al, %eax 260; X86-NEXT: movd %eax, %xmm5 261; X86-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] 262; X86-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] 263; X86-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] 264; X86-NEXT: movdqa %xmm5, %xmm2 265; X86-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 266; X86-NEXT: movdqa %xmm2, (%ecx) 267; X86-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 268; X86-NEXT: movdqa %xmm1, %xmm2 269; X86-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 270; X86-NEXT: pmullw %xmm3, %xmm2 271; X86-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 272; X86-NEXT: pand %xmm3, %xmm2 273; X86-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 274; X86-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 275; X86-NEXT: pmullw %xmm5, %xmm1 276; X86-NEXT: pand %xmm3, %xmm1 277; X86-NEXT: packuswb %xmm2, %xmm1 278; X86-NEXT: psubb %xmm1, %xmm0 279; X86-NEXT: leal -12(%ebp), %esp 280; X86-NEXT: popl %esi 281; X86-NEXT: popl %edi 282; X86-NEXT: popl %ebx 283; X86-NEXT: popl %ebp 284; X86-NEXT: retl 285; 286; X64-LABEL: vector_i128_i8: 287; X64: # %bb.0: 288; X64-NEXT: pushq %rbp 289; X64-NEXT: pushq %r15 290; X64-NEXT: pushq %r14 291; X64-NEXT: pushq %r13 292; X64-NEXT: pushq %r12 293; X64-NEXT: pushq %rbx 294; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 295; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) 296; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) 297; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 298; X64-NEXT: divb -{{[0-9]+}}(%rsp) 299; X64-NEXT: movzbl %al, %eax 300; X64-NEXT: movd %eax, %xmm2 301; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 302; X64-NEXT: divb -{{[0-9]+}}(%rsp) 303; X64-NEXT: movzbl %al, %r8d 304; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 305; X64-NEXT: divb -{{[0-9]+}}(%rsp) 306; X64-NEXT: movzbl %al, %r9d 307; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 308; X64-NEXT: divb -{{[0-9]+}}(%rsp) 309; X64-NEXT: movzbl %al, %r10d 310; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 311; X64-NEXT: divb -{{[0-9]+}}(%rsp) 312; X64-NEXT: movzbl %al, %r11d 313; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 314; X64-NEXT: divb -{{[0-9]+}}(%rsp) 315; X64-NEXT: movzbl %al, %r14d 316; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 317; X64-NEXT: divb -{{[0-9]+}}(%rsp) 318; X64-NEXT: movzbl %al, %r15d 319; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 320; X64-NEXT: divb -{{[0-9]+}}(%rsp) 321; X64-NEXT: movzbl %al, %r12d 322; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 323; X64-NEXT: divb -{{[0-9]+}}(%rsp) 324; X64-NEXT: movzbl %al, %r13d 325; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 326; X64-NEXT: divb -{{[0-9]+}}(%rsp) 327; X64-NEXT: movzbl %al, %edi 328; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 329; X64-NEXT: divb -{{[0-9]+}}(%rsp) 330; X64-NEXT: movzbl %al, %esi 331; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 332; X64-NEXT: divb -{{[0-9]+}}(%rsp) 333; X64-NEXT: movzbl %al, %ebx 334; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 335; X64-NEXT: divb -{{[0-9]+}}(%rsp) 336; X64-NEXT: movzbl %al, %ebp 337; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 338; X64-NEXT: divb -{{[0-9]+}}(%rsp) 339; X64-NEXT: movzbl %al, %edx 340; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 341; X64-NEXT: divb -{{[0-9]+}}(%rsp) 342; X64-NEXT: movl %eax, %ecx 343; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 344; X64-NEXT: divb -{{[0-9]+}}(%rsp) 345; X64-NEXT: movd %r8d, %xmm3 346; X64-NEXT: movd %r9d, %xmm4 347; X64-NEXT: movd %r10d, %xmm5 348; X64-NEXT: movd %r11d, %xmm6 349; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 350; X64-NEXT: movd %r14d, %xmm2 351; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] 352; X64-NEXT: movd %r15d, %xmm4 353; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] 354; X64-NEXT: movd %r12d, %xmm3 355; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] 356; X64-NEXT: movd %r13d, %xmm6 357; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 358; X64-NEXT: movd %edi, %xmm4 359; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 360; X64-NEXT: movd %esi, %xmm2 361; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] 362; X64-NEXT: movd %ebx, %xmm5 363; X64-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] 364; X64-NEXT: movd %ebp, %xmm6 365; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] 366; X64-NEXT: movd %edx, %xmm2 367; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] 368; X64-NEXT: movzbl %cl, %ecx 369; X64-NEXT: movd %ecx, %xmm4 370; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] 371; X64-NEXT: movzbl %al, %eax 372; X64-NEXT: movd %eax, %xmm6 373; X64-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] 374; X64-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] 375; X64-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] 376; X64-NEXT: movdqa %xmm6, %xmm2 377; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 378; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 379; X64-NEXT: movdqa %xmm2, (%rax) 380; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 381; X64-NEXT: movdqa %xmm1, %xmm2 382; X64-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 383; X64-NEXT: pmullw %xmm3, %xmm2 384; X64-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 385; X64-NEXT: pand %xmm3, %xmm2 386; X64-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 387; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 388; X64-NEXT: pmullw %xmm6, %xmm1 389; X64-NEXT: pand %xmm3, %xmm1 390; X64-NEXT: packuswb %xmm2, %xmm1 391; X64-NEXT: psubb %xmm1, %xmm0 392; X64-NEXT: popq %rbx 393; X64-NEXT: popq %r12 394; X64-NEXT: popq %r13 395; X64-NEXT: popq %r14 396; X64-NEXT: popq %r15 397; X64-NEXT: popq %rbp 398; X64-NEXT: retq 399 %div = udiv <16 x i8> %x, %y 400 store <16 x i8> %div, <16 x i8>* %divdst, align 16 401 %t1 = mul <16 x i8> %div, %y 402 %t2 = sub <16 x i8> %x, %t1 403 ret <16 x i8> %t2 404} 405 406define <8 x i16> @vector_i128_i16(<8 x i16> %x, <8 x i16> %y, <8 x i16>* %divdst) nounwind { 407; X86-LABEL: vector_i128_i16: 408; X86: # %bb.0: 409; X86-NEXT: pushl %esi 410; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 411; X86-NEXT: pextrw $7, %xmm0, %eax 412; X86-NEXT: pextrw $7, %xmm1, %esi 413; X86-NEXT: # kill: def $ax killed $ax killed $eax 414; X86-NEXT: xorl %edx, %edx 415; X86-NEXT: divw %si 416; X86-NEXT: # kill: def $ax killed $ax def $eax 417; X86-NEXT: movd %eax, %xmm2 418; X86-NEXT: pextrw $6, %xmm0, %eax 419; X86-NEXT: pextrw $6, %xmm1, %esi 420; X86-NEXT: # kill: def $ax killed $ax killed $eax 421; X86-NEXT: xorl %edx, %edx 422; X86-NEXT: divw %si 423; X86-NEXT: # kill: def $ax killed $ax def $eax 424; X86-NEXT: movd %eax, %xmm3 425; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 426; X86-NEXT: pextrw $5, %xmm0, %eax 427; X86-NEXT: pextrw $5, %xmm1, %esi 428; X86-NEXT: # kill: def $ax killed $ax killed $eax 429; X86-NEXT: xorl %edx, %edx 430; X86-NEXT: divw %si 431; X86-NEXT: # kill: def $ax killed $ax def $eax 432; X86-NEXT: movd %eax, %xmm4 433; X86-NEXT: pextrw $4, %xmm0, %eax 434; X86-NEXT: pextrw $4, %xmm1, %esi 435; X86-NEXT: # kill: def $ax killed $ax killed $eax 436; X86-NEXT: xorl %edx, %edx 437; X86-NEXT: divw %si 438; X86-NEXT: # kill: def $ax killed $ax def $eax 439; X86-NEXT: movd %eax, %xmm2 440; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] 441; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 442; X86-NEXT: pextrw $3, %xmm0, %eax 443; X86-NEXT: pextrw $3, %xmm1, %esi 444; X86-NEXT: # kill: def $ax killed $ax killed $eax 445; X86-NEXT: xorl %edx, %edx 446; X86-NEXT: divw %si 447; X86-NEXT: # kill: def $ax killed $ax def $eax 448; X86-NEXT: movd %eax, %xmm3 449; X86-NEXT: pextrw $2, %xmm0, %eax 450; X86-NEXT: pextrw $2, %xmm1, %esi 451; X86-NEXT: # kill: def $ax killed $ax killed $eax 452; X86-NEXT: xorl %edx, %edx 453; X86-NEXT: divw %si 454; X86-NEXT: # kill: def $ax killed $ax def $eax 455; X86-NEXT: movd %eax, %xmm4 456; X86-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 457; X86-NEXT: pextrw $1, %xmm0, %eax 458; X86-NEXT: pextrw $1, %xmm1, %esi 459; X86-NEXT: # kill: def $ax killed $ax killed $eax 460; X86-NEXT: xorl %edx, %edx 461; X86-NEXT: divw %si 462; X86-NEXT: # kill: def $ax killed $ax def $eax 463; X86-NEXT: movd %eax, %xmm3 464; X86-NEXT: movd %xmm0, %eax 465; X86-NEXT: movd %xmm1, %esi 466; X86-NEXT: # kill: def $ax killed $ax killed $eax 467; X86-NEXT: xorl %edx, %edx 468; X86-NEXT: divw %si 469; X86-NEXT: # kill: def $ax killed $ax def $eax 470; X86-NEXT: movd %eax, %xmm5 471; X86-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] 472; X86-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 473; X86-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] 474; X86-NEXT: movdqa %xmm5, (%ecx) 475; X86-NEXT: pmullw %xmm1, %xmm5 476; X86-NEXT: psubw %xmm5, %xmm0 477; X86-NEXT: popl %esi 478; X86-NEXT: retl 479; 480; X64-LABEL: vector_i128_i16: 481; X64: # %bb.0: 482; X64-NEXT: pextrw $7, %xmm0, %eax 483; X64-NEXT: pextrw $7, %xmm1, %ecx 484; X64-NEXT: # kill: def $ax killed $ax killed $eax 485; X64-NEXT: xorl %edx, %edx 486; X64-NEXT: divw %cx 487; X64-NEXT: # kill: def $ax killed $ax def $eax 488; X64-NEXT: movd %eax, %xmm2 489; X64-NEXT: pextrw $6, %xmm0, %eax 490; X64-NEXT: pextrw $6, %xmm1, %ecx 491; X64-NEXT: # kill: def $ax killed $ax killed $eax 492; X64-NEXT: xorl %edx, %edx 493; X64-NEXT: divw %cx 494; X64-NEXT: # kill: def $ax killed $ax def $eax 495; X64-NEXT: movd %eax, %xmm3 496; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 497; X64-NEXT: pextrw $5, %xmm0, %eax 498; X64-NEXT: pextrw $5, %xmm1, %ecx 499; X64-NEXT: # kill: def $ax killed $ax killed $eax 500; X64-NEXT: xorl %edx, %edx 501; X64-NEXT: divw %cx 502; X64-NEXT: # kill: def $ax killed $ax def $eax 503; X64-NEXT: movd %eax, %xmm4 504; X64-NEXT: pextrw $4, %xmm0, %eax 505; X64-NEXT: pextrw $4, %xmm1, %ecx 506; X64-NEXT: # kill: def $ax killed $ax killed $eax 507; X64-NEXT: xorl %edx, %edx 508; X64-NEXT: divw %cx 509; X64-NEXT: # kill: def $ax killed $ax def $eax 510; X64-NEXT: movd %eax, %xmm2 511; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] 512; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 513; X64-NEXT: pextrw $3, %xmm0, %eax 514; X64-NEXT: pextrw $3, %xmm1, %ecx 515; X64-NEXT: # kill: def $ax killed $ax killed $eax 516; X64-NEXT: xorl %edx, %edx 517; X64-NEXT: divw %cx 518; X64-NEXT: # kill: def $ax killed $ax def $eax 519; X64-NEXT: movd %eax, %xmm3 520; X64-NEXT: pextrw $2, %xmm0, %eax 521; X64-NEXT: pextrw $2, %xmm1, %ecx 522; X64-NEXT: # kill: def $ax killed $ax killed $eax 523; X64-NEXT: xorl %edx, %edx 524; X64-NEXT: divw %cx 525; X64-NEXT: # kill: def $ax killed $ax def $eax 526; X64-NEXT: movd %eax, %xmm4 527; X64-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 528; X64-NEXT: pextrw $1, %xmm0, %eax 529; X64-NEXT: pextrw $1, %xmm1, %ecx 530; X64-NEXT: # kill: def $ax killed $ax killed $eax 531; X64-NEXT: xorl %edx, %edx 532; X64-NEXT: divw %cx 533; X64-NEXT: # kill: def $ax killed $ax def $eax 534; X64-NEXT: movd %eax, %xmm3 535; X64-NEXT: movd %xmm0, %eax 536; X64-NEXT: movd %xmm1, %ecx 537; X64-NEXT: # kill: def $ax killed $ax killed $eax 538; X64-NEXT: xorl %edx, %edx 539; X64-NEXT: divw %cx 540; X64-NEXT: # kill: def $ax killed $ax def $eax 541; X64-NEXT: movd %eax, %xmm5 542; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] 543; X64-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 544; X64-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] 545; X64-NEXT: movdqa %xmm5, (%rdi) 546; X64-NEXT: pmullw %xmm1, %xmm5 547; X64-NEXT: psubw %xmm5, %xmm0 548; X64-NEXT: retq 549 %div = udiv <8 x i16> %x, %y 550 store <8 x i16> %div, <8 x i16>* %divdst, align 16 551 %t1 = mul <8 x i16> %div, %y 552 %t2 = sub <8 x i16> %x, %t1 553 ret <8 x i16> %t2 554} 555 556define <4 x i32> @vector_i128_i32(<4 x i32> %x, <4 x i32> %y, <4 x i32>* %divdst) nounwind { 557; X86-LABEL: vector_i128_i32: 558; X86: # %bb.0: 559; X86-NEXT: pushl %esi 560; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 561; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] 562; X86-NEXT: movd %xmm2, %eax 563; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] 564; X86-NEXT: movd %xmm2, %esi 565; X86-NEXT: xorl %edx, %edx 566; X86-NEXT: divl %esi 567; X86-NEXT: movd %eax, %xmm2 568; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 569; X86-NEXT: movd %xmm3, %eax 570; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] 571; X86-NEXT: movd %xmm3, %esi 572; X86-NEXT: xorl %edx, %edx 573; X86-NEXT: divl %esi 574; X86-NEXT: movd %eax, %xmm3 575; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 576; X86-NEXT: movd %xmm0, %eax 577; X86-NEXT: movd %xmm1, %esi 578; X86-NEXT: xorl %edx, %edx 579; X86-NEXT: divl %esi 580; X86-NEXT: movd %eax, %xmm2 581; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] 582; X86-NEXT: movd %xmm4, %eax 583; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] 584; X86-NEXT: movd %xmm4, %esi 585; X86-NEXT: xorl %edx, %edx 586; X86-NEXT: divl %esi 587; X86-NEXT: movd %eax, %xmm4 588; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] 589; X86-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 590; X86-NEXT: movdqa %xmm2, (%ecx) 591; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 592; X86-NEXT: pmuludq %xmm1, %xmm2 593; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 594; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 595; X86-NEXT: pmuludq %xmm3, %xmm1 596; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 597; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 598; X86-NEXT: psubd %xmm2, %xmm0 599; X86-NEXT: popl %esi 600; X86-NEXT: retl 601; 602; X64-LABEL: vector_i128_i32: 603; X64: # %bb.0: 604; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] 605; X64-NEXT: movd %xmm2, %eax 606; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] 607; X64-NEXT: movd %xmm2, %ecx 608; X64-NEXT: xorl %edx, %edx 609; X64-NEXT: divl %ecx 610; X64-NEXT: movd %eax, %xmm2 611; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 612; X64-NEXT: movd %xmm3, %eax 613; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] 614; X64-NEXT: movd %xmm3, %ecx 615; X64-NEXT: xorl %edx, %edx 616; X64-NEXT: divl %ecx 617; X64-NEXT: movd %eax, %xmm3 618; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 619; X64-NEXT: movd %xmm0, %eax 620; X64-NEXT: movd %xmm1, %ecx 621; X64-NEXT: xorl %edx, %edx 622; X64-NEXT: divl %ecx 623; X64-NEXT: movd %eax, %xmm2 624; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] 625; X64-NEXT: movd %xmm4, %eax 626; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] 627; X64-NEXT: movd %xmm4, %ecx 628; X64-NEXT: xorl %edx, %edx 629; X64-NEXT: divl %ecx 630; X64-NEXT: movd %eax, %xmm4 631; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] 632; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 633; X64-NEXT: movdqa %xmm2, (%rdi) 634; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 635; X64-NEXT: pmuludq %xmm1, %xmm2 636; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 637; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 638; X64-NEXT: pmuludq %xmm3, %xmm1 639; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 640; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 641; X64-NEXT: psubd %xmm2, %xmm0 642; X64-NEXT: retq 643 %div = udiv <4 x i32> %x, %y 644 store <4 x i32> %div, <4 x i32>* %divdst, align 16 645 %t1 = mul <4 x i32> %div, %y 646 %t2 = sub <4 x i32> %x, %t1 647 ret <4 x i32> %t2 648} 649 650define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, <2 x i64>* %divdst) nounwind { 651; X86-LABEL: vector_i128_i64: 652; X86: # %bb.0: 653; X86-NEXT: pushl %esi 654; X86-NEXT: subl $72, %esp 655; X86-NEXT: movdqu %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 656; X86-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 657; X86-NEXT: movl {{[0-9]+}}(%esp), %esi 658; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] 659; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp) 660; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 661; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp) 662; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] 663; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp) 664; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 665; X86-NEXT: movd %xmm1, (%esp) 666; X86-NEXT: calll __udivdi3 667; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload 668; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 669; X86-NEXT: movd %xmm0, {{[0-9]+}}(%esp) 670; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp) 671; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload 672; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 673; X86-NEXT: movd %xmm0, {{[0-9]+}}(%esp) 674; X86-NEXT: movd %xmm1, (%esp) 675; X86-NEXT: movd %edx, %xmm0 676; X86-NEXT: movd %eax, %xmm1 677; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 678; X86-NEXT: movdqu %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 679; X86-NEXT: calll __udivdi3 680; X86-NEXT: movd %edx, %xmm0 681; X86-NEXT: movd %eax, %xmm1 682; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 683; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 684; X86-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 685; X86-NEXT: movdqa %xmm1, (%esi) 686; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm3 # 16-byte Reload 687; X86-NEXT: movdqa %xmm3, %xmm0 688; X86-NEXT: psrlq $32, %xmm0 689; X86-NEXT: pmuludq %xmm1, %xmm0 690; X86-NEXT: movdqa %xmm1, %xmm2 691; X86-NEXT: psrlq $32, %xmm2 692; X86-NEXT: pmuludq %xmm3, %xmm2 693; X86-NEXT: paddq %xmm0, %xmm2 694; X86-NEXT: psllq $32, %xmm2 695; X86-NEXT: pmuludq %xmm3, %xmm1 696; X86-NEXT: paddq %xmm2, %xmm1 697; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 698; X86-NEXT: psubq %xmm1, %xmm0 699; X86-NEXT: addl $72, %esp 700; X86-NEXT: popl %esi 701; X86-NEXT: retl 702; 703; X64-LABEL: vector_i128_i64: 704; X64: # %bb.0: 705; X64-NEXT: movq %xmm0, %rax 706; X64-NEXT: movq %xmm1, %rcx 707; X64-NEXT: xorl %edx, %edx 708; X64-NEXT: divq %rcx 709; X64-NEXT: movq %rax, %xmm2 710; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 711; X64-NEXT: movq %xmm3, %rax 712; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] 713; X64-NEXT: movq %xmm3, %rcx 714; X64-NEXT: xorl %edx, %edx 715; X64-NEXT: divq %rcx 716; X64-NEXT: movq %rax, %xmm3 717; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 718; X64-NEXT: movdqa %xmm2, (%rdi) 719; X64-NEXT: movdqa %xmm1, %xmm3 720; X64-NEXT: psrlq $32, %xmm3 721; X64-NEXT: pmuludq %xmm2, %xmm3 722; X64-NEXT: movdqa %xmm2, %xmm4 723; X64-NEXT: psrlq $32, %xmm4 724; X64-NEXT: pmuludq %xmm1, %xmm4 725; X64-NEXT: paddq %xmm3, %xmm4 726; X64-NEXT: psllq $32, %xmm4 727; X64-NEXT: pmuludq %xmm1, %xmm2 728; X64-NEXT: paddq %xmm4, %xmm2 729; X64-NEXT: psubq %xmm2, %xmm0 730; X64-NEXT: retq 731 %div = udiv <2 x i64> %x, %y 732 store <2 x i64> %div, <2 x i64>* %divdst, align 16 733 %t1 = mul <2 x i64> %div, %y 734 %t2 = sub <2 x i64> %x, %t1 735 ret <2 x i64> %t2 736} 737 738; Special tests. 739 740define i32 @scalar_i32_commutative(i32 %x, i32* %ysrc, i32* %divdst) nounwind { 741; X86-LABEL: scalar_i32_commutative: 742; X86: # %bb.0: 743; X86-NEXT: pushl %edi 744; X86-NEXT: pushl %esi 745; X86-NEXT: movl {{[0-9]+}}(%esp), %esi 746; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 747; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 748; X86-NEXT: movl (%eax), %edi 749; X86-NEXT: movl %ecx, %eax 750; X86-NEXT: xorl %edx, %edx 751; X86-NEXT: divl %edi 752; X86-NEXT: movl %eax, (%esi) 753; X86-NEXT: imull %eax, %edi 754; X86-NEXT: subl %edi, %ecx 755; X86-NEXT: movl %ecx, %eax 756; X86-NEXT: popl %esi 757; X86-NEXT: popl %edi 758; X86-NEXT: retl 759; 760; X64-LABEL: scalar_i32_commutative: 761; X64: # %bb.0: 762; X64-NEXT: movq %rdx, %rcx 763; X64-NEXT: movl (%rsi), %esi 764; X64-NEXT: movl %edi, %eax 765; X64-NEXT: xorl %edx, %edx 766; X64-NEXT: divl %esi 767; X64-NEXT: movl %eax, (%rcx) 768; X64-NEXT: imull %eax, %esi 769; X64-NEXT: subl %esi, %edi 770; X64-NEXT: movl %edi, %eax 771; X64-NEXT: retq 772 %y = load i32, i32* %ysrc, align 4 773 %div = udiv i32 %x, %y 774 store i32 %div, i32* %divdst, align 4 775 %t1 = mul i32 %y, %div ; commutative 776 %t2 = sub i32 %x, %t1 777 ret i32 %t2 778} 779 780; We do not care about extra uses. 781define i32 @extrause(i32 %x, i32 %y, i32* %divdst, i32* %t1dst) nounwind { 782; X86-LABEL: extrause: 783; X86: # %bb.0: 784; X86-NEXT: pushl %ebx 785; X86-NEXT: pushl %edi 786; X86-NEXT: pushl %esi 787; X86-NEXT: movl {{[0-9]+}}(%esp), %esi 788; X86-NEXT: movl {{[0-9]+}}(%esp), %edi 789; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 790; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx 791; X86-NEXT: movl %ecx, %eax 792; X86-NEXT: xorl %edx, %edx 793; X86-NEXT: divl %ebx 794; X86-NEXT: movl %eax, (%edi) 795; X86-NEXT: imull %ebx, %eax 796; X86-NEXT: movl %eax, (%esi) 797; X86-NEXT: subl %eax, %ecx 798; X86-NEXT: movl %ecx, %eax 799; X86-NEXT: popl %esi 800; X86-NEXT: popl %edi 801; X86-NEXT: popl %ebx 802; X86-NEXT: retl 803; 804; X64-LABEL: extrause: 805; X64: # %bb.0: 806; X64-NEXT: movq %rdx, %r8 807; X64-NEXT: movl %edi, %eax 808; X64-NEXT: xorl %edx, %edx 809; X64-NEXT: divl %esi 810; X64-NEXT: movl %eax, (%r8) 811; X64-NEXT: imull %esi, %eax 812; X64-NEXT: movl %eax, (%rcx) 813; X64-NEXT: subl %eax, %edi 814; X64-NEXT: movl %edi, %eax 815; X64-NEXT: retq 816 %div = udiv i32 %x, %y 817 store i32 %div, i32* %divdst, align 4 818 %t1 = mul i32 %div, %y 819 store i32 %t1, i32* %t1dst, align 4 820 %t2 = sub i32 %x, %t1 821 ret i32 %t2 822} 823 824; 'rem' should appear next to 'div'. 825define i32 @multiple_bb(i32 %x, i32 %y, i32* %divdst, i1 zeroext %store_urem, i32* %uremdst) nounwind { 826; X86-LABEL: multiple_bb: 827; X86: # %bb.0: 828; X86-NEXT: pushl %ebx 829; X86-NEXT: pushl %edi 830; X86-NEXT: pushl %esi 831; X86-NEXT: movl {{[0-9]+}}(%esp), %esi 832; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 833; X86-NEXT: movb {{[0-9]+}}(%esp), %bl 834; X86-NEXT: movl {{[0-9]+}}(%esp), %edi 835; X86-NEXT: movl %ecx, %eax 836; X86-NEXT: xorl %edx, %edx 837; X86-NEXT: divl %esi 838; X86-NEXT: movl %eax, (%edi) 839; X86-NEXT: testb %bl, %bl 840; X86-NEXT: je .LBB10_2 841; X86-NEXT: # %bb.1: # %do_urem 842; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 843; X86-NEXT: movl %eax, %edi 844; X86-NEXT: imull %esi, %edi 845; X86-NEXT: subl %edi, %ecx 846; X86-NEXT: movl %ecx, (%edx) 847; X86-NEXT: .LBB10_2: # %end 848; X86-NEXT: popl %esi 849; X86-NEXT: popl %edi 850; X86-NEXT: popl %ebx 851; X86-NEXT: retl 852; 853; X64-LABEL: multiple_bb: 854; X64: # %bb.0: 855; X64-NEXT: movq %rdx, %r9 856; X64-NEXT: movl %edi, %eax 857; X64-NEXT: xorl %edx, %edx 858; X64-NEXT: divl %esi 859; X64-NEXT: movl %eax, (%r9) 860; X64-NEXT: testb %cl, %cl 861; X64-NEXT: je .LBB10_2 862; X64-NEXT: # %bb.1: # %do_urem 863; X64-NEXT: movl %eax, %ecx 864; X64-NEXT: imull %esi, %ecx 865; X64-NEXT: subl %ecx, %edi 866; X64-NEXT: movl %edi, (%r8) 867; X64-NEXT: .LBB10_2: # %end 868; X64-NEXT: retq 869 %div = udiv i32 %x, %y 870 store i32 %div, i32* %divdst, align 4 871 br i1 %store_urem, label %do_urem, label %end 872do_urem: 873 %t1 = mul i32 %div, %y 874 %t2 = sub i32 %x, %t1 875 store i32 %t2, i32* %uremdst, align 4 876 br label %end 877end: 878 ret i32 %div 879} 880 881define i32 @negative_different_x(i32 %x0, i32 %x1, i32 %y, i32* %divdst) nounwind { 882; X86-LABEL: negative_different_x: 883; X86: # %bb.0: 884; X86-NEXT: pushl %edi 885; X86-NEXT: pushl %esi 886; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 887; X86-NEXT: movl {{[0-9]+}}(%esp), %esi 888; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 889; X86-NEXT: movl {{[0-9]+}}(%esp), %edi 890; X86-NEXT: xorl %edx, %edx 891; X86-NEXT: divl %edi 892; X86-NEXT: movl %eax, (%esi) 893; X86-NEXT: imull %edi, %eax 894; X86-NEXT: subl %eax, %ecx 895; X86-NEXT: movl %ecx, %eax 896; X86-NEXT: popl %esi 897; X86-NEXT: popl %edi 898; X86-NEXT: retl 899; 900; X64-LABEL: negative_different_x: 901; X64: # %bb.0: 902; X64-NEXT: movl %edx, %r8d 903; X64-NEXT: movl %edi, %eax 904; X64-NEXT: xorl %edx, %edx 905; X64-NEXT: divl %r8d 906; X64-NEXT: movl %eax, (%rcx) 907; X64-NEXT: imull %r8d, %eax 908; X64-NEXT: subl %eax, %esi 909; X64-NEXT: movl %esi, %eax 910; X64-NEXT: retq 911 %div = udiv i32 %x0, %y ; not %x1 912 store i32 %div, i32* %divdst, align 4 913 %t1 = mul i32 %div, %y 914 %t2 = sub i32 %x1, %t1 ; not %x0 915 ret i32 %t2 916} 917