1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 5 6define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) { 7; SSE-LABEL: fold_srem_vec_1: 8; SSE: # %bb.0: 9; SSE-NEXT: pextrw $3, %xmm0, %eax 10; SSE-NEXT: movswl %ax, %ecx 11; SSE-NEXT: imull $32081, %ecx, %ecx # imm = 0x7D51 12; SSE-NEXT: shrl $16, %ecx 13; SSE-NEXT: subl %eax, %ecx 14; SSE-NEXT: movzwl %cx, %ecx 15; SSE-NEXT: movswl %cx, %edx 16; SSE-NEXT: shrl $15, %ecx 17; SSE-NEXT: sarl $9, %edx 18; SSE-NEXT: addl %ecx, %edx 19; SSE-NEXT: imull $-1003, %edx, %ecx # imm = 0xFC15 20; SSE-NEXT: subl %ecx, %eax 21; SSE-NEXT: movd %xmm0, %ecx 22; SSE-NEXT: movswl %cx, %edx 23; SSE-NEXT: imull $-21385, %edx, %edx # imm = 0xAC77 24; SSE-NEXT: shrl $16, %edx 25; SSE-NEXT: addl %ecx, %edx 26; SSE-NEXT: movzwl %dx, %edx 27; SSE-NEXT: movswl %dx, %esi 28; SSE-NEXT: shrl $15, %edx 29; SSE-NEXT: sarl $6, %esi 30; SSE-NEXT: addl %edx, %esi 31; SSE-NEXT: imull $95, %esi, %edx 32; SSE-NEXT: subl %edx, %ecx 33; SSE-NEXT: movd %ecx, %xmm1 34; SSE-NEXT: pextrw $1, %xmm0, %ecx 35; SSE-NEXT: movswl %cx, %edx 36; SSE-NEXT: imull $-16913, %edx, %edx # imm = 0xBDEF 37; SSE-NEXT: movl %edx, %esi 38; SSE-NEXT: shrl $31, %esi 39; SSE-NEXT: sarl $21, %edx 40; SSE-NEXT: addl %esi, %edx 41; SSE-NEXT: imull $-124, %edx, %edx 42; SSE-NEXT: subl %edx, %ecx 43; SSE-NEXT: pinsrw $1, %ecx, %xmm1 44; SSE-NEXT: pextrw $2, %xmm0, %ecx 45; SSE-NEXT: movswl %cx, %edx 46; SSE-NEXT: imull $2675, %edx, %edx # imm = 0xA73 47; SSE-NEXT: movl %edx, %esi 48; SSE-NEXT: shrl $31, %esi 49; SSE-NEXT: sarl $18, %edx 50; SSE-NEXT: addl %esi, %edx 51; SSE-NEXT: imull $98, %edx, %edx 52; SSE-NEXT: subl %edx, %ecx 53; SSE-NEXT: pinsrw $2, %ecx, %xmm1 54; SSE-NEXT: pinsrw $3, %eax, %xmm1 55; SSE-NEXT: movdqa %xmm1, %xmm0 56; SSE-NEXT: retq 57; 58; AVX-LABEL: fold_srem_vec_1: 59; AVX: # %bb.0: 60; AVX-NEXT: vpextrw $3, %xmm0, %eax 61; AVX-NEXT: movswl %ax, %ecx 62; AVX-NEXT: imull $32081, %ecx, %ecx # imm = 0x7D51 63; AVX-NEXT: shrl $16, %ecx 64; AVX-NEXT: subl %eax, %ecx 65; AVX-NEXT: movzwl %cx, %ecx 66; AVX-NEXT: movswl %cx, %edx 67; AVX-NEXT: shrl $15, %ecx 68; AVX-NEXT: sarl $9, %edx 69; AVX-NEXT: addl %ecx, %edx 70; AVX-NEXT: imull $-1003, %edx, %ecx # imm = 0xFC15 71; AVX-NEXT: subl %ecx, %eax 72; AVX-NEXT: vmovd %xmm0, %ecx 73; AVX-NEXT: movswl %cx, %edx 74; AVX-NEXT: imull $-21385, %edx, %edx # imm = 0xAC77 75; AVX-NEXT: shrl $16, %edx 76; AVX-NEXT: addl %ecx, %edx 77; AVX-NEXT: movzwl %dx, %edx 78; AVX-NEXT: movswl %dx, %esi 79; AVX-NEXT: shrl $15, %edx 80; AVX-NEXT: sarl $6, %esi 81; AVX-NEXT: addl %edx, %esi 82; AVX-NEXT: imull $95, %esi, %edx 83; AVX-NEXT: subl %edx, %ecx 84; AVX-NEXT: vmovd %ecx, %xmm1 85; AVX-NEXT: vpextrw $1, %xmm0, %ecx 86; AVX-NEXT: movswl %cx, %edx 87; AVX-NEXT: imull $-16913, %edx, %edx # imm = 0xBDEF 88; AVX-NEXT: movl %edx, %esi 89; AVX-NEXT: shrl $31, %esi 90; AVX-NEXT: sarl $21, %edx 91; AVX-NEXT: addl %esi, %edx 92; AVX-NEXT: imull $-124, %edx, %edx 93; AVX-NEXT: subl %edx, %ecx 94; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 95; AVX-NEXT: vpextrw $2, %xmm0, %ecx 96; AVX-NEXT: movswl %cx, %edx 97; AVX-NEXT: imull $2675, %edx, %edx # imm = 0xA73 98; AVX-NEXT: movl %edx, %esi 99; AVX-NEXT: shrl $31, %esi 100; AVX-NEXT: sarl $18, %edx 101; AVX-NEXT: addl %esi, %edx 102; AVX-NEXT: imull $98, %edx, %edx 103; AVX-NEXT: subl %edx, %ecx 104; AVX-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm0 105; AVX-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 106; AVX-NEXT: retq 107 %1 = srem <4 x i16> %x, <i16 95, i16 -124, i16 98, i16 -1003> 108 ret <4 x i16> %1 109} 110 111define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) { 112; SSE-LABEL: fold_srem_vec_2: 113; SSE: # %bb.0: 114; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151] 115; SSE-NEXT: pmulhw %xmm0, %xmm1 116; SSE-NEXT: paddw %xmm0, %xmm1 117; SSE-NEXT: movdqa %xmm1, %xmm2 118; SSE-NEXT: psrlw $15, %xmm2 119; SSE-NEXT: psraw $6, %xmm1 120; SSE-NEXT: paddw %xmm2, %xmm1 121; SSE-NEXT: pmullw {{.*}}(%rip), %xmm1 122; SSE-NEXT: psubw %xmm1, %xmm0 123; SSE-NEXT: retq 124; 125; AVX-LABEL: fold_srem_vec_2: 126; AVX: # %bb.0: 127; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1 128; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm1 129; AVX-NEXT: vpsrlw $15, %xmm1, %xmm2 130; AVX-NEXT: vpsraw $6, %xmm1, %xmm1 131; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 132; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 133; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 134; AVX-NEXT: retq 135 %1 = srem <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95> 136 ret <4 x i16> %1 137} 138 139 140; Don't fold if we can combine srem with sdiv. 141define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) { 142; SSE-LABEL: combine_srem_sdiv: 143; SSE: # %bb.0: 144; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151] 145; SSE-NEXT: pmulhw %xmm0, %xmm1 146; SSE-NEXT: paddw %xmm0, %xmm1 147; SSE-NEXT: movdqa %xmm1, %xmm2 148; SSE-NEXT: psrlw $15, %xmm2 149; SSE-NEXT: psraw $6, %xmm1 150; SSE-NEXT: paddw %xmm2, %xmm1 151; SSE-NEXT: movdqa {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95] 152; SSE-NEXT: pmullw %xmm1, %xmm2 153; SSE-NEXT: psubw %xmm2, %xmm0 154; SSE-NEXT: paddw %xmm1, %xmm0 155; SSE-NEXT: retq 156; 157; AVX-LABEL: combine_srem_sdiv: 158; AVX: # %bb.0: 159; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1 160; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm1 161; AVX-NEXT: vpsrlw $15, %xmm1, %xmm2 162; AVX-NEXT: vpsraw $6, %xmm1, %xmm1 163; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 164; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm2 165; AVX-NEXT: vpsubw %xmm2, %xmm0, %xmm0 166; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 167; AVX-NEXT: retq 168 %1 = srem <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95> 169 %2 = sdiv <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95> 170 %3 = add <4 x i16> %1, %2 171 ret <4 x i16> %3 172} 173 174; Don't fold for divisors that are a power of two. 175define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) { 176; SSE-LABEL: dont_fold_srem_power_of_two: 177; SSE: # %bb.0: 178; SSE-NEXT: pextrw $1, %xmm0, %eax 179; SSE-NEXT: leal 31(%rax), %ecx 180; SSE-NEXT: testw %ax, %ax 181; SSE-NEXT: cmovnsl %eax, %ecx 182; SSE-NEXT: andl $-32, %ecx 183; SSE-NEXT: subl %ecx, %eax 184; SSE-NEXT: movd %xmm0, %ecx 185; SSE-NEXT: leal 63(%rcx), %edx 186; SSE-NEXT: testw %cx, %cx 187; SSE-NEXT: cmovnsl %ecx, %edx 188; SSE-NEXT: andl $-64, %edx 189; SSE-NEXT: subl %edx, %ecx 190; SSE-NEXT: movd %ecx, %xmm1 191; SSE-NEXT: pinsrw $1, %eax, %xmm1 192; SSE-NEXT: pextrw $2, %xmm0, %eax 193; SSE-NEXT: leal 7(%rax), %ecx 194; SSE-NEXT: testw %ax, %ax 195; SSE-NEXT: cmovnsl %eax, %ecx 196; SSE-NEXT: andl $-8, %ecx 197; SSE-NEXT: subl %ecx, %eax 198; SSE-NEXT: pinsrw $2, %eax, %xmm1 199; SSE-NEXT: pextrw $3, %xmm0, %eax 200; SSE-NEXT: movswl %ax, %ecx 201; SSE-NEXT: imull $-21385, %ecx, %ecx # imm = 0xAC77 202; SSE-NEXT: shrl $16, %ecx 203; SSE-NEXT: addl %eax, %ecx 204; SSE-NEXT: movzwl %cx, %ecx 205; SSE-NEXT: movswl %cx, %edx 206; SSE-NEXT: shrl $15, %ecx 207; SSE-NEXT: sarl $6, %edx 208; SSE-NEXT: addl %ecx, %edx 209; SSE-NEXT: imull $95, %edx, %ecx 210; SSE-NEXT: subl %ecx, %eax 211; SSE-NEXT: pinsrw $3, %eax, %xmm1 212; SSE-NEXT: movdqa %xmm1, %xmm0 213; SSE-NEXT: retq 214; 215; AVX-LABEL: dont_fold_srem_power_of_two: 216; AVX: # %bb.0: 217; AVX-NEXT: vpextrw $1, %xmm0, %eax 218; AVX-NEXT: leal 31(%rax), %ecx 219; AVX-NEXT: testw %ax, %ax 220; AVX-NEXT: cmovnsl %eax, %ecx 221; AVX-NEXT: andl $-32, %ecx 222; AVX-NEXT: subl %ecx, %eax 223; AVX-NEXT: vmovd %xmm0, %ecx 224; AVX-NEXT: leal 63(%rcx), %edx 225; AVX-NEXT: testw %cx, %cx 226; AVX-NEXT: cmovnsl %ecx, %edx 227; AVX-NEXT: andl $-64, %edx 228; AVX-NEXT: subl %edx, %ecx 229; AVX-NEXT: vmovd %ecx, %xmm1 230; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 231; AVX-NEXT: vpextrw $2, %xmm0, %eax 232; AVX-NEXT: leal 7(%rax), %ecx 233; AVX-NEXT: testw %ax, %ax 234; AVX-NEXT: cmovnsl %eax, %ecx 235; AVX-NEXT: andl $-8, %ecx 236; AVX-NEXT: subl %ecx, %eax 237; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 238; AVX-NEXT: vpextrw $3, %xmm0, %eax 239; AVX-NEXT: movswl %ax, %ecx 240; AVX-NEXT: imull $-21385, %ecx, %ecx # imm = 0xAC77 241; AVX-NEXT: shrl $16, %ecx 242; AVX-NEXT: addl %eax, %ecx 243; AVX-NEXT: movzwl %cx, %ecx 244; AVX-NEXT: movswl %cx, %edx 245; AVX-NEXT: shrl $15, %ecx 246; AVX-NEXT: sarl $6, %edx 247; AVX-NEXT: addl %ecx, %edx 248; AVX-NEXT: imull $95, %edx, %ecx 249; AVX-NEXT: subl %ecx, %eax 250; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 251; AVX-NEXT: retq 252 %1 = srem <4 x i16> %x, <i16 64, i16 32, i16 8, i16 95> 253 ret <4 x i16> %1 254} 255 256; Don't fold if the divisor is one. 257define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { 258; SSE-LABEL: dont_fold_srem_one: 259; SSE: # %bb.0: 260; SSE-NEXT: pextrw $2, %xmm0, %eax 261; SSE-NEXT: movswl %ax, %ecx 262; SSE-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217 263; SSE-NEXT: shrl $16, %ecx 264; SSE-NEXT: addl %eax, %ecx 265; SSE-NEXT: movzwl %cx, %ecx 266; SSE-NEXT: movswl %cx, %edx 267; SSE-NEXT: shrl $15, %ecx 268; SSE-NEXT: sarl $4, %edx 269; SSE-NEXT: addl %ecx, %edx 270; SSE-NEXT: leal (%rdx,%rdx,2), %ecx 271; SSE-NEXT: shll $3, %ecx 272; SSE-NEXT: subl %ecx, %edx 273; SSE-NEXT: addl %eax, %edx 274; SSE-NEXT: pextrw $1, %xmm0, %eax 275; SSE-NEXT: movswl %ax, %ecx 276; SSE-NEXT: imull $12827, %ecx, %ecx # imm = 0x321B 277; SSE-NEXT: movl %ecx, %esi 278; SSE-NEXT: shrl $31, %esi 279; SSE-NEXT: sarl $23, %ecx 280; SSE-NEXT: addl %esi, %ecx 281; SSE-NEXT: imull $654, %ecx, %ecx # imm = 0x28E 282; SSE-NEXT: subl %ecx, %eax 283; SSE-NEXT: pxor %xmm1, %xmm1 284; SSE-NEXT: pinsrw $1, %eax, %xmm1 285; SSE-NEXT: pinsrw $2, %edx, %xmm1 286; SSE-NEXT: pextrw $3, %xmm0, %eax 287; SSE-NEXT: movswl %ax, %ecx 288; SSE-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057 289; SSE-NEXT: movl %ecx, %edx 290; SSE-NEXT: shrl $31, %edx 291; SSE-NEXT: sarl $26, %ecx 292; SSE-NEXT: addl %edx, %ecx 293; SSE-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F 294; SSE-NEXT: subl %ecx, %eax 295; SSE-NEXT: pinsrw $3, %eax, %xmm1 296; SSE-NEXT: movdqa %xmm1, %xmm0 297; SSE-NEXT: retq 298; 299; AVX-LABEL: dont_fold_srem_one: 300; AVX: # %bb.0: 301; AVX-NEXT: vpextrw $2, %xmm0, %eax 302; AVX-NEXT: movswl %ax, %ecx 303; AVX-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217 304; AVX-NEXT: shrl $16, %ecx 305; AVX-NEXT: addl %eax, %ecx 306; AVX-NEXT: movzwl %cx, %ecx 307; AVX-NEXT: movswl %cx, %edx 308; AVX-NEXT: shrl $15, %ecx 309; AVX-NEXT: sarl $4, %edx 310; AVX-NEXT: addl %ecx, %edx 311; AVX-NEXT: leal (%rdx,%rdx,2), %ecx 312; AVX-NEXT: shll $3, %ecx 313; AVX-NEXT: subl %ecx, %edx 314; AVX-NEXT: addl %eax, %edx 315; AVX-NEXT: vpextrw $1, %xmm0, %eax 316; AVX-NEXT: movswl %ax, %ecx 317; AVX-NEXT: imull $12827, %ecx, %ecx # imm = 0x321B 318; AVX-NEXT: movl %ecx, %esi 319; AVX-NEXT: shrl $31, %esi 320; AVX-NEXT: sarl $23, %ecx 321; AVX-NEXT: addl %esi, %ecx 322; AVX-NEXT: imull $654, %ecx, %ecx # imm = 0x28E 323; AVX-NEXT: subl %ecx, %eax 324; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 325; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 326; AVX-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1 327; AVX-NEXT: vpextrw $3, %xmm0, %eax 328; AVX-NEXT: movswl %ax, %ecx 329; AVX-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057 330; AVX-NEXT: movl %ecx, %edx 331; AVX-NEXT: shrl $31, %edx 332; AVX-NEXT: sarl $26, %ecx 333; AVX-NEXT: addl %edx, %ecx 334; AVX-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F 335; AVX-NEXT: subl %ecx, %eax 336; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 337; AVX-NEXT: retq 338 %1 = srem <4 x i16> %x, <i16 1, i16 654, i16 23, i16 5423> 339 ret <4 x i16> %1 340} 341 342; Don't fold if the divisor is 2^15. 343define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) { 344; SSE-LABEL: dont_fold_urem_i16_smax: 345; SSE: # %bb.0: 346; SSE-NEXT: pextrw $2, %xmm0, %eax 347; SSE-NEXT: movswl %ax, %ecx 348; SSE-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217 349; SSE-NEXT: shrl $16, %ecx 350; SSE-NEXT: addl %eax, %ecx 351; SSE-NEXT: movzwl %cx, %ecx 352; SSE-NEXT: movswl %cx, %edx 353; SSE-NEXT: shrl $15, %ecx 354; SSE-NEXT: sarl $4, %edx 355; SSE-NEXT: addl %ecx, %edx 356; SSE-NEXT: leal (%rdx,%rdx,2), %ecx 357; SSE-NEXT: shll $3, %ecx 358; SSE-NEXT: subl %ecx, %edx 359; SSE-NEXT: addl %eax, %edx 360; SSE-NEXT: pextrw $1, %xmm0, %eax 361; SSE-NEXT: leal 32767(%rax), %ecx 362; SSE-NEXT: testw %ax, %ax 363; SSE-NEXT: cmovnsl %eax, %ecx 364; SSE-NEXT: andl $-32768, %ecx # imm = 0x8000 365; SSE-NEXT: addl %eax, %ecx 366; SSE-NEXT: pxor %xmm1, %xmm1 367; SSE-NEXT: pinsrw $1, %ecx, %xmm1 368; SSE-NEXT: pinsrw $2, %edx, %xmm1 369; SSE-NEXT: pextrw $3, %xmm0, %eax 370; SSE-NEXT: movswl %ax, %ecx 371; SSE-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057 372; SSE-NEXT: movl %ecx, %edx 373; SSE-NEXT: shrl $31, %edx 374; SSE-NEXT: sarl $26, %ecx 375; SSE-NEXT: addl %edx, %ecx 376; SSE-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F 377; SSE-NEXT: subl %ecx, %eax 378; SSE-NEXT: pinsrw $3, %eax, %xmm1 379; SSE-NEXT: movdqa %xmm1, %xmm0 380; SSE-NEXT: retq 381; 382; AVX-LABEL: dont_fold_urem_i16_smax: 383; AVX: # %bb.0: 384; AVX-NEXT: vpextrw $2, %xmm0, %eax 385; AVX-NEXT: movswl %ax, %ecx 386; AVX-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217 387; AVX-NEXT: shrl $16, %ecx 388; AVX-NEXT: addl %eax, %ecx 389; AVX-NEXT: movzwl %cx, %ecx 390; AVX-NEXT: movswl %cx, %edx 391; AVX-NEXT: shrl $15, %ecx 392; AVX-NEXT: sarl $4, %edx 393; AVX-NEXT: addl %ecx, %edx 394; AVX-NEXT: leal (%rdx,%rdx,2), %ecx 395; AVX-NEXT: shll $3, %ecx 396; AVX-NEXT: subl %ecx, %edx 397; AVX-NEXT: addl %eax, %edx 398; AVX-NEXT: vpextrw $1, %xmm0, %eax 399; AVX-NEXT: leal 32767(%rax), %ecx 400; AVX-NEXT: testw %ax, %ax 401; AVX-NEXT: cmovnsl %eax, %ecx 402; AVX-NEXT: andl $-32768, %ecx # imm = 0x8000 403; AVX-NEXT: addl %eax, %ecx 404; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 405; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 406; AVX-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1 407; AVX-NEXT: vpextrw $3, %xmm0, %eax 408; AVX-NEXT: movswl %ax, %ecx 409; AVX-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057 410; AVX-NEXT: movl %ecx, %edx 411; AVX-NEXT: shrl $31, %edx 412; AVX-NEXT: sarl $26, %ecx 413; AVX-NEXT: addl %edx, %ecx 414; AVX-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F 415; AVX-NEXT: subl %ecx, %eax 416; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 417; AVX-NEXT: retq 418 %1 = srem <4 x i16> %x, <i16 1, i16 32768, i16 23, i16 5423> 419 ret <4 x i16> %1 420} 421 422; Don't fold i64 srem. 423define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) { 424; SSE-LABEL: dont_fold_srem_i64: 425; SSE: # %bb.0: 426; SSE-NEXT: movdqa %xmm1, %xmm2 427; SSE-NEXT: movq %xmm1, %rcx 428; SSE-NEXT: movabsq $-5614226457215950491, %rdx # imm = 0xB21642C8590B2165 429; SSE-NEXT: movq %rcx, %rax 430; SSE-NEXT: imulq %rdx 431; SSE-NEXT: addq %rcx, %rdx 432; SSE-NEXT: movq %rdx, %rax 433; SSE-NEXT: shrq $63, %rax 434; SSE-NEXT: sarq $4, %rdx 435; SSE-NEXT: addq %rax, %rdx 436; SSE-NEXT: leaq (%rdx,%rdx,2), %rax 437; SSE-NEXT: shlq $3, %rax 438; SSE-NEXT: subq %rax, %rdx 439; SSE-NEXT: addq %rcx, %rdx 440; SSE-NEXT: movq %rdx, %xmm1 441; SSE-NEXT: pextrq $1, %xmm2, %rcx 442; SSE-NEXT: movabsq $6966426675817289639, %rdx # imm = 0x60ADB826E5E517A7 443; SSE-NEXT: movq %rcx, %rax 444; SSE-NEXT: imulq %rdx 445; SSE-NEXT: movq %rdx, %rax 446; SSE-NEXT: shrq $63, %rax 447; SSE-NEXT: sarq $11, %rdx 448; SSE-NEXT: addq %rax, %rdx 449; SSE-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F 450; SSE-NEXT: subq %rax, %rcx 451; SSE-NEXT: movq %rcx, %xmm2 452; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 453; SSE-NEXT: pextrq $1, %xmm0, %rcx 454; SSE-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5 455; SSE-NEXT: movq %rcx, %rax 456; SSE-NEXT: imulq %rdx 457; SSE-NEXT: movq %rdx, %rax 458; SSE-NEXT: shrq $63, %rax 459; SSE-NEXT: sarq $8, %rdx 460; SSE-NEXT: addq %rax, %rdx 461; SSE-NEXT: imulq $654, %rdx, %rax # imm = 0x28E 462; SSE-NEXT: subq %rax, %rcx 463; SSE-NEXT: movq %rcx, %xmm0 464; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] 465; SSE-NEXT: retq 466; 467; AVX1-LABEL: dont_fold_srem_i64: 468; AVX1: # %bb.0: 469; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 470; AVX1-NEXT: vmovq %xmm1, %rcx 471; AVX1-NEXT: movabsq $-5614226457215950491, %rdx # imm = 0xB21642C8590B2165 472; AVX1-NEXT: movq %rcx, %rax 473; AVX1-NEXT: imulq %rdx 474; AVX1-NEXT: addq %rcx, %rdx 475; AVX1-NEXT: movq %rdx, %rax 476; AVX1-NEXT: shrq $63, %rax 477; AVX1-NEXT: sarq $4, %rdx 478; AVX1-NEXT: addq %rax, %rdx 479; AVX1-NEXT: leaq (%rdx,%rdx,2), %rax 480; AVX1-NEXT: shlq $3, %rax 481; AVX1-NEXT: subq %rax, %rdx 482; AVX1-NEXT: addq %rcx, %rdx 483; AVX1-NEXT: vmovq %rdx, %xmm2 484; AVX1-NEXT: vpextrq $1, %xmm1, %rcx 485; AVX1-NEXT: movabsq $6966426675817289639, %rdx # imm = 0x60ADB826E5E517A7 486; AVX1-NEXT: movq %rcx, %rax 487; AVX1-NEXT: imulq %rdx 488; AVX1-NEXT: movq %rdx, %rax 489; AVX1-NEXT: shrq $63, %rax 490; AVX1-NEXT: sarq $11, %rdx 491; AVX1-NEXT: addq %rax, %rdx 492; AVX1-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F 493; AVX1-NEXT: subq %rax, %rcx 494; AVX1-NEXT: vmovq %rcx, %xmm1 495; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 496; AVX1-NEXT: vpextrq $1, %xmm0, %rcx 497; AVX1-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5 498; AVX1-NEXT: movq %rcx, %rax 499; AVX1-NEXT: imulq %rdx 500; AVX1-NEXT: movq %rdx, %rax 501; AVX1-NEXT: shrq $63, %rax 502; AVX1-NEXT: sarq $8, %rdx 503; AVX1-NEXT: addq %rax, %rdx 504; AVX1-NEXT: imulq $654, %rdx, %rax # imm = 0x28E 505; AVX1-NEXT: subq %rax, %rcx 506; AVX1-NEXT: vmovq %rcx, %xmm0 507; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] 508; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 509; AVX1-NEXT: retq 510; 511; AVX2-LABEL: dont_fold_srem_i64: 512; AVX2: # %bb.0: 513; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 514; AVX2-NEXT: vmovq %xmm1, %rcx 515; AVX2-NEXT: movabsq $-5614226457215950491, %rdx # imm = 0xB21642C8590B2165 516; AVX2-NEXT: movq %rcx, %rax 517; AVX2-NEXT: imulq %rdx 518; AVX2-NEXT: addq %rcx, %rdx 519; AVX2-NEXT: movq %rdx, %rax 520; AVX2-NEXT: shrq $63, %rax 521; AVX2-NEXT: sarq $4, %rdx 522; AVX2-NEXT: addq %rax, %rdx 523; AVX2-NEXT: leaq (%rdx,%rdx,2), %rax 524; AVX2-NEXT: shlq $3, %rax 525; AVX2-NEXT: subq %rax, %rdx 526; AVX2-NEXT: addq %rcx, %rdx 527; AVX2-NEXT: vmovq %rdx, %xmm2 528; AVX2-NEXT: vpextrq $1, %xmm1, %rcx 529; AVX2-NEXT: movabsq $6966426675817289639, %rdx # imm = 0x60ADB826E5E517A7 530; AVX2-NEXT: movq %rcx, %rax 531; AVX2-NEXT: imulq %rdx 532; AVX2-NEXT: movq %rdx, %rax 533; AVX2-NEXT: shrq $63, %rax 534; AVX2-NEXT: sarq $11, %rdx 535; AVX2-NEXT: addq %rax, %rdx 536; AVX2-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F 537; AVX2-NEXT: subq %rax, %rcx 538; AVX2-NEXT: vmovq %rcx, %xmm1 539; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 540; AVX2-NEXT: vpextrq $1, %xmm0, %rcx 541; AVX2-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5 542; AVX2-NEXT: movq %rcx, %rax 543; AVX2-NEXT: imulq %rdx 544; AVX2-NEXT: movq %rdx, %rax 545; AVX2-NEXT: shrq $63, %rax 546; AVX2-NEXT: sarq $8, %rdx 547; AVX2-NEXT: addq %rax, %rdx 548; AVX2-NEXT: imulq $654, %rdx, %rax # imm = 0x28E 549; AVX2-NEXT: subq %rax, %rcx 550; AVX2-NEXT: vmovq %rcx, %xmm0 551; AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] 552; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 553; AVX2-NEXT: retq 554 %1 = srem <4 x i64> %x, <i64 1, i64 654, i64 23, i64 5423> 555 ret <4 x i64> %1 556} 557