1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -disable-peephole -mtriple=i686-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefixes=CHECK,X86 3; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefixes=CHECK,X64 4 5define <16 x i8> @commute_fold_vpcomb(<16 x i8>* %a0, <16 x i8> %a1) { 6; X86-LABEL: commute_fold_vpcomb: 7; X86: # %bb.0: 8; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 9; X86-NEXT: vpcomgtb (%eax), %xmm0, %xmm0 10; X86-NEXT: retl 11; 12; X64-LABEL: commute_fold_vpcomb: 13; X64: # %bb.0: 14; X64-NEXT: vpcomgtb (%rdi), %xmm0, %xmm0 15; X64-NEXT: retq 16 %1 = load <16 x i8>, <16 x i8>* %a0 17 %2 = call <16 x i8> @llvm.x86.xop.vpcomb(<16 x i8> %1, <16 x i8> %a1, i8 0) ; vpcomltb 18 ret <16 x i8> %2 19} 20declare <16 x i8> @llvm.x86.xop.vpcomb(<16 x i8>, <16 x i8>, i8) nounwind readnone 21 22define <4 x i32> @commute_fold_vpcomd(<4 x i32>* %a0, <4 x i32> %a1) { 23; X86-LABEL: commute_fold_vpcomd: 24; X86: # %bb.0: 25; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 26; X86-NEXT: vpcomged (%eax), %xmm0, %xmm0 27; X86-NEXT: retl 28; 29; X64-LABEL: commute_fold_vpcomd: 30; X64: # %bb.0: 31; X64-NEXT: vpcomged (%rdi), %xmm0, %xmm0 32; X64-NEXT: retq 33 %1 = load <4 x i32>, <4 x i32>* %a0 34 %2 = call <4 x i32> @llvm.x86.xop.vpcomd(<4 x i32> %1, <4 x i32> %a1, i8 1) ; vpcomled 35 ret <4 x i32> %2 36} 37declare <4 x i32> @llvm.x86.xop.vpcomd(<4 x i32>, <4 x i32>, i8) nounwind readnone 38 39define <2 x i64> @commute_fold_vpcomq(<2 x i64>* %a0, <2 x i64> %a1) { 40; X86-LABEL: commute_fold_vpcomq: 41; X86: # %bb.0: 42; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 43; X86-NEXT: vpcomltq (%eax), %xmm0, %xmm0 44; X86-NEXT: retl 45; 46; X64-LABEL: commute_fold_vpcomq: 47; X64: # %bb.0: 48; X64-NEXT: vpcomltq (%rdi), %xmm0, %xmm0 49; X64-NEXT: retq 50 %1 = load <2 x i64>, <2 x i64>* %a0 51 %2 = call <2 x i64> @llvm.x86.xop.vpcomq(<2 x i64> %1, <2 x i64> %a1, i8 2) ; vpcomgtq 52 ret <2 x i64> %2 53} 54declare <2 x i64> @llvm.x86.xop.vpcomq(<2 x i64>, <2 x i64>, i8) nounwind readnone 55 56define <16 x i8> @commute_fold_vpcomub(<16 x i8>* %a0, <16 x i8> %a1) { 57; X86-LABEL: commute_fold_vpcomub: 58; X86: # %bb.0: 59; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 60; X86-NEXT: vpcomleub (%eax), %xmm0, %xmm0 61; X86-NEXT: retl 62; 63; X64-LABEL: commute_fold_vpcomub: 64; X64: # %bb.0: 65; X64-NEXT: vpcomleub (%rdi), %xmm0, %xmm0 66; X64-NEXT: retq 67 %1 = load <16 x i8>, <16 x i8>* %a0 68 %2 = call <16 x i8> @llvm.x86.xop.vpcomub(<16 x i8> %1, <16 x i8> %a1, i8 3) ; vpcomgeub 69 ret <16 x i8> %2 70} 71declare <16 x i8> @llvm.x86.xop.vpcomub(<16 x i8>, <16 x i8>, i8) nounwind readnone 72 73define <4 x i32> @commute_fold_vpcomud(<4 x i32>* %a0, <4 x i32> %a1) { 74; X86-LABEL: commute_fold_vpcomud: 75; X86: # %bb.0: 76; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 77; X86-NEXT: vpcomeqd (%eax), %xmm0, %xmm0 78; X86-NEXT: retl 79; 80; X64-LABEL: commute_fold_vpcomud: 81; X64: # %bb.0: 82; X64-NEXT: vpcomeqd (%rdi), %xmm0, %xmm0 83; X64-NEXT: retq 84 %1 = load <4 x i32>, <4 x i32>* %a0 85 %2 = call <4 x i32> @llvm.x86.xop.vpcomud(<4 x i32> %1, <4 x i32> %a1, i8 4) ; vpcomequd 86 ret <4 x i32> %2 87} 88declare <4 x i32> @llvm.x86.xop.vpcomud(<4 x i32>, <4 x i32>, i8) nounwind readnone 89 90define <2 x i64> @commute_fold_vpcomuq(<2 x i64>* %a0, <2 x i64> %a1) { 91; X86-LABEL: commute_fold_vpcomuq: 92; X86: # %bb.0: 93; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 94; X86-NEXT: vpcomneqq (%eax), %xmm0, %xmm0 95; X86-NEXT: retl 96; 97; X64-LABEL: commute_fold_vpcomuq: 98; X64: # %bb.0: 99; X64-NEXT: vpcomneqq (%rdi), %xmm0, %xmm0 100; X64-NEXT: retq 101 %1 = load <2 x i64>, <2 x i64>* %a0 102 %2 = call <2 x i64> @llvm.x86.xop.vpcomuq(<2 x i64> %1, <2 x i64> %a1, i8 5) ; vpcomnequq 103 ret <2 x i64> %2 104} 105declare <2 x i64> @llvm.x86.xop.vpcomuq(<2 x i64>, <2 x i64>, i8) nounwind readnone 106 107define <8 x i16> @commute_fold_vpcomuw(<8 x i16>* %a0, <8 x i16> %a1) { 108; CHECK-LABEL: commute_fold_vpcomuw: 109; CHECK: # %bb.0: 110; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 111; CHECK-NEXT: ret{{[l|q]}} 112 %1 = load <8 x i16>, <8 x i16>* %a0 113 %2 = call <8 x i16> @llvm.x86.xop.vpcomuw(<8 x i16> %1, <8 x i16> %a1, i8 6) ; vpcomfalseuw 114 ret <8 x i16> %2 115} 116declare <8 x i16> @llvm.x86.xop.vpcomuw(<8 x i16>, <8 x i16>, i8) nounwind readnone 117 118define <8 x i16> @commute_fold_vpcomw(<8 x i16>* %a0, <8 x i16> %a1) { 119; CHECK-LABEL: commute_fold_vpcomw: 120; CHECK: # %bb.0: 121; CHECK-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 122; CHECK-NEXT: ret{{[l|q]}} 123 %1 = load <8 x i16>, <8 x i16>* %a0 124 %2 = call <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16> %1, <8 x i16> %a1, i8 7) ; vpcomtruew 125 ret <8 x i16> %2 126} 127declare <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16>, <8 x i16>, i8) nounwind readnone 128 129define <4 x i32> @commute_fold_vpmacsdd(<4 x i32>* %a0, <4 x i32> %a1, <4 x i32> %a2) { 130; X86-LABEL: commute_fold_vpmacsdd: 131; X86: # %bb.0: 132; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 133; X86-NEXT: vpmacsdd %xmm1, (%eax), %xmm0, %xmm0 134; X86-NEXT: retl 135; 136; X64-LABEL: commute_fold_vpmacsdd: 137; X64: # %bb.0: 138; X64-NEXT: vpmacsdd %xmm1, (%rdi), %xmm0, %xmm0 139; X64-NEXT: retq 140 %1 = load <4 x i32>, <4 x i32>* %a0 141 %2 = call <4 x i32> @llvm.x86.xop.vpmacsdd(<4 x i32> %1, <4 x i32> %a1, <4 x i32> %a2) 142 ret <4 x i32> %2 143} 144declare <4 x i32> @llvm.x86.xop.vpmacsdd(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone 145 146define <2 x i64> @commute_fold_vpmacsdqh(<4 x i32>* %a0, <4 x i32> %a1, <2 x i64> %a2) { 147; X86-LABEL: commute_fold_vpmacsdqh: 148; X86: # %bb.0: 149; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 150; X86-NEXT: vpmacsdqh %xmm1, (%eax), %xmm0, %xmm0 151; X86-NEXT: retl 152; 153; X64-LABEL: commute_fold_vpmacsdqh: 154; X64: # %bb.0: 155; X64-NEXT: vpmacsdqh %xmm1, (%rdi), %xmm0, %xmm0 156; X64-NEXT: retq 157 %1 = load <4 x i32>, <4 x i32>* %a0 158 %2 = call <2 x i64> @llvm.x86.xop.vpmacsdqh(<4 x i32> %1, <4 x i32> %a1, <2 x i64> %a2) 159 ret <2 x i64> %2 160} 161declare <2 x i64> @llvm.x86.xop.vpmacsdqh(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone 162 163define <2 x i64> @commute_fold_vpmacsdql(<4 x i32>* %a0, <4 x i32> %a1, <2 x i64> %a2) { 164; X86-LABEL: commute_fold_vpmacsdql: 165; X86: # %bb.0: 166; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 167; X86-NEXT: vpmacsdql %xmm1, (%eax), %xmm0, %xmm0 168; X86-NEXT: retl 169; 170; X64-LABEL: commute_fold_vpmacsdql: 171; X64: # %bb.0: 172; X64-NEXT: vpmacsdql %xmm1, (%rdi), %xmm0, %xmm0 173; X64-NEXT: retq 174 %1 = load <4 x i32>, <4 x i32>* %a0 175 %2 = call <2 x i64> @llvm.x86.xop.vpmacsdql(<4 x i32> %1, <4 x i32> %a1, <2 x i64> %a2) 176 ret <2 x i64> %2 177} 178declare <2 x i64> @llvm.x86.xop.vpmacsdql(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone 179 180define <4 x i32> @commute_fold_vpmacssdd(<4 x i32>* %a0, <4 x i32> %a1, <4 x i32> %a2) { 181; X86-LABEL: commute_fold_vpmacssdd: 182; X86: # %bb.0: 183; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 184; X86-NEXT: vpmacssdd %xmm1, (%eax), %xmm0, %xmm0 185; X86-NEXT: retl 186; 187; X64-LABEL: commute_fold_vpmacssdd: 188; X64: # %bb.0: 189; X64-NEXT: vpmacssdd %xmm1, (%rdi), %xmm0, %xmm0 190; X64-NEXT: retq 191 %1 = load <4 x i32>, <4 x i32>* %a0 192 %2 = call <4 x i32> @llvm.x86.xop.vpmacssdd(<4 x i32> %1, <4 x i32> %a1, <4 x i32> %a2) 193 ret <4 x i32> %2 194} 195declare <4 x i32> @llvm.x86.xop.vpmacssdd(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone 196 197define <2 x i64> @commute_fold_vpmacssdqh(<4 x i32>* %a0, <4 x i32> %a1, <2 x i64> %a2) { 198; X86-LABEL: commute_fold_vpmacssdqh: 199; X86: # %bb.0: 200; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 201; X86-NEXT: vpmacssdqh %xmm1, (%eax), %xmm0, %xmm0 202; X86-NEXT: retl 203; 204; X64-LABEL: commute_fold_vpmacssdqh: 205; X64: # %bb.0: 206; X64-NEXT: vpmacssdqh %xmm1, (%rdi), %xmm0, %xmm0 207; X64-NEXT: retq 208 %1 = load <4 x i32>, <4 x i32>* %a0 209 %2 = call <2 x i64> @llvm.x86.xop.vpmacssdqh(<4 x i32> %1, <4 x i32> %a1, <2 x i64> %a2) 210 ret <2 x i64> %2 211} 212declare <2 x i64> @llvm.x86.xop.vpmacssdqh(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone 213 214define <2 x i64> @commute_fold_vpmacssdql(<4 x i32>* %a0, <4 x i32> %a1, <2 x i64> %a2) { 215; X86-LABEL: commute_fold_vpmacssdql: 216; X86: # %bb.0: 217; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 218; X86-NEXT: vpmacssdql %xmm1, (%eax), %xmm0, %xmm0 219; X86-NEXT: retl 220; 221; X64-LABEL: commute_fold_vpmacssdql: 222; X64: # %bb.0: 223; X64-NEXT: vpmacssdql %xmm1, (%rdi), %xmm0, %xmm0 224; X64-NEXT: retq 225 %1 = load <4 x i32>, <4 x i32>* %a0 226 %2 = call <2 x i64> @llvm.x86.xop.vpmacssdql(<4 x i32> %1, <4 x i32> %a1, <2 x i64> %a2) 227 ret <2 x i64> %2 228} 229declare <2 x i64> @llvm.x86.xop.vpmacssdql(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone 230 231define <4 x i32> @commute_fold_vpmacsswd(<8 x i16>* %a0, <8 x i16> %a1, <4 x i32> %a2) { 232; X86-LABEL: commute_fold_vpmacsswd: 233; X86: # %bb.0: 234; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 235; X86-NEXT: vpmacsswd %xmm1, (%eax), %xmm0, %xmm0 236; X86-NEXT: retl 237; 238; X64-LABEL: commute_fold_vpmacsswd: 239; X64: # %bb.0: 240; X64-NEXT: vpmacsswd %xmm1, (%rdi), %xmm0, %xmm0 241; X64-NEXT: retq 242 %1 = load <8 x i16>, <8 x i16>* %a0 243 %2 = call <4 x i32> @llvm.x86.xop.vpmacsswd(<8 x i16> %1, <8 x i16> %a1, <4 x i32> %a2) 244 ret <4 x i32> %2 245} 246declare <4 x i32> @llvm.x86.xop.vpmacsswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone 247 248define <8 x i16> @commute_fold_vpmacssww(<8 x i16>* %a0, <8 x i16> %a1, <8 x i16> %a2) { 249; X86-LABEL: commute_fold_vpmacssww: 250; X86: # %bb.0: 251; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 252; X86-NEXT: vpmacssww %xmm1, (%eax), %xmm0, %xmm0 253; X86-NEXT: retl 254; 255; X64-LABEL: commute_fold_vpmacssww: 256; X64: # %bb.0: 257; X64-NEXT: vpmacssww %xmm1, (%rdi), %xmm0, %xmm0 258; X64-NEXT: retq 259 %1 = load <8 x i16>, <8 x i16>* %a0 260 %2 = call <8 x i16> @llvm.x86.xop.vpmacssww(<8 x i16> %1, <8 x i16> %a1, <8 x i16> %a2) 261 ret <8 x i16> %2 262} 263declare <8 x i16> @llvm.x86.xop.vpmacssww(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone 264 265define <4 x i32> @commute_fold_vpmacswd(<8 x i16>* %a0, <8 x i16> %a1, <4 x i32> %a2) { 266; X86-LABEL: commute_fold_vpmacswd: 267; X86: # %bb.0: 268; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 269; X86-NEXT: vpmacswd %xmm1, (%eax), %xmm0, %xmm0 270; X86-NEXT: retl 271; 272; X64-LABEL: commute_fold_vpmacswd: 273; X64: # %bb.0: 274; X64-NEXT: vpmacswd %xmm1, (%rdi), %xmm0, %xmm0 275; X64-NEXT: retq 276 %1 = load <8 x i16>, <8 x i16>* %a0 277 %2 = call <4 x i32> @llvm.x86.xop.vpmacswd(<8 x i16> %1, <8 x i16> %a1, <4 x i32> %a2) 278 ret <4 x i32> %2 279} 280declare <4 x i32> @llvm.x86.xop.vpmacswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone 281 282define <8 x i16> @commute_fold_vpmacsww(<8 x i16>* %a0, <8 x i16> %a1, <8 x i16> %a2) { 283; X86-LABEL: commute_fold_vpmacsww: 284; X86: # %bb.0: 285; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 286; X86-NEXT: vpmacsww %xmm1, (%eax), %xmm0, %xmm0 287; X86-NEXT: retl 288; 289; X64-LABEL: commute_fold_vpmacsww: 290; X64: # %bb.0: 291; X64-NEXT: vpmacsww %xmm1, (%rdi), %xmm0, %xmm0 292; X64-NEXT: retq 293 %1 = load <8 x i16>, <8 x i16>* %a0 294 %2 = call <8 x i16> @llvm.x86.xop.vpmacsww(<8 x i16> %1, <8 x i16> %a1, <8 x i16> %a2) 295 ret <8 x i16> %2 296} 297declare <8 x i16> @llvm.x86.xop.vpmacsww(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone 298 299define <4 x i32> @commute_fold_vpmadcsswd(<8 x i16>* %a0, <8 x i16> %a1, <4 x i32> %a2) { 300; X86-LABEL: commute_fold_vpmadcsswd: 301; X86: # %bb.0: 302; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 303; X86-NEXT: vpmadcsswd %xmm1, (%eax), %xmm0, %xmm0 304; X86-NEXT: retl 305; 306; X64-LABEL: commute_fold_vpmadcsswd: 307; X64: # %bb.0: 308; X64-NEXT: vpmadcsswd %xmm1, (%rdi), %xmm0, %xmm0 309; X64-NEXT: retq 310 %1 = load <8 x i16>, <8 x i16>* %a0 311 %2 = call <4 x i32> @llvm.x86.xop.vpmadcsswd(<8 x i16> %1, <8 x i16> %a1, <4 x i32> %a2) 312 ret <4 x i32> %2 313} 314declare <4 x i32> @llvm.x86.xop.vpmadcsswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone 315 316define <4 x i32> @commute_fold_vpmadcswd(<8 x i16>* %a0, <8 x i16> %a1, <4 x i32> %a2) { 317; X86-LABEL: commute_fold_vpmadcswd: 318; X86: # %bb.0: 319; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 320; X86-NEXT: vpmadcswd %xmm1, (%eax), %xmm0, %xmm0 321; X86-NEXT: retl 322; 323; X64-LABEL: commute_fold_vpmadcswd: 324; X64: # %bb.0: 325; X64-NEXT: vpmadcswd %xmm1, (%rdi), %xmm0, %xmm0 326; X64-NEXT: retq 327 %1 = load <8 x i16>, <8 x i16>* %a0 328 %2 = call <4 x i32> @llvm.x86.xop.vpmadcswd(<8 x i16> %1, <8 x i16> %a1, <4 x i32> %a2) 329 ret <4 x i32> %2 330} 331declare <4 x i32> @llvm.x86.xop.vpmadcswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone 332