1//===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file describes the X86 SSE instruction set, defining the instructions, 10// and properties of the instructions which are needed for code generation, 11// machine code emission, and analysis. 12// 13//===----------------------------------------------------------------------===// 14 15//===----------------------------------------------------------------------===// 16// SSE 1 & 2 Instructions Classes 17//===----------------------------------------------------------------------===// 18 19/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class 20multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, 21 RegisterClass RC, X86MemOperand x86memop, 22 Domain d, X86FoldableSchedWrite sched, 23 bit Is2Addr = 1> { 24let isCodeGenOnly = 1 in { 25 let isCommutable = 1 in { 26 def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 27 !if(Is2Addr, 28 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 29 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 30 [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], d>, 31 Sched<[sched]>; 32 } 33 def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 34 !if(Is2Addr, 35 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 36 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 37 [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], d>, 38 Sched<[sched.Folded, sched.ReadAfterFold]>; 39} 40} 41 42/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class 43multiclass sse12_fp_scalar_int<bits<8> opc, 44 SDPatternOperator OpNode, RegisterClass RC, 45 ValueType VT, string asm, Operand memopr, 46 PatFrags mem_frags, Domain d, 47 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 48let hasSideEffects = 0 in { 49 def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 50 !if(Is2Addr, 51 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), 52 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 53 [(set RC:$dst, (VT (OpNode RC:$src1, RC:$src2)))], d>, 54 Sched<[sched]>; 55 let mayLoad = 1 in 56 def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2), 57 !if(Is2Addr, 58 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), 59 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 60 [(set RC:$dst, (VT (OpNode RC:$src1, (mem_frags addr:$src2))))], d>, 61 Sched<[sched.Folded, sched.ReadAfterFold]>; 62} 63} 64 65/// sse12_fp_packed - SSE 1 & 2 packed instructions class 66multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, 67 RegisterClass RC, ValueType vt, 68 X86MemOperand x86memop, PatFrag mem_frag, 69 Domain d, X86FoldableSchedWrite sched, 70 bit Is2Addr = 1> { 71 let isCommutable = 1 in 72 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 73 !if(Is2Addr, 74 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 75 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 76 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], d>, 77 Sched<[sched]>; 78 let mayLoad = 1 in 79 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 80 !if(Is2Addr, 81 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 82 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 83 [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))], 84 d>, 85 Sched<[sched.Folded, sched.ReadAfterFold]>; 86} 87 88/// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class 89multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d, 90 string OpcodeStr, X86MemOperand x86memop, 91 X86FoldableSchedWrite sched, 92 list<dag> pat_rr, list<dag> pat_rm, 93 bit Is2Addr = 1> { 94 let isCommutable = 1, hasSideEffects = 0 in 95 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 96 !if(Is2Addr, 97 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 98 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 99 pat_rr, d>, 100 Sched<[sched]>; 101 let hasSideEffects = 0, mayLoad = 1 in 102 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 103 !if(Is2Addr, 104 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 105 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 106 pat_rm, d>, 107 Sched<[sched.Folded, sched.ReadAfterFold]>; 108} 109 110 111// Alias instructions that map fld0 to xorps for sse or vxorps for avx. 112// This is expanded by ExpandPostRAPseudos. 113let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 114 isPseudo = 1, SchedRW = [WriteZero] in { 115 def FsFLD0SH : I<0, Pseudo, (outs FR16:$dst), (ins), "", 116 [(set FR16:$dst, fp16imm0)]>, Requires<[HasSSE2, NoAVX512]>; 117 def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "", 118 [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>; 119 def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "", 120 [(set FR64:$dst, fp64imm0)]>, Requires<[HasSSE2, NoAVX512]>; 121 def FsFLD0F128 : I<0, Pseudo, (outs VR128:$dst), (ins), "", 122 [(set VR128:$dst, fp128imm0)]>, Requires<[HasSSE1, NoAVX512]>; 123} 124 125//===----------------------------------------------------------------------===// 126// AVX & SSE - Zero/One Vectors 127//===----------------------------------------------------------------------===// 128 129// Alias instruction that maps zero vector to pxor / xorp* for sse. 130// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then 131// swizzled by ExecutionDomainFix to pxor. 132// We set canFoldAsLoad because this can be converted to a constant-pool 133// load of an all-zeros value if folding it would be beneficial. 134let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 135 isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in { 136def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "", 137 [(set VR128:$dst, (v4f32 immAllZerosV))]>; 138} 139 140let Predicates = [NoAVX512] in { 141def : Pat<(v16i8 immAllZerosV), (V_SET0)>; 142def : Pat<(v8i16 immAllZerosV), (V_SET0)>; 143def : Pat<(v8f16 immAllZerosV), (V_SET0)>; 144def : Pat<(v4i32 immAllZerosV), (V_SET0)>; 145def : Pat<(v2i64 immAllZerosV), (V_SET0)>; 146def : Pat<(v2f64 immAllZerosV), (V_SET0)>; 147} 148 149 150// The same as done above but for AVX. The 256-bit AVX1 ISA doesn't support PI, 151// and doesn't need it because on sandy bridge the register is set to zero 152// at the rename stage without using any execution unit, so SET0PSY 153// and SET0PDY can be used for vector int instructions without penalty 154let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 155 isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in { 156def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "", 157 [(set VR256:$dst, (v8i32 immAllZerosV))]>; 158} 159 160let Predicates = [NoAVX512] in { 161def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>; 162def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>; 163def : Pat<(v16f16 immAllZerosV), (AVX_SET0)>; 164def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>; 165def : Pat<(v8f32 immAllZerosV), (AVX_SET0)>; 166def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>; 167} 168 169// We set canFoldAsLoad because this can be converted to a constant-pool 170// load of an all-ones value if folding it would be beneficial. 171let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 172 isPseudo = 1, SchedRW = [WriteZero] in { 173 def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "", 174 [(set VR128:$dst, (v4i32 immAllOnesV))]>; 175 let Predicates = [HasAVX1Only, OptForMinSize] in { 176 def AVX1_SETALLONES: I<0, Pseudo, (outs VR256:$dst), (ins), "", 177 [(set VR256:$dst, (v8i32 immAllOnesV))]>; 178 } 179 let Predicates = [HasAVX2] in 180 def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "", 181 [(set VR256:$dst, (v8i32 immAllOnesV))]>; 182} 183 184//===----------------------------------------------------------------------===// 185// SSE 1 & 2 - Move FP Scalar Instructions 186// 187// Move Instructions. Register-to-register movss/movsd is not used for FR32/64 188// register copies because it's a partial register update; Register-to-register 189// movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires 190// that the insert be implementable in terms of a copy, and just mentioned, we 191// don't use movss/movsd for copies. 192//===----------------------------------------------------------------------===// 193 194multiclass sse12_move_rr<SDNode OpNode, ValueType vt, string base_opc, 195 string asm_opr, Domain d, string Name> { 196 let isCommutable = 1 in 197 def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst), 198 (ins VR128:$src1, VR128:$src2), 199 !strconcat(base_opc, asm_opr), 200 [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))], d>, 201 Sched<[SchedWriteFShuffle.XMM]>; 202 203 // For the disassembler 204 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in 205 def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst), 206 (ins VR128:$src1, VR128:$src2), 207 !strconcat(base_opc, asm_opr), []>, 208 Sched<[SchedWriteFShuffle.XMM]>, FoldGenData<Name#rr>; 209} 210 211multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt, 212 X86MemOperand x86memop, string OpcodeStr, 213 Domain d, string Name, Predicate pred> { 214 // AVX 215 let Predicates = [UseAVX, OptForSize] in 216 defm V#NAME : sse12_move_rr<OpNode, vt, OpcodeStr, 217 "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d, 218 "V"#Name>, 219 VEX_4V, VEX_LIG, VEX_WIG; 220 221 def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), 222 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 223 [(store RC:$src, addr:$dst)], d>, 224 VEX, VEX_LIG, Sched<[WriteFStore]>, VEX_WIG; 225 // SSE1 & 2 226 let Constraints = "$src1 = $dst" in { 227 let Predicates = [pred, NoSSE41_Or_OptForSize] in 228 defm NAME : sse12_move_rr<OpNode, vt, OpcodeStr, 229 "\t{$src2, $dst|$dst, $src2}", d, Name>; 230 } 231 232 def NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), 233 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 234 [(store RC:$src, addr:$dst)], d>, 235 Sched<[WriteFStore]>; 236 237 def : InstAlias<"v"#OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}", 238 (!cast<Instruction>("V"#NAME#"rr_REV") 239 VR128:$dst, VR128:$src1, VR128:$src2), 0>; 240 def : InstAlias<OpcodeStr#".s\t{$src2, $dst|$dst, $src2}", 241 (!cast<Instruction>(NAME#"rr_REV") 242 VR128:$dst, VR128:$src2), 0>; 243} 244 245// Loading from memory automatically zeroing upper bits. 246multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop, 247 PatFrag mem_pat, PatFrag vzloadfrag, string OpcodeStr, 248 Domain d> { 249 def V#NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), 250 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 251 [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>, 252 VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG; 253 def NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), 254 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 255 [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>, 256 Sched<[WriteFLoad]>; 257 258 // _alt version uses FR32/FR64 register class. 259 let isCodeGenOnly = 1 in { 260 def V#NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 261 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 262 [(set RC:$dst, (mem_pat addr:$src))], d>, 263 VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG; 264 def NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 265 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 266 [(set RC:$dst, (mem_pat addr:$src))], d>, 267 Sched<[WriteFLoad]>; 268 } 269} 270 271defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss", 272 SSEPackedSingle, "MOVSS", UseSSE1>, XS; 273defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd", 274 SSEPackedDouble, "MOVSD", UseSSE2>, XD; 275 276let canFoldAsLoad = 1, isReMaterializable = 1 in { 277 defm MOVSS : sse12_move_rm<FR32, v4f32, f32mem, loadf32, X86vzload32, "movss", 278 SSEPackedSingle>, XS; 279 defm MOVSD : sse12_move_rm<FR64, v2f64, f64mem, loadf64, X86vzload64, "movsd", 280 SSEPackedDouble>, XD; 281} 282 283// Patterns 284let Predicates = [UseAVX] in { 285 def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))), 286 (VMOVSSrm addr:$src)>; 287 def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))), 288 (VMOVSDrm addr:$src)>; 289 290 // Represent the same patterns above but in the form they appear for 291 // 256-bit types 292 def : Pat<(v8f32 (X86vzload32 addr:$src)), 293 (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>; 294 def : Pat<(v4f64 (X86vzload64 addr:$src)), 295 (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>; 296} 297 298let Predicates = [UseAVX, OptForSize] in { 299 // Move scalar to XMM zero-extended, zeroing a VR128 then do a 300 // MOVSS to the lower bits. 301 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 302 (VMOVSSrr (v4f32 (V_SET0)), VR128:$src)>; 303 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 304 (VMOVSSrr (v4i32 (V_SET0)), VR128:$src)>; 305 306 // Move low f32 and clear high bits. 307 def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), 308 (SUBREG_TO_REG (i32 0), 309 (v4f32 (VMOVSSrr (v4f32 (V_SET0)), 310 (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)))), sub_xmm)>; 311 def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), 312 (SUBREG_TO_REG (i32 0), 313 (v4i32 (VMOVSSrr (v4i32 (V_SET0)), 314 (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>; 315} 316 317let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in { 318// Move scalar to XMM zero-extended, zeroing a VR128 then do a 319// MOVSS to the lower bits. 320def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 321 (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>; 322def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 323 (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>; 324} 325 326let Predicates = [UseSSE2] in 327def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))), 328 (MOVSDrm addr:$src)>; 329 330let Predicates = [UseSSE1] in 331def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))), 332 (MOVSSrm addr:$src)>; 333 334//===----------------------------------------------------------------------===// 335// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions 336//===----------------------------------------------------------------------===// 337 338multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC, 339 X86MemOperand x86memop, PatFrag ld_frag, 340 string asm, Domain d, 341 X86SchedWriteMoveLS sched> { 342let hasSideEffects = 0, isMoveReg = 1 in 343 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), 344 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>, 345 Sched<[sched.RR]>; 346let canFoldAsLoad = 1, isReMaterializable = 1 in 347 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 348 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 349 [(set RC:$dst, (ld_frag addr:$src))], d>, 350 Sched<[sched.RM]>; 351} 352 353let Predicates = [HasAVX, NoVLX] in { 354defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps", 355 SSEPackedSingle, SchedWriteFMoveLS.XMM>, 356 PS, VEX, VEX_WIG; 357defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd", 358 SSEPackedDouble, SchedWriteFMoveLS.XMM>, 359 PD, VEX, VEX_WIG; 360defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups", 361 SSEPackedSingle, SchedWriteFMoveLS.XMM>, 362 PS, VEX, VEX_WIG; 363defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd", 364 SSEPackedDouble, SchedWriteFMoveLS.XMM>, 365 PD, VEX, VEX_WIG; 366 367defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, "movaps", 368 SSEPackedSingle, SchedWriteFMoveLS.YMM>, 369 PS, VEX, VEX_L, VEX_WIG; 370defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, "movapd", 371 SSEPackedDouble, SchedWriteFMoveLS.YMM>, 372 PD, VEX, VEX_L, VEX_WIG; 373defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups", 374 SSEPackedSingle, SchedWriteFMoveLS.YMM>, 375 PS, VEX, VEX_L, VEX_WIG; 376defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd", 377 SSEPackedDouble, SchedWriteFMoveLS.YMM>, 378 PD, VEX, VEX_L, VEX_WIG; 379} 380 381let Predicates = [UseSSE1] in { 382defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps", 383 SSEPackedSingle, SchedWriteFMoveLS.XMM>, 384 PS; 385defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups", 386 SSEPackedSingle, SchedWriteFMoveLS.XMM>, 387 PS; 388} 389let Predicates = [UseSSE2] in { 390defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd", 391 SSEPackedDouble, SchedWriteFMoveLS.XMM>, 392 PD; 393defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd", 394 SSEPackedDouble, SchedWriteFMoveLS.XMM>, 395 PD; 396} 397 398let Predicates = [HasAVX, NoVLX] in { 399let SchedRW = [SchedWriteFMoveLS.XMM.MR] in { 400def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 401 "movaps\t{$src, $dst|$dst, $src}", 402 [(alignedstore (v4f32 VR128:$src), addr:$dst)]>, 403 VEX, VEX_WIG; 404def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 405 "movapd\t{$src, $dst|$dst, $src}", 406 [(alignedstore (v2f64 VR128:$src), addr:$dst)]>, 407 VEX, VEX_WIG; 408def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 409 "movups\t{$src, $dst|$dst, $src}", 410 [(store (v4f32 VR128:$src), addr:$dst)]>, 411 VEX, VEX_WIG; 412def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 413 "movupd\t{$src, $dst|$dst, $src}", 414 [(store (v2f64 VR128:$src), addr:$dst)]>, 415 VEX, VEX_WIG; 416} // SchedRW 417 418let SchedRW = [SchedWriteFMoveLS.YMM.MR] in { 419def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 420 "movaps\t{$src, $dst|$dst, $src}", 421 [(alignedstore (v8f32 VR256:$src), addr:$dst)]>, 422 VEX, VEX_L, VEX_WIG; 423def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 424 "movapd\t{$src, $dst|$dst, $src}", 425 [(alignedstore (v4f64 VR256:$src), addr:$dst)]>, 426 VEX, VEX_L, VEX_WIG; 427def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 428 "movups\t{$src, $dst|$dst, $src}", 429 [(store (v8f32 VR256:$src), addr:$dst)]>, 430 VEX, VEX_L, VEX_WIG; 431def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 432 "movupd\t{$src, $dst|$dst, $src}", 433 [(store (v4f64 VR256:$src), addr:$dst)]>, 434 VEX, VEX_L, VEX_WIG; 435} // SchedRW 436} // Predicate 437 438// For disassembler 439let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, 440 isMoveReg = 1 in { 441let SchedRW = [SchedWriteFMoveLS.XMM.RR] in { 442 def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst), 443 (ins VR128:$src), 444 "movaps\t{$src, $dst|$dst, $src}", []>, 445 VEX, VEX_WIG, FoldGenData<"VMOVAPSrr">; 446 def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst), 447 (ins VR128:$src), 448 "movapd\t{$src, $dst|$dst, $src}", []>, 449 VEX, VEX_WIG, FoldGenData<"VMOVAPDrr">; 450 def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst), 451 (ins VR128:$src), 452 "movups\t{$src, $dst|$dst, $src}", []>, 453 VEX, VEX_WIG, FoldGenData<"VMOVUPSrr">; 454 def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst), 455 (ins VR128:$src), 456 "movupd\t{$src, $dst|$dst, $src}", []>, 457 VEX, VEX_WIG, FoldGenData<"VMOVUPDrr">; 458} // SchedRW 459 460let SchedRW = [SchedWriteFMoveLS.YMM.RR] in { 461 def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst), 462 (ins VR256:$src), 463 "movaps\t{$src, $dst|$dst, $src}", []>, 464 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPSYrr">; 465 def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst), 466 (ins VR256:$src), 467 "movapd\t{$src, $dst|$dst, $src}", []>, 468 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPDYrr">; 469 def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst), 470 (ins VR256:$src), 471 "movups\t{$src, $dst|$dst, $src}", []>, 472 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPSYrr">; 473 def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst), 474 (ins VR256:$src), 475 "movupd\t{$src, $dst|$dst, $src}", []>, 476 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPDYrr">; 477} // SchedRW 478} // Predicate 479 480// Reversed version with ".s" suffix for GAS compatibility. 481def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}", 482 (VMOVAPSrr_REV VR128:$dst, VR128:$src), 0>; 483def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}", 484 (VMOVAPDrr_REV VR128:$dst, VR128:$src), 0>; 485def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}", 486 (VMOVUPSrr_REV VR128:$dst, VR128:$src), 0>; 487def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}", 488 (VMOVUPDrr_REV VR128:$dst, VR128:$src), 0>; 489def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}", 490 (VMOVAPSYrr_REV VR256:$dst, VR256:$src), 0>; 491def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}", 492 (VMOVAPDYrr_REV VR256:$dst, VR256:$src), 0>; 493def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}", 494 (VMOVUPSYrr_REV VR256:$dst, VR256:$src), 0>; 495def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}", 496 (VMOVUPDYrr_REV VR256:$dst, VR256:$src), 0>; 497 498let SchedRW = [SchedWriteFMoveLS.XMM.MR] in { 499def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 500 "movaps\t{$src, $dst|$dst, $src}", 501 [(alignedstore (v4f32 VR128:$src), addr:$dst)]>; 502def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 503 "movapd\t{$src, $dst|$dst, $src}", 504 [(alignedstore (v2f64 VR128:$src), addr:$dst)]>; 505def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 506 "movups\t{$src, $dst|$dst, $src}", 507 [(store (v4f32 VR128:$src), addr:$dst)]>; 508def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 509 "movupd\t{$src, $dst|$dst, $src}", 510 [(store (v2f64 VR128:$src), addr:$dst)]>; 511} // SchedRW 512 513// For disassembler 514let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, 515 isMoveReg = 1, SchedRW = [SchedWriteFMoveLS.XMM.RR] in { 516 def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 517 "movaps\t{$src, $dst|$dst, $src}", []>, 518 FoldGenData<"MOVAPSrr">; 519 def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 520 "movapd\t{$src, $dst|$dst, $src}", []>, 521 FoldGenData<"MOVAPDrr">; 522 def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 523 "movups\t{$src, $dst|$dst, $src}", []>, 524 FoldGenData<"MOVUPSrr">; 525 def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 526 "movupd\t{$src, $dst|$dst, $src}", []>, 527 FoldGenData<"MOVUPDrr">; 528} 529 530// Reversed version with ".s" suffix for GAS compatibility. 531def : InstAlias<"movaps.s\t{$src, $dst|$dst, $src}", 532 (MOVAPSrr_REV VR128:$dst, VR128:$src), 0>; 533def : InstAlias<"movapd.s\t{$src, $dst|$dst, $src}", 534 (MOVAPDrr_REV VR128:$dst, VR128:$src), 0>; 535def : InstAlias<"movups.s\t{$src, $dst|$dst, $src}", 536 (MOVUPSrr_REV VR128:$dst, VR128:$src), 0>; 537def : InstAlias<"movupd.s\t{$src, $dst|$dst, $src}", 538 (MOVUPDrr_REV VR128:$dst, VR128:$src), 0>; 539 540let Predicates = [HasAVX, NoVLX] in { 541 // 256-bit load/store need to use floating point load/store in case we don't 542 // have AVX2. Execution domain fixing will convert to integer if AVX2 is 543 // available and changing the domain is beneficial. 544 def : Pat<(alignedloadv4i64 addr:$src), 545 (VMOVAPSYrm addr:$src)>; 546 def : Pat<(alignedloadv8i32 addr:$src), 547 (VMOVAPSYrm addr:$src)>; 548 def : Pat<(alignedloadv16i16 addr:$src), 549 (VMOVAPSYrm addr:$src)>; 550 def : Pat<(alignedloadv32i8 addr:$src), 551 (VMOVAPSYrm addr:$src)>; 552 def : Pat<(loadv4i64 addr:$src), 553 (VMOVUPSYrm addr:$src)>; 554 def : Pat<(loadv8i32 addr:$src), 555 (VMOVUPSYrm addr:$src)>; 556 def : Pat<(loadv16i16 addr:$src), 557 (VMOVUPSYrm addr:$src)>; 558 def : Pat<(loadv32i8 addr:$src), 559 (VMOVUPSYrm addr:$src)>; 560 561 def : Pat<(alignedstore (v4i64 VR256:$src), addr:$dst), 562 (VMOVAPSYmr addr:$dst, VR256:$src)>; 563 def : Pat<(alignedstore (v8i32 VR256:$src), addr:$dst), 564 (VMOVAPSYmr addr:$dst, VR256:$src)>; 565 def : Pat<(alignedstore (v16i16 VR256:$src), addr:$dst), 566 (VMOVAPSYmr addr:$dst, VR256:$src)>; 567 def : Pat<(alignedstore (v32i8 VR256:$src), addr:$dst), 568 (VMOVAPSYmr addr:$dst, VR256:$src)>; 569 def : Pat<(store (v4i64 VR256:$src), addr:$dst), 570 (VMOVUPSYmr addr:$dst, VR256:$src)>; 571 def : Pat<(store (v8i32 VR256:$src), addr:$dst), 572 (VMOVUPSYmr addr:$dst, VR256:$src)>; 573 def : Pat<(store (v16i16 VR256:$src), addr:$dst), 574 (VMOVUPSYmr addr:$dst, VR256:$src)>; 575 def : Pat<(store (v32i8 VR256:$src), addr:$dst), 576 (VMOVUPSYmr addr:$dst, VR256:$src)>; 577 578 def : Pat<(alignedloadv8f16 addr:$src), 579 (VMOVAPSrm addr:$src)>; 580 def : Pat<(loadv8f16 addr:$src), 581 (VMOVUPSrm addr:$src)>; 582 def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst), 583 (VMOVAPSmr addr:$dst, VR128:$src)>; 584 def : Pat<(store (v8f16 VR128:$src), addr:$dst), 585 (VMOVUPSmr addr:$dst, VR128:$src)>; 586 def : Pat<(alignedloadv16f16 addr:$src), 587 (VMOVAPSYrm addr:$src)>; 588 def : Pat<(loadv16f16 addr:$src), 589 (VMOVUPSYrm addr:$src)>; 590 def : Pat<(alignedstore (v16f16 VR256:$src), addr:$dst), 591 (VMOVAPSYmr addr:$dst, VR256:$src)>; 592 def : Pat<(store (v16f16 VR256:$src), addr:$dst), 593 (VMOVUPSYmr addr:$dst, VR256:$src)>; 594} 595 596// Use movaps / movups for SSE integer load / store (one byte shorter). 597// The instructions selected below are then converted to MOVDQA/MOVDQU 598// during the SSE domain pass. 599let Predicates = [UseSSE1] in { 600 def : Pat<(alignedloadv2i64 addr:$src), 601 (MOVAPSrm addr:$src)>; 602 def : Pat<(alignedloadv4i32 addr:$src), 603 (MOVAPSrm addr:$src)>; 604 def : Pat<(alignedloadv8i16 addr:$src), 605 (MOVAPSrm addr:$src)>; 606 def : Pat<(alignedloadv16i8 addr:$src), 607 (MOVAPSrm addr:$src)>; 608 def : Pat<(loadv2i64 addr:$src), 609 (MOVUPSrm addr:$src)>; 610 def : Pat<(loadv4i32 addr:$src), 611 (MOVUPSrm addr:$src)>; 612 def : Pat<(loadv8i16 addr:$src), 613 (MOVUPSrm addr:$src)>; 614 def : Pat<(loadv16i8 addr:$src), 615 (MOVUPSrm addr:$src)>; 616 617 def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst), 618 (MOVAPSmr addr:$dst, VR128:$src)>; 619 def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst), 620 (MOVAPSmr addr:$dst, VR128:$src)>; 621 def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst), 622 (MOVAPSmr addr:$dst, VR128:$src)>; 623 def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst), 624 (MOVAPSmr addr:$dst, VR128:$src)>; 625 def : Pat<(store (v2i64 VR128:$src), addr:$dst), 626 (MOVUPSmr addr:$dst, VR128:$src)>; 627 def : Pat<(store (v4i32 VR128:$src), addr:$dst), 628 (MOVUPSmr addr:$dst, VR128:$src)>; 629 def : Pat<(store (v8i16 VR128:$src), addr:$dst), 630 (MOVUPSmr addr:$dst, VR128:$src)>; 631 def : Pat<(store (v16i8 VR128:$src), addr:$dst), 632 (MOVUPSmr addr:$dst, VR128:$src)>; 633} 634 635let Predicates = [UseSSE2] in { 636 def : Pat<(alignedloadv8f16 addr:$src), 637 (MOVAPSrm addr:$src)>; 638 def : Pat<(loadv8f16 addr:$src), 639 (MOVUPSrm addr:$src)>; 640 def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst), 641 (MOVAPSmr addr:$dst, VR128:$src)>; 642 def : Pat<(store (v8f16 VR128:$src), addr:$dst), 643 (MOVUPSmr addr:$dst, VR128:$src)>; 644} 645 646//===----------------------------------------------------------------------===// 647// SSE 1 & 2 - Move Low packed FP Instructions 648//===----------------------------------------------------------------------===// 649 650multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDPatternOperator pdnode, 651 string base_opc, string asm_opr> { 652 // No pattern as they need be special cased between high and low. 653 let hasSideEffects = 0, mayLoad = 1 in 654 def PSrm : PI<opc, MRMSrcMem, 655 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2), 656 !strconcat(base_opc, "s", asm_opr), 657 [], SSEPackedSingle>, PS, 658 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>; 659 660 def PDrm : PI<opc, MRMSrcMem, 661 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2), 662 !strconcat(base_opc, "d", asm_opr), 663 [(set VR128:$dst, (v2f64 (pdnode VR128:$src1, 664 (scalar_to_vector (loadf64 addr:$src2)))))], 665 SSEPackedDouble>, PD, 666 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>; 667} 668 669multiclass sse12_mov_hilo_packed<bits<8>opc, SDPatternOperator pdnode, 670 string base_opc> { 671 let Predicates = [UseAVX] in 672 defm V#NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc, 673 "\t{$src2, $src1, $dst|$dst, $src1, $src2}">, 674 VEX_4V, VEX_WIG; 675 676 let Constraints = "$src1 = $dst" in 677 defm NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc, 678 "\t{$src2, $dst|$dst, $src2}">; 679} 680 681defm MOVL : sse12_mov_hilo_packed<0x12, X86Movsd, "movlp">; 682 683let SchedRW = [WriteFStore] in { 684let Predicates = [UseAVX] in { 685let mayStore = 1, hasSideEffects = 0 in 686def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 687 "movlps\t{$src, $dst|$dst, $src}", 688 []>, 689 VEX, VEX_WIG; 690def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 691 "movlpd\t{$src, $dst|$dst, $src}", 692 [(store (f64 (extractelt (v2f64 VR128:$src), 693 (iPTR 0))), addr:$dst)]>, 694 VEX, VEX_WIG; 695}// UseAVX 696let mayStore = 1, hasSideEffects = 0 in 697def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 698 "movlps\t{$src, $dst|$dst, $src}", 699 []>; 700def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 701 "movlpd\t{$src, $dst|$dst, $src}", 702 [(store (f64 (extractelt (v2f64 VR128:$src), 703 (iPTR 0))), addr:$dst)]>; 704} // SchedRW 705 706let Predicates = [UseSSE1] in { 707 // This pattern helps select MOVLPS on SSE1 only targets. With SSE2 we'll 708 // end up with a movsd or blend instead of shufp. 709 // No need for aligned load, we're only loading 64-bits. 710 def : Pat<(X86Shufp (v4f32 (simple_load addr:$src2)), VR128:$src1, 711 (i8 -28)), 712 (MOVLPSrm VR128:$src1, addr:$src2)>; 713 def : Pat<(X86Shufp (v4f32 (X86vzload64 addr:$src2)), VR128:$src1, (i8 -28)), 714 (MOVLPSrm VR128:$src1, addr:$src2)>; 715 716 def : Pat<(v4f32 (X86vzload64 addr:$src)), 717 (MOVLPSrm (v4f32 (V_SET0)), addr:$src)>; 718 def : Pat<(X86vextractstore64 (v4f32 VR128:$src), addr:$dst), 719 (MOVLPSmr addr:$dst, VR128:$src)>; 720} 721 722//===----------------------------------------------------------------------===// 723// SSE 1 & 2 - Move Hi packed FP Instructions 724//===----------------------------------------------------------------------===// 725 726defm MOVH : sse12_mov_hilo_packed<0x16, X86Unpckl, "movhp">; 727 728let SchedRW = [WriteFStore] in { 729// v2f64 extract element 1 is always custom lowered to unpack high to low 730// and extract element 0 so the non-store version isn't too horrible. 731let Predicates = [UseAVX] in { 732let mayStore = 1, hasSideEffects = 0 in 733def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 734 "movhps\t{$src, $dst|$dst, $src}", 735 []>, VEX, VEX_WIG; 736def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 737 "movhpd\t{$src, $dst|$dst, $src}", 738 [(store (f64 (extractelt 739 (v2f64 (X86Unpckh VR128:$src, VR128:$src)), 740 (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG; 741} // UseAVX 742let mayStore = 1, hasSideEffects = 0 in 743def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 744 "movhps\t{$src, $dst|$dst, $src}", 745 []>; 746def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 747 "movhpd\t{$src, $dst|$dst, $src}", 748 [(store (f64 (extractelt 749 (v2f64 (X86Unpckh VR128:$src, VR128:$src)), 750 (iPTR 0))), addr:$dst)]>; 751} // SchedRW 752 753let Predicates = [UseAVX] in { 754 // MOVHPD patterns 755 def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))), 756 (VMOVHPDrm VR128:$src1, addr:$src2)>; 757 758 def : Pat<(store (f64 (extractelt 759 (v2f64 (X86VPermilpi VR128:$src, (i8 1))), 760 (iPTR 0))), addr:$dst), 761 (VMOVHPDmr addr:$dst, VR128:$src)>; 762 763 // MOVLPD patterns 764 def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))), 765 (VMOVLPDrm VR128:$src1, addr:$src2)>; 766} 767 768let Predicates = [UseSSE1] in { 769 // This pattern helps select MOVHPS on SSE1 only targets. With SSE2 we'll 770 // end up with a movsd or blend instead of shufp. 771 // No need for aligned load, we're only loading 64-bits. 772 def : Pat<(X86Movlhps VR128:$src1, (v4f32 (simple_load addr:$src2))), 773 (MOVHPSrm VR128:$src1, addr:$src2)>; 774 def : Pat<(X86Movlhps VR128:$src1, (v4f32 (X86vzload64 addr:$src2))), 775 (MOVHPSrm VR128:$src1, addr:$src2)>; 776 777 def : Pat<(X86vextractstore64 (v4f32 (X86Movhlps VR128:$src, VR128:$src)), 778 addr:$dst), 779 (MOVHPSmr addr:$dst, VR128:$src)>; 780} 781 782let Predicates = [UseSSE2] in { 783 // MOVHPD patterns 784 def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))), 785 (MOVHPDrm VR128:$src1, addr:$src2)>; 786 787 def : Pat<(store (f64 (extractelt 788 (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))), 789 (iPTR 0))), addr:$dst), 790 (MOVHPDmr addr:$dst, VR128:$src)>; 791 792 // MOVLPD patterns 793 def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))), 794 (MOVLPDrm VR128:$src1, addr:$src2)>; 795} 796 797let Predicates = [UseSSE2, NoSSE41_Or_OptForSize] in { 798 // Use MOVLPD to load into the low bits from a full vector unless we can use 799 // BLENDPD. 800 def : Pat<(X86Movsd VR128:$src1, (v2f64 (simple_load addr:$src2))), 801 (MOVLPDrm VR128:$src1, addr:$src2)>; 802} 803 804//===----------------------------------------------------------------------===// 805// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions 806//===----------------------------------------------------------------------===// 807 808let Predicates = [UseAVX] in { 809 def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst), 810 (ins VR128:$src1, VR128:$src2), 811 "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 812 [(set VR128:$dst, 813 (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>, 814 VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG; 815 let isCommutable = 1 in 816 def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst), 817 (ins VR128:$src1, VR128:$src2), 818 "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 819 [(set VR128:$dst, 820 (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>, 821 VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG, 822 NotMemoryFoldable; 823} 824let Constraints = "$src1 = $dst" in { 825 def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst), 826 (ins VR128:$src1, VR128:$src2), 827 "movlhps\t{$src2, $dst|$dst, $src2}", 828 [(set VR128:$dst, 829 (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>, 830 Sched<[SchedWriteFShuffle.XMM]>; 831 let isCommutable = 1 in 832 def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst), 833 (ins VR128:$src1, VR128:$src2), 834 "movhlps\t{$src2, $dst|$dst, $src2}", 835 [(set VR128:$dst, 836 (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>, 837 Sched<[SchedWriteFShuffle.XMM]>, NotMemoryFoldable; 838} 839 840//===----------------------------------------------------------------------===// 841// SSE 1 & 2 - Conversion Instructions 842//===----------------------------------------------------------------------===// 843 844multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, 845 SDPatternOperator OpNode, X86MemOperand x86memop, PatFrag ld_frag, 846 string asm, string mem, X86FoldableSchedWrite sched, 847 Domain d, 848 SchedRead Int2Fpu = ReadDefault> { 849 let ExeDomain = d in { 850 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), 851 !strconcat(asm,"\t{$src, $dst|$dst, $src}"), 852 [(set DstRC:$dst, (OpNode SrcRC:$src))]>, 853 Sched<[sched, Int2Fpu]>; 854 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), 855 mem#"\t{$src, $dst|$dst, $src}", 856 [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>, 857 Sched<[sched.Folded]>; 858 } 859} 860 861multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop, 862 ValueType DstTy, ValueType SrcTy, PatFrag ld_frag, 863 string asm, Domain d, X86FoldableSchedWrite sched> { 864let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in { 865 def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), asm, 866 [(set RC:$dst, (DstTy (any_sint_to_fp (SrcTy RC:$src))))], d>, 867 Sched<[sched]>; 868 let mayLoad = 1 in 869 def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm, 870 [(set RC:$dst, (DstTy (any_sint_to_fp 871 (SrcTy (ld_frag addr:$src)))))], d>, 872 Sched<[sched.Folded]>; 873} 874} 875 876multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, 877 X86MemOperand x86memop, string asm, string mem, 878 X86FoldableSchedWrite sched, Domain d> { 879let hasSideEffects = 0, Predicates = [UseAVX], ExeDomain = d in { 880 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src), 881 !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, 882 Sched<[sched, ReadDefault, ReadInt2Fpu]>; 883 let mayLoad = 1 in 884 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), 885 (ins DstRC:$src1, x86memop:$src), 886 asm#"{"#mem#"}\t{$src, $src1, $dst|$dst, $src1, $src}", []>, 887 Sched<[sched.Folded, sched.ReadAfterFold]>; 888} // hasSideEffects = 0 889} 890 891let isCodeGenOnly = 1, Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in { 892defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32, 893 "cvttss2si", "cvttss2si", 894 WriteCvtSS2I, SSEPackedSingle>, 895 XS, VEX, VEX_LIG; 896defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32, 897 "cvttss2si", "cvttss2si", 898 WriteCvtSS2I, SSEPackedSingle>, 899 XS, VEX, VEX_W, VEX_LIG; 900defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64, 901 "cvttsd2si", "cvttsd2si", 902 WriteCvtSD2I, SSEPackedDouble>, 903 XD, VEX, VEX_LIG; 904defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64, 905 "cvttsd2si", "cvttsd2si", 906 WriteCvtSD2I, SSEPackedDouble>, 907 XD, VEX, VEX_W, VEX_LIG; 908 909defm VCVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, lrint, f32mem, loadf32, 910 "cvtss2si", "cvtss2si", 911 WriteCvtSS2I, SSEPackedSingle>, 912 XS, VEX, VEX_LIG; 913defm VCVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, llrint, f32mem, loadf32, 914 "cvtss2si", "cvtss2si", 915 WriteCvtSS2I, SSEPackedSingle>, 916 XS, VEX, VEX_W, VEX_LIG; 917defm VCVTSD2SI : sse12_cvt_s<0x2D, FR64, GR32, lrint, f64mem, loadf64, 918 "cvtsd2si", "cvtsd2si", 919 WriteCvtSD2I, SSEPackedDouble>, 920 XD, VEX, VEX_LIG; 921defm VCVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64, 922 "cvtsd2si", "cvtsd2si", 923 WriteCvtSD2I, SSEPackedDouble>, 924 XD, VEX, VEX_W, VEX_LIG; 925} 926 927// The assembler can recognize rr 64-bit instructions by seeing a rxx 928// register, but the same isn't true when only using memory operands, 929// provide other assembly "l" and "q" forms to address this explicitly 930// where appropriate to do so. 931let isCodeGenOnly = 1 in { 932defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss", "l", 933 WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V, 934 VEX_LIG, SIMD_EXC; 935defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss", "q", 936 WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V, 937 VEX_W, VEX_LIG, SIMD_EXC; 938defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd", "l", 939 WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V, 940 VEX_LIG; 941defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd", "q", 942 WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V, 943 VEX_W, VEX_LIG, SIMD_EXC; 944} // isCodeGenOnly = 1 945 946let Predicates = [UseAVX] in { 947 def : Pat<(f32 (any_sint_to_fp (loadi32 addr:$src))), 948 (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>; 949 def : Pat<(f32 (any_sint_to_fp (loadi64 addr:$src))), 950 (VCVTSI642SSrm (f32 (IMPLICIT_DEF)), addr:$src)>; 951 def : Pat<(f64 (any_sint_to_fp (loadi32 addr:$src))), 952 (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>; 953 def : Pat<(f64 (any_sint_to_fp (loadi64 addr:$src))), 954 (VCVTSI642SDrm (f64 (IMPLICIT_DEF)), addr:$src)>; 955 956 def : Pat<(f32 (any_sint_to_fp GR32:$src)), 957 (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>; 958 def : Pat<(f32 (any_sint_to_fp GR64:$src)), 959 (VCVTSI642SSrr (f32 (IMPLICIT_DEF)), GR64:$src)>; 960 def : Pat<(f64 (any_sint_to_fp GR32:$src)), 961 (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>; 962 def : Pat<(f64 (any_sint_to_fp GR64:$src)), 963 (VCVTSI642SDrr (f64 (IMPLICIT_DEF)), GR64:$src)>; 964 965 def : Pat<(i64 (lrint FR32:$src)), (VCVTSS2SI64rr FR32:$src)>; 966 def : Pat<(i64 (lrint (loadf32 addr:$src))), (VCVTSS2SI64rm addr:$src)>; 967 968 def : Pat<(i64 (lrint FR64:$src)), (VCVTSD2SI64rr FR64:$src)>; 969 def : Pat<(i64 (lrint (loadf64 addr:$src))), (VCVTSD2SI64rm addr:$src)>; 970} 971 972let isCodeGenOnly = 1 in { 973defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32, 974 "cvttss2si", "cvttss2si", 975 WriteCvtSS2I, SSEPackedSingle>, XS, SIMD_EXC; 976defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32, 977 "cvttss2si", "cvttss2si", 978 WriteCvtSS2I, SSEPackedSingle>, XS, REX_W, SIMD_EXC; 979defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64, 980 "cvttsd2si", "cvttsd2si", 981 WriteCvtSD2I, SSEPackedDouble>, XD, SIMD_EXC; 982defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64, 983 "cvttsd2si", "cvttsd2si", 984 WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC; 985 986defm CVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, lrint, f32mem, loadf32, 987 "cvtss2si", "cvtss2si", 988 WriteCvtSS2I, SSEPackedSingle>, XS, SIMD_EXC; 989defm CVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, llrint, f32mem, loadf32, 990 "cvtss2si", "cvtss2si", 991 WriteCvtSS2I, SSEPackedSingle>, XS, REX_W, SIMD_EXC; 992defm CVTSD2SI : sse12_cvt_s<0x2D, FR64, GR32, lrint, f64mem, loadf64, 993 "cvtsd2si", "cvtsd2si", 994 WriteCvtSD2I, SSEPackedDouble>, XD, SIMD_EXC; 995defm CVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64, 996 "cvtsd2si", "cvtsd2si", 997 WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC; 998 999defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, any_sint_to_fp, i32mem, loadi32, 1000 "cvtsi2ss", "cvtsi2ss{l}", 1001 WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, SIMD_EXC; 1002defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, any_sint_to_fp, i64mem, loadi64, 1003 "cvtsi2ss", "cvtsi2ss{q}", 1004 WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, REX_W, SIMD_EXC; 1005defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, any_sint_to_fp, i32mem, loadi32, 1006 "cvtsi2sd", "cvtsi2sd{l}", 1007 WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD; 1008defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, any_sint_to_fp, i64mem, loadi64, 1009 "cvtsi2sd", "cvtsi2sd{q}", 1010 WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD, REX_W, SIMD_EXC; 1011} // isCodeGenOnly = 1 1012 1013let Predicates = [UseSSE1] in { 1014 def : Pat<(i64 (lrint FR32:$src)), (CVTSS2SI64rr FR32:$src)>; 1015 def : Pat<(i64 (lrint (loadf32 addr:$src))), (CVTSS2SI64rm addr:$src)>; 1016} 1017 1018let Predicates = [UseSSE2] in { 1019 def : Pat<(i64 (lrint FR64:$src)), (CVTSD2SI64rr FR64:$src)>; 1020 def : Pat<(i64 (lrint (loadf64 addr:$src))), (CVTSD2SI64rm addr:$src)>; 1021} 1022 1023// Conversion Instructions Intrinsics - Match intrinsics which expect MM 1024// and/or XMM operand(s). 1025 1026multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, 1027 ValueType DstVT, ValueType SrcVT, SDNode OpNode, 1028 Operand memop, PatFrags mem_frags, string asm, 1029 X86FoldableSchedWrite sched, Domain d> { 1030let ExeDomain = d in { 1031 def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), 1032 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 1033 [(set DstRC:$dst, (DstVT (OpNode (SrcVT SrcRC:$src))))]>, 1034 Sched<[sched]>; 1035 def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src), 1036 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 1037 [(set DstRC:$dst, (DstVT (OpNode (SrcVT (mem_frags addr:$src)))))]>, 1038 Sched<[sched.Folded]>; 1039} 1040} 1041 1042multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC, 1043 RegisterClass DstRC, X86MemOperand x86memop, 1044 string asm, string mem, X86FoldableSchedWrite sched, 1045 Domain d, bit Is2Addr = 1> { 1046let hasSideEffects = 0, ExeDomain = d in { 1047 def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2), 1048 !if(Is2Addr, 1049 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), 1050 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 1051 []>, Sched<[sched, ReadDefault, ReadInt2Fpu]>; 1052 let mayLoad = 1 in 1053 def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), 1054 (ins DstRC:$src1, x86memop:$src2), 1055 !if(Is2Addr, 1056 asm#"{"#mem#"}\t{$src2, $dst|$dst, $src2}", 1057 asm#"{"#mem#"}\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 1058 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 1059} 1060} 1061 1062let Uses = [MXCSR], mayRaiseFPException = 1 in { 1063let Predicates = [UseAVX] in { 1064defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64, 1065 X86cvts2si, sdmem, sse_load_f64, "cvtsd2si", 1066 WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_LIG; 1067defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, 1068 X86cvts2si, sdmem, sse_load_f64, "cvtsd2si", 1069 WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_W, VEX_LIG; 1070} 1071defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64, X86cvts2si, 1072 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I, 1073 SSEPackedDouble>, XD; 1074defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, X86cvts2si, 1075 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I, 1076 SSEPackedDouble>, XD, REX_W; 1077} 1078 1079let Predicates = [UseAVX] in { 1080defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 1081 i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle, 0>, 1082 XS, VEX_4V, VEX_LIG, SIMD_EXC; 1083defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 1084 i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle, 0>, 1085 XS, VEX_4V, VEX_LIG, VEX_W, SIMD_EXC; 1086defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 1087 i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble, 0>, 1088 XD, VEX_4V, VEX_LIG; 1089defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 1090 i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble, 0>, 1091 XD, VEX_4V, VEX_LIG, VEX_W, SIMD_EXC; 1092} 1093let Constraints = "$src1 = $dst" in { 1094 defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 1095 i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle>, 1096 XS, SIMD_EXC; 1097 defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 1098 i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle>, 1099 XS, REX_W, SIMD_EXC; 1100 defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 1101 i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble>, 1102 XD; 1103 defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 1104 i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble>, 1105 XD, REX_W, SIMD_EXC; 1106} 1107 1108def : InstAlias<"vcvtsi2ss{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1109 (VCVTSI2SSrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">; 1110def : InstAlias<"vcvtsi2ss{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1111 (VCVTSI642SSrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">; 1112def : InstAlias<"vcvtsi2sd{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1113 (VCVTSI2SDrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">; 1114def : InstAlias<"vcvtsi2sd{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1115 (VCVTSI642SDrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">; 1116 1117def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}", 1118 (VCVTSI2SSrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">; 1119def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}", 1120 (VCVTSI2SDrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">; 1121 1122def : InstAlias<"cvtsi2ss{l}\t{$src, $dst|$dst, $src}", 1123 (CVTSI2SSrr_Int VR128:$dst, GR32:$src), 0, "att">; 1124def : InstAlias<"cvtsi2ss{q}\t{$src, $dst|$dst, $src}", 1125 (CVTSI642SSrr_Int VR128:$dst, GR64:$src), 0, "att">; 1126def : InstAlias<"cvtsi2sd{l}\t{$src, $dst|$dst, $src}", 1127 (CVTSI2SDrr_Int VR128:$dst, GR32:$src), 0, "att">; 1128def : InstAlias<"cvtsi2sd{q}\t{$src, $dst|$dst, $src}", 1129 (CVTSI642SDrr_Int VR128:$dst, GR64:$src), 0, "att">; 1130 1131def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}", 1132 (CVTSI2SSrm_Int VR128:$dst, i32mem:$src), 0, "att">; 1133def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}", 1134 (CVTSI2SDrm_Int VR128:$dst, i32mem:$src), 0, "att">; 1135 1136/// SSE 1 Only 1137 1138// Aliases for intrinsics 1139let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in { 1140defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int, 1141 ssmem, sse_load_f32, "cvttss2si", 1142 WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG; 1143defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32, 1144 X86cvtts2Int, ssmem, sse_load_f32, 1145 "cvttss2si", WriteCvtSS2I, SSEPackedSingle>, 1146 XS, VEX, VEX_LIG, VEX_W; 1147defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int, 1148 sdmem, sse_load_f64, "cvttsd2si", 1149 WriteCvtSS2I, SSEPackedDouble>, XD, VEX, VEX_LIG; 1150defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64, 1151 X86cvtts2Int, sdmem, sse_load_f64, 1152 "cvttsd2si", WriteCvtSS2I, SSEPackedDouble>, 1153 XD, VEX, VEX_LIG, VEX_W; 1154} 1155let Uses = [MXCSR], mayRaiseFPException = 1 in { 1156defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int, 1157 ssmem, sse_load_f32, "cvttss2si", 1158 WriteCvtSS2I, SSEPackedSingle>, XS; 1159defm CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32, 1160 X86cvtts2Int, ssmem, sse_load_f32, 1161 "cvttss2si", WriteCvtSS2I, SSEPackedSingle>, 1162 XS, REX_W; 1163defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int, 1164 sdmem, sse_load_f64, "cvttsd2si", 1165 WriteCvtSD2I, SSEPackedDouble>, XD; 1166defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64, 1167 X86cvtts2Int, sdmem, sse_load_f64, 1168 "cvttsd2si", WriteCvtSD2I, SSEPackedDouble>, 1169 XD, REX_W; 1170} 1171 1172def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}", 1173 (VCVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1174def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}", 1175 (VCVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">; 1176def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}", 1177 (VCVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1178def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}", 1179 (VCVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">; 1180def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}", 1181 (VCVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1182def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}", 1183 (VCVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">; 1184def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}", 1185 (VCVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1186def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}", 1187 (VCVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">; 1188 1189def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}", 1190 (CVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1191def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}", 1192 (CVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">; 1193def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}", 1194 (CVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1195def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}", 1196 (CVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">; 1197def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}", 1198 (CVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1199def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}", 1200 (CVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">; 1201def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}", 1202 (CVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1203def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}", 1204 (CVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">; 1205 1206let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in { 1207defm VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si, 1208 ssmem, sse_load_f32, "cvtss2si", 1209 WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG; 1210defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si, 1211 ssmem, sse_load_f32, "cvtss2si", 1212 WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_W, VEX_LIG; 1213} 1214let Uses = [MXCSR], mayRaiseFPException = 1 in { 1215defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si, 1216 ssmem, sse_load_f32, "cvtss2si", 1217 WriteCvtSS2I, SSEPackedSingle>, XS; 1218defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si, 1219 ssmem, sse_load_f32, "cvtss2si", 1220 WriteCvtSS2I, SSEPackedSingle>, XS, REX_W; 1221 1222defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, load, 1223 "vcvtdq2ps\t{$src, $dst|$dst, $src}", 1224 SSEPackedSingle, WriteCvtI2PS>, 1225 PS, VEX, Requires<[HasAVX, NoVLX]>, VEX_WIG; 1226defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, load, 1227 "vcvtdq2ps\t{$src, $dst|$dst, $src}", 1228 SSEPackedSingle, WriteCvtI2PSY>, 1229 PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, VEX_WIG; 1230 1231defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memop, 1232 "cvtdq2ps\t{$src, $dst|$dst, $src}", 1233 SSEPackedSingle, WriteCvtI2PS>, 1234 PS, Requires<[UseSSE2]>; 1235} 1236 1237// AVX aliases 1238def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}", 1239 (VCVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1240def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}", 1241 (VCVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">; 1242def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}", 1243 (VCVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1244def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}", 1245 (VCVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">; 1246def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}", 1247 (VCVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1248def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}", 1249 (VCVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">; 1250def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}", 1251 (VCVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1252def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}", 1253 (VCVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">; 1254 1255// SSE aliases 1256def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}", 1257 (CVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1258def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}", 1259 (CVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">; 1260def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}", 1261 (CVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1262def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}", 1263 (CVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">; 1264def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}", 1265 (CVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1266def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}", 1267 (CVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">; 1268def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}", 1269 (CVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1270def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}", 1271 (CVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">; 1272 1273/// SSE 2 Only 1274 1275// Convert scalar double to scalar single 1276let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [UseAVX], 1277 ExeDomain = SSEPackedSingle in { 1278def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst), 1279 (ins FR32:$src1, FR64:$src2), 1280 "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 1281 VEX_4V, VEX_LIG, VEX_WIG, 1282 Sched<[WriteCvtSD2SS]>, SIMD_EXC; 1283let mayLoad = 1 in 1284def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), 1285 (ins FR32:$src1, f64mem:$src2), 1286 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 1287 XD, VEX_4V, VEX_LIG, VEX_WIG, 1288 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>, SIMD_EXC; 1289} 1290 1291def : Pat<(f32 (any_fpround FR64:$src)), 1292 (VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>, 1293 Requires<[UseAVX]>; 1294 1295let isCodeGenOnly = 1, ExeDomain = SSEPackedSingle in { 1296def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src), 1297 "cvtsd2ss\t{$src, $dst|$dst, $src}", 1298 [(set FR32:$dst, (any_fpround FR64:$src))]>, 1299 Sched<[WriteCvtSD2SS]>, SIMD_EXC; 1300def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src), 1301 "cvtsd2ss\t{$src, $dst|$dst, $src}", 1302 [(set FR32:$dst, (any_fpround (loadf64 addr:$src)))]>, 1303 XD, Requires<[UseSSE2, OptForSize]>, 1304 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>, SIMD_EXC; 1305} 1306 1307let Uses = [MXCSR], mayRaiseFPException = 1, ExeDomain = SSEPackedSingle in { 1308def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg, 1309 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1310 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1311 [(set VR128:$dst, 1312 (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>, 1313 XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>, 1314 Sched<[WriteCvtSD2SS]>; 1315def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem, 1316 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), 1317 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1318 [(set VR128:$dst, 1319 (v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>, 1320 XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>, 1321 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>; 1322let Constraints = "$src1 = $dst" in { 1323def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg, 1324 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1325 "cvtsd2ss\t{$src2, $dst|$dst, $src2}", 1326 [(set VR128:$dst, 1327 (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>, 1328 XD, Requires<[UseSSE2]>, Sched<[WriteCvtSD2SS]>; 1329def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem, 1330 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), 1331 "cvtsd2ss\t{$src2, $dst|$dst, $src2}", 1332 [(set VR128:$dst, 1333 (v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>, 1334 XD, Requires<[UseSSE2]>, 1335 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>; 1336} 1337} 1338 1339// Convert scalar single to scalar double 1340// SSE2 instructions with XS prefix 1341let isCodeGenOnly = 1, hasSideEffects = 0, ExeDomain = SSEPackedSingle in { 1342def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), 1343 (ins FR64:$src1, FR32:$src2), 1344 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 1345 XS, VEX_4V, VEX_LIG, VEX_WIG, 1346 Sched<[WriteCvtSS2SD]>, Requires<[UseAVX]>, SIMD_EXC; 1347let mayLoad = 1 in 1348def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), 1349 (ins FR64:$src1, f32mem:$src2), 1350 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 1351 XS, VEX_4V, VEX_LIG, VEX_WIG, 1352 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>, 1353 Requires<[UseAVX, OptForSize]>, SIMD_EXC; 1354} // isCodeGenOnly = 1, hasSideEffects = 0 1355 1356def : Pat<(f64 (any_fpextend FR32:$src)), 1357 (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>; 1358def : Pat<(any_fpextend (loadf32 addr:$src)), 1359 (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX, OptForSize]>; 1360 1361let isCodeGenOnly = 1, ExeDomain = SSEPackedSingle in { 1362def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src), 1363 "cvtss2sd\t{$src, $dst|$dst, $src}", 1364 [(set FR64:$dst, (any_fpextend FR32:$src))]>, 1365 XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD]>, SIMD_EXC; 1366def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src), 1367 "cvtss2sd\t{$src, $dst|$dst, $src}", 1368 [(set FR64:$dst, (any_fpextend (loadf32 addr:$src)))]>, 1369 XS, Requires<[UseSSE2, OptForSize]>, 1370 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>, SIMD_EXC; 1371} // isCodeGenOnly = 1 1372 1373let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1, 1374 ExeDomain = SSEPackedSingle in { 1375def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg, 1376 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1377 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1378 []>, XS, VEX_4V, VEX_LIG, VEX_WIG, 1379 Requires<[HasAVX]>, Sched<[WriteCvtSS2SD]>; 1380let mayLoad = 1 in 1381def VCVTSS2SDrm_Int: I<0x5A, MRMSrcMem, 1382 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2), 1383 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1384 []>, XS, VEX_4V, VEX_LIG, VEX_WIG, Requires<[HasAVX]>, 1385 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>; 1386let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix 1387def CVTSS2SDrr_Int: I<0x5A, MRMSrcReg, 1388 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1389 "cvtss2sd\t{$src2, $dst|$dst, $src2}", 1390 []>, XS, Requires<[UseSSE2]>, 1391 Sched<[WriteCvtSS2SD]>; 1392let mayLoad = 1 in 1393def CVTSS2SDrm_Int: I<0x5A, MRMSrcMem, 1394 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2), 1395 "cvtss2sd\t{$src2, $dst|$dst, $src2}", 1396 []>, XS, Requires<[UseSSE2]>, 1397 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>; 1398} 1399} // hasSideEffects = 0 1400 1401// Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and 1402// (v)cvtss2sd intrinsic sequences from clang which produce unnecessary 1403// vmovs{s,d} instructions 1404let Predicates = [UseAVX] in { 1405def : Pat<(v4f32 (X86Movss 1406 (v4f32 VR128:$dst), 1407 (v4f32 (scalar_to_vector 1408 (f32 (any_fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))), 1409 (VCVTSD2SSrr_Int VR128:$dst, VR128:$src)>; 1410 1411def : Pat<(v2f64 (X86Movsd 1412 (v2f64 VR128:$dst), 1413 (v2f64 (scalar_to_vector 1414 (f64 (any_fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))), 1415 (VCVTSS2SDrr_Int VR128:$dst, VR128:$src)>; 1416 1417def : Pat<(v4f32 (X86Movss 1418 (v4f32 VR128:$dst), 1419 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))), 1420 (VCVTSI642SSrr_Int VR128:$dst, GR64:$src)>; 1421 1422def : Pat<(v4f32 (X86Movss 1423 (v4f32 VR128:$dst), 1424 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))), 1425 (VCVTSI642SSrm_Int VR128:$dst, addr:$src)>; 1426 1427def : Pat<(v4f32 (X86Movss 1428 (v4f32 VR128:$dst), 1429 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))), 1430 (VCVTSI2SSrr_Int VR128:$dst, GR32:$src)>; 1431 1432def : Pat<(v4f32 (X86Movss 1433 (v4f32 VR128:$dst), 1434 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))), 1435 (VCVTSI2SSrm_Int VR128:$dst, addr:$src)>; 1436 1437def : Pat<(v2f64 (X86Movsd 1438 (v2f64 VR128:$dst), 1439 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))), 1440 (VCVTSI642SDrr_Int VR128:$dst, GR64:$src)>; 1441 1442def : Pat<(v2f64 (X86Movsd 1443 (v2f64 VR128:$dst), 1444 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))), 1445 (VCVTSI642SDrm_Int VR128:$dst, addr:$src)>; 1446 1447def : Pat<(v2f64 (X86Movsd 1448 (v2f64 VR128:$dst), 1449 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))), 1450 (VCVTSI2SDrr_Int VR128:$dst, GR32:$src)>; 1451 1452def : Pat<(v2f64 (X86Movsd 1453 (v2f64 VR128:$dst), 1454 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))), 1455 (VCVTSI2SDrm_Int VR128:$dst, addr:$src)>; 1456} // Predicates = [UseAVX] 1457 1458let Predicates = [UseSSE2] in { 1459def : Pat<(v4f32 (X86Movss 1460 (v4f32 VR128:$dst), 1461 (v4f32 (scalar_to_vector 1462 (f32 (any_fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))), 1463 (CVTSD2SSrr_Int VR128:$dst, VR128:$src)>; 1464 1465def : Pat<(v2f64 (X86Movsd 1466 (v2f64 VR128:$dst), 1467 (v2f64 (scalar_to_vector 1468 (f64 (any_fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))), 1469 (CVTSS2SDrr_Int VR128:$dst, VR128:$src)>; 1470 1471def : Pat<(v2f64 (X86Movsd 1472 (v2f64 VR128:$dst), 1473 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))), 1474 (CVTSI642SDrr_Int VR128:$dst, GR64:$src)>; 1475 1476def : Pat<(v2f64 (X86Movsd 1477 (v2f64 VR128:$dst), 1478 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))), 1479 (CVTSI642SDrm_Int VR128:$dst, addr:$src)>; 1480 1481def : Pat<(v2f64 (X86Movsd 1482 (v2f64 VR128:$dst), 1483 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))), 1484 (CVTSI2SDrr_Int VR128:$dst, GR32:$src)>; 1485 1486def : Pat<(v2f64 (X86Movsd 1487 (v2f64 VR128:$dst), 1488 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))), 1489 (CVTSI2SDrm_Int VR128:$dst, addr:$src)>; 1490} // Predicates = [UseSSE2] 1491 1492let Predicates = [UseSSE1] in { 1493def : Pat<(v4f32 (X86Movss 1494 (v4f32 VR128:$dst), 1495 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))), 1496 (CVTSI642SSrr_Int VR128:$dst, GR64:$src)>; 1497 1498def : Pat<(v4f32 (X86Movss 1499 (v4f32 VR128:$dst), 1500 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))), 1501 (CVTSI642SSrm_Int VR128:$dst, addr:$src)>; 1502 1503def : Pat<(v4f32 (X86Movss 1504 (v4f32 VR128:$dst), 1505 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))), 1506 (CVTSI2SSrr_Int VR128:$dst, GR32:$src)>; 1507 1508def : Pat<(v4f32 (X86Movss 1509 (v4f32 VR128:$dst), 1510 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))), 1511 (CVTSI2SSrm_Int VR128:$dst, addr:$src)>; 1512} // Predicates = [UseSSE1] 1513 1514let Predicates = [HasAVX, NoVLX] in { 1515// Convert packed single/double fp to doubleword 1516def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1517 "cvtps2dq\t{$src, $dst|$dst, $src}", 1518 [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>, 1519 VEX, Sched<[WriteCvtPS2I]>, VEX_WIG, SIMD_EXC; 1520def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1521 "cvtps2dq\t{$src, $dst|$dst, $src}", 1522 [(set VR128:$dst, 1523 (v4i32 (X86cvtp2Int (loadv4f32 addr:$src))))]>, 1524 VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG, SIMD_EXC; 1525def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 1526 "cvtps2dq\t{$src, $dst|$dst, $src}", 1527 [(set VR256:$dst, 1528 (v8i32 (X86cvtp2Int (v8f32 VR256:$src))))]>, 1529 VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG, SIMD_EXC; 1530def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 1531 "cvtps2dq\t{$src, $dst|$dst, $src}", 1532 [(set VR256:$dst, 1533 (v8i32 (X86cvtp2Int (loadv8f32 addr:$src))))]>, 1534 VEX, VEX_L, Sched<[WriteCvtPS2IYLd]>, VEX_WIG, SIMD_EXC; 1535} 1536def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1537 "cvtps2dq\t{$src, $dst|$dst, $src}", 1538 [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>, 1539 Sched<[WriteCvtPS2I]>, SIMD_EXC; 1540def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1541 "cvtps2dq\t{$src, $dst|$dst, $src}", 1542 [(set VR128:$dst, 1543 (v4i32 (X86cvtp2Int (memopv4f32 addr:$src))))]>, 1544 Sched<[WriteCvtPS2ILd]>, SIMD_EXC; 1545 1546 1547// Convert Packed Double FP to Packed DW Integers 1548let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in { 1549// The assembler can recognize rr 256-bit instructions by seeing a ymm 1550// register, but the same isn't true when using memory operands instead. 1551// Provide other assembly rr and rm forms to address this explicitly. 1552def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1553 "vcvtpd2dq\t{$src, $dst|$dst, $src}", 1554 [(set VR128:$dst, 1555 (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>, 1556 VEX, Sched<[WriteCvtPD2I]>, VEX_WIG; 1557 1558// XMM only 1559def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1560 "vcvtpd2dq{x}\t{$src, $dst|$dst, $src}", 1561 [(set VR128:$dst, 1562 (v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX, 1563 Sched<[WriteCvtPD2ILd]>, VEX_WIG; 1564 1565// YMM only 1566def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), 1567 "vcvtpd2dq\t{$src, $dst|$dst, $src}", 1568 [(set VR128:$dst, 1569 (v4i32 (X86cvtp2Int (v4f64 VR256:$src))))]>, 1570 VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG; 1571def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), 1572 "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}", 1573 [(set VR128:$dst, 1574 (v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>, 1575 VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG; 1576} 1577 1578def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}", 1579 (VCVTPD2DQrr VR128:$dst, VR128:$src), 0, "att">; 1580def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}", 1581 (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">; 1582 1583def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1584 "cvtpd2dq\t{$src, $dst|$dst, $src}", 1585 [(set VR128:$dst, 1586 (v4i32 (X86cvtp2Int (memopv2f64 addr:$src))))]>, 1587 Sched<[WriteCvtPD2ILd]>, SIMD_EXC; 1588def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1589 "cvtpd2dq\t{$src, $dst|$dst, $src}", 1590 [(set VR128:$dst, 1591 (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>, 1592 Sched<[WriteCvtPD2I]>, SIMD_EXC; 1593 1594// Convert with truncation packed single/double fp to doubleword 1595// SSE2 packed instructions with XS prefix 1596let Uses = [MXCSR], mayRaiseFPException = 1 in { 1597let Predicates = [HasAVX, NoVLX] in { 1598def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1599 "cvttps2dq\t{$src, $dst|$dst, $src}", 1600 [(set VR128:$dst, 1601 (v4i32 (X86any_cvttp2si (v4f32 VR128:$src))))]>, 1602 VEX, Sched<[WriteCvtPS2I]>, VEX_WIG; 1603def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1604 "cvttps2dq\t{$src, $dst|$dst, $src}", 1605 [(set VR128:$dst, 1606 (v4i32 (X86any_cvttp2si (loadv4f32 addr:$src))))]>, 1607 VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG; 1608def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 1609 "cvttps2dq\t{$src, $dst|$dst, $src}", 1610 [(set VR256:$dst, 1611 (v8i32 (X86any_cvttp2si (v8f32 VR256:$src))))]>, 1612 VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG; 1613def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 1614 "cvttps2dq\t{$src, $dst|$dst, $src}", 1615 [(set VR256:$dst, 1616 (v8i32 (X86any_cvttp2si (loadv8f32 addr:$src))))]>, 1617 VEX, VEX_L, 1618 Sched<[WriteCvtPS2IYLd]>, VEX_WIG; 1619} 1620 1621def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1622 "cvttps2dq\t{$src, $dst|$dst, $src}", 1623 [(set VR128:$dst, 1624 (v4i32 (X86any_cvttp2si (v4f32 VR128:$src))))]>, 1625 Sched<[WriteCvtPS2I]>; 1626def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1627 "cvttps2dq\t{$src, $dst|$dst, $src}", 1628 [(set VR128:$dst, 1629 (v4i32 (X86any_cvttp2si (memopv4f32 addr:$src))))]>, 1630 Sched<[WriteCvtPS2ILd]>; 1631} 1632 1633// The assembler can recognize rr 256-bit instructions by seeing a ymm 1634// register, but the same isn't true when using memory operands instead. 1635// Provide other assembly rr and rm forms to address this explicitly. 1636let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in { 1637// XMM only 1638def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1639 "cvttpd2dq\t{$src, $dst|$dst, $src}", 1640 [(set VR128:$dst, 1641 (v4i32 (X86any_cvttp2si (v2f64 VR128:$src))))]>, 1642 VEX, Sched<[WriteCvtPD2I]>, VEX_WIG; 1643def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1644 "cvttpd2dq{x}\t{$src, $dst|$dst, $src}", 1645 [(set VR128:$dst, 1646 (v4i32 (X86any_cvttp2si (loadv2f64 addr:$src))))]>, 1647 VEX, Sched<[WriteCvtPD2ILd]>, VEX_WIG; 1648 1649// YMM only 1650def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), 1651 "cvttpd2dq\t{$src, $dst|$dst, $src}", 1652 [(set VR128:$dst, 1653 (v4i32 (X86any_cvttp2si (v4f64 VR256:$src))))]>, 1654 VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG; 1655def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), 1656 "cvttpd2dq{y}\t{$src, $dst|$dst, $src}", 1657 [(set VR128:$dst, 1658 (v4i32 (X86any_cvttp2si (loadv4f64 addr:$src))))]>, 1659 VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG; 1660} // Predicates = [HasAVX, NoVLX] 1661 1662def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}", 1663 (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0, "att">; 1664def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}", 1665 (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">; 1666 1667let Predicates = [HasAVX, NoVLX] in { 1668 def : Pat<(v4i32 (any_fp_to_sint (v4f64 VR256:$src))), 1669 (VCVTTPD2DQYrr VR256:$src)>; 1670 def : Pat<(v4i32 (any_fp_to_sint (loadv4f64 addr:$src))), 1671 (VCVTTPD2DQYrm addr:$src)>; 1672} 1673 1674def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1675 "cvttpd2dq\t{$src, $dst|$dst, $src}", 1676 [(set VR128:$dst, 1677 (v4i32 (X86any_cvttp2si (v2f64 VR128:$src))))]>, 1678 Sched<[WriteCvtPD2I]>, SIMD_EXC; 1679def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src), 1680 "cvttpd2dq\t{$src, $dst|$dst, $src}", 1681 [(set VR128:$dst, 1682 (v4i32 (X86any_cvttp2si (memopv2f64 addr:$src))))]>, 1683 Sched<[WriteCvtPD2ILd]>, SIMD_EXC; 1684 1685// Convert packed single to packed double 1686let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in { 1687 // SSE2 instructions without OpSize prefix 1688def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1689 "vcvtps2pd\t{$src, $dst|$dst, $src}", 1690 [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>, 1691 PS, VEX, Sched<[WriteCvtPS2PD]>, VEX_WIG; 1692def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), 1693 "vcvtps2pd\t{$src, $dst|$dst, $src}", 1694 [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>, 1695 PS, VEX, Sched<[WriteCvtPS2PD.Folded]>, VEX_WIG; 1696def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), 1697 "vcvtps2pd\t{$src, $dst|$dst, $src}", 1698 [(set VR256:$dst, (v4f64 (any_fpextend (v4f32 VR128:$src))))]>, 1699 PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY]>, VEX_WIG; 1700def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src), 1701 "vcvtps2pd\t{$src, $dst|$dst, $src}", 1702 [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))]>, 1703 PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY.Folded]>, VEX_WIG; 1704} 1705 1706let Predicates = [UseSSE2], Uses = [MXCSR], mayRaiseFPException = 1 in { 1707def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1708 "cvtps2pd\t{$src, $dst|$dst, $src}", 1709 [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>, 1710 PS, Sched<[WriteCvtPS2PD]>; 1711def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), 1712 "cvtps2pd\t{$src, $dst|$dst, $src}", 1713 [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>, 1714 PS, Sched<[WriteCvtPS2PD.Folded]>; 1715} 1716 1717// Convert Packed DW Integers to Packed Double FP 1718let Predicates = [HasAVX, NoVLX] in { 1719let hasSideEffects = 0, mayLoad = 1 in 1720def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 1721 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 1722 [(set VR128:$dst, 1723 (v2f64 (X86any_VSintToFP 1724 (bc_v4i32 1725 (v2i64 (scalar_to_vector 1726 (loadi64 addr:$src)))))))]>, 1727 VEX, Sched<[WriteCvtI2PDLd]>, VEX_WIG; 1728def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1729 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 1730 [(set VR128:$dst, 1731 (v2f64 (X86any_VSintToFP (v4i32 VR128:$src))))]>, 1732 VEX, Sched<[WriteCvtI2PD]>, VEX_WIG; 1733def VCVTDQ2PDYrm : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src), 1734 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 1735 [(set VR256:$dst, 1736 (v4f64 (any_sint_to_fp (loadv4i32 addr:$src))))]>, 1737 VEX, VEX_L, Sched<[WriteCvtI2PDYLd]>, 1738 VEX_WIG; 1739def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), 1740 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 1741 [(set VR256:$dst, 1742 (v4f64 (any_sint_to_fp (v4i32 VR128:$src))))]>, 1743 VEX, VEX_L, Sched<[WriteCvtI2PDY]>, VEX_WIG; 1744} 1745 1746let hasSideEffects = 0, mayLoad = 1 in 1747def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 1748 "cvtdq2pd\t{$src, $dst|$dst, $src}", 1749 [(set VR128:$dst, 1750 (v2f64 (X86any_VSintToFP 1751 (bc_v4i32 1752 (v2i64 (scalar_to_vector 1753 (loadi64 addr:$src)))))))]>, 1754 Sched<[WriteCvtI2PDLd]>; 1755def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1756 "cvtdq2pd\t{$src, $dst|$dst, $src}", 1757 [(set VR128:$dst, 1758 (v2f64 (X86any_VSintToFP (v4i32 VR128:$src))))]>, 1759 Sched<[WriteCvtI2PD]>; 1760 1761// AVX register conversion intrinsics 1762let Predicates = [HasAVX, NoVLX] in { 1763 def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), 1764 (VCVTDQ2PDrm addr:$src)>; 1765} // Predicates = [HasAVX, NoVLX] 1766 1767// SSE2 register conversion intrinsics 1768let Predicates = [UseSSE2] in { 1769 def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), 1770 (CVTDQ2PDrm addr:$src)>; 1771} // Predicates = [UseSSE2] 1772 1773// Convert packed double to packed single 1774// The assembler can recognize rr 256-bit instructions by seeing a ymm 1775// register, but the same isn't true when using memory operands instead. 1776// Provide other assembly rr and rm forms to address this explicitly. 1777let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in { 1778// XMM only 1779def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1780 "cvtpd2ps\t{$src, $dst|$dst, $src}", 1781 [(set VR128:$dst, (v4f32 (X86any_vfpround (v2f64 VR128:$src))))]>, 1782 VEX, Sched<[WriteCvtPD2PS]>, VEX_WIG; 1783def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1784 "cvtpd2ps{x}\t{$src, $dst|$dst, $src}", 1785 [(set VR128:$dst, (v4f32 (X86any_vfpround (loadv2f64 addr:$src))))]>, 1786 VEX, Sched<[WriteCvtPD2PS.Folded]>, VEX_WIG; 1787 1788def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), 1789 "cvtpd2ps\t{$src, $dst|$dst, $src}", 1790 [(set VR128:$dst, (v4f32 (X86any_vfpround (v4f64 VR256:$src))))]>, 1791 VEX, VEX_L, Sched<[WriteCvtPD2PSY]>, VEX_WIG; 1792def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), 1793 "cvtpd2ps{y}\t{$src, $dst|$dst, $src}", 1794 [(set VR128:$dst, (v4f32 (X86any_vfpround (loadv4f64 addr:$src))))]>, 1795 VEX, VEX_L, Sched<[WriteCvtPD2PSY.Folded]>, VEX_WIG; 1796} // Predicates = [HasAVX, NoVLX] 1797 1798def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}", 1799 (VCVTPD2PSrr VR128:$dst, VR128:$src), 0, "att">; 1800def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}", 1801 (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0, "att">; 1802 1803def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1804 "cvtpd2ps\t{$src, $dst|$dst, $src}", 1805 [(set VR128:$dst, (v4f32 (X86any_vfpround (v2f64 VR128:$src))))]>, 1806 Sched<[WriteCvtPD2PS]>, SIMD_EXC; 1807def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1808 "cvtpd2ps\t{$src, $dst|$dst, $src}", 1809 [(set VR128:$dst, (v4f32 (X86any_vfpround (memopv2f64 addr:$src))))]>, 1810 Sched<[WriteCvtPD2PS.Folded]>, SIMD_EXC; 1811 1812//===----------------------------------------------------------------------===// 1813// SSE 1 & 2 - Compare Instructions 1814//===----------------------------------------------------------------------===// 1815 1816// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions 1817multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop, 1818 Operand memop, SDNode OpNode, ValueType VT, 1819 PatFrag ld_frag, string asm, 1820 X86FoldableSchedWrite sched, 1821 PatFrags mem_frags> { 1822 def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst), 1823 (ins VR128:$src1, VR128:$src2, u8imm:$cc), asm, 1824 [(set VR128:$dst, (OpNode (VT VR128:$src1), 1825 VR128:$src2, timm:$cc))]>, 1826 Sched<[sched]>, SIMD_EXC; 1827 let mayLoad = 1 in 1828 def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst), 1829 (ins VR128:$src1, memop:$src2, u8imm:$cc), asm, 1830 [(set VR128:$dst, (OpNode (VT VR128:$src1), 1831 (mem_frags addr:$src2), timm:$cc))]>, 1832 Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; 1833 1834 let isCodeGenOnly = 1 in { 1835 let isCommutable = 1 in 1836 def rr : SIi8<0xC2, MRMSrcReg, 1837 (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm, 1838 [(set RC:$dst, (OpNode RC:$src1, RC:$src2, timm:$cc))]>, 1839 Sched<[sched]>, SIMD_EXC; 1840 def rm : SIi8<0xC2, MRMSrcMem, 1841 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm, 1842 [(set RC:$dst, (OpNode RC:$src1, 1843 (ld_frag addr:$src2), timm:$cc))]>, 1844 Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; 1845 } 1846} 1847 1848let ExeDomain = SSEPackedSingle in 1849defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32, 1850 "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1851 SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, 1852 XS, VEX_4V, VEX_LIG, VEX_WIG; 1853let ExeDomain = SSEPackedDouble in 1854defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64, 1855 "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1856 SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, 1857 XD, VEX_4V, VEX_LIG, VEX_WIG; 1858 1859let Constraints = "$src1 = $dst" in { 1860 let ExeDomain = SSEPackedSingle in 1861 defm CMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32, 1862 "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", 1863 SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS; 1864 let ExeDomain = SSEPackedDouble in 1865 defm CMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64, 1866 "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}", 1867 SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD; 1868} 1869 1870// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS 1871multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDPatternOperator OpNode, 1872 ValueType vt, X86MemOperand x86memop, 1873 PatFrag ld_frag, string OpcodeStr, Domain d, 1874 X86FoldableSchedWrite sched = WriteFComX> { 1875 let ExeDomain = d in { 1876 def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), 1877 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 1878 [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>, 1879 Sched<[sched]>, SIMD_EXC; 1880 let mayLoad = 1 in 1881 def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2), 1882 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 1883 [(set EFLAGS, (OpNode (vt RC:$src1), 1884 (ld_frag addr:$src2)))]>, 1885 Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; 1886} 1887} 1888 1889// sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp 1890multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode, 1891 ValueType vt, Operand memop, 1892 PatFrags mem_frags, string OpcodeStr, 1893 Domain d, 1894 X86FoldableSchedWrite sched = WriteFComX> { 1895let ExeDomain = d in { 1896 def rr_Int: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), 1897 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 1898 [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>, 1899 Sched<[sched]>, SIMD_EXC; 1900let mayLoad = 1 in 1901 def rm_Int: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2), 1902 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 1903 [(set EFLAGS, (OpNode (vt RC:$src1), 1904 (mem_frags addr:$src2)))]>, 1905 Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; 1906} 1907} 1908 1909let Defs = [EFLAGS] in { 1910 defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32, 1911 "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG; 1912 defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64, 1913 "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG; 1914 defm VCOMISS : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32, 1915 "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG; 1916 defm VCOMISD : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64, 1917 "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG; 1918 1919 let isCodeGenOnly = 1 in { 1920 defm VUCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem, 1921 sse_load_f32, "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG; 1922 defm VUCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem, 1923 sse_load_f64, "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG; 1924 1925 defm VCOMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem, 1926 sse_load_f32, "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG; 1927 defm VCOMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem, 1928 sse_load_f64, "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG; 1929 } 1930 defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32, 1931 "ucomiss", SSEPackedSingle>, PS; 1932 defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64, 1933 "ucomisd", SSEPackedDouble>, PD; 1934 defm COMISS : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32, 1935 "comiss", SSEPackedSingle>, PS; 1936 defm COMISD : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64, 1937 "comisd", SSEPackedDouble>, PD; 1938 1939 let isCodeGenOnly = 1 in { 1940 defm UCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem, 1941 sse_load_f32, "ucomiss", SSEPackedSingle>, PS; 1942 defm UCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem, 1943 sse_load_f64, "ucomisd", SSEPackedDouble>, PD; 1944 1945 defm COMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem, 1946 sse_load_f32, "comiss", SSEPackedSingle>, PS; 1947 defm COMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem, 1948 sse_load_f64, "comisd", SSEPackedDouble>, PD; 1949 } 1950} // Defs = [EFLAGS] 1951 1952// sse12_cmp_packed - sse 1 & 2 compare packed instructions 1953multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop, 1954 ValueType VT, string asm, 1955 X86FoldableSchedWrite sched, 1956 Domain d, PatFrag ld_frag> { 1957 let isCommutable = 1 in 1958 def rri : PIi8<0xC2, MRMSrcReg, 1959 (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm, 1960 [(set RC:$dst, (VT (X86any_cmpp RC:$src1, RC:$src2, timm:$cc)))], d>, 1961 Sched<[sched]>, SIMD_EXC; 1962 def rmi : PIi8<0xC2, MRMSrcMem, 1963 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm, 1964 [(set RC:$dst, 1965 (VT (X86any_cmpp RC:$src1, (ld_frag addr:$src2), timm:$cc)))], d>, 1966 Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; 1967} 1968 1969defm VCMPPS : sse12_cmp_packed<VR128, f128mem, v4f32, 1970 "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1971 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, PS, VEX_4V, VEX_WIG; 1972defm VCMPPD : sse12_cmp_packed<VR128, f128mem, v2f64, 1973 "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1974 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, loadv2f64>, PD, VEX_4V, VEX_WIG; 1975defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, v8f32, 1976 "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1977 SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, PS, VEX_4V, VEX_L, VEX_WIG; 1978defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, v4f64, 1979 "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1980 SchedWriteFCmpSizes.PD.YMM, SSEPackedDouble, loadv4f64>, PD, VEX_4V, VEX_L, VEX_WIG; 1981let Constraints = "$src1 = $dst" in { 1982 defm CMPPS : sse12_cmp_packed<VR128, f128mem, v4f32, 1983 "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}", 1984 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, memopv4f32>, PS; 1985 defm CMPPD : sse12_cmp_packed<VR128, f128mem, v2f64, 1986 "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}", 1987 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, PD; 1988} 1989 1990def CommutableCMPCC : PatLeaf<(timm), [{ 1991 uint64_t Imm = N->getZExtValue() & 0x7; 1992 return (Imm == 0x00 || Imm == 0x03 || Imm == 0x04 || Imm == 0x07); 1993}]>; 1994 1995// Patterns to select compares with loads in first operand. 1996let Predicates = [HasAVX] in { 1997 def : Pat<(v4f64 (X86any_cmpp (loadv4f64 addr:$src2), VR256:$src1, 1998 CommutableCMPCC:$cc)), 1999 (VCMPPDYrmi VR256:$src1, addr:$src2, timm:$cc)>; 2000 2001 def : Pat<(v8f32 (X86any_cmpp (loadv8f32 addr:$src2), VR256:$src1, 2002 CommutableCMPCC:$cc)), 2003 (VCMPPSYrmi VR256:$src1, addr:$src2, timm:$cc)>; 2004 2005 def : Pat<(v2f64 (X86any_cmpp (loadv2f64 addr:$src2), VR128:$src1, 2006 CommutableCMPCC:$cc)), 2007 (VCMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>; 2008 2009 def : Pat<(v4f32 (X86any_cmpp (loadv4f32 addr:$src2), VR128:$src1, 2010 CommutableCMPCC:$cc)), 2011 (VCMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>; 2012 2013 def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1, 2014 CommutableCMPCC:$cc)), 2015 (VCMPSDrm FR64:$src1, addr:$src2, timm:$cc)>; 2016 2017 def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1, 2018 CommutableCMPCC:$cc)), 2019 (VCMPSSrm FR32:$src1, addr:$src2, timm:$cc)>; 2020} 2021 2022let Predicates = [UseSSE2] in { 2023 def : Pat<(v2f64 (X86any_cmpp (memopv2f64 addr:$src2), VR128:$src1, 2024 CommutableCMPCC:$cc)), 2025 (CMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>; 2026 2027 def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1, 2028 CommutableCMPCC:$cc)), 2029 (CMPSDrm FR64:$src1, addr:$src2, timm:$cc)>; 2030} 2031 2032let Predicates = [UseSSE1] in { 2033 def : Pat<(v4f32 (X86any_cmpp (memopv4f32 addr:$src2), VR128:$src1, 2034 CommutableCMPCC:$cc)), 2035 (CMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>; 2036 2037 def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1, 2038 CommutableCMPCC:$cc)), 2039 (CMPSSrm FR32:$src1, addr:$src2, timm:$cc)>; 2040} 2041 2042//===----------------------------------------------------------------------===// 2043// SSE 1 & 2 - Shuffle Instructions 2044//===----------------------------------------------------------------------===// 2045 2046/// sse12_shuffle - sse 1 & 2 fp shuffle instructions 2047multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop, 2048 ValueType vt, string asm, PatFrag mem_frag, 2049 X86FoldableSchedWrite sched, Domain d, 2050 bit IsCommutable = 0> { 2051 def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst), 2052 (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm, 2053 [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2), 2054 (i8 timm:$src3))))], d>, 2055 Sched<[sched.Folded, sched.ReadAfterFold]>; 2056 let isCommutable = IsCommutable in 2057 def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst), 2058 (ins RC:$src1, RC:$src2, u8imm:$src3), asm, 2059 [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, 2060 (i8 timm:$src3))))], d>, 2061 Sched<[sched]>; 2062} 2063 2064let Predicates = [HasAVX, NoVLX] in { 2065 defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32, 2066 "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 2067 loadv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, 2068 PS, VEX_4V, VEX_WIG; 2069 defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32, 2070 "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 2071 loadv8f32, SchedWriteFShuffle.YMM, SSEPackedSingle>, 2072 PS, VEX_4V, VEX_L, VEX_WIG; 2073 defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64, 2074 "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 2075 loadv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>, 2076 PD, VEX_4V, VEX_WIG; 2077 defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64, 2078 "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 2079 loadv4f64, SchedWriteFShuffle.YMM, SSEPackedDouble>, 2080 PD, VEX_4V, VEX_L, VEX_WIG; 2081} 2082let Constraints = "$src1 = $dst" in { 2083 defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32, 2084 "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}", 2085 memopv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, PS; 2086 defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64, 2087 "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}", 2088 memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD; 2089} 2090 2091//===----------------------------------------------------------------------===// 2092// SSE 1 & 2 - Unpack FP Instructions 2093//===----------------------------------------------------------------------===// 2094 2095/// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave 2096multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt, 2097 PatFrag mem_frag, RegisterClass RC, 2098 X86MemOperand x86memop, string asm, 2099 X86FoldableSchedWrite sched, Domain d, 2100 bit IsCommutable = 0> { 2101 let isCommutable = IsCommutable in 2102 def rr : PI<opc, MRMSrcReg, 2103 (outs RC:$dst), (ins RC:$src1, RC:$src2), 2104 asm, [(set RC:$dst, 2105 (vt (OpNode RC:$src1, RC:$src2)))], d>, 2106 Sched<[sched]>; 2107 def rm : PI<opc, MRMSrcMem, 2108 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 2109 asm, [(set RC:$dst, 2110 (vt (OpNode RC:$src1, 2111 (mem_frag addr:$src2))))], d>, 2112 Sched<[sched.Folded, sched.ReadAfterFold]>; 2113} 2114 2115let Predicates = [HasAVX, NoVLX] in { 2116defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, load, 2117 VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2118 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG; 2119defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, load, 2120 VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2121 SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD, VEX_4V, VEX_WIG; 2122defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, load, 2123 VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2124 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG; 2125defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, load, 2126 VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2127 SchedWriteFShuffle.XMM, SSEPackedDouble>, PD, VEX_4V, VEX_WIG; 2128 2129defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, load, 2130 VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2131 SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG; 2132defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, load, 2133 VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2134 SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG; 2135defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, load, 2136 VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2137 SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG; 2138defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, load, 2139 VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2140 SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG; 2141}// Predicates = [HasAVX, NoVLX] 2142 2143let Constraints = "$src1 = $dst" in { 2144 defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memop, 2145 VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}", 2146 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS; 2147 defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memop, 2148 VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}", 2149 SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD; 2150 defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memop, 2151 VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}", 2152 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS; 2153 defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memop, 2154 VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}", 2155 SchedWriteFShuffle.XMM, SSEPackedDouble>, PD; 2156} // Constraints = "$src1 = $dst" 2157 2158let Predicates = [HasAVX1Only] in { 2159 def : Pat<(v8i32 (X86Unpckl VR256:$src1, (loadv8i32 addr:$src2))), 2160 (VUNPCKLPSYrm VR256:$src1, addr:$src2)>; 2161 def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)), 2162 (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>; 2163 def : Pat<(v8i32 (X86Unpckh VR256:$src1, (loadv8i32 addr:$src2))), 2164 (VUNPCKHPSYrm VR256:$src1, addr:$src2)>; 2165 def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)), 2166 (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>; 2167 2168 def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))), 2169 (VUNPCKLPDYrm VR256:$src1, addr:$src2)>; 2170 def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)), 2171 (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>; 2172 def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))), 2173 (VUNPCKHPDYrm VR256:$src1, addr:$src2)>; 2174 def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)), 2175 (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>; 2176} 2177 2178let Predicates = [UseSSE2] in { 2179 // Use MOVHPD if the load isn't aligned enough for UNPCKLPD. 2180 def : Pat<(v2f64 (X86Unpckl VR128:$src1, 2181 (v2f64 (simple_load addr:$src2)))), 2182 (MOVHPDrm VR128:$src1, addr:$src2)>; 2183} 2184 2185//===----------------------------------------------------------------------===// 2186// SSE 1 & 2 - Extract Floating-Point Sign mask 2187//===----------------------------------------------------------------------===// 2188 2189/// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave 2190multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt, 2191 string asm, Domain d> { 2192 def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src), 2193 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 2194 [(set GR32orGR64:$dst, (X86movmsk (vt RC:$src)))], d>, 2195 Sched<[WriteFMOVMSK]>; 2196} 2197 2198let Predicates = [HasAVX] in { 2199 defm VMOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps", 2200 SSEPackedSingle>, PS, VEX, VEX_WIG; 2201 defm VMOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd", 2202 SSEPackedDouble>, PD, VEX, VEX_WIG; 2203 defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, v8f32, "movmskps", 2204 SSEPackedSingle>, PS, VEX, VEX_L, VEX_WIG; 2205 defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd", 2206 SSEPackedDouble>, PD, VEX, VEX_L, VEX_WIG; 2207 2208 // Also support integer VTs to avoid a int->fp bitcast in the DAG. 2209 def : Pat<(X86movmsk (v4i32 VR128:$src)), 2210 (VMOVMSKPSrr VR128:$src)>; 2211 def : Pat<(X86movmsk (v2i64 VR128:$src)), 2212 (VMOVMSKPDrr VR128:$src)>; 2213 def : Pat<(X86movmsk (v8i32 VR256:$src)), 2214 (VMOVMSKPSYrr VR256:$src)>; 2215 def : Pat<(X86movmsk (v4i64 VR256:$src)), 2216 (VMOVMSKPDYrr VR256:$src)>; 2217} 2218 2219defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps", 2220 SSEPackedSingle>, PS; 2221defm MOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd", 2222 SSEPackedDouble>, PD; 2223 2224let Predicates = [UseSSE2] in { 2225 // Also support integer VTs to avoid a int->fp bitcast in the DAG. 2226 def : Pat<(X86movmsk (v4i32 VR128:$src)), 2227 (MOVMSKPSrr VR128:$src)>; 2228 def : Pat<(X86movmsk (v2i64 VR128:$src)), 2229 (MOVMSKPDrr VR128:$src)>; 2230} 2231 2232//===---------------------------------------------------------------------===// 2233// SSE2 - Packed Integer Logical Instructions 2234//===---------------------------------------------------------------------===// 2235 2236let ExeDomain = SSEPackedInt in { // SSE integer instructions 2237 2238/// PDI_binop_rm - Simple SSE2 binary operator. 2239multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 2240 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 2241 X86MemOperand x86memop, X86FoldableSchedWrite sched, 2242 bit IsCommutable, bit Is2Addr> { 2243 let isCommutable = IsCommutable in 2244 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), 2245 (ins RC:$src1, RC:$src2), 2246 !if(Is2Addr, 2247 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 2248 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 2249 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, 2250 Sched<[sched]>; 2251 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), 2252 (ins RC:$src1, x86memop:$src2), 2253 !if(Is2Addr, 2254 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 2255 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 2256 [(set RC:$dst, (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>, 2257 Sched<[sched.Folded, sched.ReadAfterFold]>; 2258} 2259} // ExeDomain = SSEPackedInt 2260 2261multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode, 2262 ValueType OpVT128, ValueType OpVT256, 2263 X86SchedWriteWidths sched, bit IsCommutable, 2264 Predicate prd> { 2265let Predicates = [HasAVX, prd] in 2266 defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128, 2267 VR128, load, i128mem, sched.XMM, 2268 IsCommutable, 0>, VEX_4V, VEX_WIG; 2269 2270let Constraints = "$src1 = $dst" in 2271 defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128, 2272 memop, i128mem, sched.XMM, IsCommutable, 1>; 2273 2274let Predicates = [HasAVX2, prd] in 2275 defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, 2276 OpVT256, VR256, load, i256mem, sched.YMM, 2277 IsCommutable, 0>, VEX_4V, VEX_L, VEX_WIG; 2278} 2279 2280// These are ordered here for pattern ordering requirements with the fp versions 2281 2282defm PAND : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64, 2283 SchedWriteVecLogic, 1, NoVLX>; 2284defm POR : PDI_binop_all<0xEB, "por", or, v2i64, v4i64, 2285 SchedWriteVecLogic, 1, NoVLX>; 2286defm PXOR : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64, 2287 SchedWriteVecLogic, 1, NoVLX>; 2288defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64, 2289 SchedWriteVecLogic, 0, NoVLX>; 2290 2291//===----------------------------------------------------------------------===// 2292// SSE 1 & 2 - Logical Instructions 2293//===----------------------------------------------------------------------===// 2294 2295/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops 2296/// 2297/// There are no patterns here because isel prefers integer versions for SSE2 2298/// and later. There are SSE1 v4f32 patterns later. 2299multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr, 2300 X86SchedWriteWidths sched> { 2301 let Predicates = [HasAVX, NoVLX] in { 2302 defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle, 2303 !strconcat(OpcodeStr, "ps"), f256mem, sched.YMM, 2304 [], [], 0>, PS, VEX_4V, VEX_L, VEX_WIG; 2305 2306 defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble, 2307 !strconcat(OpcodeStr, "pd"), f256mem, sched.YMM, 2308 [], [], 0>, PD, VEX_4V, VEX_L, VEX_WIG; 2309 2310 defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, 2311 !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM, 2312 [], [], 0>, PS, VEX_4V, VEX_WIG; 2313 2314 defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble, 2315 !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM, 2316 [], [], 0>, PD, VEX_4V, VEX_WIG; 2317 } 2318 2319 let Constraints = "$src1 = $dst" in { 2320 defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, 2321 !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM, 2322 [], []>, PS; 2323 2324 defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble, 2325 !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM, 2326 [], []>, PD; 2327 } 2328} 2329 2330defm AND : sse12_fp_packed_logical<0x54, "and", SchedWriteFLogic>; 2331defm OR : sse12_fp_packed_logical<0x56, "or", SchedWriteFLogic>; 2332defm XOR : sse12_fp_packed_logical<0x57, "xor", SchedWriteFLogic>; 2333let isCommutable = 0 in 2334 defm ANDN : sse12_fp_packed_logical<0x55, "andn", SchedWriteFLogic>; 2335 2336let Predicates = [HasAVX2, NoVLX] in { 2337 def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)), 2338 (VPANDYrr VR256:$src1, VR256:$src2)>; 2339 def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)), 2340 (VPANDYrr VR256:$src1, VR256:$src2)>; 2341 def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)), 2342 (VPANDYrr VR256:$src1, VR256:$src2)>; 2343 2344 def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)), 2345 (VPORYrr VR256:$src1, VR256:$src2)>; 2346 def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)), 2347 (VPORYrr VR256:$src1, VR256:$src2)>; 2348 def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)), 2349 (VPORYrr VR256:$src1, VR256:$src2)>; 2350 2351 def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)), 2352 (VPXORYrr VR256:$src1, VR256:$src2)>; 2353 def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)), 2354 (VPXORYrr VR256:$src1, VR256:$src2)>; 2355 def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)), 2356 (VPXORYrr VR256:$src1, VR256:$src2)>; 2357 2358 def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)), 2359 (VPANDNYrr VR256:$src1, VR256:$src2)>; 2360 def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)), 2361 (VPANDNYrr VR256:$src1, VR256:$src2)>; 2362 def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)), 2363 (VPANDNYrr VR256:$src1, VR256:$src2)>; 2364 2365 def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)), 2366 (VPANDYrm VR256:$src1, addr:$src2)>; 2367 def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)), 2368 (VPANDYrm VR256:$src1, addr:$src2)>; 2369 def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)), 2370 (VPANDYrm VR256:$src1, addr:$src2)>; 2371 2372 def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)), 2373 (VPORYrm VR256:$src1, addr:$src2)>; 2374 def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)), 2375 (VPORYrm VR256:$src1, addr:$src2)>; 2376 def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)), 2377 (VPORYrm VR256:$src1, addr:$src2)>; 2378 2379 def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)), 2380 (VPXORYrm VR256:$src1, addr:$src2)>; 2381 def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)), 2382 (VPXORYrm VR256:$src1, addr:$src2)>; 2383 def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)), 2384 (VPXORYrm VR256:$src1, addr:$src2)>; 2385 2386 def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)), 2387 (VPANDNYrm VR256:$src1, addr:$src2)>; 2388 def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)), 2389 (VPANDNYrm VR256:$src1, addr:$src2)>; 2390 def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)), 2391 (VPANDNYrm VR256:$src1, addr:$src2)>; 2392} 2393 2394// If only AVX1 is supported, we need to handle integer operations with 2395// floating point instructions since the integer versions aren't available. 2396let Predicates = [HasAVX1Only] in { 2397 def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)), 2398 (VANDPSYrr VR256:$src1, VR256:$src2)>; 2399 def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)), 2400 (VANDPSYrr VR256:$src1, VR256:$src2)>; 2401 def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)), 2402 (VANDPSYrr VR256:$src1, VR256:$src2)>; 2403 def : Pat<(v4i64 (and VR256:$src1, VR256:$src2)), 2404 (VANDPSYrr VR256:$src1, VR256:$src2)>; 2405 2406 def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)), 2407 (VORPSYrr VR256:$src1, VR256:$src2)>; 2408 def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)), 2409 (VORPSYrr VR256:$src1, VR256:$src2)>; 2410 def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)), 2411 (VORPSYrr VR256:$src1, VR256:$src2)>; 2412 def : Pat<(v4i64 (or VR256:$src1, VR256:$src2)), 2413 (VORPSYrr VR256:$src1, VR256:$src2)>; 2414 2415 def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)), 2416 (VXORPSYrr VR256:$src1, VR256:$src2)>; 2417 def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)), 2418 (VXORPSYrr VR256:$src1, VR256:$src2)>; 2419 def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)), 2420 (VXORPSYrr VR256:$src1, VR256:$src2)>; 2421 def : Pat<(v4i64 (xor VR256:$src1, VR256:$src2)), 2422 (VXORPSYrr VR256:$src1, VR256:$src2)>; 2423 2424 def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)), 2425 (VANDNPSYrr VR256:$src1, VR256:$src2)>; 2426 def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)), 2427 (VANDNPSYrr VR256:$src1, VR256:$src2)>; 2428 def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)), 2429 (VANDNPSYrr VR256:$src1, VR256:$src2)>; 2430 def : Pat<(v4i64 (X86andnp VR256:$src1, VR256:$src2)), 2431 (VANDNPSYrr VR256:$src1, VR256:$src2)>; 2432 2433 def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)), 2434 (VANDPSYrm VR256:$src1, addr:$src2)>; 2435 def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)), 2436 (VANDPSYrm VR256:$src1, addr:$src2)>; 2437 def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)), 2438 (VANDPSYrm VR256:$src1, addr:$src2)>; 2439 def : Pat<(and VR256:$src1, (loadv4i64 addr:$src2)), 2440 (VANDPSYrm VR256:$src1, addr:$src2)>; 2441 2442 def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)), 2443 (VORPSYrm VR256:$src1, addr:$src2)>; 2444 def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)), 2445 (VORPSYrm VR256:$src1, addr:$src2)>; 2446 def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)), 2447 (VORPSYrm VR256:$src1, addr:$src2)>; 2448 def : Pat<(or VR256:$src1, (loadv4i64 addr:$src2)), 2449 (VORPSYrm VR256:$src1, addr:$src2)>; 2450 2451 def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)), 2452 (VXORPSYrm VR256:$src1, addr:$src2)>; 2453 def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)), 2454 (VXORPSYrm VR256:$src1, addr:$src2)>; 2455 def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)), 2456 (VXORPSYrm VR256:$src1, addr:$src2)>; 2457 def : Pat<(xor VR256:$src1, (loadv4i64 addr:$src2)), 2458 (VXORPSYrm VR256:$src1, addr:$src2)>; 2459 2460 def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)), 2461 (VANDNPSYrm VR256:$src1, addr:$src2)>; 2462 def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)), 2463 (VANDNPSYrm VR256:$src1, addr:$src2)>; 2464 def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)), 2465 (VANDNPSYrm VR256:$src1, addr:$src2)>; 2466 def : Pat<(X86andnp VR256:$src1, (loadv4i64 addr:$src2)), 2467 (VANDNPSYrm VR256:$src1, addr:$src2)>; 2468} 2469 2470let Predicates = [HasAVX, NoVLX] in { 2471 def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)), 2472 (VPANDrr VR128:$src1, VR128:$src2)>; 2473 def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)), 2474 (VPANDrr VR128:$src1, VR128:$src2)>; 2475 def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)), 2476 (VPANDrr VR128:$src1, VR128:$src2)>; 2477 2478 def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)), 2479 (VPORrr VR128:$src1, VR128:$src2)>; 2480 def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)), 2481 (VPORrr VR128:$src1, VR128:$src2)>; 2482 def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)), 2483 (VPORrr VR128:$src1, VR128:$src2)>; 2484 2485 def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)), 2486 (VPXORrr VR128:$src1, VR128:$src2)>; 2487 def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)), 2488 (VPXORrr VR128:$src1, VR128:$src2)>; 2489 def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)), 2490 (VPXORrr VR128:$src1, VR128:$src2)>; 2491 2492 def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)), 2493 (VPANDNrr VR128:$src1, VR128:$src2)>; 2494 def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)), 2495 (VPANDNrr VR128:$src1, VR128:$src2)>; 2496 def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)), 2497 (VPANDNrr VR128:$src1, VR128:$src2)>; 2498 2499 def : Pat<(and VR128:$src1, (loadv16i8 addr:$src2)), 2500 (VPANDrm VR128:$src1, addr:$src2)>; 2501 def : Pat<(and VR128:$src1, (loadv8i16 addr:$src2)), 2502 (VPANDrm VR128:$src1, addr:$src2)>; 2503 def : Pat<(and VR128:$src1, (loadv4i32 addr:$src2)), 2504 (VPANDrm VR128:$src1, addr:$src2)>; 2505 2506 def : Pat<(or VR128:$src1, (loadv16i8 addr:$src2)), 2507 (VPORrm VR128:$src1, addr:$src2)>; 2508 def : Pat<(or VR128:$src1, (loadv8i16 addr:$src2)), 2509 (VPORrm VR128:$src1, addr:$src2)>; 2510 def : Pat<(or VR128:$src1, (loadv4i32 addr:$src2)), 2511 (VPORrm VR128:$src1, addr:$src2)>; 2512 2513 def : Pat<(xor VR128:$src1, (loadv16i8 addr:$src2)), 2514 (VPXORrm VR128:$src1, addr:$src2)>; 2515 def : Pat<(xor VR128:$src1, (loadv8i16 addr:$src2)), 2516 (VPXORrm VR128:$src1, addr:$src2)>; 2517 def : Pat<(xor VR128:$src1, (loadv4i32 addr:$src2)), 2518 (VPXORrm VR128:$src1, addr:$src2)>; 2519 2520 def : Pat<(X86andnp VR128:$src1, (loadv16i8 addr:$src2)), 2521 (VPANDNrm VR128:$src1, addr:$src2)>; 2522 def : Pat<(X86andnp VR128:$src1, (loadv8i16 addr:$src2)), 2523 (VPANDNrm VR128:$src1, addr:$src2)>; 2524 def : Pat<(X86andnp VR128:$src1, (loadv4i32 addr:$src2)), 2525 (VPANDNrm VR128:$src1, addr:$src2)>; 2526} 2527 2528let Predicates = [UseSSE2] in { 2529 def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)), 2530 (PANDrr VR128:$src1, VR128:$src2)>; 2531 def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)), 2532 (PANDrr VR128:$src1, VR128:$src2)>; 2533 def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)), 2534 (PANDrr VR128:$src1, VR128:$src2)>; 2535 2536 def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)), 2537 (PORrr VR128:$src1, VR128:$src2)>; 2538 def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)), 2539 (PORrr VR128:$src1, VR128:$src2)>; 2540 def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)), 2541 (PORrr VR128:$src1, VR128:$src2)>; 2542 2543 def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)), 2544 (PXORrr VR128:$src1, VR128:$src2)>; 2545 def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)), 2546 (PXORrr VR128:$src1, VR128:$src2)>; 2547 def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)), 2548 (PXORrr VR128:$src1, VR128:$src2)>; 2549 2550 def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)), 2551 (PANDNrr VR128:$src1, VR128:$src2)>; 2552 def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)), 2553 (PANDNrr VR128:$src1, VR128:$src2)>; 2554 def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)), 2555 (PANDNrr VR128:$src1, VR128:$src2)>; 2556 2557 def : Pat<(and VR128:$src1, (memopv16i8 addr:$src2)), 2558 (PANDrm VR128:$src1, addr:$src2)>; 2559 def : Pat<(and VR128:$src1, (memopv8i16 addr:$src2)), 2560 (PANDrm VR128:$src1, addr:$src2)>; 2561 def : Pat<(and VR128:$src1, (memopv4i32 addr:$src2)), 2562 (PANDrm VR128:$src1, addr:$src2)>; 2563 2564 def : Pat<(or VR128:$src1, (memopv16i8 addr:$src2)), 2565 (PORrm VR128:$src1, addr:$src2)>; 2566 def : Pat<(or VR128:$src1, (memopv8i16 addr:$src2)), 2567 (PORrm VR128:$src1, addr:$src2)>; 2568 def : Pat<(or VR128:$src1, (memopv4i32 addr:$src2)), 2569 (PORrm VR128:$src1, addr:$src2)>; 2570 2571 def : Pat<(xor VR128:$src1, (memopv16i8 addr:$src2)), 2572 (PXORrm VR128:$src1, addr:$src2)>; 2573 def : Pat<(xor VR128:$src1, (memopv8i16 addr:$src2)), 2574 (PXORrm VR128:$src1, addr:$src2)>; 2575 def : Pat<(xor VR128:$src1, (memopv4i32 addr:$src2)), 2576 (PXORrm VR128:$src1, addr:$src2)>; 2577 2578 def : Pat<(X86andnp VR128:$src1, (memopv16i8 addr:$src2)), 2579 (PANDNrm VR128:$src1, addr:$src2)>; 2580 def : Pat<(X86andnp VR128:$src1, (memopv8i16 addr:$src2)), 2581 (PANDNrm VR128:$src1, addr:$src2)>; 2582 def : Pat<(X86andnp VR128:$src1, (memopv4i32 addr:$src2)), 2583 (PANDNrm VR128:$src1, addr:$src2)>; 2584} 2585 2586// Patterns for packed operations when we don't have integer type available. 2587def : Pat<(v4f32 (X86fand VR128:$src1, VR128:$src2)), 2588 (ANDPSrr VR128:$src1, VR128:$src2)>; 2589def : Pat<(v4f32 (X86for VR128:$src1, VR128:$src2)), 2590 (ORPSrr VR128:$src1, VR128:$src2)>; 2591def : Pat<(v4f32 (X86fxor VR128:$src1, VR128:$src2)), 2592 (XORPSrr VR128:$src1, VR128:$src2)>; 2593def : Pat<(v4f32 (X86fandn VR128:$src1, VR128:$src2)), 2594 (ANDNPSrr VR128:$src1, VR128:$src2)>; 2595 2596def : Pat<(X86fand VR128:$src1, (memopv4f32 addr:$src2)), 2597 (ANDPSrm VR128:$src1, addr:$src2)>; 2598def : Pat<(X86for VR128:$src1, (memopv4f32 addr:$src2)), 2599 (ORPSrm VR128:$src1, addr:$src2)>; 2600def : Pat<(X86fxor VR128:$src1, (memopv4f32 addr:$src2)), 2601 (XORPSrm VR128:$src1, addr:$src2)>; 2602def : Pat<(X86fandn VR128:$src1, (memopv4f32 addr:$src2)), 2603 (ANDNPSrm VR128:$src1, addr:$src2)>; 2604 2605//===----------------------------------------------------------------------===// 2606// SSE 1 & 2 - Arithmetic Instructions 2607//===----------------------------------------------------------------------===// 2608 2609/// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and 2610/// vector forms. 2611/// 2612/// In addition, we also have a special variant of the scalar form here to 2613/// represent the associated intrinsic operation. This form is unlike the 2614/// plain scalar form, in that it takes an entire vector (instead of a scalar) 2615/// and leaves the top elements unmodified (therefore these cannot be commuted). 2616/// 2617/// These three forms can each be reg+reg or reg+mem. 2618/// 2619 2620/// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those 2621/// classes below 2622multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr, 2623 SDPatternOperator OpNode, X86SchedWriteSizes sched> { 2624let Uses = [MXCSR], mayRaiseFPException = 1 in { 2625 let Predicates = [HasAVX, NoVLX] in { 2626 defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, 2627 VR128, v4f32, f128mem, loadv4f32, 2628 SSEPackedSingle, sched.PS.XMM, 0>, PS, VEX_4V, VEX_WIG; 2629 defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, 2630 VR128, v2f64, f128mem, loadv2f64, 2631 SSEPackedDouble, sched.PD.XMM, 0>, PD, VEX_4V, VEX_WIG; 2632 2633 defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), 2634 OpNode, VR256, v8f32, f256mem, loadv8f32, 2635 SSEPackedSingle, sched.PS.YMM, 0>, PS, VEX_4V, VEX_L, VEX_WIG; 2636 defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), 2637 OpNode, VR256, v4f64, f256mem, loadv4f64, 2638 SSEPackedDouble, sched.PD.YMM, 0>, PD, VEX_4V, VEX_L, VEX_WIG; 2639 } 2640 2641 let Constraints = "$src1 = $dst" in { 2642 defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128, 2643 v4f32, f128mem, memopv4f32, SSEPackedSingle, 2644 sched.PS.XMM>, PS; 2645 defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128, 2646 v2f64, f128mem, memopv2f64, SSEPackedDouble, 2647 sched.PD.XMM>, PD; 2648 } 2649} 2650} 2651 2652multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, 2653 X86SchedWriteSizes sched> { 2654let Uses = [MXCSR], mayRaiseFPException = 1 in { 2655 defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"), 2656 OpNode, FR32, f32mem, SSEPackedSingle, sched.PS.Scl, 0>, 2657 XS, VEX_4V, VEX_LIG, VEX_WIG; 2658 defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"), 2659 OpNode, FR64, f64mem, SSEPackedDouble, sched.PD.Scl, 0>, 2660 XD, VEX_4V, VEX_LIG, VEX_WIG; 2661 2662 let Constraints = "$src1 = $dst" in { 2663 defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"), 2664 OpNode, FR32, f32mem, SSEPackedSingle, 2665 sched.PS.Scl>, XS; 2666 defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"), 2667 OpNode, FR64, f64mem, SSEPackedDouble, 2668 sched.PD.Scl>, XD; 2669 } 2670} 2671} 2672 2673multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr, 2674 SDPatternOperator OpNode, 2675 X86SchedWriteSizes sched> { 2676let Uses = [MXCSR], mayRaiseFPException = 1 in { 2677 defm V#NAME#SS : sse12_fp_scalar_int<opc, OpNode, VR128, v4f32, 2678 !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32, 2679 SSEPackedSingle, sched.PS.Scl, 0>, XS, VEX_4V, VEX_LIG, VEX_WIG; 2680 defm V#NAME#SD : sse12_fp_scalar_int<opc, OpNode, VR128, v2f64, 2681 !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64, 2682 SSEPackedDouble, sched.PD.Scl, 0>, XD, VEX_4V, VEX_LIG, VEX_WIG; 2683 2684 let Constraints = "$src1 = $dst" in { 2685 defm SS : sse12_fp_scalar_int<opc, OpNode, VR128, v4f32, 2686 !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32, 2687 SSEPackedSingle, sched.PS.Scl>, XS; 2688 defm SD : sse12_fp_scalar_int<opc, OpNode, VR128, v2f64, 2689 !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64, 2690 SSEPackedDouble, sched.PD.Scl>, XD; 2691 } 2692} 2693} 2694 2695// Binary Arithmetic instructions 2696defm ADD : basic_sse12_fp_binop_p<0x58, "add", any_fadd, SchedWriteFAddSizes>, 2697 basic_sse12_fp_binop_s<0x58, "add", any_fadd, SchedWriteFAddSizes>, 2698 basic_sse12_fp_binop_s_int<0x58, "add", null_frag, SchedWriteFAddSizes>; 2699defm MUL : basic_sse12_fp_binop_p<0x59, "mul", any_fmul, SchedWriteFMulSizes>, 2700 basic_sse12_fp_binop_s<0x59, "mul", any_fmul, SchedWriteFMulSizes>, 2701 basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, SchedWriteFMulSizes>; 2702let isCommutable = 0 in { 2703 defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", any_fsub, SchedWriteFAddSizes>, 2704 basic_sse12_fp_binop_s<0x5C, "sub", any_fsub, SchedWriteFAddSizes>, 2705 basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag, SchedWriteFAddSizes>; 2706 defm DIV : basic_sse12_fp_binop_p<0x5E, "div", any_fdiv, SchedWriteFDivSizes>, 2707 basic_sse12_fp_binop_s<0x5E, "div", any_fdiv, SchedWriteFDivSizes>, 2708 basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, SchedWriteFDivSizes>; 2709 defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SchedWriteFCmpSizes>, 2710 basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SchedWriteFCmpSizes>, 2711 basic_sse12_fp_binop_s_int<0x5F, "max", X86fmaxs, SchedWriteFCmpSizes>; 2712 defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SchedWriteFCmpSizes>, 2713 basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SchedWriteFCmpSizes>, 2714 basic_sse12_fp_binop_s_int<0x5D, "min", X86fmins, SchedWriteFCmpSizes>; 2715} 2716 2717let isCodeGenOnly = 1 in { 2718 defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>, 2719 basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>; 2720 defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SchedWriteFCmpSizes>, 2721 basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SchedWriteFCmpSizes>; 2722} 2723 2724// Patterns used to select SSE scalar fp arithmetic instructions from 2725// either: 2726// 2727// (1) a scalar fp operation followed by a blend 2728// 2729// The effect is that the backend no longer emits unnecessary vector 2730// insert instructions immediately after SSE scalar fp instructions 2731// like addss or mulss. 2732// 2733// For example, given the following code: 2734// __m128 foo(__m128 A, __m128 B) { 2735// A[0] += B[0]; 2736// return A; 2737// } 2738// 2739// Previously we generated: 2740// addss %xmm0, %xmm1 2741// movss %xmm1, %xmm0 2742// 2743// We now generate: 2744// addss %xmm1, %xmm0 2745// 2746// (2) a vector packed single/double fp operation followed by a vector insert 2747// 2748// The effect is that the backend converts the packed fp instruction 2749// followed by a vector insert into a single SSE scalar fp instruction. 2750// 2751// For example, given the following code: 2752// __m128 foo(__m128 A, __m128 B) { 2753// __m128 C = A + B; 2754// return (__m128) {c[0], a[1], a[2], a[3]}; 2755// } 2756// 2757// Previously we generated: 2758// addps %xmm0, %xmm1 2759// movss %xmm1, %xmm0 2760// 2761// We now generate: 2762// addss %xmm1, %xmm0 2763 2764// TODO: Some canonicalization in lowering would simplify the number of 2765// patterns we have to try to match. 2766multiclass scalar_math_patterns<SDPatternOperator Op, string OpcPrefix, SDNode Move, 2767 ValueType VT, ValueType EltTy, 2768 RegisterClass RC, PatFrag ld_frag, 2769 Predicate BasePredicate> { 2770 let Predicates = [BasePredicate] in { 2771 // extracted scalar math op with insert via movss/movsd 2772 def : Pat<(VT (Move (VT VR128:$dst), 2773 (VT (scalar_to_vector 2774 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), 2775 RC:$src))))), 2776 (!cast<Instruction>(OpcPrefix#rr_Int) VT:$dst, 2777 (VT (COPY_TO_REGCLASS RC:$src, VR128)))>; 2778 def : Pat<(VT (Move (VT VR128:$dst), 2779 (VT (scalar_to_vector 2780 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), 2781 (ld_frag addr:$src)))))), 2782 (!cast<Instruction>(OpcPrefix#rm_Int) VT:$dst, addr:$src)>; 2783 } 2784 2785 // Repeat for AVX versions of the instructions. 2786 let Predicates = [UseAVX] in { 2787 // extracted scalar math op with insert via movss/movsd 2788 def : Pat<(VT (Move (VT VR128:$dst), 2789 (VT (scalar_to_vector 2790 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), 2791 RC:$src))))), 2792 (!cast<Instruction>("V"#OpcPrefix#rr_Int) VT:$dst, 2793 (VT (COPY_TO_REGCLASS RC:$src, VR128)))>; 2794 def : Pat<(VT (Move (VT VR128:$dst), 2795 (VT (scalar_to_vector 2796 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), 2797 (ld_frag addr:$src)))))), 2798 (!cast<Instruction>("V"#OpcPrefix#rm_Int) VT:$dst, addr:$src)>; 2799 } 2800} 2801 2802defm : scalar_math_patterns<any_fadd, "ADDSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; 2803defm : scalar_math_patterns<any_fsub, "SUBSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; 2804defm : scalar_math_patterns<any_fmul, "MULSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; 2805defm : scalar_math_patterns<any_fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; 2806 2807defm : scalar_math_patterns<any_fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; 2808defm : scalar_math_patterns<any_fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; 2809defm : scalar_math_patterns<any_fmul, "MULSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; 2810defm : scalar_math_patterns<any_fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; 2811 2812/// Unop Arithmetic 2813/// In addition, we also have a special variant of the scalar form here to 2814/// represent the associated intrinsic operation. This form is unlike the 2815/// plain scalar form, in that it takes an entire vector (instead of a 2816/// scalar) and leaves the top elements undefined. 2817/// 2818/// And, we have a special variant form for a full-vector intrinsic form. 2819 2820/// sse_fp_unop_s - SSE1 unops in scalar form 2821/// For the non-AVX defs, we need $src1 to be tied to $dst because 2822/// the HW instructions are 2 operand / destructive. 2823multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, 2824 X86MemOperand x86memop, Operand intmemop, 2825 SDPatternOperator OpNode, Domain d, 2826 X86FoldableSchedWrite sched, Predicate target> { 2827 let isCodeGenOnly = 1, hasSideEffects = 0 in { 2828 def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1), 2829 !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"), 2830 [(set RC:$dst, (OpNode RC:$src1))], d>, Sched<[sched]>, 2831 Requires<[target]>; 2832 let mayLoad = 1 in 2833 def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1), 2834 !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"), 2835 [(set RC:$dst, (OpNode (load addr:$src1)))], d>, 2836 Sched<[sched.Folded]>, 2837 Requires<[target, OptForSize]>; 2838 } 2839 2840 let hasSideEffects = 0, Constraints = "$src1 = $dst", ExeDomain = d in { 2841 def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 2842 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>, 2843 Sched<[sched]>; 2844 let mayLoad = 1 in 2845 def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, intmemop:$src2), 2846 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>, 2847 Sched<[sched.Folded, sched.ReadAfterFold]>; 2848 } 2849 2850} 2851 2852multiclass sse_fp_unop_s_intr<ValueType vt, PatFrags mem_frags, 2853 Intrinsic Intr, Predicate target> { 2854 let Predicates = [target] in { 2855 // These are unary operations, but they are modeled as having 2 source operands 2856 // because the high elements of the destination are unchanged in SSE. 2857 def : Pat<(Intr VR128:$src), 2858 (!cast<Instruction>(NAME#r_Int) VR128:$src, VR128:$src)>; 2859 } 2860 // We don't want to fold scalar loads into these instructions unless 2861 // optimizing for size. This is because the folded instruction will have a 2862 // partial register update, while the unfolded sequence will not, e.g. 2863 // movss mem, %xmm0 2864 // rcpss %xmm0, %xmm0 2865 // which has a clobber before the rcp, vs. 2866 // rcpss mem, %xmm0 2867 let Predicates = [target, OptForSize] in { 2868 def : Pat<(Intr (mem_frags addr:$src2)), 2869 (!cast<Instruction>(NAME#m_Int) 2870 (vt (IMPLICIT_DEF)), addr:$src2)>; 2871 } 2872} 2873 2874multiclass avx_fp_unop_s_intr<ValueType vt, PatFrags mem_frags, 2875 Intrinsic Intr, Predicate target> { 2876 let Predicates = [target] in { 2877 def : Pat<(Intr VR128:$src), 2878 (!cast<Instruction>(NAME#r_Int) VR128:$src, 2879 VR128:$src)>; 2880 } 2881 let Predicates = [target, OptForSize] in { 2882 def : Pat<(Intr (mem_frags addr:$src2)), 2883 (!cast<Instruction>(NAME#m_Int) 2884 (vt (IMPLICIT_DEF)), addr:$src2)>; 2885 } 2886} 2887 2888multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, 2889 ValueType ScalarVT, X86MemOperand x86memop, 2890 Operand intmemop, SDPatternOperator OpNode, Domain d, 2891 X86FoldableSchedWrite sched, Predicate target> { 2892 let isCodeGenOnly = 1, hasSideEffects = 0 in { 2893 def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 2894 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 2895 [], d>, Sched<[sched]>; 2896 let mayLoad = 1 in 2897 def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 2898 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 2899 [], d>, Sched<[sched.Folded, sched.ReadAfterFold]>; 2900 } 2901 let hasSideEffects = 0, ExeDomain = d in { 2902 def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), 2903 (ins VR128:$src1, VR128:$src2), 2904 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 2905 []>, Sched<[sched]>; 2906 let mayLoad = 1 in 2907 def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), 2908 (ins VR128:$src1, intmemop:$src2), 2909 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 2910 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 2911 } 2912 2913 // We don't want to fold scalar loads into these instructions unless 2914 // optimizing for size. This is because the folded instruction will have a 2915 // partial register update, while the unfolded sequence will not, e.g. 2916 // vmovss mem, %xmm0 2917 // vrcpss %xmm0, %xmm0, %xmm0 2918 // which has a clobber before the rcp, vs. 2919 // vrcpss mem, %xmm0, %xmm0 2920 // TODO: In theory, we could fold the load, and avoid the stall caused by 2921 // the partial register store, either in BreakFalseDeps or with smarter RA. 2922 let Predicates = [target] in { 2923 def : Pat<(OpNode RC:$src), (!cast<Instruction>(NAME#r) 2924 (ScalarVT (IMPLICIT_DEF)), RC:$src)>; 2925 } 2926 let Predicates = [target, OptForSize] in { 2927 def : Pat<(ScalarVT (OpNode (load addr:$src))), 2928 (!cast<Instruction>(NAME#m) (ScalarVT (IMPLICIT_DEF)), 2929 addr:$src)>; 2930 } 2931} 2932 2933/// sse1_fp_unop_p - SSE1 unops in packed form. 2934multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, 2935 X86SchedWriteWidths sched, list<Predicate> prds> { 2936let Predicates = prds in { 2937 def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2938 !strconcat("v", OpcodeStr, 2939 "ps\t{$src, $dst|$dst, $src}"), 2940 [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>, 2941 VEX, Sched<[sched.XMM]>, VEX_WIG; 2942 def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2943 !strconcat("v", OpcodeStr, 2944 "ps\t{$src, $dst|$dst, $src}"), 2945 [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))]>, 2946 VEX, Sched<[sched.XMM.Folded]>, VEX_WIG; 2947 def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 2948 !strconcat("v", OpcodeStr, 2949 "ps\t{$src, $dst|$dst, $src}"), 2950 [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))]>, 2951 VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG; 2952 def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 2953 !strconcat("v", OpcodeStr, 2954 "ps\t{$src, $dst|$dst, $src}"), 2955 [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))]>, 2956 VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG; 2957} 2958 2959 def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2960 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), 2961 [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>, 2962 Sched<[sched.XMM]>; 2963 def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2964 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), 2965 [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>, 2966 Sched<[sched.XMM.Folded]>; 2967} 2968 2969/// sse2_fp_unop_p - SSE2 unops in vector forms. 2970multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr, 2971 SDPatternOperator OpNode, X86SchedWriteWidths sched> { 2972let Predicates = [HasAVX, NoVLX] in { 2973 def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2974 !strconcat("v", OpcodeStr, 2975 "pd\t{$src, $dst|$dst, $src}"), 2976 [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>, 2977 VEX, Sched<[sched.XMM]>, VEX_WIG; 2978 def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2979 !strconcat("v", OpcodeStr, 2980 "pd\t{$src, $dst|$dst, $src}"), 2981 [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))]>, 2982 VEX, Sched<[sched.XMM.Folded]>, VEX_WIG; 2983 def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 2984 !strconcat("v", OpcodeStr, 2985 "pd\t{$src, $dst|$dst, $src}"), 2986 [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))]>, 2987 VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG; 2988 def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 2989 !strconcat("v", OpcodeStr, 2990 "pd\t{$src, $dst|$dst, $src}"), 2991 [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))]>, 2992 VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG; 2993} 2994 2995 def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2996 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), 2997 [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>, 2998 Sched<[sched.XMM]>; 2999 def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 3000 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), 3001 [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>, 3002 Sched<[sched.XMM.Folded]>; 3003} 3004 3005multiclass sse1_fp_unop_s_intr<string OpcodeStr, Predicate AVXTarget> { 3006 defm SS : sse_fp_unop_s_intr<v4f32, sse_load_f32, 3007 !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss), 3008 UseSSE1>, XS; 3009 defm V#NAME#SS : avx_fp_unop_s_intr<v4f32, sse_load_f32, 3010 !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss), 3011 AVXTarget>, 3012 XS, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable; 3013} 3014 3015multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, 3016 X86SchedWriteWidths sched, Predicate AVXTarget> { 3017 defm SS : sse_fp_unop_s<opc, OpcodeStr#ss, FR32, f32mem, 3018 ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, XS; 3019 defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr#ss, FR32, f32, 3020 f32mem, ssmem, OpNode, SSEPackedSingle, sched.Scl, AVXTarget>, 3021 XS, VEX_4V, VEX_LIG, VEX_WIG; 3022} 3023 3024multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, 3025 X86SchedWriteWidths sched, Predicate AVXTarget> { 3026 defm SD : sse_fp_unop_s<opc, OpcodeStr#sd, FR64, f64mem, 3027 sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD; 3028 defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr#sd, FR64, f64, 3029 f64mem, sdmem, OpNode, SSEPackedDouble, sched.Scl, AVXTarget>, 3030 XD, VEX_4V, VEX_LIG, VEX_WIG; 3031} 3032 3033// Square root. 3034defm SQRT : sse1_fp_unop_s<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt, UseAVX>, 3035 sse1_fp_unop_p<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt, [HasAVX, NoVLX]>, 3036 sse2_fp_unop_s<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt64, UseAVX>, 3037 sse2_fp_unop_p<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt64>, SIMD_EXC; 3038 3039// Reciprocal approximations. Note that these typically require refinement 3040// in order to obtain suitable precision. 3041defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>, 3042 sse1_fp_unop_s_intr<"rsqrt", HasAVX>, 3043 sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, [HasAVX]>; 3044defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>, 3045 sse1_fp_unop_s_intr<"rcp", HasAVX>, 3046 sse1_fp_unop_p<0x53, "rcp", X86frcp, SchedWriteFRcp, [HasAVX]>; 3047 3048// There is no f64 version of the reciprocal approximation instructions. 3049 3050multiclass scalar_unary_math_patterns<SDPatternOperator OpNode, string OpcPrefix, SDNode Move, 3051 ValueType VT, Predicate BasePredicate> { 3052 let Predicates = [BasePredicate] in { 3053 def : Pat<(VT (Move VT:$dst, (scalar_to_vector 3054 (OpNode (extractelt VT:$src, 0))))), 3055 (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>; 3056 } 3057 3058 // Repeat for AVX versions of the instructions. 3059 let Predicates = [UseAVX] in { 3060 def : Pat<(VT (Move VT:$dst, (scalar_to_vector 3061 (OpNode (extractelt VT:$src, 0))))), 3062 (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>; 3063 } 3064} 3065 3066defm : scalar_unary_math_patterns<any_fsqrt, "SQRTSS", X86Movss, v4f32, UseSSE1>; 3067defm : scalar_unary_math_patterns<any_fsqrt, "SQRTSD", X86Movsd, v2f64, UseSSE2>; 3068 3069multiclass scalar_unary_math_intr_patterns<Intrinsic Intr, string OpcPrefix, 3070 SDNode Move, ValueType VT, 3071 Predicate BasePredicate> { 3072 let Predicates = [BasePredicate] in { 3073 def : Pat<(VT (Move VT:$dst, (Intr VT:$src))), 3074 (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>; 3075 } 3076 3077 // Repeat for AVX versions of the instructions. 3078 let Predicates = [HasAVX] in { 3079 def : Pat<(VT (Move VT:$dst, (Intr VT:$src))), 3080 (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>; 3081 } 3082} 3083 3084defm : scalar_unary_math_intr_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss, 3085 v4f32, UseSSE1>; 3086defm : scalar_unary_math_intr_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss, 3087 v4f32, UseSSE1>; 3088 3089 3090//===----------------------------------------------------------------------===// 3091// SSE 1 & 2 - Non-temporal stores 3092//===----------------------------------------------------------------------===// 3093 3094let AddedComplexity = 400 in { // Prefer non-temporal versions 3095let Predicates = [HasAVX, NoVLX] in { 3096let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in { 3097def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs), 3098 (ins f128mem:$dst, VR128:$src), 3099 "movntps\t{$src, $dst|$dst, $src}", 3100 [(alignednontemporalstore (v4f32 VR128:$src), 3101 addr:$dst)]>, VEX, VEX_WIG; 3102def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs), 3103 (ins f128mem:$dst, VR128:$src), 3104 "movntpd\t{$src, $dst|$dst, $src}", 3105 [(alignednontemporalstore (v2f64 VR128:$src), 3106 addr:$dst)]>, VEX, VEX_WIG; 3107} // SchedRW 3108 3109let SchedRW = [SchedWriteFMoveLSNT.YMM.MR] in { 3110def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs), 3111 (ins f256mem:$dst, VR256:$src), 3112 "movntps\t{$src, $dst|$dst, $src}", 3113 [(alignednontemporalstore (v8f32 VR256:$src), 3114 addr:$dst)]>, VEX, VEX_L, VEX_WIG; 3115def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs), 3116 (ins f256mem:$dst, VR256:$src), 3117 "movntpd\t{$src, $dst|$dst, $src}", 3118 [(alignednontemporalstore (v4f64 VR256:$src), 3119 addr:$dst)]>, VEX, VEX_L, VEX_WIG; 3120} // SchedRW 3121 3122let ExeDomain = SSEPackedInt in { 3123def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs), 3124 (ins i128mem:$dst, VR128:$src), 3125 "movntdq\t{$src, $dst|$dst, $src}", 3126 [(alignednontemporalstore (v2i64 VR128:$src), 3127 addr:$dst)]>, VEX, VEX_WIG, 3128 Sched<[SchedWriteVecMoveLSNT.XMM.MR]>; 3129def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs), 3130 (ins i256mem:$dst, VR256:$src), 3131 "movntdq\t{$src, $dst|$dst, $src}", 3132 [(alignednontemporalstore (v4i64 VR256:$src), 3133 addr:$dst)]>, VEX, VEX_L, VEX_WIG, 3134 Sched<[SchedWriteVecMoveLSNT.YMM.MR]>; 3135} // ExeDomain 3136} // Predicates 3137 3138let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in { 3139def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 3140 "movntps\t{$src, $dst|$dst, $src}", 3141 [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>; 3142def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 3143 "movntpd\t{$src, $dst|$dst, $src}", 3144 [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>; 3145} // SchedRW 3146 3147let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLSNT.XMM.MR] in 3148def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 3149 "movntdq\t{$src, $dst|$dst, $src}", 3150 [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)]>; 3151 3152let SchedRW = [WriteStoreNT] in { 3153// There is no AVX form for instructions below this point 3154def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), 3155 "movnti{l}\t{$src, $dst|$dst, $src}", 3156 [(nontemporalstore (i32 GR32:$src), addr:$dst)]>, 3157 PS, Requires<[HasSSE2]>; 3158def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), 3159 "movnti{q}\t{$src, $dst|$dst, $src}", 3160 [(nontemporalstore (i64 GR64:$src), addr:$dst)]>, 3161 PS, Requires<[HasSSE2]>; 3162} // SchedRW = [WriteStoreNT] 3163 3164let Predicates = [HasAVX, NoVLX] in { 3165 def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst), 3166 (VMOVNTDQYmr addr:$dst, VR256:$src)>; 3167 def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst), 3168 (VMOVNTDQYmr addr:$dst, VR256:$src)>; 3169 def : Pat<(alignednontemporalstore (v16f16 VR256:$src), addr:$dst), 3170 (VMOVNTDQYmr addr:$dst, VR256:$src)>; 3171 def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst), 3172 (VMOVNTDQYmr addr:$dst, VR256:$src)>; 3173 3174 def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst), 3175 (VMOVNTDQmr addr:$dst, VR128:$src)>; 3176 def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst), 3177 (VMOVNTDQmr addr:$dst, VR128:$src)>; 3178 def : Pat<(alignednontemporalstore (v8f16 VR128:$src), addr:$dst), 3179 (VMOVNTDQmr addr:$dst, VR128:$src)>; 3180 def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst), 3181 (VMOVNTDQmr addr:$dst, VR128:$src)>; 3182} 3183 3184let Predicates = [UseSSE2] in { 3185 def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst), 3186 (MOVNTDQmr addr:$dst, VR128:$src)>; 3187 def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst), 3188 (MOVNTDQmr addr:$dst, VR128:$src)>; 3189 def : Pat<(alignednontemporalstore (v8f16 VR128:$src), addr:$dst), 3190 (MOVNTDQmr addr:$dst, VR128:$src)>; 3191 def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst), 3192 (MOVNTDQmr addr:$dst, VR128:$src)>; 3193} 3194 3195} // AddedComplexity 3196 3197//===----------------------------------------------------------------------===// 3198// SSE 1 & 2 - Prefetch and memory fence 3199//===----------------------------------------------------------------------===// 3200 3201// Prefetch intrinsic. 3202let Predicates = [HasSSEPrefetch], SchedRW = [WriteLoad] in { 3203def PREFETCHT0 : I<0x18, MRM1m, (outs), (ins i8mem:$src), 3204 "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))]>, TB; 3205def PREFETCHT1 : I<0x18, MRM2m, (outs), (ins i8mem:$src), 3206 "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))]>, TB; 3207def PREFETCHT2 : I<0x18, MRM3m, (outs), (ins i8mem:$src), 3208 "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))]>, TB; 3209def PREFETCHNTA : I<0x18, MRM0m, (outs), (ins i8mem:$src), 3210 "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))]>, TB; 3211} 3212 3213// FIXME: How should flush instruction be modeled? 3214let SchedRW = [WriteLoad] in { 3215// Flush cache 3216def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src), 3217 "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>, 3218 PS, Requires<[HasCLFLUSH]>; 3219} 3220 3221let SchedRW = [WriteNop] in { 3222// Pause. This "instruction" is encoded as "rep; nop", so even though it 3223// was introduced with SSE2, it's backward compatible. 3224def PAUSE : I<0x90, RawFrm, (outs), (ins), 3225 "pause", [(int_x86_sse2_pause)]>, OBXS; 3226} 3227 3228let SchedRW = [WriteFence] in { 3229// Load, store, and memory fence 3230// TODO: As with mfence, we may want to ease the availability of sfence/lfence 3231// to include any 64-bit target. 3232def SFENCE : I<0xAE, MRM7X, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>, 3233 PS, Requires<[HasSSE1]>; 3234def LFENCE : I<0xAE, MRM5X, (outs), (ins), "lfence", [(int_x86_sse2_lfence)]>, 3235 PS, Requires<[HasSSE2]>; 3236def MFENCE : I<0xAE, MRM6X, (outs), (ins), "mfence", [(int_x86_sse2_mfence)]>, 3237 PS, Requires<[HasMFence]>; 3238} // SchedRW 3239 3240def : Pat<(X86MFence), (MFENCE)>; 3241 3242//===----------------------------------------------------------------------===// 3243// SSE 1 & 2 - Load/Store XCSR register 3244//===----------------------------------------------------------------------===// 3245 3246let mayLoad=1, hasSideEffects=1, Defs=[MXCSR] in 3247def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src), 3248 "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>, 3249 VEX, Sched<[WriteLDMXCSR]>, VEX_WIG; 3250let mayStore=1, hasSideEffects=1, Uses=[MXCSR] in 3251def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst), 3252 "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>, 3253 VEX, Sched<[WriteSTMXCSR]>, VEX_WIG; 3254 3255let mayLoad=1, hasSideEffects=1, Defs=[MXCSR] in 3256def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src), 3257 "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>, 3258 PS, Sched<[WriteLDMXCSR]>; 3259let mayStore=1, hasSideEffects=1, Uses=[MXCSR] in 3260def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst), 3261 "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>, 3262 PS, Sched<[WriteSTMXCSR]>; 3263 3264//===---------------------------------------------------------------------===// 3265// SSE2 - Move Aligned/Unaligned Packed Integer Instructions 3266//===---------------------------------------------------------------------===// 3267 3268let ExeDomain = SSEPackedInt in { // SSE integer instructions 3269 3270let hasSideEffects = 0 in { 3271def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3272 "movdqa\t{$src, $dst|$dst, $src}", []>, 3273 Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG; 3274def VMOVDQUrr : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3275 "movdqu\t{$src, $dst|$dst, $src}", []>, 3276 Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG; 3277def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 3278 "movdqa\t{$src, $dst|$dst, $src}", []>, 3279 Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG; 3280def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 3281 "movdqu\t{$src, $dst|$dst, $src}", []>, 3282 Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG; 3283} 3284 3285// For Disassembler 3286let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { 3287def VMOVDQArr_REV : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 3288 "movdqa\t{$src, $dst|$dst, $src}", []>, 3289 Sched<[SchedWriteVecMoveLS.XMM.RR]>, 3290 VEX, VEX_WIG, FoldGenData<"VMOVDQArr">; 3291def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), 3292 "movdqa\t{$src, $dst|$dst, $src}", []>, 3293 Sched<[SchedWriteVecMoveLS.YMM.RR]>, 3294 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQAYrr">; 3295def VMOVDQUrr_REV : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 3296 "movdqu\t{$src, $dst|$dst, $src}", []>, 3297 Sched<[SchedWriteVecMoveLS.XMM.RR]>, 3298 VEX, VEX_WIG, FoldGenData<"VMOVDQUrr">; 3299def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), 3300 "movdqu\t{$src, $dst|$dst, $src}", []>, 3301 Sched<[SchedWriteVecMoveLS.YMM.RR]>, 3302 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQUYrr">; 3303} 3304 3305let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1, 3306 hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in { 3307def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 3308 "movdqa\t{$src, $dst|$dst, $src}", 3309 [(set VR128:$dst, (alignedloadv2i64 addr:$src))]>, 3310 Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG; 3311def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 3312 "movdqa\t{$src, $dst|$dst, $src}", []>, 3313 Sched<[SchedWriteVecMoveLS.YMM.RM]>, 3314 VEX, VEX_L, VEX_WIG; 3315def VMOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 3316 "vmovdqu\t{$src, $dst|$dst, $src}", 3317 [(set VR128:$dst, (loadv2i64 addr:$src))]>, 3318 Sched<[SchedWriteVecMoveLS.XMM.RM]>, 3319 XS, VEX, VEX_WIG; 3320def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 3321 "vmovdqu\t{$src, $dst|$dst, $src}", []>, 3322 Sched<[SchedWriteVecMoveLS.YMM.RM]>, 3323 XS, VEX, VEX_L, VEX_WIG; 3324} 3325 3326let mayStore = 1, hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in { 3327def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs), 3328 (ins i128mem:$dst, VR128:$src), 3329 "movdqa\t{$src, $dst|$dst, $src}", 3330 [(alignedstore (v2i64 VR128:$src), addr:$dst)]>, 3331 Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_WIG; 3332def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs), 3333 (ins i256mem:$dst, VR256:$src), 3334 "movdqa\t{$src, $dst|$dst, $src}", []>, 3335 Sched<[SchedWriteVecMoveLS.YMM.MR]>, VEX, VEX_L, VEX_WIG; 3336def VMOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), 3337 "vmovdqu\t{$src, $dst|$dst, $src}", 3338 [(store (v2i64 VR128:$src), addr:$dst)]>, 3339 Sched<[SchedWriteVecMoveLS.XMM.MR]>, XS, VEX, VEX_WIG; 3340def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src), 3341 "vmovdqu\t{$src, $dst|$dst, $src}",[]>, 3342 Sched<[SchedWriteVecMoveLS.YMM.MR]>, XS, VEX, VEX_L, VEX_WIG; 3343} 3344 3345let SchedRW = [SchedWriteVecMoveLS.XMM.RR] in { 3346let hasSideEffects = 0 in { 3347def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3348 "movdqa\t{$src, $dst|$dst, $src}", []>; 3349 3350def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3351 "movdqu\t{$src, $dst|$dst, $src}", []>, 3352 XS, Requires<[UseSSE2]>; 3353} 3354 3355// For Disassembler 3356let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { 3357def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 3358 "movdqa\t{$src, $dst|$dst, $src}", []>, 3359 FoldGenData<"MOVDQArr">; 3360 3361def MOVDQUrr_REV : I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 3362 "movdqu\t{$src, $dst|$dst, $src}", []>, 3363 XS, Requires<[UseSSE2]>, FoldGenData<"MOVDQUrr">; 3364} 3365} // SchedRW 3366 3367let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1, 3368 hasSideEffects = 0, SchedRW = [SchedWriteVecMoveLS.XMM.RM] in { 3369def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 3370 "movdqa\t{$src, $dst|$dst, $src}", 3371 [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>; 3372def MOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 3373 "movdqu\t{$src, $dst|$dst, $src}", 3374 [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>, 3375 XS, Requires<[UseSSE2]>; 3376} 3377 3378let mayStore = 1, hasSideEffects = 0, 3379 SchedRW = [SchedWriteVecMoveLS.XMM.MR] in { 3380def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), 3381 "movdqa\t{$src, $dst|$dst, $src}", 3382 [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>; 3383def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), 3384 "movdqu\t{$src, $dst|$dst, $src}", 3385 [/*(store (v2i64 VR128:$src), addr:$dst)*/]>, 3386 XS, Requires<[UseSSE2]>; 3387} 3388 3389} // ExeDomain = SSEPackedInt 3390 3391// Reversed version with ".s" suffix for GAS compatibility. 3392def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}", 3393 (VMOVDQArr_REV VR128:$dst, VR128:$src), 0>; 3394def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}", 3395 (VMOVDQAYrr_REV VR256:$dst, VR256:$src), 0>; 3396def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}", 3397 (VMOVDQUrr_REV VR128:$dst, VR128:$src), 0>; 3398def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}", 3399 (VMOVDQUYrr_REV VR256:$dst, VR256:$src), 0>; 3400 3401// Reversed version with ".s" suffix for GAS compatibility. 3402def : InstAlias<"movdqa.s\t{$src, $dst|$dst, $src}", 3403 (MOVDQArr_REV VR128:$dst, VR128:$src), 0>; 3404def : InstAlias<"movdqu.s\t{$src, $dst|$dst, $src}", 3405 (MOVDQUrr_REV VR128:$dst, VR128:$src), 0>; 3406 3407let Predicates = [HasAVX, NoVLX] in { 3408 // Additional patterns for other integer sizes. 3409 def : Pat<(alignedloadv4i32 addr:$src), 3410 (VMOVDQArm addr:$src)>; 3411 def : Pat<(alignedloadv8i16 addr:$src), 3412 (VMOVDQArm addr:$src)>; 3413 def : Pat<(alignedloadv8f16 addr:$src), 3414 (VMOVDQArm addr:$src)>; 3415 def : Pat<(alignedloadv16i8 addr:$src), 3416 (VMOVDQArm addr:$src)>; 3417 def : Pat<(loadv4i32 addr:$src), 3418 (VMOVDQUrm addr:$src)>; 3419 def : Pat<(loadv8i16 addr:$src), 3420 (VMOVDQUrm addr:$src)>; 3421 def : Pat<(loadv8f16 addr:$src), 3422 (VMOVDQUrm addr:$src)>; 3423 def : Pat<(loadv16i8 addr:$src), 3424 (VMOVDQUrm addr:$src)>; 3425 3426 def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst), 3427 (VMOVDQAmr addr:$dst, VR128:$src)>; 3428 def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst), 3429 (VMOVDQAmr addr:$dst, VR128:$src)>; 3430 def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst), 3431 (VMOVDQAmr addr:$dst, VR128:$src)>; 3432 def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst), 3433 (VMOVDQAmr addr:$dst, VR128:$src)>; 3434 def : Pat<(store (v4i32 VR128:$src), addr:$dst), 3435 (VMOVDQUmr addr:$dst, VR128:$src)>; 3436 def : Pat<(store (v8i16 VR128:$src), addr:$dst), 3437 (VMOVDQUmr addr:$dst, VR128:$src)>; 3438 def : Pat<(store (v8f16 VR128:$src), addr:$dst), 3439 (VMOVDQUmr addr:$dst, VR128:$src)>; 3440 def : Pat<(store (v16i8 VR128:$src), addr:$dst), 3441 (VMOVDQUmr addr:$dst, VR128:$src)>; 3442} 3443 3444//===---------------------------------------------------------------------===// 3445// SSE2 - Packed Integer Arithmetic Instructions 3446//===---------------------------------------------------------------------===// 3447 3448let ExeDomain = SSEPackedInt in { // SSE integer instructions 3449 3450/// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types 3451multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode, 3452 ValueType DstVT, ValueType SrcVT, RegisterClass RC, 3453 PatFrag memop_frag, X86MemOperand x86memop, 3454 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 3455 let isCommutable = 1 in 3456 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), 3457 (ins RC:$src1, RC:$src2), 3458 !if(Is2Addr, 3459 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3460 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3461 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>, 3462 Sched<[sched]>; 3463 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), 3464 (ins RC:$src1, x86memop:$src2), 3465 !if(Is2Addr, 3466 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3467 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3468 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), 3469 (memop_frag addr:$src2))))]>, 3470 Sched<[sched.Folded, sched.ReadAfterFold]>; 3471} 3472} // ExeDomain = SSEPackedInt 3473 3474defm PADDB : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8, 3475 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3476defm PADDW : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16, 3477 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3478defm PADDD : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32, 3479 SchedWriteVecALU, 1, NoVLX>; 3480defm PADDQ : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64, 3481 SchedWriteVecALU, 1, NoVLX>; 3482defm PADDSB : PDI_binop_all<0xEC, "paddsb", saddsat, v16i8, v32i8, 3483 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3484defm PADDSW : PDI_binop_all<0xED, "paddsw", saddsat, v8i16, v16i16, 3485 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3486defm PADDUSB : PDI_binop_all<0xDC, "paddusb", uaddsat, v16i8, v32i8, 3487 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3488defm PADDUSW : PDI_binop_all<0xDD, "paddusw", uaddsat, v8i16, v16i16, 3489 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3490defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16, 3491 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>; 3492defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16, 3493 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>; 3494defm PMULHW : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16, 3495 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>; 3496defm PSUBB : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8, 3497 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3498defm PSUBW : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16, 3499 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3500defm PSUBD : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32, 3501 SchedWriteVecALU, 0, NoVLX>; 3502defm PSUBQ : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64, 3503 SchedWriteVecALU, 0, NoVLX>; 3504defm PSUBSB : PDI_binop_all<0xE8, "psubsb", ssubsat, v16i8, v32i8, 3505 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3506defm PSUBSW : PDI_binop_all<0xE9, "psubsw", ssubsat, v8i16, v16i16, 3507 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3508defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", usubsat, v16i8, v32i8, 3509 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3510defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", usubsat, v8i16, v16i16, 3511 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3512defm PMINUB : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8, 3513 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3514defm PMINSW : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16, 3515 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3516defm PMAXUB : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8, 3517 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3518defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16, 3519 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3520defm PAVGB : PDI_binop_all<0xE0, "pavgb", avgceilu, v16i8, v32i8, 3521 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3522defm PAVGW : PDI_binop_all<0xE3, "pavgw", avgceilu, v8i16, v16i16, 3523 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3524defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64, 3525 SchedWriteVecIMul, 1, NoVLX>; 3526 3527let Predicates = [HasAVX, NoVLX_Or_NoBWI] in 3528defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128, 3529 load, i128mem, SchedWriteVecIMul.XMM, 0>, 3530 VEX_4V, VEX_WIG; 3531 3532let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in 3533defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16, 3534 VR256, load, i256mem, SchedWriteVecIMul.YMM, 3535 0>, VEX_4V, VEX_L, VEX_WIG; 3536let Constraints = "$src1 = $dst" in 3537defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128, 3538 memop, i128mem, SchedWriteVecIMul.XMM>; 3539 3540let Predicates = [HasAVX, NoVLX_Or_NoBWI] in 3541defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128, 3542 load, i128mem, SchedWritePSADBW.XMM, 0>, 3543 VEX_4V, VEX_WIG; 3544let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in 3545defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256, 3546 load, i256mem, SchedWritePSADBW.YMM, 0>, 3547 VEX_4V, VEX_L, VEX_WIG; 3548let Constraints = "$src1 = $dst" in 3549defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128, 3550 memop, i128mem, SchedWritePSADBW.XMM>; 3551 3552//===---------------------------------------------------------------------===// 3553// SSE2 - Packed Integer Logical Instructions 3554//===---------------------------------------------------------------------===// 3555 3556multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm, 3557 string OpcodeStr, SDNode OpNode, 3558 SDNode OpNode2, RegisterClass RC, 3559 X86FoldableSchedWrite sched, 3560 X86FoldableSchedWrite schedImm, 3561 ValueType DstVT, ValueType SrcVT, 3562 PatFrag ld_frag, bit Is2Addr = 1> { 3563 // src2 is always 128-bit 3564 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), 3565 (ins RC:$src1, VR128:$src2), 3566 !if(Is2Addr, 3567 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3568 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3569 [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))]>, 3570 Sched<[sched]>; 3571 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), 3572 (ins RC:$src1, i128mem:$src2), 3573 !if(Is2Addr, 3574 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3575 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3576 [(set RC:$dst, (DstVT (OpNode RC:$src1, 3577 (SrcVT (ld_frag addr:$src2)))))]>, 3578 Sched<[sched.Folded, sched.ReadAfterFold]>; 3579 def ri : PDIi8<opc2, ImmForm, (outs RC:$dst), 3580 (ins RC:$src1, u8imm:$src2), 3581 !if(Is2Addr, 3582 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3583 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3584 [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 timm:$src2))))]>, 3585 Sched<[schedImm]>; 3586} 3587 3588multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm, 3589 string OpcodeStr, SDNode OpNode, 3590 SDNode OpNode2, ValueType DstVT128, 3591 ValueType DstVT256, ValueType SrcVT, 3592 X86SchedWriteWidths sched, 3593 X86SchedWriteWidths schedImm, Predicate prd> { 3594let Predicates = [HasAVX, prd] in 3595 defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr), 3596 OpNode, OpNode2, VR128, sched.XMM, schedImm.XMM, 3597 DstVT128, SrcVT, load, 0>, VEX_4V, VEX_WIG; 3598let Predicates = [HasAVX2, prd] in 3599 defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr), 3600 OpNode, OpNode2, VR256, sched.YMM, schedImm.YMM, 3601 DstVT256, SrcVT, load, 0>, VEX_4V, VEX_L, 3602 VEX_WIG; 3603let Constraints = "$src1 = $dst" in 3604 defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2, 3605 VR128, sched.XMM, schedImm.XMM, DstVT128, SrcVT, 3606 memop>; 3607} 3608 3609multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr, 3610 SDNode OpNode, RegisterClass RC, ValueType VT, 3611 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 3612 def ri : PDIi8<opc, ImmForm, (outs RC:$dst), (ins RC:$src1, u8imm:$src2), 3613 !if(Is2Addr, 3614 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3615 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3616 [(set RC:$dst, (VT (OpNode RC:$src1, (i8 timm:$src2))))]>, 3617 Sched<[sched]>; 3618} 3619 3620multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr, 3621 SDNode OpNode, X86SchedWriteWidths sched> { 3622let Predicates = [HasAVX, NoVLX_Or_NoBWI] in 3623 defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode, 3624 VR128, v16i8, sched.XMM, 0>, VEX_4V, VEX_WIG; 3625let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in 3626 defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode, 3627 VR256, v32i8, sched.YMM, 0>, 3628 VEX_4V, VEX_L, VEX_WIG; 3629let Constraints = "$src1 = $dst" in 3630 defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8, 3631 sched.XMM>; 3632} 3633 3634let ExeDomain = SSEPackedInt in { 3635 defm PSLLW : PDI_binop_rmi_all<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli, 3636 v8i16, v16i16, v8i16, SchedWriteVecShift, 3637 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>; 3638 defm PSLLD : PDI_binop_rmi_all<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli, 3639 v4i32, v8i32, v4i32, SchedWriteVecShift, 3640 SchedWriteVecShiftImm, NoVLX>; 3641 defm PSLLQ : PDI_binop_rmi_all<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli, 3642 v2i64, v4i64, v2i64, SchedWriteVecShift, 3643 SchedWriteVecShiftImm, NoVLX>; 3644 3645 defm PSRLW : PDI_binop_rmi_all<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli, 3646 v8i16, v16i16, v8i16, SchedWriteVecShift, 3647 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>; 3648 defm PSRLD : PDI_binop_rmi_all<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli, 3649 v4i32, v8i32, v4i32, SchedWriteVecShift, 3650 SchedWriteVecShiftImm, NoVLX>; 3651 defm PSRLQ : PDI_binop_rmi_all<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli, 3652 v2i64, v4i64, v2i64, SchedWriteVecShift, 3653 SchedWriteVecShiftImm, NoVLX>; 3654 3655 defm PSRAW : PDI_binop_rmi_all<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai, 3656 v8i16, v16i16, v8i16, SchedWriteVecShift, 3657 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>; 3658 defm PSRAD : PDI_binop_rmi_all<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai, 3659 v4i32, v8i32, v4i32, SchedWriteVecShift, 3660 SchedWriteVecShiftImm, NoVLX>; 3661 3662 defm PSLLDQ : PDI_binop_ri_all<0x73, MRM7r, "pslldq", X86vshldq, 3663 SchedWriteShuffle>; 3664 defm PSRLDQ : PDI_binop_ri_all<0x73, MRM3r, "psrldq", X86vshrdq, 3665 SchedWriteShuffle>; 3666} // ExeDomain = SSEPackedInt 3667 3668//===---------------------------------------------------------------------===// 3669// SSE2 - Packed Integer Comparison Instructions 3670//===---------------------------------------------------------------------===// 3671 3672defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8, 3673 SchedWriteVecALU, 1, TruePredicate>; 3674defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16, 3675 SchedWriteVecALU, 1, TruePredicate>; 3676defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32, 3677 SchedWriteVecALU, 1, TruePredicate>; 3678defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8, 3679 SchedWriteVecALU, 0, TruePredicate>; 3680defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16, 3681 SchedWriteVecALU, 0, TruePredicate>; 3682defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32, 3683 SchedWriteVecALU, 0, TruePredicate>; 3684 3685//===---------------------------------------------------------------------===// 3686// SSE2 - Packed Integer Shuffle Instructions 3687//===---------------------------------------------------------------------===// 3688 3689let ExeDomain = SSEPackedInt in { 3690multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256, 3691 SDNode OpNode, X86SchedWriteWidths sched, 3692 Predicate prd> { 3693let Predicates = [HasAVX, prd] in { 3694 def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst), 3695 (ins VR128:$src1, u8imm:$src2), 3696 !strconcat("v", OpcodeStr, 3697 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3698 [(set VR128:$dst, 3699 (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>, 3700 VEX, Sched<[sched.XMM]>, VEX_WIG; 3701 def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst), 3702 (ins i128mem:$src1, u8imm:$src2), 3703 !strconcat("v", OpcodeStr, 3704 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3705 [(set VR128:$dst, 3706 (vt128 (OpNode (load addr:$src1), 3707 (i8 timm:$src2))))]>, VEX, 3708 Sched<[sched.XMM.Folded]>, VEX_WIG; 3709} 3710 3711let Predicates = [HasAVX2, prd] in { 3712 def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst), 3713 (ins VR256:$src1, u8imm:$src2), 3714 !strconcat("v", OpcodeStr, 3715 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3716 [(set VR256:$dst, 3717 (vt256 (OpNode VR256:$src1, (i8 timm:$src2))))]>, 3718 VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG; 3719 def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst), 3720 (ins i256mem:$src1, u8imm:$src2), 3721 !strconcat("v", OpcodeStr, 3722 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3723 [(set VR256:$dst, 3724 (vt256 (OpNode (load addr:$src1), 3725 (i8 timm:$src2))))]>, VEX, VEX_L, 3726 Sched<[sched.YMM.Folded]>, VEX_WIG; 3727} 3728 3729let Predicates = [UseSSE2] in { 3730 def ri : Ii8<0x70, MRMSrcReg, 3731 (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), 3732 !strconcat(OpcodeStr, 3733 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3734 [(set VR128:$dst, 3735 (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>, 3736 Sched<[sched.XMM]>; 3737 def mi : Ii8<0x70, MRMSrcMem, 3738 (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2), 3739 !strconcat(OpcodeStr, 3740 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3741 [(set VR128:$dst, 3742 (vt128 (OpNode (memop addr:$src1), 3743 (i8 timm:$src2))))]>, 3744 Sched<[sched.XMM.Folded]>; 3745} 3746} 3747} // ExeDomain = SSEPackedInt 3748 3749defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd, 3750 SchedWriteShuffle, NoVLX>, PD; 3751defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw, 3752 SchedWriteShuffle, NoVLX_Or_NoBWI>, XS; 3753defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw, 3754 SchedWriteShuffle, NoVLX_Or_NoBWI>, XD; 3755 3756//===---------------------------------------------------------------------===// 3757// Packed Integer Pack Instructions (SSE & AVX) 3758//===---------------------------------------------------------------------===// 3759 3760let ExeDomain = SSEPackedInt in { 3761multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, 3762 ValueType ArgVT, SDNode OpNode, RegisterClass RC, 3763 X86MemOperand x86memop, X86FoldableSchedWrite sched, 3764 PatFrag ld_frag, bit Is2Addr = 1> { 3765 def rr : PDI<opc, MRMSrcReg, 3766 (outs RC:$dst), (ins RC:$src1, RC:$src2), 3767 !if(Is2Addr, 3768 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3769 !strconcat(OpcodeStr, 3770 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3771 [(set RC:$dst, 3772 (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>, 3773 Sched<[sched]>; 3774 def rm : PDI<opc, MRMSrcMem, 3775 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 3776 !if(Is2Addr, 3777 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3778 !strconcat(OpcodeStr, 3779 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3780 [(set RC:$dst, 3781 (OutVT (OpNode (ArgVT RC:$src1), 3782 (ld_frag addr:$src2))))]>, 3783 Sched<[sched.Folded, sched.ReadAfterFold]>; 3784} 3785 3786multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, 3787 ValueType ArgVT, SDNode OpNode, RegisterClass RC, 3788 X86MemOperand x86memop, X86FoldableSchedWrite sched, 3789 PatFrag ld_frag, bit Is2Addr = 1> { 3790 def rr : SS48I<opc, MRMSrcReg, 3791 (outs RC:$dst), (ins RC:$src1, RC:$src2), 3792 !if(Is2Addr, 3793 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3794 !strconcat(OpcodeStr, 3795 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3796 [(set RC:$dst, 3797 (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>, 3798 Sched<[sched]>; 3799 def rm : SS48I<opc, MRMSrcMem, 3800 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 3801 !if(Is2Addr, 3802 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3803 !strconcat(OpcodeStr, 3804 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3805 [(set RC:$dst, 3806 (OutVT (OpNode (ArgVT RC:$src1), 3807 (ld_frag addr:$src2))))]>, 3808 Sched<[sched.Folded, sched.ReadAfterFold]>; 3809} 3810 3811let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 3812 defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, VR128, 3813 i128mem, SchedWriteShuffle.XMM, load, 0>, 3814 VEX_4V, VEX_WIG; 3815 defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, VR128, 3816 i128mem, SchedWriteShuffle.XMM, load, 0>, 3817 VEX_4V, VEX_WIG; 3818 3819 defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, VR128, 3820 i128mem, SchedWriteShuffle.XMM, load, 0>, 3821 VEX_4V, VEX_WIG; 3822 defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128, 3823 i128mem, SchedWriteShuffle.XMM, load, 0>, 3824 VEX_4V, VEX_WIG; 3825} 3826 3827let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 3828 defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss, VR256, 3829 i256mem, SchedWriteShuffle.YMM, load, 0>, 3830 VEX_4V, VEX_L, VEX_WIG; 3831 defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, VR256, 3832 i256mem, SchedWriteShuffle.YMM, load, 0>, 3833 VEX_4V, VEX_L, VEX_WIG; 3834 3835 defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus, VR256, 3836 i256mem, SchedWriteShuffle.YMM, load, 0>, 3837 VEX_4V, VEX_L, VEX_WIG; 3838 defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256, 3839 i256mem, SchedWriteShuffle.YMM, load, 0>, 3840 VEX_4V, VEX_L, VEX_WIG; 3841} 3842 3843let Constraints = "$src1 = $dst" in { 3844 defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, VR128, 3845 i128mem, SchedWriteShuffle.XMM, memop>; 3846 defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, VR128, 3847 i128mem, SchedWriteShuffle.XMM, memop>; 3848 3849 defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, VR128, 3850 i128mem, SchedWriteShuffle.XMM, memop>; 3851 3852 defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, VR128, 3853 i128mem, SchedWriteShuffle.XMM, memop>; 3854} 3855} // ExeDomain = SSEPackedInt 3856 3857//===---------------------------------------------------------------------===// 3858// SSE2 - Packed Integer Unpack Instructions 3859//===---------------------------------------------------------------------===// 3860 3861let ExeDomain = SSEPackedInt in { 3862multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt, 3863 SDNode OpNode, RegisterClass RC, X86MemOperand x86memop, 3864 X86FoldableSchedWrite sched, PatFrag ld_frag, 3865 bit Is2Addr = 1> { 3866 def rr : PDI<opc, MRMSrcReg, 3867 (outs RC:$dst), (ins RC:$src1, RC:$src2), 3868 !if(Is2Addr, 3869 !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"), 3870 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3871 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>, 3872 Sched<[sched]>; 3873 def rm : PDI<opc, MRMSrcMem, 3874 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 3875 !if(Is2Addr, 3876 !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"), 3877 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3878 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>, 3879 Sched<[sched.Folded, sched.ReadAfterFold]>; 3880} 3881 3882let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 3883 defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, VR128, 3884 i128mem, SchedWriteShuffle.XMM, load, 0>, 3885 VEX_4V, VEX_WIG; 3886 defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, VR128, 3887 i128mem, SchedWriteShuffle.XMM, load, 0>, 3888 VEX_4V, VEX_WIG; 3889 defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, VR128, 3890 i128mem, SchedWriteShuffle.XMM, load, 0>, 3891 VEX_4V, VEX_WIG; 3892 defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, VR128, 3893 i128mem, SchedWriteShuffle.XMM, load, 0>, 3894 VEX_4V, VEX_WIG; 3895} 3896 3897let Predicates = [HasAVX, NoVLX] in { 3898 defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, VR128, 3899 i128mem, SchedWriteShuffle.XMM, load, 0>, 3900 VEX_4V, VEX_WIG; 3901 defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, VR128, 3902 i128mem, SchedWriteShuffle.XMM, load, 0>, 3903 VEX_4V, VEX_WIG; 3904 defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, VR128, 3905 i128mem, SchedWriteShuffle.XMM, load, 0>, 3906 VEX_4V, VEX_WIG; 3907 defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, VR128, 3908 i128mem, SchedWriteShuffle.XMM, load, 0>, 3909 VEX_4V, VEX_WIG; 3910} 3911 3912let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 3913 defm VPUNPCKLBWY : sse2_unpack<0x60, "vpunpcklbw", v32i8, X86Unpckl, VR256, 3914 i256mem, SchedWriteShuffle.YMM, load, 0>, 3915 VEX_4V, VEX_L, VEX_WIG; 3916 defm VPUNPCKLWDY : sse2_unpack<0x61, "vpunpcklwd", v16i16, X86Unpckl, VR256, 3917 i256mem, SchedWriteShuffle.YMM, load, 0>, 3918 VEX_4V, VEX_L, VEX_WIG; 3919 defm VPUNPCKHBWY : sse2_unpack<0x68, "vpunpckhbw", v32i8, X86Unpckh, VR256, 3920 i256mem, SchedWriteShuffle.YMM, load, 0>, 3921 VEX_4V, VEX_L, VEX_WIG; 3922 defm VPUNPCKHWDY : sse2_unpack<0x69, "vpunpckhwd", v16i16, X86Unpckh, VR256, 3923 i256mem, SchedWriteShuffle.YMM, load, 0>, 3924 VEX_4V, VEX_L, VEX_WIG; 3925} 3926 3927let Predicates = [HasAVX2, NoVLX] in { 3928 defm VPUNPCKLDQY : sse2_unpack<0x62, "vpunpckldq", v8i32, X86Unpckl, VR256, 3929 i256mem, SchedWriteShuffle.YMM, load, 0>, 3930 VEX_4V, VEX_L, VEX_WIG; 3931 defm VPUNPCKLQDQY : sse2_unpack<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, VR256, 3932 i256mem, SchedWriteShuffle.YMM, load, 0>, 3933 VEX_4V, VEX_L, VEX_WIG; 3934 defm VPUNPCKHDQY : sse2_unpack<0x6A, "vpunpckhdq", v8i32, X86Unpckh, VR256, 3935 i256mem, SchedWriteShuffle.YMM, load, 0>, 3936 VEX_4V, VEX_L, VEX_WIG; 3937 defm VPUNPCKHQDQY : sse2_unpack<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, VR256, 3938 i256mem, SchedWriteShuffle.YMM, load, 0>, 3939 VEX_4V, VEX_L, VEX_WIG; 3940} 3941 3942let Constraints = "$src1 = $dst" in { 3943 defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, VR128, 3944 i128mem, SchedWriteShuffle.XMM, memop>; 3945 defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, VR128, 3946 i128mem, SchedWriteShuffle.XMM, memop>; 3947 defm PUNPCKLDQ : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, VR128, 3948 i128mem, SchedWriteShuffle.XMM, memop>; 3949 defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, VR128, 3950 i128mem, SchedWriteShuffle.XMM, memop>; 3951 3952 defm PUNPCKHBW : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, VR128, 3953 i128mem, SchedWriteShuffle.XMM, memop>; 3954 defm PUNPCKHWD : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, VR128, 3955 i128mem, SchedWriteShuffle.XMM, memop>; 3956 defm PUNPCKHDQ : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, VR128, 3957 i128mem, SchedWriteShuffle.XMM, memop>; 3958 defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, VR128, 3959 i128mem, SchedWriteShuffle.XMM, memop>; 3960} 3961} // ExeDomain = SSEPackedInt 3962 3963//===---------------------------------------------------------------------===// 3964// SSE2 - Packed Integer Extract and Insert 3965//===---------------------------------------------------------------------===// 3966 3967let ExeDomain = SSEPackedInt in { 3968multiclass sse2_pinsrw<bit Is2Addr = 1> { 3969 def rr : Ii8<0xC4, MRMSrcReg, 3970 (outs VR128:$dst), (ins VR128:$src1, 3971 GR32orGR64:$src2, u8imm:$src3), 3972 !if(Is2Addr, 3973 "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", 3974 "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 3975 [(set VR128:$dst, 3976 (X86pinsrw VR128:$src1, GR32orGR64:$src2, timm:$src3))]>, 3977 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; 3978 def rm : Ii8<0xC4, MRMSrcMem, 3979 (outs VR128:$dst), (ins VR128:$src1, 3980 i16mem:$src2, u8imm:$src3), 3981 !if(Is2Addr, 3982 "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", 3983 "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 3984 [(set VR128:$dst, 3985 (X86pinsrw VR128:$src1, (extloadi16 addr:$src2), 3986 timm:$src3))]>, 3987 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>; 3988} 3989 3990// Extract 3991let Predicates = [HasAVX, NoBWI] in 3992def VPEXTRWrr : Ii8<0xC5, MRMSrcReg, 3993 (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2), 3994 "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", 3995 [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1), 3996 timm:$src2))]>, 3997 PD, VEX, VEX_WIG, Sched<[WriteVecExtract]>; 3998def PEXTRWrr : PDIi8<0xC5, MRMSrcReg, 3999 (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2), 4000 "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", 4001 [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1), 4002 timm:$src2))]>, 4003 Sched<[WriteVecExtract]>; 4004 4005// Insert 4006let Predicates = [HasAVX, NoBWI] in 4007defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V, VEX_WIG; 4008 4009let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in 4010defm PINSRW : sse2_pinsrw, PD; 4011 4012} // ExeDomain = SSEPackedInt 4013 4014// Always select FP16 instructions if available. 4015let Predicates = [UseSSE2], AddedComplexity = -10 in { 4016 def : Pat<(f16 (load addr:$src)), (COPY_TO_REGCLASS (PINSRWrm (v8i16 (IMPLICIT_DEF)), addr:$src, 0), FR16)>; 4017 def : Pat<(store f16:$src, addr:$dst), (MOV16mr addr:$dst, (EXTRACT_SUBREG (PEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit))>; 4018 def : Pat<(i16 (bitconvert f16:$src)), (EXTRACT_SUBREG (PEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit)>; 4019 def : Pat<(f16 (bitconvert i16:$src)), (COPY_TO_REGCLASS (PINSRWrr (v8i16 (IMPLICIT_DEF)), (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit), 0), FR16)>; 4020} 4021 4022let Predicates = [HasAVX, NoBWI] in { 4023 def : Pat<(f16 (load addr:$src)), (COPY_TO_REGCLASS (VPINSRWrm (v8i16 (IMPLICIT_DEF)), addr:$src, 0), FR16)>; 4024 def : Pat<(i16 (bitconvert f16:$src)), (EXTRACT_SUBREG (VPEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit)>; 4025 def : Pat<(f16 (bitconvert i16:$src)), (COPY_TO_REGCLASS (VPINSRWrr (v8i16 (IMPLICIT_DEF)), (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit), 0), FR16)>; 4026} 4027 4028//===---------------------------------------------------------------------===// 4029// SSE2 - Packed Mask Creation 4030//===---------------------------------------------------------------------===// 4031 4032let ExeDomain = SSEPackedInt in { 4033 4034def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), 4035 (ins VR128:$src), 4036 "pmovmskb\t{$src, $dst|$dst, $src}", 4037 [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>, 4038 Sched<[WriteVecMOVMSK]>, VEX, VEX_WIG; 4039 4040let Predicates = [HasAVX2] in { 4041def VPMOVMSKBYrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), 4042 (ins VR256:$src), 4043 "pmovmskb\t{$src, $dst|$dst, $src}", 4044 [(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>, 4045 Sched<[WriteVecMOVMSKY]>, VEX, VEX_L, VEX_WIG; 4046} 4047 4048def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src), 4049 "pmovmskb\t{$src, $dst|$dst, $src}", 4050 [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>, 4051 Sched<[WriteVecMOVMSK]>; 4052 4053} // ExeDomain = SSEPackedInt 4054 4055//===---------------------------------------------------------------------===// 4056// SSE2 - Conditional Store 4057//===---------------------------------------------------------------------===// 4058 4059let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLS.XMM.MR] in { 4060// As VEX does not have separate instruction contexts for address size 4061// overrides, VMASKMOVDQU and VMASKMOVDQU64 would have a decode conflict. 4062// Prefer VMASKMODDQU64. 4063let Uses = [EDI], Predicates = [HasAVX], isAsmParserOnly = 1 in 4064def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs), 4065 (ins VR128:$src, VR128:$mask), 4066 "maskmovdqu\t{$mask, $src|$src, $mask}", 4067 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>, 4068 VEX, VEX_WIG; 4069let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in 4070def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs), 4071 (ins VR128:$src, VR128:$mask), 4072 "maskmovdqu\t{$mask, $src|$src, $mask}", 4073 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>, 4074 VEX, VEX_WIG; 4075 4076let Uses = [EDI], Predicates = [UseSSE2] in 4077def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), 4078 "maskmovdqu\t{$mask, $src|$src, $mask}", 4079 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>; 4080let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in 4081def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), 4082 "maskmovdqu\t{$mask, $src|$src, $mask}", 4083 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>; 4084 4085} // ExeDomain = SSEPackedInt 4086 4087//===---------------------------------------------------------------------===// 4088// SSE2 - Move Doubleword/Quadword 4089//===---------------------------------------------------------------------===// 4090 4091//===---------------------------------------------------------------------===// 4092// Move Int Doubleword to Packed Double Int 4093// 4094let ExeDomain = SSEPackedInt in { 4095def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), 4096 "movd\t{$src, $dst|$dst, $src}", 4097 [(set VR128:$dst, 4098 (v4i32 (scalar_to_vector GR32:$src)))]>, 4099 VEX, Sched<[WriteVecMoveFromGpr]>; 4100def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), 4101 "movd\t{$src, $dst|$dst, $src}", 4102 [(set VR128:$dst, 4103 (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>, 4104 VEX, Sched<[WriteVecLoad]>; 4105def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), 4106 "movq\t{$src, $dst|$dst, $src}", 4107 [(set VR128:$dst, 4108 (v2i64 (scalar_to_vector GR64:$src)))]>, 4109 VEX, Sched<[WriteVecMoveFromGpr]>; 4110let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in 4111def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 4112 "movq\t{$src, $dst|$dst, $src}", []>, 4113 VEX, Sched<[WriteVecLoad]>; 4114let isCodeGenOnly = 1 in 4115def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), 4116 "movq\t{$src, $dst|$dst, $src}", 4117 [(set FR64:$dst, (bitconvert GR64:$src))]>, 4118 VEX, Sched<[WriteVecMoveFromGpr]>; 4119 4120def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), 4121 "movd\t{$src, $dst|$dst, $src}", 4122 [(set VR128:$dst, 4123 (v4i32 (scalar_to_vector GR32:$src)))]>, 4124 Sched<[WriteVecMoveFromGpr]>; 4125def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), 4126 "movd\t{$src, $dst|$dst, $src}", 4127 [(set VR128:$dst, 4128 (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>, 4129 Sched<[WriteVecLoad]>; 4130def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), 4131 "movq\t{$src, $dst|$dst, $src}", 4132 [(set VR128:$dst, 4133 (v2i64 (scalar_to_vector GR64:$src)))]>, 4134 Sched<[WriteVecMoveFromGpr]>; 4135let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in 4136def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 4137 "movq\t{$src, $dst|$dst, $src}", []>, 4138 Sched<[WriteVecLoad]>; 4139let isCodeGenOnly = 1 in 4140def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), 4141 "movq\t{$src, $dst|$dst, $src}", 4142 [(set FR64:$dst, (bitconvert GR64:$src))]>, 4143 Sched<[WriteVecMoveFromGpr]>; 4144} // ExeDomain = SSEPackedInt 4145 4146//===---------------------------------------------------------------------===// 4147// Move Int Doubleword to Single Scalar 4148// 4149let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { 4150 def VMOVDI2SSrr : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), 4151 "movd\t{$src, $dst|$dst, $src}", 4152 [(set FR32:$dst, (bitconvert GR32:$src))]>, 4153 VEX, Sched<[WriteVecMoveFromGpr]>; 4154 4155 def MOVDI2SSrr : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), 4156 "movd\t{$src, $dst|$dst, $src}", 4157 [(set FR32:$dst, (bitconvert GR32:$src))]>, 4158 Sched<[WriteVecMoveFromGpr]>; 4159 4160} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 4161 4162//===---------------------------------------------------------------------===// 4163// Move Packed Doubleword Int to Packed Double Int 4164// 4165let ExeDomain = SSEPackedInt in { 4166def VMOVPDI2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), 4167 "movd\t{$src, $dst|$dst, $src}", 4168 [(set GR32:$dst, (extractelt (v4i32 VR128:$src), 4169 (iPTR 0)))]>, VEX, 4170 Sched<[WriteVecMoveToGpr]>; 4171def VMOVPDI2DImr : VS2I<0x7E, MRMDestMem, (outs), 4172 (ins i32mem:$dst, VR128:$src), 4173 "movd\t{$src, $dst|$dst, $src}", 4174 [(store (i32 (extractelt (v4i32 VR128:$src), 4175 (iPTR 0))), addr:$dst)]>, 4176 VEX, Sched<[WriteVecStore]>; 4177def MOVPDI2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), 4178 "movd\t{$src, $dst|$dst, $src}", 4179 [(set GR32:$dst, (extractelt (v4i32 VR128:$src), 4180 (iPTR 0)))]>, 4181 Sched<[WriteVecMoveToGpr]>; 4182def MOVPDI2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src), 4183 "movd\t{$src, $dst|$dst, $src}", 4184 [(store (i32 (extractelt (v4i32 VR128:$src), 4185 (iPTR 0))), addr:$dst)]>, 4186 Sched<[WriteVecStore]>; 4187} // ExeDomain = SSEPackedInt 4188 4189//===---------------------------------------------------------------------===// 4190// Move Packed Doubleword Int first element to Doubleword Int 4191// 4192let ExeDomain = SSEPackedInt in { 4193let SchedRW = [WriteVecMoveToGpr] in { 4194def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), 4195 "movq\t{$src, $dst|$dst, $src}", 4196 [(set GR64:$dst, (extractelt (v2i64 VR128:$src), 4197 (iPTR 0)))]>, 4198 VEX; 4199 4200def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), 4201 "movq\t{$src, $dst|$dst, $src}", 4202 [(set GR64:$dst, (extractelt (v2i64 VR128:$src), 4203 (iPTR 0)))]>; 4204} //SchedRW 4205 4206let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in 4207def VMOVPQIto64mr : VRS2I<0x7E, MRMDestMem, (outs), 4208 (ins i64mem:$dst, VR128:$src), 4209 "movq\t{$src, $dst|$dst, $src}", []>, 4210 VEX, Sched<[WriteVecStore]>; 4211let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in 4212def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), 4213 "movq\t{$src, $dst|$dst, $src}", []>, 4214 Sched<[WriteVecStore]>; 4215} // ExeDomain = SSEPackedInt 4216 4217//===---------------------------------------------------------------------===// 4218// Bitcast FR64 <-> GR64 4219// 4220let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { 4221 def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), 4222 "movq\t{$src, $dst|$dst, $src}", 4223 [(set GR64:$dst, (bitconvert FR64:$src))]>, 4224 VEX, Sched<[WriteVecMoveToGpr]>; 4225 4226 def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), 4227 "movq\t{$src, $dst|$dst, $src}", 4228 [(set GR64:$dst, (bitconvert FR64:$src))]>, 4229 Sched<[WriteVecMoveToGpr]>; 4230} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 4231 4232//===---------------------------------------------------------------------===// 4233// Move Scalar Single to Double Int 4234// 4235let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { 4236 def VMOVSS2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), 4237 "movd\t{$src, $dst|$dst, $src}", 4238 [(set GR32:$dst, (bitconvert FR32:$src))]>, 4239 VEX, Sched<[WriteVecMoveToGpr]>; 4240 def MOVSS2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), 4241 "movd\t{$src, $dst|$dst, $src}", 4242 [(set GR32:$dst, (bitconvert FR32:$src))]>, 4243 Sched<[WriteVecMoveToGpr]>; 4244} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 4245 4246let Predicates = [UseAVX] in { 4247 def : Pat<(v4i32 (scalar_to_vector (i32 (anyext GR8:$src)))), 4248 (VMOVDI2PDIrr (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 4249 GR8:$src, sub_8bit)))>; 4250 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), 4251 (VMOVDI2PDIrr GR32:$src)>; 4252 4253 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), 4254 (VMOV64toPQIrr GR64:$src)>; 4255 4256 // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part. 4257 // These instructions also write zeros in the high part of a 256-bit register. 4258 def : Pat<(v4i32 (X86vzload32 addr:$src)), 4259 (VMOVDI2PDIrm addr:$src)>; 4260 def : Pat<(v8i32 (X86vzload32 addr:$src)), 4261 (SUBREG_TO_REG (i64 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>; 4262} 4263 4264let Predicates = [UseSSE2] in { 4265 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), 4266 (MOVDI2PDIrr GR32:$src)>; 4267 4268 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), 4269 (MOV64toPQIrr GR64:$src)>; 4270 def : Pat<(v4i32 (X86vzload32 addr:$src)), 4271 (MOVDI2PDIrm addr:$src)>; 4272} 4273 4274// Before the MC layer of LLVM existed, clang emitted "movd" assembly instead of 4275// "movq" due to MacOS parsing limitation. In order to parse old assembly, we add 4276// these aliases. 4277def : InstAlias<"movd\t{$src, $dst|$dst, $src}", 4278 (MOV64toPQIrr VR128:$dst, GR64:$src), 0>; 4279def : InstAlias<"movd\t{$src, $dst|$dst, $src}", 4280 (MOVPQIto64rr GR64:$dst, VR128:$src), 0>; 4281// Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX. 4282def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}", 4283 (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>; 4284def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}", 4285 (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>; 4286 4287//===---------------------------------------------------------------------===// 4288// SSE2 - Move Quadword 4289//===---------------------------------------------------------------------===// 4290 4291//===---------------------------------------------------------------------===// 4292// Move Quadword Int to Packed Quadword Int 4293// 4294 4295let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLoad] in { 4296def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 4297 "vmovq\t{$src, $dst|$dst, $src}", 4298 [(set VR128:$dst, 4299 (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS, 4300 VEX, Requires<[UseAVX]>, VEX_WIG; 4301def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 4302 "movq\t{$src, $dst|$dst, $src}", 4303 [(set VR128:$dst, 4304 (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, 4305 XS, Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix 4306} // ExeDomain, SchedRW 4307 4308//===---------------------------------------------------------------------===// 4309// Move Packed Quadword Int to Quadword Int 4310// 4311let ExeDomain = SSEPackedInt, SchedRW = [WriteVecStore] in { 4312def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), 4313 "movq\t{$src, $dst|$dst, $src}", 4314 [(store (i64 (extractelt (v2i64 VR128:$src), 4315 (iPTR 0))), addr:$dst)]>, 4316 VEX, VEX_WIG; 4317def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), 4318 "movq\t{$src, $dst|$dst, $src}", 4319 [(store (i64 (extractelt (v2i64 VR128:$src), 4320 (iPTR 0))), addr:$dst)]>; 4321} // ExeDomain, SchedRW 4322 4323// For disassembler only 4324let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, 4325 SchedRW = [SchedWriteVecLogic.XMM] in { 4326def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 4327 "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_WIG; 4328def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 4329 "movq\t{$src, $dst|$dst, $src}", []>; 4330} 4331 4332def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}", 4333 (VMOVPQI2QIrr VR128:$dst, VR128:$src), 0>; 4334def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}", 4335 (MOVPQI2QIrr VR128:$dst, VR128:$src), 0>; 4336 4337let Predicates = [UseAVX] in { 4338 def : Pat<(v2i64 (X86vzload64 addr:$src)), 4339 (VMOVQI2PQIrm addr:$src)>; 4340 def : Pat<(v4i64 (X86vzload64 addr:$src)), 4341 (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>; 4342 4343 def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst), 4344 (VMOVPQI2QImr addr:$dst, VR128:$src)>; 4345} 4346 4347let Predicates = [UseSSE2] in { 4348 def : Pat<(v2i64 (X86vzload64 addr:$src)), (MOVQI2PQIrm addr:$src)>; 4349 4350 def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst), 4351 (MOVPQI2QImr addr:$dst, VR128:$src)>; 4352} 4353 4354//===---------------------------------------------------------------------===// 4355// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in 4356// IA32 document. movq xmm1, xmm2 does clear the high bits. 4357// 4358let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in { 4359def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 4360 "vmovq\t{$src, $dst|$dst, $src}", 4361 [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>, 4362 XS, VEX, Requires<[UseAVX]>, VEX_WIG; 4363def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 4364 "movq\t{$src, $dst|$dst, $src}", 4365 [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>, 4366 XS, Requires<[UseSSE2]>; 4367} // ExeDomain, SchedRW 4368 4369let Predicates = [UseAVX] in { 4370 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), 4371 (VMOVZPQILo2PQIrr VR128:$src)>; 4372} 4373let Predicates = [UseSSE2] in { 4374 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), 4375 (MOVZPQILo2PQIrr VR128:$src)>; 4376} 4377 4378let Predicates = [UseAVX] in { 4379 def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), 4380 (SUBREG_TO_REG (i32 0), 4381 (v2f64 (VMOVZPQILo2PQIrr 4382 (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))), 4383 sub_xmm)>; 4384 def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), 4385 (SUBREG_TO_REG (i32 0), 4386 (v2i64 (VMOVZPQILo2PQIrr 4387 (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))), 4388 sub_xmm)>; 4389} 4390 4391//===---------------------------------------------------------------------===// 4392// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP 4393//===---------------------------------------------------------------------===// 4394 4395multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr, 4396 ValueType vt, RegisterClass RC, PatFrag mem_frag, 4397 X86MemOperand x86memop, X86FoldableSchedWrite sched> { 4398def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src), 4399 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4400 [(set RC:$dst, (vt (OpNode RC:$src)))]>, 4401 Sched<[sched]>; 4402def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 4403 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4404 [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>, 4405 Sched<[sched.Folded]>; 4406} 4407 4408let Predicates = [HasAVX, NoVLX] in { 4409 defm VMOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup", 4410 v4f32, VR128, loadv4f32, f128mem, 4411 SchedWriteFShuffle.XMM>, VEX, VEX_WIG; 4412 defm VMOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup", 4413 v4f32, VR128, loadv4f32, f128mem, 4414 SchedWriteFShuffle.XMM>, VEX, VEX_WIG; 4415 defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup", 4416 v8f32, VR256, loadv8f32, f256mem, 4417 SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG; 4418 defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup", 4419 v8f32, VR256, loadv8f32, f256mem, 4420 SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG; 4421} 4422defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128, 4423 memopv4f32, f128mem, SchedWriteFShuffle.XMM>; 4424defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128, 4425 memopv4f32, f128mem, SchedWriteFShuffle.XMM>; 4426 4427let Predicates = [HasAVX, NoVLX] in { 4428 def : Pat<(v4i32 (X86Movshdup VR128:$src)), 4429 (VMOVSHDUPrr VR128:$src)>; 4430 def : Pat<(v4i32 (X86Movshdup (load addr:$src))), 4431 (VMOVSHDUPrm addr:$src)>; 4432 def : Pat<(v4i32 (X86Movsldup VR128:$src)), 4433 (VMOVSLDUPrr VR128:$src)>; 4434 def : Pat<(v4i32 (X86Movsldup (load addr:$src))), 4435 (VMOVSLDUPrm addr:$src)>; 4436 def : Pat<(v8i32 (X86Movshdup VR256:$src)), 4437 (VMOVSHDUPYrr VR256:$src)>; 4438 def : Pat<(v8i32 (X86Movshdup (load addr:$src))), 4439 (VMOVSHDUPYrm addr:$src)>; 4440 def : Pat<(v8i32 (X86Movsldup VR256:$src)), 4441 (VMOVSLDUPYrr VR256:$src)>; 4442 def : Pat<(v8i32 (X86Movsldup (load addr:$src))), 4443 (VMOVSLDUPYrm addr:$src)>; 4444} 4445 4446let Predicates = [UseSSE3] in { 4447 def : Pat<(v4i32 (X86Movshdup VR128:$src)), 4448 (MOVSHDUPrr VR128:$src)>; 4449 def : Pat<(v4i32 (X86Movshdup (memop addr:$src))), 4450 (MOVSHDUPrm addr:$src)>; 4451 def : Pat<(v4i32 (X86Movsldup VR128:$src)), 4452 (MOVSLDUPrr VR128:$src)>; 4453 def : Pat<(v4i32 (X86Movsldup (memop addr:$src))), 4454 (MOVSLDUPrm addr:$src)>; 4455} 4456 4457//===---------------------------------------------------------------------===// 4458// SSE3 - Replicate Double FP - MOVDDUP 4459//===---------------------------------------------------------------------===// 4460 4461multiclass sse3_replicate_dfp<string OpcodeStr, X86SchedWriteWidths sched> { 4462def rr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 4463 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4464 [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))]>, 4465 Sched<[sched.XMM]>; 4466def rm : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), 4467 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4468 [(set VR128:$dst, 4469 (v2f64 (X86Movddup 4470 (scalar_to_vector (loadf64 addr:$src)))))]>, 4471 Sched<[sched.XMM.Folded]>; 4472} 4473 4474// FIXME: Merge with above classes when there are patterns for the ymm version 4475multiclass sse3_replicate_dfp_y<string OpcodeStr, X86SchedWriteWidths sched> { 4476def rr : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 4477 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4478 [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>, 4479 Sched<[sched.YMM]>; 4480def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 4481 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4482 [(set VR256:$dst, 4483 (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>, 4484 Sched<[sched.YMM.Folded]>; 4485} 4486 4487let Predicates = [HasAVX, NoVLX] in { 4488 defm VMOVDDUP : sse3_replicate_dfp<"vmovddup", SchedWriteFShuffle>, 4489 VEX, VEX_WIG; 4490 defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup", SchedWriteFShuffle>, 4491 VEX, VEX_L, VEX_WIG; 4492} 4493 4494defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>; 4495 4496 4497let Predicates = [HasAVX, NoVLX] in { 4498 def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))), 4499 (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; 4500} 4501 4502let Predicates = [UseSSE3] in { 4503 def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))), 4504 (MOVDDUPrm addr:$src)>; 4505} 4506 4507//===---------------------------------------------------------------------===// 4508// SSE3 - Move Unaligned Integer 4509//===---------------------------------------------------------------------===// 4510 4511let Predicates = [HasAVX] in { 4512 def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 4513 "vlddqu\t{$src, $dst|$dst, $src}", 4514 [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, 4515 Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG; 4516 def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 4517 "vlddqu\t{$src, $dst|$dst, $src}", 4518 [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>, 4519 Sched<[SchedWriteVecMoveLS.YMM.RM]>, VEX, VEX_L, VEX_WIG; 4520} // Predicates 4521 4522def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 4523 "lddqu\t{$src, $dst|$dst, $src}", 4524 [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, 4525 Sched<[SchedWriteVecMoveLS.XMM.RM]>; 4526 4527//===---------------------------------------------------------------------===// 4528// SSE3 - Arithmetic 4529//===---------------------------------------------------------------------===// 4530 4531multiclass sse3_addsub<string OpcodeStr, ValueType vt, RegisterClass RC, 4532 X86MemOperand x86memop, X86FoldableSchedWrite sched, 4533 PatFrag ld_frag, bit Is2Addr = 1> { 4534let Uses = [MXCSR], mayRaiseFPException = 1 in { 4535 def rr : I<0xD0, MRMSrcReg, 4536 (outs RC:$dst), (ins RC:$src1, RC:$src2), 4537 !if(Is2Addr, 4538 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4539 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4540 [(set RC:$dst, (vt (X86Addsub RC:$src1, RC:$src2)))]>, 4541 Sched<[sched]>; 4542 def rm : I<0xD0, MRMSrcMem, 4543 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 4544 !if(Is2Addr, 4545 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4546 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4547 [(set RC:$dst, (vt (X86Addsub RC:$src1, (ld_frag addr:$src2))))]>, 4548 Sched<[sched.Folded, sched.ReadAfterFold]>; 4549} 4550} 4551 4552let Predicates = [HasAVX] in { 4553 let ExeDomain = SSEPackedSingle in { 4554 defm VADDSUBPS : sse3_addsub<"vaddsubps", v4f32, VR128, f128mem, 4555 SchedWriteFAddSizes.PS.XMM, loadv4f32, 0>, 4556 XD, VEX_4V, VEX_WIG; 4557 defm VADDSUBPSY : sse3_addsub<"vaddsubps", v8f32, VR256, f256mem, 4558 SchedWriteFAddSizes.PS.YMM, loadv8f32, 0>, 4559 XD, VEX_4V, VEX_L, VEX_WIG; 4560 } 4561 let ExeDomain = SSEPackedDouble in { 4562 defm VADDSUBPD : sse3_addsub<"vaddsubpd", v2f64, VR128, f128mem, 4563 SchedWriteFAddSizes.PD.XMM, loadv2f64, 0>, 4564 PD, VEX_4V, VEX_WIG; 4565 defm VADDSUBPDY : sse3_addsub<"vaddsubpd", v4f64, VR256, f256mem, 4566 SchedWriteFAddSizes.PD.YMM, loadv4f64, 0>, 4567 PD, VEX_4V, VEX_L, VEX_WIG; 4568 } 4569} 4570let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in { 4571 let ExeDomain = SSEPackedSingle in 4572 defm ADDSUBPS : sse3_addsub<"addsubps", v4f32, VR128, f128mem, 4573 SchedWriteFAddSizes.PS.XMM, memopv4f32>, XD; 4574 let ExeDomain = SSEPackedDouble in 4575 defm ADDSUBPD : sse3_addsub<"addsubpd", v2f64, VR128, f128mem, 4576 SchedWriteFAddSizes.PD.XMM, memopv2f64>, PD; 4577} 4578 4579//===---------------------------------------------------------------------===// 4580// SSE3 Instructions 4581//===---------------------------------------------------------------------===// 4582 4583// Horizontal ops 4584multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, 4585 X86MemOperand x86memop, SDNode OpNode, 4586 X86FoldableSchedWrite sched, PatFrag ld_frag, 4587 bit Is2Addr = 1> { 4588let Uses = [MXCSR], mayRaiseFPException = 1 in { 4589 def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 4590 !if(Is2Addr, 4591 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4592 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4593 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>, 4594 Sched<[sched]>; 4595 4596 def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 4597 !if(Is2Addr, 4598 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4599 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4600 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>, 4601 Sched<[sched.Folded, sched.ReadAfterFold]>; 4602} 4603} 4604multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, 4605 X86MemOperand x86memop, SDNode OpNode, 4606 X86FoldableSchedWrite sched, PatFrag ld_frag, 4607 bit Is2Addr = 1> { 4608let Uses = [MXCSR], mayRaiseFPException = 1 in { 4609 def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 4610 !if(Is2Addr, 4611 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4612 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4613 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>, 4614 Sched<[sched]>; 4615 4616 def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 4617 !if(Is2Addr, 4618 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4619 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4620 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>, 4621 Sched<[sched.Folded, sched.ReadAfterFold]>; 4622} 4623} 4624 4625let Predicates = [HasAVX] in { 4626 let ExeDomain = SSEPackedSingle in { 4627 defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem, 4628 X86fhadd, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG; 4629 defm VHSUBPS : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem, 4630 X86fhsub, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG; 4631 defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem, 4632 X86fhadd, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG; 4633 defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem, 4634 X86fhsub, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG; 4635 } 4636 let ExeDomain = SSEPackedDouble in { 4637 defm VHADDPD : S3_Int<0x7C, "vhaddpd", v2f64, VR128, f128mem, 4638 X86fhadd, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG; 4639 defm VHSUBPD : S3_Int<0x7D, "vhsubpd", v2f64, VR128, f128mem, 4640 X86fhsub, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG; 4641 defm VHADDPDY : S3_Int<0x7C, "vhaddpd", v4f64, VR256, f256mem, 4642 X86fhadd, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG; 4643 defm VHSUBPDY : S3_Int<0x7D, "vhsubpd", v4f64, VR256, f256mem, 4644 X86fhsub, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG; 4645 } 4646} 4647 4648let Constraints = "$src1 = $dst" in { 4649 let ExeDomain = SSEPackedSingle in { 4650 defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd, 4651 WriteFHAdd, memopv4f32>; 4652 defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub, 4653 WriteFHAdd, memopv4f32>; 4654 } 4655 let ExeDomain = SSEPackedDouble in { 4656 defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd, 4657 WriteFHAdd, memopv2f64>; 4658 defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub, 4659 WriteFHAdd, memopv2f64>; 4660 } 4661} 4662 4663//===---------------------------------------------------------------------===// 4664// SSSE3 - Packed Absolute Instructions 4665//===---------------------------------------------------------------------===// 4666 4667/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}. 4668multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt, 4669 SDNode OpNode, X86SchedWriteWidths sched, PatFrag ld_frag> { 4670 def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst), 4671 (ins VR128:$src), 4672 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4673 [(set VR128:$dst, (vt (OpNode VR128:$src)))]>, 4674 Sched<[sched.XMM]>; 4675 4676 def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst), 4677 (ins i128mem:$src), 4678 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4679 [(set VR128:$dst, 4680 (vt (OpNode (ld_frag addr:$src))))]>, 4681 Sched<[sched.XMM.Folded]>; 4682} 4683 4684/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}. 4685multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt, 4686 SDNode OpNode, X86SchedWriteWidths sched> { 4687 def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst), 4688 (ins VR256:$src), 4689 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4690 [(set VR256:$dst, (vt (OpNode VR256:$src)))]>, 4691 Sched<[sched.YMM]>; 4692 4693 def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst), 4694 (ins i256mem:$src), 4695 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4696 [(set VR256:$dst, 4697 (vt (OpNode (load addr:$src))))]>, 4698 Sched<[sched.YMM.Folded]>; 4699} 4700 4701let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 4702 defm VPABSB : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, SchedWriteVecALU, 4703 load>, VEX, VEX_WIG; 4704 defm VPABSW : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, SchedWriteVecALU, 4705 load>, VEX, VEX_WIG; 4706} 4707let Predicates = [HasAVX, NoVLX] in { 4708 defm VPABSD : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, SchedWriteVecALU, 4709 load>, VEX, VEX_WIG; 4710} 4711let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 4712 defm VPABSB : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs, SchedWriteVecALU>, 4713 VEX, VEX_L, VEX_WIG; 4714 defm VPABSW : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, abs, SchedWriteVecALU>, 4715 VEX, VEX_L, VEX_WIG; 4716} 4717let Predicates = [HasAVX2, NoVLX] in { 4718 defm VPABSD : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, abs, SchedWriteVecALU>, 4719 VEX, VEX_L, VEX_WIG; 4720} 4721 4722defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, SchedWriteVecALU, 4723 memop>; 4724defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, SchedWriteVecALU, 4725 memop>; 4726defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, SchedWriteVecALU, 4727 memop>; 4728 4729//===---------------------------------------------------------------------===// 4730// SSSE3 - Packed Binary Operator Instructions 4731//===---------------------------------------------------------------------===// 4732 4733/// SS3I_binop_rm - Simple SSSE3 bin op 4734multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 4735 ValueType DstVT, ValueType OpVT, RegisterClass RC, 4736 PatFrag memop_frag, X86MemOperand x86memop, 4737 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 4738 let isCommutable = 1 in 4739 def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst), 4740 (ins RC:$src1, RC:$src2), 4741 !if(Is2Addr, 4742 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4743 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4744 [(set RC:$dst, (DstVT (OpNode (OpVT RC:$src1), RC:$src2)))]>, 4745 Sched<[sched]>; 4746 def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst), 4747 (ins RC:$src1, x86memop:$src2), 4748 !if(Is2Addr, 4749 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4750 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4751 [(set RC:$dst, 4752 (DstVT (OpNode (OpVT RC:$src1), (memop_frag addr:$src2))))]>, 4753 Sched<[sched.Folded, sched.ReadAfterFold]>; 4754} 4755 4756/// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}. 4757multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr, 4758 Intrinsic IntId128, X86FoldableSchedWrite sched, 4759 PatFrag ld_frag, bit Is2Addr = 1> { 4760 let isCommutable = 1 in 4761 def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst), 4762 (ins VR128:$src1, VR128:$src2), 4763 !if(Is2Addr, 4764 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4765 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4766 [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>, 4767 Sched<[sched]>; 4768 def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst), 4769 (ins VR128:$src1, i128mem:$src2), 4770 !if(Is2Addr, 4771 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4772 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4773 [(set VR128:$dst, 4774 (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>, 4775 Sched<[sched.Folded, sched.ReadAfterFold]>; 4776} 4777 4778multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr, 4779 Intrinsic IntId256, 4780 X86FoldableSchedWrite sched> { 4781 let isCommutable = 1 in 4782 def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst), 4783 (ins VR256:$src1, VR256:$src2), 4784 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 4785 [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>, 4786 Sched<[sched]>; 4787 def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst), 4788 (ins VR256:$src1, i256mem:$src2), 4789 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 4790 [(set VR256:$dst, 4791 (IntId256 VR256:$src1, (load addr:$src2)))]>, 4792 Sched<[sched.Folded, sched.ReadAfterFold]>; 4793} 4794 4795let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 4796let isCommutable = 0 in { 4797 defm VPSHUFB : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8, 4798 VR128, load, i128mem, 4799 SchedWriteVarShuffle.XMM, 0>, VEX_4V, VEX_WIG; 4800 defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16, 4801 v16i8, VR128, load, i128mem, 4802 SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG; 4803} 4804defm VPMULHRSW : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16, 4805 VR128, load, i128mem, 4806 SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG; 4807} 4808 4809let ImmT = NoImm, Predicates = [HasAVX] in { 4810let isCommutable = 0 in { 4811 defm VPHADDW : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128, 4812 load, i128mem, 4813 SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG; 4814 defm VPHADDD : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128, 4815 load, i128mem, 4816 SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG; 4817 defm VPHSUBW : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128, 4818 load, i128mem, 4819 SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG; 4820 defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128, 4821 load, i128mem, 4822 SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG; 4823 defm VPSIGNB : SS3I_binop_rm_int<0x08, "vpsignb", 4824 int_x86_ssse3_psign_b_128, 4825 SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG; 4826 defm VPSIGNW : SS3I_binop_rm_int<0x09, "vpsignw", 4827 int_x86_ssse3_psign_w_128, 4828 SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG; 4829 defm VPSIGND : SS3I_binop_rm_int<0x0A, "vpsignd", 4830 int_x86_ssse3_psign_d_128, 4831 SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG; 4832 defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw", 4833 int_x86_ssse3_phadd_sw_128, 4834 SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG; 4835 defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw", 4836 int_x86_ssse3_phsub_sw_128, 4837 SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG; 4838} 4839} 4840 4841let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 4842let isCommutable = 0 in { 4843 defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8, 4844 VR256, load, i256mem, 4845 SchedWriteVarShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4846 defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16, 4847 v32i8, VR256, load, i256mem, 4848 SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4849} 4850defm VPMULHRSWY : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16, 4851 VR256, load, i256mem, 4852 SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4853} 4854 4855let ImmT = NoImm, Predicates = [HasAVX2] in { 4856let isCommutable = 0 in { 4857 defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16, 4858 VR256, load, i256mem, 4859 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4860 defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256, 4861 load, i256mem, 4862 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4863 defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16, 4864 VR256, load, i256mem, 4865 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4866 defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256, 4867 load, i256mem, 4868 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4869 defm VPSIGNB : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b, 4870 SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG; 4871 defm VPSIGNW : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w, 4872 SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG; 4873 defm VPSIGND : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d, 4874 SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG; 4875 defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw", 4876 int_x86_avx2_phadd_sw, 4877 SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG; 4878 defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw", 4879 int_x86_avx2_phsub_sw, 4880 SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG; 4881} 4882} 4883 4884// None of these have i8 immediate fields. 4885let ImmT = NoImm, Constraints = "$src1 = $dst" in { 4886let isCommutable = 0 in { 4887 defm PHADDW : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128, 4888 memop, i128mem, SchedWritePHAdd.XMM>; 4889 defm PHADDD : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128, 4890 memop, i128mem, SchedWritePHAdd.XMM>; 4891 defm PHSUBW : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128, 4892 memop, i128mem, SchedWritePHAdd.XMM>; 4893 defm PHSUBD : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128, 4894 memop, i128mem, SchedWritePHAdd.XMM>; 4895 defm PSIGNB : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128, 4896 SchedWriteVecALU.XMM, memop>; 4897 defm PSIGNW : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128, 4898 SchedWriteVecALU.XMM, memop>; 4899 defm PSIGND : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128, 4900 SchedWriteVecALU.XMM, memop>; 4901 defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128, 4902 memop, i128mem, SchedWriteVarShuffle.XMM>; 4903 defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw", 4904 int_x86_ssse3_phadd_sw_128, 4905 SchedWritePHAdd.XMM, memop>; 4906 defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw", 4907 int_x86_ssse3_phsub_sw_128, 4908 SchedWritePHAdd.XMM, memop>; 4909 defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16, 4910 v16i8, VR128, memop, i128mem, 4911 SchedWriteVecIMul.XMM>; 4912} 4913defm PMULHRSW : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16, 4914 VR128, memop, i128mem, SchedWriteVecIMul.XMM>; 4915} 4916 4917//===---------------------------------------------------------------------===// 4918// SSSE3 - Packed Align Instruction Patterns 4919//===---------------------------------------------------------------------===// 4920 4921multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC, 4922 PatFrag memop_frag, X86MemOperand x86memop, 4923 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 4924 let hasSideEffects = 0 in { 4925 def rri : SS3AI<0x0F, MRMSrcReg, (outs RC:$dst), 4926 (ins RC:$src1, RC:$src2, u8imm:$src3), 4927 !if(Is2Addr, 4928 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 4929 !strconcat(asm, 4930 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 4931 [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 timm:$src3))))]>, 4932 Sched<[sched]>; 4933 let mayLoad = 1 in 4934 def rmi : SS3AI<0x0F, MRMSrcMem, (outs RC:$dst), 4935 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 4936 !if(Is2Addr, 4937 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 4938 !strconcat(asm, 4939 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 4940 [(set RC:$dst, (VT (X86PAlignr RC:$src1, 4941 (memop_frag addr:$src2), 4942 (i8 timm:$src3))))]>, 4943 Sched<[sched.Folded, sched.ReadAfterFold]>; 4944 } 4945} 4946 4947let Predicates = [HasAVX, NoVLX_Or_NoBWI] in 4948 defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, load, i128mem, 4949 SchedWriteShuffle.XMM, 0>, VEX_4V, VEX_WIG; 4950let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in 4951 defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, load, i256mem, 4952 SchedWriteShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4953let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in 4954 defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memop, i128mem, 4955 SchedWriteShuffle.XMM>; 4956 4957//===---------------------------------------------------------------------===// 4958// SSSE3 - Thread synchronization 4959//===---------------------------------------------------------------------===// 4960 4961let SchedRW = [WriteSystem] in { 4962let Uses = [EAX, ECX, EDX] in 4963def MONITOR32rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>, 4964 TB, Requires<[HasSSE3, Not64BitMode]>; 4965let Uses = [RAX, ECX, EDX] in 4966def MONITOR64rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>, 4967 TB, Requires<[HasSSE3, In64BitMode]>; 4968 4969let Uses = [ECX, EAX] in 4970def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait", 4971 [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>; 4972} // SchedRW 4973 4974def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>; 4975def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>; 4976 4977def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITOR32rrr)>, 4978 Requires<[Not64BitMode]>; 4979def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITOR64rrr)>, 4980 Requires<[In64BitMode]>; 4981 4982//===----------------------------------------------------------------------===// 4983// SSE4.1 - Packed Move with Sign/Zero Extend 4984// NOTE: Any Extend is promoted to Zero Extend in X86ISelDAGToDAG.cpp 4985//===----------------------------------------------------------------------===// 4986 4987multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp, 4988 RegisterClass OutRC, RegisterClass InRC, 4989 X86FoldableSchedWrite sched> { 4990 def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src), 4991 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>, 4992 Sched<[sched]>; 4993 4994 def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src), 4995 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>, 4996 Sched<[sched.Folded]>; 4997} 4998 4999multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr, 5000 X86MemOperand MemOp, X86MemOperand MemYOp, 5001 Predicate prd> { 5002 defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128, 5003 SchedWriteShuffle.XMM>; 5004 let Predicates = [HasAVX, prd] in 5005 defm V#NAME : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp, 5006 VR128, VR128, SchedWriteVecExtend.XMM>, 5007 VEX, VEX_WIG; 5008 let Predicates = [HasAVX2, prd] in 5009 defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp, 5010 VR256, VR128, SchedWriteVecExtend.YMM>, 5011 VEX, VEX_L, VEX_WIG; 5012} 5013 5014multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp, 5015 X86MemOperand MemYOp, Predicate prd> { 5016 defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr), 5017 MemOp, MemYOp, prd>; 5018 defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10), 5019 !strconcat("pmovzx", OpcodeStr), 5020 MemOp, MemYOp, prd>; 5021} 5022 5023defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem, NoVLX_Or_NoBWI>; 5024defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem, NoVLX>; 5025defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem, NoVLX>; 5026 5027defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem, NoVLX>; 5028defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem, NoVLX>; 5029 5030defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>; 5031 5032// AVX2 Patterns 5033multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, 5034 SDNode ExtOp, SDNode InVecOp> { 5035 // Register-Register patterns 5036 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 5037 def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))), 5038 (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>; 5039 } 5040 let Predicates = [HasAVX2, NoVLX] in { 5041 def : Pat<(v8i32 (InVecOp (v16i8 VR128:$src))), 5042 (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>; 5043 def : Pat<(v4i64 (InVecOp (v16i8 VR128:$src))), 5044 (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>; 5045 5046 def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))), 5047 (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>; 5048 def : Pat<(v4i64 (InVecOp (v8i16 VR128:$src))), 5049 (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>; 5050 5051 def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))), 5052 (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>; 5053 } 5054 5055 // Simple Register-Memory patterns 5056 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 5057 def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5058 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; 5059 5060 def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))), 5061 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; 5062 } 5063 5064 let Predicates = [HasAVX2, NoVLX] in { 5065 def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5066 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 5067 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5068 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; 5069 5070 def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), 5071 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; 5072 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), 5073 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 5074 5075 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)), 5076 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; 5077 } 5078 5079 // AVX2 Register-Memory patterns 5080 let Predicates = [HasAVX2, NoVLX] in { 5081 def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))), 5082 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; 5083 5084 def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5085 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 5086 def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 5087 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 5088 def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))), 5089 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 5090 5091 def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))), 5092 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; 5093 5094 def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), 5095 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; 5096 def : Pat<(v4i64 (InVecOp (bc_v16i8 (v2i64 (X86vzload32 addr:$src))))), 5097 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; 5098 5099 def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5100 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 5101 def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 5102 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 5103 def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), 5104 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 5105 } 5106} 5107 5108defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", sext, sext_invec>; 5109defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", zext, zext_invec>; 5110 5111// SSE4.1/AVX patterns. 5112multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy, 5113 SDNode ExtOp> { 5114 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 5115 def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))), 5116 (!cast<I>(OpcPrefix#BWrr) VR128:$src)>; 5117 } 5118 let Predicates = [HasAVX, NoVLX] in { 5119 def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))), 5120 (!cast<I>(OpcPrefix#BDrr) VR128:$src)>; 5121 def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))), 5122 (!cast<I>(OpcPrefix#BQrr) VR128:$src)>; 5123 5124 def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))), 5125 (!cast<I>(OpcPrefix#WDrr) VR128:$src)>; 5126 def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))), 5127 (!cast<I>(OpcPrefix#WQrr) VR128:$src)>; 5128 5129 def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))), 5130 (!cast<I>(OpcPrefix#DQrr) VR128:$src)>; 5131 } 5132 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 5133 def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5134 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5135 } 5136 let Predicates = [HasAVX, NoVLX] in { 5137 def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5138 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 5139 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5140 (!cast<I>(OpcPrefix#BQrm) addr:$src)>; 5141 5142 def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), 5143 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5144 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), 5145 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 5146 5147 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)), 5148 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5149 } 5150 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 5151 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5152 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5153 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 5154 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5155 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))), 5156 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5157 def : Pat<(v8i16 (ExtOp (loadv16i8 addr:$src))), 5158 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5159 } 5160 let Predicates = [HasAVX, NoVLX] in { 5161 def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), 5162 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 5163 def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))), 5164 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 5165 def : Pat<(v4i32 (ExtOp (loadv16i8 addr:$src))), 5166 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 5167 5168 def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))), 5169 (!cast<I>(OpcPrefix#BQrm) addr:$src)>; 5170 def : Pat<(v2i64 (ExtOp (loadv16i8 addr:$src))), 5171 (!cast<I>(OpcPrefix#BQrm) addr:$src)>; 5172 5173 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5174 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5175 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 5176 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5177 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), 5178 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5179 def : Pat<(v4i32 (ExtOp (loadv8i16 addr:$src))), 5180 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5181 5182 def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), 5183 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 5184 def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (X86vzload32 addr:$src))))), 5185 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 5186 def : Pat<(v2i64 (ExtOp (loadv8i16 addr:$src))), 5187 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 5188 5189 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5190 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5191 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 5192 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5193 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), 5194 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5195 def : Pat<(v2i64 (ExtOp (loadv4i32 addr:$src))), 5196 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5197 } 5198} 5199 5200defm : SS41I_pmovx_patterns<"VPMOVSX", "s", sext_invec>; 5201defm : SS41I_pmovx_patterns<"VPMOVZX", "z", zext_invec>; 5202 5203let Predicates = [UseSSE41] in { 5204 defm : SS41I_pmovx_patterns<"PMOVSX", "s", sext_invec>; 5205 defm : SS41I_pmovx_patterns<"PMOVZX", "z", zext_invec>; 5206} 5207 5208//===----------------------------------------------------------------------===// 5209// SSE4.1 - Extract Instructions 5210//===----------------------------------------------------------------------===// 5211 5212/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem 5213multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> { 5214 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), 5215 (ins VR128:$src1, u8imm:$src2), 5216 !strconcat(OpcodeStr, 5217 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5218 [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1), 5219 timm:$src2))]>, 5220 Sched<[WriteVecExtract]>; 5221 let hasSideEffects = 0, mayStore = 1 in 5222 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5223 (ins i8mem:$dst, VR128:$src1, u8imm:$src2), 5224 !strconcat(OpcodeStr, 5225 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5226 [(store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), timm:$src2))), 5227 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5228} 5229 5230let Predicates = [HasAVX, NoBWI] in 5231 defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX, VEX_WIG; 5232 5233defm PEXTRB : SS41I_extract8<0x14, "pextrb">; 5234 5235 5236/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination 5237multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> { 5238 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in 5239 def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), 5240 (ins VR128:$src1, u8imm:$src2), 5241 !strconcat(OpcodeStr, 5242 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, 5243 Sched<[WriteVecExtract]>, FoldGenData<NAME#rr>; 5244 5245 let hasSideEffects = 0, mayStore = 1 in 5246 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5247 (ins i16mem:$dst, VR128:$src1, u8imm:$src2), 5248 !strconcat(OpcodeStr, 5249 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5250 [(store (i16 (trunc (X86pextrw (v8i16 VR128:$src1), timm:$src2))), 5251 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5252} 5253 5254let Predicates = [HasAVX, NoBWI] in 5255 defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX, VEX_WIG; 5256 5257defm PEXTRW : SS41I_extract16<0x15, "pextrw">; 5258 5259let Predicates = [UseSSE41] in 5260 def : Pat<(store f16:$src, addr:$dst), (PEXTRWmr addr:$dst, (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0)>; 5261 5262let Predicates = [HasAVX, NoBWI] in 5263 def : Pat<(store f16:$src, addr:$dst), (VPEXTRWmr addr:$dst, (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0)>; 5264 5265 5266/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination 5267multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> { 5268 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst), 5269 (ins VR128:$src1, u8imm:$src2), 5270 !strconcat(OpcodeStr, 5271 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5272 [(set GR32:$dst, 5273 (extractelt (v4i32 VR128:$src1), imm:$src2))]>, 5274 Sched<[WriteVecExtract]>; 5275 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5276 (ins i32mem:$dst, VR128:$src1, u8imm:$src2), 5277 !strconcat(OpcodeStr, 5278 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5279 [(store (extractelt (v4i32 VR128:$src1), imm:$src2), 5280 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5281} 5282 5283let Predicates = [HasAVX, NoDQI] in 5284 defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX; 5285 5286defm PEXTRD : SS41I_extract32<0x16, "pextrd">; 5287 5288/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination 5289multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> { 5290 def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst), 5291 (ins VR128:$src1, u8imm:$src2), 5292 !strconcat(OpcodeStr, 5293 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5294 [(set GR64:$dst, 5295 (extractelt (v2i64 VR128:$src1), imm:$src2))]>, 5296 Sched<[WriteVecExtract]>; 5297 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5298 (ins i64mem:$dst, VR128:$src1, u8imm:$src2), 5299 !strconcat(OpcodeStr, 5300 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5301 [(store (extractelt (v2i64 VR128:$src1), imm:$src2), 5302 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5303} 5304 5305let Predicates = [HasAVX, NoDQI] in 5306 defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W; 5307 5308defm PEXTRQ : SS41I_extract64<0x16, "pextrq">, REX_W; 5309 5310/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory 5311/// destination 5312multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> { 5313 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), 5314 (ins VR128:$src1, u8imm:$src2), 5315 !strconcat(OpcodeStr, 5316 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5317 [(set GR32orGR64:$dst, 5318 (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>, 5319 Sched<[WriteVecExtract]>; 5320 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5321 (ins f32mem:$dst, VR128:$src1, u8imm:$src2), 5322 !strconcat(OpcodeStr, 5323 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5324 [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2), 5325 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5326} 5327 5328let ExeDomain = SSEPackedSingle in { 5329 let Predicates = [UseAVX] in 5330 defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX, VEX_WIG; 5331 defm EXTRACTPS : SS41I_extractf32<0x17, "extractps">; 5332} 5333 5334//===----------------------------------------------------------------------===// 5335// SSE4.1 - Insert Instructions 5336//===----------------------------------------------------------------------===// 5337 5338multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> { 5339 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 5340 (ins VR128:$src1, GR32orGR64:$src2, u8imm:$src3), 5341 !if(Is2Addr, 5342 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5343 !strconcat(asm, 5344 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5345 [(set VR128:$dst, 5346 (X86pinsrb VR128:$src1, GR32orGR64:$src2, timm:$src3))]>, 5347 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; 5348 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 5349 (ins VR128:$src1, i8mem:$src2, u8imm:$src3), 5350 !if(Is2Addr, 5351 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5352 !strconcat(asm, 5353 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5354 [(set VR128:$dst, 5355 (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), timm:$src3))]>, 5356 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>; 5357} 5358 5359let Predicates = [HasAVX, NoBWI] in { 5360 defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V, VEX_WIG; 5361 def : Pat<(X86pinsrb VR128:$src1, (i32 (anyext (i8 GR8:$src2))), timm:$src3), 5362 (VPINSRBrr VR128:$src1, (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 5363 GR8:$src2, sub_8bit), timm:$src3)>; 5364} 5365 5366let Constraints = "$src1 = $dst" in 5367 defm PINSRB : SS41I_insert8<0x20, "pinsrb">; 5368 5369multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> { 5370 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 5371 (ins VR128:$src1, GR32:$src2, u8imm:$src3), 5372 !if(Is2Addr, 5373 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5374 !strconcat(asm, 5375 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5376 [(set VR128:$dst, 5377 (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>, 5378 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; 5379 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 5380 (ins VR128:$src1, i32mem:$src2, u8imm:$src3), 5381 !if(Is2Addr, 5382 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5383 !strconcat(asm, 5384 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5385 [(set VR128:$dst, 5386 (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2), imm:$src3)))]>, 5387 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>; 5388} 5389 5390let Predicates = [HasAVX, NoDQI] in 5391 defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V; 5392let Constraints = "$src1 = $dst" in 5393 defm PINSRD : SS41I_insert32<0x22, "pinsrd">; 5394 5395multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> { 5396 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 5397 (ins VR128:$src1, GR64:$src2, u8imm:$src3), 5398 !if(Is2Addr, 5399 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5400 !strconcat(asm, 5401 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5402 [(set VR128:$dst, 5403 (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>, 5404 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; 5405 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 5406 (ins VR128:$src1, i64mem:$src2, u8imm:$src3), 5407 !if(Is2Addr, 5408 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5409 !strconcat(asm, 5410 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5411 [(set VR128:$dst, 5412 (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2), imm:$src3)))]>, 5413 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>; 5414} 5415 5416let Predicates = [HasAVX, NoDQI] in 5417 defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W; 5418let Constraints = "$src1 = $dst" in 5419 defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W; 5420 5421// insertps has a few different modes, there's the first two here below which 5422// are optimized inserts that won't zero arbitrary elements in the destination 5423// vector. The next one matches the intrinsic and could zero arbitrary elements 5424// in the target vector. 5425multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> { 5426 let isCommutable = 1 in 5427 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 5428 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 5429 !if(Is2Addr, 5430 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5431 !strconcat(asm, 5432 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5433 [(set VR128:$dst, 5434 (X86insertps VR128:$src1, VR128:$src2, timm:$src3))]>, 5435 Sched<[SchedWriteFShuffle.XMM]>; 5436 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 5437 (ins VR128:$src1, f32mem:$src2, u8imm:$src3), 5438 !if(Is2Addr, 5439 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5440 !strconcat(asm, 5441 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5442 [(set VR128:$dst, 5443 (X86insertps VR128:$src1, 5444 (v4f32 (scalar_to_vector (loadf32 addr:$src2))), 5445 timm:$src3))]>, 5446 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>; 5447} 5448 5449let ExeDomain = SSEPackedSingle in { 5450 let Predicates = [UseAVX] in 5451 defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, 5452 VEX_4V, VEX_WIG; 5453 let Constraints = "$src1 = $dst" in 5454 defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1>; 5455} 5456 5457//===----------------------------------------------------------------------===// 5458// SSE4.1 - Round Instructions 5459//===----------------------------------------------------------------------===// 5460 5461multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr, 5462 X86MemOperand x86memop, RegisterClass RC, 5463 ValueType VT, PatFrag mem_frag, SDPatternOperator OpNode, 5464 X86FoldableSchedWrite sched> { 5465 // Intrinsic operation, reg. 5466 // Vector intrinsic operation, reg 5467let Uses = [MXCSR], mayRaiseFPException = 1 in { 5468 def r : SS4AIi8<opc, MRMSrcReg, 5469 (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2), 5470 !strconcat(OpcodeStr, 5471 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5472 [(set RC:$dst, (VT (OpNode RC:$src1, timm:$src2)))]>, 5473 Sched<[sched]>; 5474 5475 // Vector intrinsic operation, mem 5476 def m : SS4AIi8<opc, MRMSrcMem, 5477 (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2), 5478 !strconcat(OpcodeStr, 5479 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5480 [(set RC:$dst, 5481 (VT (OpNode (mem_frag addr:$src1), timm:$src2)))]>, 5482 Sched<[sched.Folded]>; 5483} 5484} 5485 5486multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd, 5487 string OpcodeStr, X86FoldableSchedWrite sched> { 5488let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in { 5489 def SSr : SS4AIi8<opcss, MRMSrcReg, 5490 (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3), 5491 !strconcat(OpcodeStr, 5492 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 5493 []>, Sched<[sched]>; 5494 5495 let mayLoad = 1 in 5496 def SSm : SS4AIi8<opcss, MRMSrcMem, 5497 (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3), 5498 !strconcat(OpcodeStr, 5499 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 5500 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 5501} // ExeDomain = SSEPackedSingle, hasSideEffects = 0 5502 5503let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in { 5504 def SDr : SS4AIi8<opcsd, MRMSrcReg, 5505 (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3), 5506 !strconcat(OpcodeStr, 5507 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 5508 []>, Sched<[sched]>; 5509 5510 let mayLoad = 1 in 5511 def SDm : SS4AIi8<opcsd, MRMSrcMem, 5512 (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3), 5513 !strconcat(OpcodeStr, 5514 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 5515 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 5516} // ExeDomain = SSEPackedDouble, hasSideEffects = 0 5517} 5518 5519multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd, 5520 string OpcodeStr, X86FoldableSchedWrite sched> { 5521let Uses = [MXCSR], mayRaiseFPException = 1 in { 5522let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in { 5523 def SSr : SS4AIi8<opcss, MRMSrcReg, 5524 (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2), 5525 !strconcat(OpcodeStr, 5526 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5527 []>, Sched<[sched]>; 5528 5529 let mayLoad = 1 in 5530 def SSm : SS4AIi8<opcss, MRMSrcMem, 5531 (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2), 5532 !strconcat(OpcodeStr, 5533 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5534 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 5535} // ExeDomain = SSEPackedSingle, hasSideEffects = 0 5536 5537let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in { 5538 def SDr : SS4AIi8<opcsd, MRMSrcReg, 5539 (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2), 5540 !strconcat(OpcodeStr, 5541 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5542 []>, Sched<[sched]>; 5543 5544 let mayLoad = 1 in 5545 def SDm : SS4AIi8<opcsd, MRMSrcMem, 5546 (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2), 5547 !strconcat(OpcodeStr, 5548 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5549 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 5550} // ExeDomain = SSEPackedDouble, hasSideEffects = 0 5551} 5552} 5553 5554multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd, 5555 string OpcodeStr, X86FoldableSchedWrite sched, 5556 ValueType VT32, ValueType VT64, 5557 SDNode OpNode, bit Is2Addr = 1> { 5558let Uses = [MXCSR], mayRaiseFPException = 1 in { 5559let ExeDomain = SSEPackedSingle in { 5560 def SSr_Int : SS4AIi8<opcss, MRMSrcReg, 5561 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3), 5562 !if(Is2Addr, 5563 !strconcat(OpcodeStr, 5564 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5565 !strconcat(OpcodeStr, 5566 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5567 [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>, 5568 Sched<[sched]>; 5569 5570 def SSm_Int : SS4AIi8<opcss, MRMSrcMem, 5571 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3), 5572 !if(Is2Addr, 5573 !strconcat(OpcodeStr, 5574 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5575 !strconcat(OpcodeStr, 5576 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5577 [(set VR128:$dst, 5578 (OpNode VR128:$src1, (sse_load_f32 addr:$src2), timm:$src3))]>, 5579 Sched<[sched.Folded, sched.ReadAfterFold]>; 5580} // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 5581 5582let ExeDomain = SSEPackedDouble in { 5583 def SDr_Int : SS4AIi8<opcsd, MRMSrcReg, 5584 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3), 5585 !if(Is2Addr, 5586 !strconcat(OpcodeStr, 5587 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5588 !strconcat(OpcodeStr, 5589 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5590 [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>, 5591 Sched<[sched]>; 5592 5593 def SDm_Int : SS4AIi8<opcsd, MRMSrcMem, 5594 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3), 5595 !if(Is2Addr, 5596 !strconcat(OpcodeStr, 5597 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5598 !strconcat(OpcodeStr, 5599 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5600 [(set VR128:$dst, 5601 (OpNode VR128:$src1, (sse_load_f64 addr:$src2), timm:$src3))]>, 5602 Sched<[sched.Folded, sched.ReadAfterFold]>; 5603} // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 5604} 5605} 5606 5607// FP round - roundss, roundps, roundsd, roundpd 5608let Predicates = [HasAVX, NoVLX] in { 5609 let ExeDomain = SSEPackedSingle, Uses = [MXCSR], mayRaiseFPException = 1 in { 5610 // Intrinsic form 5611 defm VROUNDPS : sse41_fp_unop_p<0x08, "vroundps", f128mem, VR128, v4f32, 5612 loadv4f32, X86any_VRndScale, SchedWriteFRnd.XMM>, 5613 VEX, VEX_WIG; 5614 defm VROUNDPSY : sse41_fp_unop_p<0x08, "vroundps", f256mem, VR256, v8f32, 5615 loadv8f32, X86any_VRndScale, SchedWriteFRnd.YMM>, 5616 VEX, VEX_L, VEX_WIG; 5617 } 5618 5619 let ExeDomain = SSEPackedDouble, Uses = [MXCSR], mayRaiseFPException = 1 in { 5620 defm VROUNDPD : sse41_fp_unop_p<0x09, "vroundpd", f128mem, VR128, v2f64, 5621 loadv2f64, X86any_VRndScale, SchedWriteFRnd.XMM>, 5622 VEX, VEX_WIG; 5623 defm VROUNDPDY : sse41_fp_unop_p<0x09, "vroundpd", f256mem, VR256, v4f64, 5624 loadv4f64, X86any_VRndScale, SchedWriteFRnd.YMM>, 5625 VEX, VEX_L, VEX_WIG; 5626 } 5627} 5628let Predicates = [UseAVX] in { 5629 defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl, 5630 v4f32, v2f64, X86RndScales, 0>, 5631 VEX_4V, VEX_LIG, VEX_WIG, SIMD_EXC; 5632 defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>, 5633 VEX_4V, VEX_LIG, VEX_WIG, SIMD_EXC; 5634} 5635 5636let Predicates = [UseAVX] in { 5637 def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2), 5638 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src1, timm:$src2)>; 5639 def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2), 5640 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src1, timm:$src2)>; 5641} 5642 5643let Predicates = [UseAVX, OptForSize] in { 5644 def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2), 5645 (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>; 5646 def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2), 5647 (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>; 5648} 5649 5650let ExeDomain = SSEPackedSingle in 5651defm ROUNDPS : sse41_fp_unop_p<0x08, "roundps", f128mem, VR128, v4f32, 5652 memopv4f32, X86any_VRndScale, SchedWriteFRnd.XMM>; 5653let ExeDomain = SSEPackedDouble in 5654defm ROUNDPD : sse41_fp_unop_p<0x09, "roundpd", f128mem, VR128, v2f64, 5655 memopv2f64, X86any_VRndScale, SchedWriteFRnd.XMM>; 5656 5657defm ROUND : sse41_fp_unop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl>; 5658 5659let Constraints = "$src1 = $dst" in 5660defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl, 5661 v4f32, v2f64, X86RndScales>; 5662 5663let Predicates = [UseSSE41] in { 5664 def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2), 5665 (ROUNDSSr FR32:$src1, timm:$src2)>; 5666 def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2), 5667 (ROUNDSDr FR64:$src1, timm:$src2)>; 5668} 5669 5670let Predicates = [UseSSE41, OptForSize] in { 5671 def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2), 5672 (ROUNDSSm addr:$src1, timm:$src2)>; 5673 def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2), 5674 (ROUNDSDm addr:$src1, timm:$src2)>; 5675} 5676 5677//===----------------------------------------------------------------------===// 5678// SSE4.1 - Packed Bit Test 5679//===----------------------------------------------------------------------===// 5680 5681// ptest instruction we'll lower to this in X86ISelLowering primarily from 5682// the intel intrinsic that corresponds to this. 5683let Defs = [EFLAGS], Predicates = [HasAVX] in { 5684def VPTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), 5685 "vptest\t{$src2, $src1|$src1, $src2}", 5686 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>, 5687 Sched<[SchedWriteVecTest.XMM]>, VEX, VEX_WIG; 5688def VPTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2), 5689 "vptest\t{$src2, $src1|$src1, $src2}", 5690 [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>, 5691 Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>, 5692 VEX, VEX_WIG; 5693 5694def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2), 5695 "vptest\t{$src2, $src1|$src1, $src2}", 5696 [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>, 5697 Sched<[SchedWriteVecTest.YMM]>, VEX, VEX_L, VEX_WIG; 5698def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2), 5699 "vptest\t{$src2, $src1|$src1, $src2}", 5700 [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>, 5701 Sched<[SchedWriteVecTest.YMM.Folded, SchedWriteVecTest.YMM.ReadAfterFold]>, 5702 VEX, VEX_L, VEX_WIG; 5703} 5704 5705let Defs = [EFLAGS] in { 5706def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), 5707 "ptest\t{$src2, $src1|$src1, $src2}", 5708 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>, 5709 Sched<[SchedWriteVecTest.XMM]>; 5710def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2), 5711 "ptest\t{$src2, $src1|$src1, $src2}", 5712 [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>, 5713 Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>; 5714} 5715 5716// The bit test instructions below are AVX only 5717multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC, 5718 X86MemOperand x86memop, PatFrag mem_frag, ValueType vt, 5719 X86FoldableSchedWrite sched> { 5720 def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), 5721 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 5722 [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>, 5723 Sched<[sched]>, VEX; 5724 def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2), 5725 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 5726 [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>, 5727 Sched<[sched.Folded, sched.ReadAfterFold]>, VEX; 5728} 5729 5730let Defs = [EFLAGS], Predicates = [HasAVX] in { 5731let ExeDomain = SSEPackedSingle in { 5732defm VTESTPS : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32, 5733 SchedWriteFTest.XMM>; 5734defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32, 5735 SchedWriteFTest.YMM>, VEX_L; 5736} 5737let ExeDomain = SSEPackedDouble in { 5738defm VTESTPD : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64, 5739 SchedWriteFTest.XMM>; 5740defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64, 5741 SchedWriteFTest.YMM>, VEX_L; 5742} 5743} 5744 5745//===----------------------------------------------------------------------===// 5746// SSE4.1 - Misc Instructions 5747//===----------------------------------------------------------------------===// 5748 5749let Defs = [EFLAGS], Predicates = [HasPOPCNT] in { 5750 def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), 5751 "popcnt{w}\t{$src, $dst|$dst, $src}", 5752 [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>, 5753 Sched<[WritePOPCNT]>, OpSize16, XS; 5754 def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), 5755 "popcnt{w}\t{$src, $dst|$dst, $src}", 5756 [(set GR16:$dst, (ctpop (loadi16 addr:$src))), 5757 (implicit EFLAGS)]>, 5758 Sched<[WritePOPCNT.Folded]>, OpSize16, XS; 5759 5760 def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), 5761 "popcnt{l}\t{$src, $dst|$dst, $src}", 5762 [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>, 5763 Sched<[WritePOPCNT]>, OpSize32, XS; 5764 5765 def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), 5766 "popcnt{l}\t{$src, $dst|$dst, $src}", 5767 [(set GR32:$dst, (ctpop (loadi32 addr:$src))), 5768 (implicit EFLAGS)]>, 5769 Sched<[WritePOPCNT.Folded]>, OpSize32, XS; 5770 5771 def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), 5772 "popcnt{q}\t{$src, $dst|$dst, $src}", 5773 [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>, 5774 Sched<[WritePOPCNT]>, XS; 5775 def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), 5776 "popcnt{q}\t{$src, $dst|$dst, $src}", 5777 [(set GR64:$dst, (ctpop (loadi64 addr:$src))), 5778 (implicit EFLAGS)]>, 5779 Sched<[WritePOPCNT.Folded]>, XS; 5780} 5781 5782// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16. 5783multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr, 5784 SDNode OpNode, PatFrag ld_frag, 5785 X86FoldableSchedWrite Sched> { 5786 def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), 5787 (ins VR128:$src), 5788 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 5789 [(set VR128:$dst, (v8i16 (OpNode (v8i16 VR128:$src))))]>, 5790 Sched<[Sched]>; 5791 def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), 5792 (ins i128mem:$src), 5793 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 5794 [(set VR128:$dst, 5795 (v8i16 (OpNode (ld_frag addr:$src))))]>, 5796 Sched<[Sched.Folded]>; 5797} 5798 5799// PHMIN has the same profile as PSAD, thus we use the same scheduling 5800// model, although the naming is misleading. 5801let Predicates = [HasAVX] in 5802defm VPHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "vphminposuw", 5803 X86phminpos, load, 5804 WritePHMINPOS>, VEX, VEX_WIG; 5805defm PHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "phminposuw", 5806 X86phminpos, memop, 5807 WritePHMINPOS>; 5808 5809/// SS48I_binop_rm - Simple SSE41 binary operator. 5810multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 5811 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 5812 X86MemOperand x86memop, X86FoldableSchedWrite sched, 5813 bit Is2Addr = 1> { 5814 let isCommutable = 1 in 5815 def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst), 5816 (ins RC:$src1, RC:$src2), 5817 !if(Is2Addr, 5818 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 5819 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 5820 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, 5821 Sched<[sched]>; 5822 def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst), 5823 (ins RC:$src1, x86memop:$src2), 5824 !if(Is2Addr, 5825 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 5826 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 5827 [(set RC:$dst, 5828 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>, 5829 Sched<[sched.Folded, sched.ReadAfterFold]>; 5830} 5831 5832let Predicates = [HasAVX, NoVLX] in { 5833 defm VPMINSD : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128, 5834 load, i128mem, SchedWriteVecALU.XMM, 0>, 5835 VEX_4V, VEX_WIG; 5836 defm VPMINUD : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128, 5837 load, i128mem, SchedWriteVecALU.XMM, 0>, 5838 VEX_4V, VEX_WIG; 5839 defm VPMAXSD : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128, 5840 load, i128mem, SchedWriteVecALU.XMM, 0>, 5841 VEX_4V, VEX_WIG; 5842 defm VPMAXUD : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128, 5843 load, i128mem, SchedWriteVecALU.XMM, 0>, 5844 VEX_4V, VEX_WIG; 5845 defm VPMULDQ : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v2i64, VR128, 5846 load, i128mem, SchedWriteVecIMul.XMM, 0>, 5847 VEX_4V, VEX_WIG; 5848} 5849let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 5850 defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128, 5851 load, i128mem, SchedWriteVecALU.XMM, 0>, 5852 VEX_4V, VEX_WIG; 5853 defm VPMINUW : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128, 5854 load, i128mem, SchedWriteVecALU.XMM, 0>, 5855 VEX_4V, VEX_WIG; 5856 defm VPMAXSB : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128, 5857 load, i128mem, SchedWriteVecALU.XMM, 0>, 5858 VEX_4V, VEX_WIG; 5859 defm VPMAXUW : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128, 5860 load, i128mem, SchedWriteVecALU.XMM, 0>, 5861 VEX_4V, VEX_WIG; 5862} 5863 5864let Predicates = [HasAVX2, NoVLX] in { 5865 defm VPMINSDY : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256, 5866 load, i256mem, SchedWriteVecALU.YMM, 0>, 5867 VEX_4V, VEX_L, VEX_WIG; 5868 defm VPMINUDY : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256, 5869 load, i256mem, SchedWriteVecALU.YMM, 0>, 5870 VEX_4V, VEX_L, VEX_WIG; 5871 defm VPMAXSDY : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256, 5872 load, i256mem, SchedWriteVecALU.YMM, 0>, 5873 VEX_4V, VEX_L, VEX_WIG; 5874 defm VPMAXUDY : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256, 5875 load, i256mem, SchedWriteVecALU.YMM, 0>, 5876 VEX_4V, VEX_L, VEX_WIG; 5877 defm VPMULDQY : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v4i64, VR256, 5878 load, i256mem, SchedWriteVecIMul.YMM, 0>, 5879 VEX_4V, VEX_L, VEX_WIG; 5880} 5881let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 5882 defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256, 5883 load, i256mem, SchedWriteVecALU.YMM, 0>, 5884 VEX_4V, VEX_L, VEX_WIG; 5885 defm VPMINUWY : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256, 5886 load, i256mem, SchedWriteVecALU.YMM, 0>, 5887 VEX_4V, VEX_L, VEX_WIG; 5888 defm VPMAXSBY : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256, 5889 load, i256mem, SchedWriteVecALU.YMM, 0>, 5890 VEX_4V, VEX_L, VEX_WIG; 5891 defm VPMAXUWY : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256, 5892 load, i256mem, SchedWriteVecALU.YMM, 0>, 5893 VEX_4V, VEX_L, VEX_WIG; 5894} 5895 5896let Constraints = "$src1 = $dst" in { 5897 defm PMINSB : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128, 5898 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5899 defm PMINSD : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128, 5900 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5901 defm PMINUD : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128, 5902 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5903 defm PMINUW : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128, 5904 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5905 defm PMAXSB : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128, 5906 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5907 defm PMAXSD : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128, 5908 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5909 defm PMAXUD : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128, 5910 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5911 defm PMAXUW : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128, 5912 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5913 defm PMULDQ : SS48I_binop_rm<0x28, "pmuldq", X86pmuldq, v2i64, VR128, 5914 memop, i128mem, SchedWriteVecIMul.XMM, 1>; 5915} 5916 5917let Predicates = [HasAVX, NoVLX] in 5918 defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128, 5919 load, i128mem, SchedWritePMULLD.XMM, 0>, 5920 VEX_4V, VEX_WIG; 5921let Predicates = [HasAVX] in 5922 defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128, 5923 load, i128mem, SchedWriteVecALU.XMM, 0>, 5924 VEX_4V, VEX_WIG; 5925 5926let Predicates = [HasAVX2, NoVLX] in 5927 defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256, 5928 load, i256mem, SchedWritePMULLD.YMM, 0>, 5929 VEX_4V, VEX_L, VEX_WIG; 5930let Predicates = [HasAVX2] in 5931 defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256, 5932 load, i256mem, SchedWriteVecALU.YMM, 0>, 5933 VEX_4V, VEX_L, VEX_WIG; 5934 5935let Constraints = "$src1 = $dst" in { 5936 defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128, 5937 memop, i128mem, SchedWritePMULLD.XMM, 1>; 5938 defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128, 5939 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5940} 5941 5942/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate 5943multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, 5944 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag, 5945 X86MemOperand x86memop, bit Is2Addr, 5946 X86FoldableSchedWrite sched> { 5947 let isCommutable = 1 in 5948 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), 5949 (ins RC:$src1, RC:$src2, u8imm:$src3), 5950 !if(Is2Addr, 5951 !strconcat(OpcodeStr, 5952 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5953 !strconcat(OpcodeStr, 5954 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5955 [(set RC:$dst, (IntId RC:$src1, RC:$src2, timm:$src3))]>, 5956 Sched<[sched]>; 5957 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), 5958 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 5959 !if(Is2Addr, 5960 !strconcat(OpcodeStr, 5961 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5962 !strconcat(OpcodeStr, 5963 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5964 [(set RC:$dst, 5965 (IntId RC:$src1, (memop_frag addr:$src2), timm:$src3))]>, 5966 Sched<[sched.Folded, sched.ReadAfterFold]>; 5967} 5968 5969/// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate 5970multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, 5971 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 5972 X86MemOperand x86memop, bit Is2Addr, 5973 X86FoldableSchedWrite sched> { 5974 let isCommutable = 1 in 5975 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), 5976 (ins RC:$src1, RC:$src2, u8imm:$src3), 5977 !if(Is2Addr, 5978 !strconcat(OpcodeStr, 5979 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5980 !strconcat(OpcodeStr, 5981 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5982 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>, 5983 Sched<[sched]>; 5984 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), 5985 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 5986 !if(Is2Addr, 5987 !strconcat(OpcodeStr, 5988 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5989 !strconcat(OpcodeStr, 5990 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5991 [(set RC:$dst, 5992 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>, 5993 Sched<[sched.Folded, sched.ReadAfterFold]>; 5994} 5995 5996def BlendCommuteImm2 : SDNodeXForm<timm, [{ 5997 uint8_t Imm = N->getZExtValue() & 0x03; 5998 return getI8Imm(Imm ^ 0x03, SDLoc(N)); 5999}]>; 6000 6001def BlendCommuteImm4 : SDNodeXForm<timm, [{ 6002 uint8_t Imm = N->getZExtValue() & 0x0f; 6003 return getI8Imm(Imm ^ 0x0f, SDLoc(N)); 6004}]>; 6005 6006def BlendCommuteImm8 : SDNodeXForm<timm, [{ 6007 uint8_t Imm = N->getZExtValue() & 0xff; 6008 return getI8Imm(Imm ^ 0xff, SDLoc(N)); 6009}]>; 6010 6011// Turn a 4-bit blendi immediate to 8-bit for use with pblendw. 6012def BlendScaleImm4 : SDNodeXForm<timm, [{ 6013 uint8_t Imm = N->getZExtValue(); 6014 uint8_t NewImm = 0; 6015 for (unsigned i = 0; i != 4; ++i) { 6016 if (Imm & (1 << i)) 6017 NewImm |= 0x3 << (i * 2); 6018 } 6019 return getI8Imm(NewImm, SDLoc(N)); 6020}]>; 6021 6022// Turn a 2-bit blendi immediate to 8-bit for use with pblendw. 6023def BlendScaleImm2 : SDNodeXForm<timm, [{ 6024 uint8_t Imm = N->getZExtValue(); 6025 uint8_t NewImm = 0; 6026 for (unsigned i = 0; i != 2; ++i) { 6027 if (Imm & (1 << i)) 6028 NewImm |= 0xf << (i * 4); 6029 } 6030 return getI8Imm(NewImm, SDLoc(N)); 6031}]>; 6032 6033// Turn a 2-bit blendi immediate to 4-bit for use with pblendd. 6034def BlendScaleImm2to4 : SDNodeXForm<timm, [{ 6035 uint8_t Imm = N->getZExtValue(); 6036 uint8_t NewImm = 0; 6037 for (unsigned i = 0; i != 2; ++i) { 6038 if (Imm & (1 << i)) 6039 NewImm |= 0x3 << (i * 2); 6040 } 6041 return getI8Imm(NewImm, SDLoc(N)); 6042}]>; 6043 6044// Turn a 4-bit blendi immediate to 8-bit for use with pblendw and invert it. 6045def BlendScaleCommuteImm4 : SDNodeXForm<timm, [{ 6046 uint8_t Imm = N->getZExtValue(); 6047 uint8_t NewImm = 0; 6048 for (unsigned i = 0; i != 4; ++i) { 6049 if (Imm & (1 << i)) 6050 NewImm |= 0x3 << (i * 2); 6051 } 6052 return getI8Imm(NewImm ^ 0xff, SDLoc(N)); 6053}]>; 6054 6055// Turn a 2-bit blendi immediate to 8-bit for use with pblendw and invert it. 6056def BlendScaleCommuteImm2 : SDNodeXForm<timm, [{ 6057 uint8_t Imm = N->getZExtValue(); 6058 uint8_t NewImm = 0; 6059 for (unsigned i = 0; i != 2; ++i) { 6060 if (Imm & (1 << i)) 6061 NewImm |= 0xf << (i * 4); 6062 } 6063 return getI8Imm(NewImm ^ 0xff, SDLoc(N)); 6064}]>; 6065 6066// Turn a 2-bit blendi immediate to 4-bit for use with pblendd and invert it. 6067def BlendScaleCommuteImm2to4 : SDNodeXForm<timm, [{ 6068 uint8_t Imm = N->getZExtValue(); 6069 uint8_t NewImm = 0; 6070 for (unsigned i = 0; i != 2; ++i) { 6071 if (Imm & (1 << i)) 6072 NewImm |= 0x3 << (i * 2); 6073 } 6074 return getI8Imm(NewImm ^ 0xf, SDLoc(N)); 6075}]>; 6076 6077let Predicates = [HasAVX] in { 6078 let isCommutable = 0 in { 6079 defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw, 6080 VR128, load, i128mem, 0, 6081 SchedWriteMPSAD.XMM>, VEX_4V, VEX_WIG; 6082 } 6083 6084let Uses = [MXCSR], mayRaiseFPException = 1 in { 6085 let ExeDomain = SSEPackedSingle in 6086 defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps, 6087 VR128, load, f128mem, 0, 6088 SchedWriteDPPS.XMM>, VEX_4V, VEX_WIG; 6089 let ExeDomain = SSEPackedDouble in 6090 defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd, 6091 VR128, load, f128mem, 0, 6092 SchedWriteDPPD.XMM>, VEX_4V, VEX_WIG; 6093 let ExeDomain = SSEPackedSingle in 6094 defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256, 6095 VR256, load, i256mem, 0, 6096 SchedWriteDPPS.YMM>, VEX_4V, VEX_L, VEX_WIG; 6097} 6098} 6099 6100let Predicates = [HasAVX2] in { 6101 let isCommutable = 0 in { 6102 defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw, 6103 VR256, load, i256mem, 0, 6104 SchedWriteMPSAD.YMM>, VEX_4V, VEX_L, VEX_WIG; 6105 } 6106} 6107 6108let Constraints = "$src1 = $dst" in { 6109 let isCommutable = 0 in { 6110 defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw, 6111 VR128, memop, i128mem, 1, 6112 SchedWriteMPSAD.XMM>; 6113 } 6114 6115 let ExeDomain = SSEPackedSingle in 6116 defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps, 6117 VR128, memop, f128mem, 1, 6118 SchedWriteDPPS.XMM>, SIMD_EXC; 6119 let ExeDomain = SSEPackedDouble in 6120 defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd, 6121 VR128, memop, f128mem, 1, 6122 SchedWriteDPPD.XMM>, SIMD_EXC; 6123} 6124 6125/// SS41I_blend_rmi - SSE 4.1 blend with 8-bit immediate 6126multiclass SS41I_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, 6127 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 6128 X86MemOperand x86memop, bit Is2Addr, Domain d, 6129 X86FoldableSchedWrite sched, SDNodeXForm commuteXForm> { 6130let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in { 6131 let isCommutable = 1 in 6132 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), 6133 (ins RC:$src1, RC:$src2, u8imm:$src3), 6134 !if(Is2Addr, 6135 !strconcat(OpcodeStr, 6136 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6137 !strconcat(OpcodeStr, 6138 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6139 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>, 6140 Sched<[sched]>; 6141 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), 6142 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 6143 !if(Is2Addr, 6144 !strconcat(OpcodeStr, 6145 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6146 !strconcat(OpcodeStr, 6147 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6148 [(set RC:$dst, 6149 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>, 6150 Sched<[sched.Folded, sched.ReadAfterFold]>; 6151} 6152 6153 // Pattern to commute if load is in first source. 6154 def : Pat<(OpVT (OpNode (memop_frag addr:$src2), RC:$src1, timm:$src3)), 6155 (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2, 6156 (commuteXForm timm:$src3))>; 6157} 6158 6159let Predicates = [HasAVX] in { 6160 defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32, 6161 VR128, load, f128mem, 0, SSEPackedSingle, 6162 SchedWriteFBlend.XMM, BlendCommuteImm4>, 6163 VEX_4V, VEX_WIG; 6164 defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32, 6165 VR256, load, f256mem, 0, SSEPackedSingle, 6166 SchedWriteFBlend.YMM, BlendCommuteImm8>, 6167 VEX_4V, VEX_L, VEX_WIG; 6168 defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64, 6169 VR128, load, f128mem, 0, SSEPackedDouble, 6170 SchedWriteFBlend.XMM, BlendCommuteImm2>, 6171 VEX_4V, VEX_WIG; 6172 defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64, 6173 VR256, load, f256mem, 0, SSEPackedDouble, 6174 SchedWriteFBlend.YMM, BlendCommuteImm4>, 6175 VEX_4V, VEX_L, VEX_WIG; 6176 defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16, 6177 VR128, load, i128mem, 0, SSEPackedInt, 6178 SchedWriteBlend.XMM, BlendCommuteImm8>, 6179 VEX_4V, VEX_WIG; 6180} 6181 6182let Predicates = [HasAVX2] in { 6183 defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16, 6184 VR256, load, i256mem, 0, SSEPackedInt, 6185 SchedWriteBlend.YMM, BlendCommuteImm8>, 6186 VEX_4V, VEX_L, VEX_WIG; 6187} 6188 6189// Emulate vXi32/vXi64 blends with vXf32/vXf64 or pblendw. 6190// ExecutionDomainFixPass will cleanup domains later on. 6191let Predicates = [HasAVX1Only] in { 6192def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3), 6193 (VBLENDPDYrri VR256:$src1, VR256:$src2, timm:$src3)>; 6194def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3), 6195 (VBLENDPDYrmi VR256:$src1, addr:$src2, timm:$src3)>; 6196def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3), 6197 (VBLENDPDYrmi VR256:$src1, addr:$src2, (BlendCommuteImm4 timm:$src3))>; 6198 6199// Use pblendw for 128-bit integer to keep it in the integer domain and prevent 6200// it from becoming movsd via commuting under optsize. 6201def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3), 6202 (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>; 6203def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3), 6204 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>; 6205def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3), 6206 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>; 6207 6208def : Pat<(X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), timm:$src3), 6209 (VBLENDPSYrri VR256:$src1, VR256:$src2, timm:$src3)>; 6210def : Pat<(X86Blendi VR256:$src1, (loadv8i32 addr:$src2), timm:$src3), 6211 (VBLENDPSYrmi VR256:$src1, addr:$src2, timm:$src3)>; 6212def : Pat<(X86Blendi (loadv8i32 addr:$src2), VR256:$src1, timm:$src3), 6213 (VBLENDPSYrmi VR256:$src1, addr:$src2, (BlendCommuteImm8 timm:$src3))>; 6214 6215// Use pblendw for 128-bit integer to keep it in the integer domain and prevent 6216// it from becoming movss via commuting under optsize. 6217def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3), 6218 (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>; 6219def : Pat<(X86Blendi VR128:$src1, (loadv4i32 addr:$src2), timm:$src3), 6220 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>; 6221def : Pat<(X86Blendi (loadv4i32 addr:$src2), VR128:$src1, timm:$src3), 6222 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>; 6223} 6224 6225defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32, 6226 VR128, memop, f128mem, 1, SSEPackedSingle, 6227 SchedWriteFBlend.XMM, BlendCommuteImm4>; 6228defm BLENDPD : SS41I_blend_rmi<0x0D, "blendpd", X86Blendi, v2f64, 6229 VR128, memop, f128mem, 1, SSEPackedDouble, 6230 SchedWriteFBlend.XMM, BlendCommuteImm2>; 6231defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16, 6232 VR128, memop, i128mem, 1, SSEPackedInt, 6233 SchedWriteBlend.XMM, BlendCommuteImm8>; 6234 6235let Predicates = [UseSSE41] in { 6236// Use pblendw for 128-bit integer to keep it in the integer domain and prevent 6237// it from becoming movss via commuting under optsize. 6238def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3), 6239 (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>; 6240def : Pat<(X86Blendi VR128:$src1, (memopv2i64 addr:$src2), timm:$src3), 6241 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>; 6242def : Pat<(X86Blendi (memopv2i64 addr:$src2), VR128:$src1, timm:$src3), 6243 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>; 6244 6245def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3), 6246 (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>; 6247def : Pat<(X86Blendi VR128:$src1, (memopv4i32 addr:$src2), timm:$src3), 6248 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>; 6249def : Pat<(X86Blendi (memopv4i32 addr:$src2), VR128:$src1, timm:$src3), 6250 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>; 6251} 6252 6253// For insertion into the zero index (low half) of a 256-bit vector, it is 6254// more efficient to generate a blend with immediate instead of an insert*128. 6255let Predicates = [HasAVX] in { 6256def : Pat<(insert_subvector (v4f64 VR256:$src1), (v2f64 VR128:$src2), (iPTR 0)), 6257 (VBLENDPDYrri VR256:$src1, 6258 (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), 6259 VR128:$src2, sub_xmm), 0x3)>; 6260def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)), 6261 (VBLENDPSYrri VR256:$src1, 6262 (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), 6263 VR128:$src2, sub_xmm), 0xf)>; 6264 6265def : Pat<(insert_subvector (loadv4f64 addr:$src2), (v2f64 VR128:$src1), (iPTR 0)), 6266 (VBLENDPDYrmi (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), 6267 VR128:$src1, sub_xmm), addr:$src2, 0xc)>; 6268def : Pat<(insert_subvector (loadv8f32 addr:$src2), (v4f32 VR128:$src1), (iPTR 0)), 6269 (VBLENDPSYrmi (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), 6270 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; 6271} 6272 6273/// SS41I_quaternary_vx - AVX SSE 4.1 with 4 operators 6274multiclass SS41I_quaternary_avx<bits<8> opc, string OpcodeStr, RegisterClass RC, 6275 X86MemOperand x86memop, ValueType VT, 6276 PatFrag mem_frag, SDNode OpNode, 6277 X86FoldableSchedWrite sched> { 6278 def rr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst), 6279 (ins RC:$src1, RC:$src2, RC:$src3), 6280 !strconcat(OpcodeStr, 6281 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 6282 [(set RC:$dst, (VT (OpNode RC:$src3, RC:$src2, RC:$src1)))], 6283 SSEPackedInt>, TAPD, VEX_4V, 6284 Sched<[sched]>; 6285 6286 def rm : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst), 6287 (ins RC:$src1, x86memop:$src2, RC:$src3), 6288 !strconcat(OpcodeStr, 6289 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 6290 [(set RC:$dst, 6291 (OpNode RC:$src3, (mem_frag addr:$src2), 6292 RC:$src1))], SSEPackedInt>, TAPD, VEX_4V, 6293 Sched<[sched.Folded, sched.ReadAfterFold, 6294 // x86memop:$src2 6295 ReadDefault, ReadDefault, ReadDefault, ReadDefault, 6296 ReadDefault, 6297 // RC::$src3 6298 sched.ReadAfterFold]>; 6299} 6300 6301let Predicates = [HasAVX] in { 6302let ExeDomain = SSEPackedDouble in { 6303defm VBLENDVPD : SS41I_quaternary_avx<0x4B, "vblendvpd", VR128, f128mem, 6304 v2f64, loadv2f64, X86Blendv, 6305 SchedWriteFVarBlend.XMM>; 6306defm VBLENDVPDY : SS41I_quaternary_avx<0x4B, "vblendvpd", VR256, f256mem, 6307 v4f64, loadv4f64, X86Blendv, 6308 SchedWriteFVarBlend.YMM>, VEX_L; 6309} // ExeDomain = SSEPackedDouble 6310let ExeDomain = SSEPackedSingle in { 6311defm VBLENDVPS : SS41I_quaternary_avx<0x4A, "vblendvps", VR128, f128mem, 6312 v4f32, loadv4f32, X86Blendv, 6313 SchedWriteFVarBlend.XMM>; 6314defm VBLENDVPSY : SS41I_quaternary_avx<0x4A, "vblendvps", VR256, f256mem, 6315 v8f32, loadv8f32, X86Blendv, 6316 SchedWriteFVarBlend.YMM>, VEX_L; 6317} // ExeDomain = SSEPackedSingle 6318defm VPBLENDVB : SS41I_quaternary_avx<0x4C, "vpblendvb", VR128, i128mem, 6319 v16i8, loadv16i8, X86Blendv, 6320 SchedWriteVarBlend.XMM>; 6321} 6322 6323let Predicates = [HasAVX2] in { 6324defm VPBLENDVBY : SS41I_quaternary_avx<0x4C, "vpblendvb", VR256, i256mem, 6325 v32i8, loadv32i8, X86Blendv, 6326 SchedWriteVarBlend.YMM>, VEX_L; 6327} 6328 6329let Predicates = [HasAVX] in { 6330 def : Pat<(v4i32 (X86Blendv (v4i32 VR128:$mask), (v4i32 VR128:$src1), 6331 (v4i32 VR128:$src2))), 6332 (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>; 6333 def : Pat<(v2i64 (X86Blendv (v2i64 VR128:$mask), (v2i64 VR128:$src1), 6334 (v2i64 VR128:$src2))), 6335 (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>; 6336 def : Pat<(v8i32 (X86Blendv (v8i32 VR256:$mask), (v8i32 VR256:$src1), 6337 (v8i32 VR256:$src2))), 6338 (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>; 6339 def : Pat<(v4i64 (X86Blendv (v4i64 VR256:$mask), (v4i64 VR256:$src1), 6340 (v4i64 VR256:$src2))), 6341 (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>; 6342} 6343 6344// Prefer a movss or movsd over a blendps when optimizing for size. these were 6345// changed to use blends because blends have better throughput on sandybridge 6346// and haswell, but movs[s/d] are 1-2 byte shorter instructions. 6347let Predicates = [HasAVX, OptForSpeed] in { 6348 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 6349 (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; 6350 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 6351 (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>; 6352 6353 def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)), 6354 (VBLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>; 6355 def : Pat<(v4f32 (X86Movss VR128:$src1, (loadv4f32 addr:$src2))), 6356 (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>; 6357 def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)), 6358 (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>; 6359 6360 def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), 6361 (VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>; 6362 def : Pat<(v2f64 (X86Movsd VR128:$src1, (loadv2f64 addr:$src2))), 6363 (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>; 6364 def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)), 6365 (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>; 6366 6367 // Move low f32 and clear high bits. 6368 def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), 6369 (SUBREG_TO_REG (i32 0), 6370 (v4f32 (VBLENDPSrri (v4f32 (V_SET0)), 6371 (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)), 6372 (i8 1))), sub_xmm)>; 6373 def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), 6374 (SUBREG_TO_REG (i32 0), 6375 (v4i32 (VPBLENDWrri (v4i32 (V_SET0)), 6376 (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)), 6377 (i8 3))), sub_xmm)>; 6378} 6379 6380// Prefer a movss or movsd over a blendps when optimizing for size. these were 6381// changed to use blends because blends have better throughput on sandybridge 6382// and haswell, but movs[s/d] are 1-2 byte shorter instructions. 6383let Predicates = [UseSSE41, OptForSpeed] in { 6384 // With SSE41 we can use blends for these patterns. 6385 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 6386 (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; 6387 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 6388 (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>; 6389 6390 def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)), 6391 (BLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>; 6392 def : Pat<(v4f32 (X86Movss VR128:$src1, (memopv4f32 addr:$src2))), 6393 (BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>; 6394 def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)), 6395 (BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>; 6396 6397 def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), 6398 (BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>; 6399 def : Pat<(v2f64 (X86Movsd VR128:$src1, (memopv2f64 addr:$src2))), 6400 (BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>; 6401 def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)), 6402 (BLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>; 6403} 6404 6405 6406/// SS41I_ternary - SSE 4.1 ternary operator 6407let Uses = [XMM0], Constraints = "$src1 = $dst" in { 6408 multiclass SS41I_ternary<bits<8> opc, string OpcodeStr, ValueType VT, 6409 PatFrag mem_frag, X86MemOperand x86memop, 6410 SDNode OpNode, X86FoldableSchedWrite sched> { 6411 def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst), 6412 (ins VR128:$src1, VR128:$src2), 6413 !strconcat(OpcodeStr, 6414 "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), 6415 [(set VR128:$dst, 6416 (VT (OpNode XMM0, VR128:$src2, VR128:$src1)))]>, 6417 Sched<[sched]>; 6418 6419 def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst), 6420 (ins VR128:$src1, x86memop:$src2), 6421 !strconcat(OpcodeStr, 6422 "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), 6423 [(set VR128:$dst, 6424 (OpNode XMM0, (mem_frag addr:$src2), VR128:$src1))]>, 6425 Sched<[sched.Folded, sched.ReadAfterFold]>; 6426 } 6427} 6428 6429let ExeDomain = SSEPackedDouble in 6430defm BLENDVPD : SS41I_ternary<0x15, "blendvpd", v2f64, memopv2f64, f128mem, 6431 X86Blendv, SchedWriteFVarBlend.XMM>; 6432let ExeDomain = SSEPackedSingle in 6433defm BLENDVPS : SS41I_ternary<0x14, "blendvps", v4f32, memopv4f32, f128mem, 6434 X86Blendv, SchedWriteFVarBlend.XMM>; 6435defm PBLENDVB : SS41I_ternary<0x10, "pblendvb", v16i8, memopv16i8, i128mem, 6436 X86Blendv, SchedWriteVarBlend.XMM>; 6437 6438// Aliases with the implicit xmm0 argument 6439def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}", 6440 (BLENDVPDrr0 VR128:$dst, VR128:$src2), 0>; 6441def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}", 6442 (BLENDVPDrm0 VR128:$dst, f128mem:$src2), 0>; 6443def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}", 6444 (BLENDVPSrr0 VR128:$dst, VR128:$src2), 0>; 6445def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}", 6446 (BLENDVPSrm0 VR128:$dst, f128mem:$src2), 0>; 6447def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}", 6448 (PBLENDVBrr0 VR128:$dst, VR128:$src2), 0>; 6449def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}", 6450 (PBLENDVBrm0 VR128:$dst, i128mem:$src2), 0>; 6451 6452let Predicates = [UseSSE41] in { 6453 def : Pat<(v4i32 (X86Blendv (v4i32 XMM0), (v4i32 VR128:$src1), 6454 (v4i32 VR128:$src2))), 6455 (BLENDVPSrr0 VR128:$src2, VR128:$src1)>; 6456 def : Pat<(v2i64 (X86Blendv (v2i64 XMM0), (v2i64 VR128:$src1), 6457 (v2i64 VR128:$src2))), 6458 (BLENDVPDrr0 VR128:$src2, VR128:$src1)>; 6459} 6460 6461let AddedComplexity = 400 in { // Prefer non-temporal versions 6462 6463let Predicates = [HasAVX, NoVLX] in 6464def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 6465 "vmovntdqa\t{$src, $dst|$dst, $src}", []>, 6466 Sched<[SchedWriteVecMoveLSNT.XMM.RM]>, VEX, VEX_WIG; 6467let Predicates = [HasAVX2, NoVLX] in 6468def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 6469 "vmovntdqa\t{$src, $dst|$dst, $src}", []>, 6470 Sched<[SchedWriteVecMoveLSNT.YMM.RM]>, VEX, VEX_L, VEX_WIG; 6471def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 6472 "movntdqa\t{$src, $dst|$dst, $src}", []>, 6473 Sched<[SchedWriteVecMoveLSNT.XMM.RM]>; 6474 6475let Predicates = [HasAVX2, NoVLX] in { 6476 def : Pat<(v8f32 (alignednontemporalload addr:$src)), 6477 (VMOVNTDQAYrm addr:$src)>; 6478 def : Pat<(v4f64 (alignednontemporalload addr:$src)), 6479 (VMOVNTDQAYrm addr:$src)>; 6480 def : Pat<(v4i64 (alignednontemporalload addr:$src)), 6481 (VMOVNTDQAYrm addr:$src)>; 6482 def : Pat<(v8i32 (alignednontemporalload addr:$src)), 6483 (VMOVNTDQAYrm addr:$src)>; 6484 def : Pat<(v16i16 (alignednontemporalload addr:$src)), 6485 (VMOVNTDQAYrm addr:$src)>; 6486 def : Pat<(v16f16 (alignednontemporalload addr:$src)), 6487 (VMOVNTDQAYrm addr:$src)>; 6488 def : Pat<(v32i8 (alignednontemporalload addr:$src)), 6489 (VMOVNTDQAYrm addr:$src)>; 6490} 6491 6492let Predicates = [HasAVX, NoVLX] in { 6493 def : Pat<(v4f32 (alignednontemporalload addr:$src)), 6494 (VMOVNTDQArm addr:$src)>; 6495 def : Pat<(v2f64 (alignednontemporalload addr:$src)), 6496 (VMOVNTDQArm addr:$src)>; 6497 def : Pat<(v2i64 (alignednontemporalload addr:$src)), 6498 (VMOVNTDQArm addr:$src)>; 6499 def : Pat<(v4i32 (alignednontemporalload addr:$src)), 6500 (VMOVNTDQArm addr:$src)>; 6501 def : Pat<(v8i16 (alignednontemporalload addr:$src)), 6502 (VMOVNTDQArm addr:$src)>; 6503 def : Pat<(v8f16 (alignednontemporalload addr:$src)), 6504 (VMOVNTDQArm addr:$src)>; 6505 def : Pat<(v16i8 (alignednontemporalload addr:$src)), 6506 (VMOVNTDQArm addr:$src)>; 6507} 6508 6509let Predicates = [UseSSE41] in { 6510 def : Pat<(v4f32 (alignednontemporalload addr:$src)), 6511 (MOVNTDQArm addr:$src)>; 6512 def : Pat<(v2f64 (alignednontemporalload addr:$src)), 6513 (MOVNTDQArm addr:$src)>; 6514 def : Pat<(v2i64 (alignednontemporalload addr:$src)), 6515 (MOVNTDQArm addr:$src)>; 6516 def : Pat<(v4i32 (alignednontemporalload addr:$src)), 6517 (MOVNTDQArm addr:$src)>; 6518 def : Pat<(v8i16 (alignednontemporalload addr:$src)), 6519 (MOVNTDQArm addr:$src)>; 6520 def : Pat<(v8f16 (alignednontemporalload addr:$src)), 6521 (MOVNTDQArm addr:$src)>; 6522 def : Pat<(v16i8 (alignednontemporalload addr:$src)), 6523 (MOVNTDQArm addr:$src)>; 6524} 6525 6526} // AddedComplexity 6527 6528//===----------------------------------------------------------------------===// 6529// SSE4.2 - Compare Instructions 6530//===----------------------------------------------------------------------===// 6531 6532/// SS42I_binop_rm - Simple SSE 4.2 binary operator 6533multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 6534 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 6535 X86MemOperand x86memop, X86FoldableSchedWrite sched, 6536 bit Is2Addr = 1> { 6537 def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst), 6538 (ins RC:$src1, RC:$src2), 6539 !if(Is2Addr, 6540 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 6541 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 6542 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, 6543 Sched<[sched]>; 6544 def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst), 6545 (ins RC:$src1, x86memop:$src2), 6546 !if(Is2Addr, 6547 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 6548 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 6549 [(set RC:$dst, 6550 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>, 6551 Sched<[sched.Folded, sched.ReadAfterFold]>; 6552} 6553 6554let Predicates = [HasAVX] in 6555 defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128, 6556 load, i128mem, SchedWriteVecALU.XMM, 0>, 6557 VEX_4V, VEX_WIG; 6558 6559let Predicates = [HasAVX2] in 6560 defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256, 6561 load, i256mem, SchedWriteVecALU.YMM, 0>, 6562 VEX_4V, VEX_L, VEX_WIG; 6563 6564let Constraints = "$src1 = $dst" in 6565 defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128, 6566 memop, i128mem, SchedWriteVecALU.XMM>; 6567 6568//===----------------------------------------------------------------------===// 6569// SSE4.2 - String/text Processing Instructions 6570//===----------------------------------------------------------------------===// 6571 6572multiclass pcmpistrm_SS42AI<string asm> { 6573 def rr : SS42AI<0x62, MRMSrcReg, (outs), 6574 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 6575 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 6576 []>, Sched<[WritePCmpIStrM]>; 6577 let mayLoad = 1 in 6578 def rm :SS42AI<0x62, MRMSrcMem, (outs), 6579 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 6580 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 6581 []>, Sched<[WritePCmpIStrM.Folded, WritePCmpIStrM.ReadAfterFold]>; 6582} 6583 6584let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in { 6585 let Predicates = [HasAVX] in 6586 defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX, VEX_WIG; 6587 defm PCMPISTRM : pcmpistrm_SS42AI<"pcmpistrm"> ; 6588} 6589 6590multiclass SS42AI_pcmpestrm<string asm> { 6591 def rr : SS42AI<0x60, MRMSrcReg, (outs), 6592 (ins VR128:$src1, VR128:$src3, u8imm:$src5), 6593 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 6594 []>, Sched<[WritePCmpEStrM]>; 6595 let mayLoad = 1 in 6596 def rm : SS42AI<0x60, MRMSrcMem, (outs), 6597 (ins VR128:$src1, i128mem:$src3, u8imm:$src5), 6598 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 6599 []>, Sched<[WritePCmpEStrM.Folded, WritePCmpEStrM.ReadAfterFold]>; 6600} 6601 6602let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in { 6603 let Predicates = [HasAVX] in 6604 defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX, VEX_WIG; 6605 defm PCMPESTRM : SS42AI_pcmpestrm<"pcmpestrm">; 6606} 6607 6608multiclass SS42AI_pcmpistri<string asm> { 6609 def rr : SS42AI<0x63, MRMSrcReg, (outs), 6610 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 6611 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 6612 []>, Sched<[WritePCmpIStrI]>; 6613 let mayLoad = 1 in 6614 def rm : SS42AI<0x63, MRMSrcMem, (outs), 6615 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 6616 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 6617 []>, Sched<[WritePCmpIStrI.Folded, WritePCmpIStrI.ReadAfterFold]>; 6618} 6619 6620let Defs = [ECX, EFLAGS], hasSideEffects = 0 in { 6621 let Predicates = [HasAVX] in 6622 defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX, VEX_WIG; 6623 defm PCMPISTRI : SS42AI_pcmpistri<"pcmpistri">; 6624} 6625 6626multiclass SS42AI_pcmpestri<string asm> { 6627 def rr : SS42AI<0x61, MRMSrcReg, (outs), 6628 (ins VR128:$src1, VR128:$src3, u8imm:$src5), 6629 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 6630 []>, Sched<[WritePCmpEStrI]>; 6631 let mayLoad = 1 in 6632 def rm : SS42AI<0x61, MRMSrcMem, (outs), 6633 (ins VR128:$src1, i128mem:$src3, u8imm:$src5), 6634 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 6635 []>, Sched<[WritePCmpEStrI.Folded, WritePCmpEStrI.ReadAfterFold]>; 6636} 6637 6638let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in { 6639 let Predicates = [HasAVX] in 6640 defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX, VEX_WIG; 6641 defm PCMPESTRI : SS42AI_pcmpestri<"pcmpestri">; 6642} 6643 6644//===----------------------------------------------------------------------===// 6645// SSE4.2 - CRC Instructions 6646//===----------------------------------------------------------------------===// 6647 6648// No CRC instructions have AVX equivalents 6649 6650// crc intrinsic instruction 6651// This set of instructions are only rm, the only difference is the size 6652// of r and m. 6653class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut, 6654 RegisterClass RCIn, SDPatternOperator Int> : 6655 CRC32I<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2), 6656 !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"), 6657 [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))]>, 6658 Sched<[WriteCRC32]>; 6659 6660class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut, 6661 X86MemOperand x86memop, SDPatternOperator Int> : 6662 CRC32I<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2), 6663 !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"), 6664 [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))]>, 6665 Sched<[WriteCRC32.Folded, WriteCRC32.ReadAfterFold]>; 6666 6667let Constraints = "$src1 = $dst" in { 6668 def CRC32r32m8 : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem, 6669 int_x86_sse42_crc32_32_8>; 6670 def CRC32r32r8 : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8, 6671 int_x86_sse42_crc32_32_8>; 6672 def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem, 6673 int_x86_sse42_crc32_32_16>, OpSize16; 6674 def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16, 6675 int_x86_sse42_crc32_32_16>, OpSize16; 6676 def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem, 6677 int_x86_sse42_crc32_32_32>, OpSize32; 6678 def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32, 6679 int_x86_sse42_crc32_32_32>, OpSize32; 6680 def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem, 6681 int_x86_sse42_crc32_64_64>, REX_W; 6682 def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64, 6683 int_x86_sse42_crc32_64_64>, REX_W; 6684 let hasSideEffects = 0 in { 6685 let mayLoad = 1 in 6686 def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem, 6687 null_frag>, REX_W; 6688 def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8, 6689 null_frag>, REX_W; 6690 } 6691} 6692 6693//===----------------------------------------------------------------------===// 6694// SHA-NI Instructions 6695//===----------------------------------------------------------------------===// 6696 6697// FIXME: Is there a better scheduler class for SHA than WriteVecIMul? 6698multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId, 6699 X86FoldableSchedWrite sched, bit UsesXMM0 = 0> { 6700 def rr : I<Opc, MRMSrcReg, (outs VR128:$dst), 6701 (ins VR128:$src1, VR128:$src2), 6702 !if(UsesXMM0, 6703 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), 6704 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")), 6705 [!if(UsesXMM0, 6706 (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)), 6707 (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>, 6708 T8PS, Sched<[sched]>; 6709 6710 def rm : I<Opc, MRMSrcMem, (outs VR128:$dst), 6711 (ins VR128:$src1, i128mem:$src2), 6712 !if(UsesXMM0, 6713 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), 6714 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")), 6715 [!if(UsesXMM0, 6716 (set VR128:$dst, (IntId VR128:$src1, 6717 (memop addr:$src2), XMM0)), 6718 (set VR128:$dst, (IntId VR128:$src1, 6719 (memop addr:$src2))))]>, T8PS, 6720 Sched<[sched.Folded, sched.ReadAfterFold]>; 6721} 6722 6723let Constraints = "$src1 = $dst", Predicates = [HasSHA] in { 6724 def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst), 6725 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 6726 "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}", 6727 [(set VR128:$dst, 6728 (int_x86_sha1rnds4 VR128:$src1, VR128:$src2, 6729 (i8 timm:$src3)))]>, TAPS, 6730 Sched<[SchedWriteVecIMul.XMM]>; 6731 def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst), 6732 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 6733 "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}", 6734 [(set VR128:$dst, 6735 (int_x86_sha1rnds4 VR128:$src1, 6736 (memop addr:$src2), 6737 (i8 timm:$src3)))]>, TAPS, 6738 Sched<[SchedWriteVecIMul.XMM.Folded, 6739 SchedWriteVecIMul.XMM.ReadAfterFold]>; 6740 6741 defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte, 6742 SchedWriteVecIMul.XMM>; 6743 defm SHA1MSG1 : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1, 6744 SchedWriteVecIMul.XMM>; 6745 defm SHA1MSG2 : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2, 6746 SchedWriteVecIMul.XMM>; 6747 6748 let Uses=[XMM0] in 6749 defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2, 6750 SchedWriteVecIMul.XMM, 1>; 6751 6752 defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1, 6753 SchedWriteVecIMul.XMM>; 6754 defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2, 6755 SchedWriteVecIMul.XMM>; 6756} 6757 6758// Aliases with explicit %xmm0 6759def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}", 6760 (SHA256RNDS2rr VR128:$dst, VR128:$src2), 0>; 6761def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}", 6762 (SHA256RNDS2rm VR128:$dst, i128mem:$src2), 0>; 6763 6764//===----------------------------------------------------------------------===// 6765// AES-NI Instructions 6766//===----------------------------------------------------------------------===// 6767 6768multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, 6769 Intrinsic IntId, PatFrag ld_frag, 6770 bit Is2Addr = 0, RegisterClass RC = VR128, 6771 X86MemOperand MemOp = i128mem> { 6772 let AsmString = OpcodeStr# 6773 !if(Is2Addr, "\t{$src2, $dst|$dst, $src2}", 6774 "\t{$src2, $src1, $dst|$dst, $src1, $src2}") in { 6775 def rr : AES8I<opc, MRMSrcReg, (outs RC:$dst), 6776 (ins RC:$src1, RC:$src2), "", 6777 [(set RC:$dst, (IntId RC:$src1, RC:$src2))]>, 6778 Sched<[WriteAESDecEnc]>; 6779 def rm : AES8I<opc, MRMSrcMem, (outs RC:$dst), 6780 (ins RC:$src1, MemOp:$src2), "", 6781 [(set RC:$dst, (IntId RC:$src1, (ld_frag addr:$src2)))]>, 6782 Sched<[WriteAESDecEnc.Folded, WriteAESDecEnc.ReadAfterFold]>; 6783 } 6784} 6785 6786// Perform One Round of an AES Encryption/Decryption Flow 6787let Predicates = [HasAVX, NoVLX_Or_NoVAES, HasAES] in { 6788 defm VAESENC : AESI_binop_rm_int<0xDC, "vaesenc", 6789 int_x86_aesni_aesenc, load>, VEX_4V, VEX_WIG; 6790 defm VAESENCLAST : AESI_binop_rm_int<0xDD, "vaesenclast", 6791 int_x86_aesni_aesenclast, load>, VEX_4V, VEX_WIG; 6792 defm VAESDEC : AESI_binop_rm_int<0xDE, "vaesdec", 6793 int_x86_aesni_aesdec, load>, VEX_4V, VEX_WIG; 6794 defm VAESDECLAST : AESI_binop_rm_int<0xDF, "vaesdeclast", 6795 int_x86_aesni_aesdeclast, load>, VEX_4V, VEX_WIG; 6796} 6797 6798let Predicates = [NoVLX, HasVAES] in { 6799 defm VAESENCY : AESI_binop_rm_int<0xDC, "vaesenc", 6800 int_x86_aesni_aesenc_256, load, 0, VR256, 6801 i256mem>, VEX_4V, VEX_L, VEX_WIG; 6802 defm VAESENCLASTY : AESI_binop_rm_int<0xDD, "vaesenclast", 6803 int_x86_aesni_aesenclast_256, load, 0, VR256, 6804 i256mem>, VEX_4V, VEX_L, VEX_WIG; 6805 defm VAESDECY : AESI_binop_rm_int<0xDE, "vaesdec", 6806 int_x86_aesni_aesdec_256, load, 0, VR256, 6807 i256mem>, VEX_4V, VEX_L, VEX_WIG; 6808 defm VAESDECLASTY : AESI_binop_rm_int<0xDF, "vaesdeclast", 6809 int_x86_aesni_aesdeclast_256, load, 0, VR256, 6810 i256mem>, VEX_4V, VEX_L, VEX_WIG; 6811} 6812 6813let Constraints = "$src1 = $dst" in { 6814 defm AESENC : AESI_binop_rm_int<0xDC, "aesenc", 6815 int_x86_aesni_aesenc, memop, 1>; 6816 defm AESENCLAST : AESI_binop_rm_int<0xDD, "aesenclast", 6817 int_x86_aesni_aesenclast, memop, 1>; 6818 defm AESDEC : AESI_binop_rm_int<0xDE, "aesdec", 6819 int_x86_aesni_aesdec, memop, 1>; 6820 defm AESDECLAST : AESI_binop_rm_int<0xDF, "aesdeclast", 6821 int_x86_aesni_aesdeclast, memop, 1>; 6822} 6823 6824// Perform the AES InvMixColumn Transformation 6825let Predicates = [HasAVX, HasAES] in { 6826 def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst), 6827 (ins VR128:$src1), 6828 "vaesimc\t{$src1, $dst|$dst, $src1}", 6829 [(set VR128:$dst, 6830 (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>, 6831 VEX, VEX_WIG; 6832 def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), 6833 (ins i128mem:$src1), 6834 "vaesimc\t{$src1, $dst|$dst, $src1}", 6835 [(set VR128:$dst, (int_x86_aesni_aesimc (load addr:$src1)))]>, 6836 Sched<[WriteAESIMC.Folded]>, VEX, VEX_WIG; 6837} 6838def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst), 6839 (ins VR128:$src1), 6840 "aesimc\t{$src1, $dst|$dst, $src1}", 6841 [(set VR128:$dst, 6842 (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>; 6843def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), 6844 (ins i128mem:$src1), 6845 "aesimc\t{$src1, $dst|$dst, $src1}", 6846 [(set VR128:$dst, (int_x86_aesni_aesimc (memop addr:$src1)))]>, 6847 Sched<[WriteAESIMC.Folded]>; 6848 6849// AES Round Key Generation Assist 6850let Predicates = [HasAVX, HasAES] in { 6851 def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), 6852 (ins VR128:$src1, u8imm:$src2), 6853 "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6854 [(set VR128:$dst, 6855 (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>, 6856 Sched<[WriteAESKeyGen]>, VEX, VEX_WIG; 6857 def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), 6858 (ins i128mem:$src1, u8imm:$src2), 6859 "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6860 [(set VR128:$dst, 6861 (int_x86_aesni_aeskeygenassist (load addr:$src1), timm:$src2))]>, 6862 Sched<[WriteAESKeyGen.Folded]>, VEX, VEX_WIG; 6863} 6864def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), 6865 (ins VR128:$src1, u8imm:$src2), 6866 "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6867 [(set VR128:$dst, 6868 (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>, 6869 Sched<[WriteAESKeyGen]>; 6870def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), 6871 (ins i128mem:$src1, u8imm:$src2), 6872 "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6873 [(set VR128:$dst, 6874 (int_x86_aesni_aeskeygenassist (memop addr:$src1), timm:$src2))]>, 6875 Sched<[WriteAESKeyGen.Folded]>; 6876 6877//===----------------------------------------------------------------------===// 6878// PCLMUL Instructions 6879//===----------------------------------------------------------------------===// 6880 6881// Immediate transform to help with commuting. 6882def PCLMULCommuteImm : SDNodeXForm<timm, [{ 6883 uint8_t Imm = N->getZExtValue(); 6884 return getI8Imm((uint8_t)((Imm >> 4) | (Imm << 4)), SDLoc(N)); 6885}]>; 6886 6887// SSE carry-less Multiplication instructions 6888let Predicates = [NoAVX, HasPCLMUL] in { 6889 let Constraints = "$src1 = $dst" in { 6890 let isCommutable = 1 in 6891 def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), 6892 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 6893 "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", 6894 [(set VR128:$dst, 6895 (int_x86_pclmulqdq VR128:$src1, VR128:$src2, timm:$src3))]>, 6896 Sched<[WriteCLMul]>; 6897 6898 def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), 6899 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 6900 "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", 6901 [(set VR128:$dst, 6902 (int_x86_pclmulqdq VR128:$src1, (memop addr:$src2), 6903 timm:$src3))]>, 6904 Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>; 6905 } // Constraints = "$src1 = $dst" 6906 6907 def : Pat<(int_x86_pclmulqdq (memop addr:$src2), VR128:$src1, 6908 (i8 timm:$src3)), 6909 (PCLMULQDQrm VR128:$src1, addr:$src2, 6910 (PCLMULCommuteImm timm:$src3))>; 6911} // Predicates = [NoAVX, HasPCLMUL] 6912 6913// SSE aliases 6914foreach HI = ["hq","lq"] in 6915foreach LO = ["hq","lq"] in { 6916 def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}", 6917 (PCLMULQDQrr VR128:$dst, VR128:$src, 6918 !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>; 6919 def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}", 6920 (PCLMULQDQrm VR128:$dst, i128mem:$src, 6921 !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>; 6922} 6923 6924// AVX carry-less Multiplication instructions 6925multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp, 6926 PatFrag LdFrag, Intrinsic IntId> { 6927 let isCommutable = 1 in 6928 def rr : PCLMULIi8<0x44, MRMSrcReg, (outs RC:$dst), 6929 (ins RC:$src1, RC:$src2, u8imm:$src3), 6930 "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 6931 [(set RC:$dst, 6932 (IntId RC:$src1, RC:$src2, timm:$src3))]>, 6933 Sched<[WriteCLMul]>; 6934 6935 def rm : PCLMULIi8<0x44, MRMSrcMem, (outs RC:$dst), 6936 (ins RC:$src1, MemOp:$src2, u8imm:$src3), 6937 "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 6938 [(set RC:$dst, 6939 (IntId RC:$src1, (LdFrag addr:$src2), timm:$src3))]>, 6940 Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>; 6941 6942 // We can commute a load in the first operand by swapping the sources and 6943 // rotating the immediate. 6944 def : Pat<(IntId (LdFrag addr:$src2), RC:$src1, (i8 timm:$src3)), 6945 (!cast<Instruction>(NAME#"rm") RC:$src1, addr:$src2, 6946 (PCLMULCommuteImm timm:$src3))>; 6947} 6948 6949let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in 6950defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, load, 6951 int_x86_pclmulqdq>, VEX_4V, VEX_WIG; 6952 6953let Predicates = [NoVLX, HasVPCLMULQDQ] in 6954defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, load, 6955 int_x86_pclmulqdq_256>, VEX_4V, VEX_L, VEX_WIG; 6956 6957multiclass vpclmulqdq_aliases_impl<string InstStr, RegisterClass RC, 6958 X86MemOperand MemOp, string Hi, string Lo> { 6959 def : InstAlias<"vpclmul"#Hi#Lo#"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6960 (!cast<Instruction>(InstStr # "rr") RC:$dst, RC:$src1, RC:$src2, 6961 !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>; 6962 def : InstAlias<"vpclmul"#Hi#Lo#"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6963 (!cast<Instruction>(InstStr # "rm") RC:$dst, RC:$src1, MemOp:$src2, 6964 !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>; 6965} 6966 6967multiclass vpclmulqdq_aliases<string InstStr, RegisterClass RC, 6968 X86MemOperand MemOp> { 6969 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "hq">; 6970 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "lq">; 6971 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "hq">; 6972 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "lq">; 6973} 6974 6975// AVX aliases 6976defm : vpclmulqdq_aliases<"VPCLMULQDQ", VR128, i128mem>; 6977defm : vpclmulqdq_aliases<"VPCLMULQDQY", VR256, i256mem>; 6978 6979//===----------------------------------------------------------------------===// 6980// SSE4A Instructions 6981//===----------------------------------------------------------------------===// 6982 6983let Predicates = [HasSSE4A] in { 6984 6985let ExeDomain = SSEPackedInt in { 6986let Constraints = "$src = $dst" in { 6987def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst), 6988 (ins VR128:$src, u8imm:$len, u8imm:$idx), 6989 "extrq\t{$idx, $len, $src|$src, $len, $idx}", 6990 [(set VR128:$dst, (X86extrqi VR128:$src, timm:$len, 6991 timm:$idx))]>, 6992 PD, Sched<[SchedWriteVecALU.XMM]>; 6993def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst), 6994 (ins VR128:$src, VR128:$mask), 6995 "extrq\t{$mask, $src|$src, $mask}", 6996 [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src, 6997 VR128:$mask))]>, 6998 PD, Sched<[SchedWriteVecALU.XMM]>; 6999 7000def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst), 7001 (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx), 7002 "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}", 7003 [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2, 7004 timm:$len, timm:$idx))]>, 7005 XD, Sched<[SchedWriteVecALU.XMM]>; 7006def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst), 7007 (ins VR128:$src, VR128:$mask), 7008 "insertq\t{$mask, $src|$src, $mask}", 7009 [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src, 7010 VR128:$mask))]>, 7011 XD, Sched<[SchedWriteVecALU.XMM]>; 7012} 7013} // ExeDomain = SSEPackedInt 7014 7015// Non-temporal (unaligned) scalar stores. 7016let AddedComplexity = 400 in { // Prefer non-temporal versions 7017let hasSideEffects = 0, mayStore = 1, SchedRW = [SchedWriteFMoveLSNT.Scl.MR] in { 7018def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src), 7019 "movntss\t{$src, $dst|$dst, $src}", []>, XS; 7020 7021def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 7022 "movntsd\t{$src, $dst|$dst, $src}", []>, XD; 7023} // SchedRW 7024 7025def : Pat<(nontemporalstore FR32:$src, addr:$dst), 7026 (MOVNTSS addr:$dst, (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>; 7027 7028def : Pat<(nontemporalstore FR64:$src, addr:$dst), 7029 (MOVNTSD addr:$dst, (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>; 7030 7031} // AddedComplexity 7032} // HasSSE4A 7033 7034//===----------------------------------------------------------------------===// 7035// AVX Instructions 7036//===----------------------------------------------------------------------===// 7037 7038//===----------------------------------------------------------------------===// 7039// VBROADCAST - Load from memory and broadcast to all elements of the 7040// destination operand 7041// 7042class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC, 7043 X86MemOperand x86memop, ValueType VT, 7044 PatFrag bcast_frag, SchedWrite Sched> : 7045 AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 7046 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7047 [(set RC:$dst, (VT (bcast_frag addr:$src)))]>, 7048 Sched<[Sched]>, VEX; 7049 7050// AVX2 adds register forms 7051class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC, 7052 ValueType ResVT, ValueType OpVT, SchedWrite Sched> : 7053 AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), 7054 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7055 [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>, 7056 Sched<[Sched]>, VEX; 7057 7058let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in { 7059 def VBROADCASTSSrm : avx_broadcast_rm<0x18, "vbroadcastss", VR128, 7060 f32mem, v4f32, X86VBroadcastld32, 7061 SchedWriteFShuffle.XMM.Folded>; 7062 def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256, 7063 f32mem, v8f32, X86VBroadcastld32, 7064 SchedWriteFShuffle.XMM.Folded>, VEX_L; 7065} 7066let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in 7067def VBROADCASTSDYrm : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem, 7068 v4f64, X86VBroadcastld64, 7069 SchedWriteFShuffle.XMM.Folded>, VEX_L; 7070 7071let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in { 7072 def VBROADCASTSSrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR128, 7073 v4f32, v4f32, SchedWriteFShuffle.XMM>; 7074 def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256, 7075 v8f32, v4f32, WriteFShuffle256>, VEX_L; 7076} 7077let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in 7078def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256, 7079 v4f64, v2f64, WriteFShuffle256>, VEX_L; 7080 7081//===----------------------------------------------------------------------===// 7082// VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both 7083// halves of a 256-bit vector. 7084// 7085let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in 7086def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst), 7087 (ins i128mem:$src), 7088 "vbroadcasti128\t{$src, $dst|$dst, $src}", []>, 7089 Sched<[WriteShuffleLd]>, VEX, VEX_L; 7090 7091let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX], 7092 ExeDomain = SSEPackedSingle in 7093def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst), 7094 (ins f128mem:$src), 7095 "vbroadcastf128\t{$src, $dst|$dst, $src}", []>, 7096 Sched<[SchedWriteFShuffle.XMM.Folded]>, VEX, VEX_L; 7097 7098let Predicates = [HasAVX, NoVLX] in { 7099def : Pat<(v4f64 (X86SubVBroadcastld128 addr:$src)), 7100 (VBROADCASTF128 addr:$src)>; 7101def : Pat<(v8f32 (X86SubVBroadcastld128 addr:$src)), 7102 (VBROADCASTF128 addr:$src)>; 7103// NOTE: We're using FP instructions here, but execution domain fixing can 7104// convert to integer when profitable. 7105def : Pat<(v4i64 (X86SubVBroadcastld128 addr:$src)), 7106 (VBROADCASTF128 addr:$src)>; 7107def : Pat<(v8i32 (X86SubVBroadcastld128 addr:$src)), 7108 (VBROADCASTF128 addr:$src)>; 7109def : Pat<(v16i16 (X86SubVBroadcastld128 addr:$src)), 7110 (VBROADCASTF128 addr:$src)>; 7111def : Pat<(v16f16 (X86SubVBroadcastld128 addr:$src)), 7112 (VBROADCASTF128 addr:$src)>; 7113def : Pat<(v32i8 (X86SubVBroadcastld128 addr:$src)), 7114 (VBROADCASTF128 addr:$src)>; 7115} 7116 7117//===----------------------------------------------------------------------===// 7118// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks 7119// 7120 7121let ExeDomain = SSEPackedSingle in { 7122let isCommutable = 1 in 7123def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst), 7124 (ins VR256:$src1, VR256:$src2, u8imm:$src3), 7125 "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>, 7126 VEX_4V, VEX_L, Sched<[WriteFShuffle256]>; 7127def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst), 7128 (ins VR256:$src1, f256mem:$src2, u8imm:$src3), 7129 "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>, 7130 VEX_4V, VEX_L, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>; 7131} 7132 7133// Immediate transform to help with commuting. 7134def Perm2XCommuteImm : SDNodeXForm<timm, [{ 7135 return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N)); 7136}]>; 7137 7138multiclass vperm2x128_lowering<string InstrStr, ValueType VT, PatFrag memop_frag> { 7139 def : Pat<(VT (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 timm:$imm))), 7140 (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR256:$src2, timm:$imm)>; 7141 def : Pat<(VT (X86VPerm2x128 VR256:$src1, (memop_frag addr:$src2), (i8 timm:$imm))), 7142 (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2, timm:$imm)>; 7143 // Pattern with load in other operand. 7144 def : Pat<(VT (X86VPerm2x128 (memop_frag addr:$src2), VR256:$src1, (i8 timm:$imm))), 7145 (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2, 7146 (Perm2XCommuteImm timm:$imm))>; 7147} 7148 7149let Predicates = [HasAVX] in { 7150 defm : vperm2x128_lowering<"VPERM2F128", v4f64, loadv4f64>; 7151 defm : vperm2x128_lowering<"VPERM2F128", v8f32, loadv8f32>; 7152} 7153 7154let Predicates = [HasAVX1Only] in { 7155 defm : vperm2x128_lowering<"VPERM2F128", v4i64, loadv4i64>; 7156 defm : vperm2x128_lowering<"VPERM2F128", v8i32, loadv8i32>; 7157 defm : vperm2x128_lowering<"VPERM2F128", v16i16, loadv16i16>; 7158 defm : vperm2x128_lowering<"VPERM2F128", v16f16, loadv16f16>; 7159 defm : vperm2x128_lowering<"VPERM2F128", v32i8, loadv32i8>; 7160} 7161 7162//===----------------------------------------------------------------------===// 7163// VINSERTF128 - Insert packed floating-point values 7164// 7165let hasSideEffects = 0, ExeDomain = SSEPackedSingle in { 7166def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst), 7167 (ins VR256:$src1, VR128:$src2, u8imm:$src3), 7168 "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7169 []>, Sched<[WriteFShuffle256]>, VEX_4V, VEX_L; 7170let mayLoad = 1 in 7171def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst), 7172 (ins VR256:$src1, f128mem:$src2, u8imm:$src3), 7173 "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7174 []>, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>, VEX_4V, VEX_L; 7175} 7176 7177// To create a 256-bit all ones value, we should produce VCMPTRUEPS 7178// with YMM register containing zero. 7179// FIXME: Avoid producing vxorps to clear the fake inputs. 7180let Predicates = [HasAVX1Only] in { 7181def : Pat<(v8i32 immAllOnesV), (VCMPPSYrri (AVX_SET0), (AVX_SET0), 0xf)>; 7182} 7183 7184multiclass vinsert_lowering<string InstrStr, string PermStr, 7185 ValueType From, ValueType To, 7186 PatFrag frommemop_frag, PatFrag tomemop_frag> { 7187 def : Pat<(vinsert128_insert:$ins (To VR256:$src1), (From VR128:$src2), 7188 (iPTR imm)), 7189 (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR128:$src2, 7190 (INSERT_get_vinsert128_imm VR256:$ins))>; 7191 def : Pat<(vinsert128_insert:$ins (To VR256:$src1), 7192 (From (frommemop_frag addr:$src2)), 7193 (iPTR imm)), 7194 (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2, 7195 (INSERT_get_vinsert128_imm VR256:$ins))>; 7196 // Folding "To" vector - convert to perm2x128 and commute inputs. 7197 def : Pat<(vinsert128_insert:$ins (To (tomemop_frag addr:$src1)), 7198 (From VR128:$src2), 7199 (iPTR imm)), 7200 (!cast<Instruction>(PermStr#rm) 7201 (INSERT_SUBREG (To (IMPLICIT_DEF)), VR128:$src2, sub_xmm), 7202 addr:$src1, (INSERT_get_vperm2x128_commutedimm VR256:$ins))>; 7203} 7204 7205let Predicates = [HasAVX, NoVLX] in { 7206 defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v4f32, v8f32, loadv4f32, loadv8f32>; 7207 defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v2f64, v4f64, loadv2f64, loadv4f64>; 7208} 7209 7210let Predicates = [HasAVX1Only] in { 7211 defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v2i64, v4i64, loadv2i64, loadv4i64>; 7212 defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v4i32, v8i32, loadv4i32, loadv8i32>; 7213 defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v8i16, v16i16, loadv8i16, loadv16i16>; 7214 defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v8f16, v16f16, loadv8f16, loadv16f16>; 7215 defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v16i8, v32i8, loadv16i8, loadv32i8>; 7216 defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v16i8, v32i8, loadv16i8, loadv32i8>; 7217} 7218 7219//===----------------------------------------------------------------------===// 7220// VEXTRACTF128 - Extract packed floating-point values 7221// 7222let hasSideEffects = 0, ExeDomain = SSEPackedSingle in { 7223def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst), 7224 (ins VR256:$src1, u8imm:$src2), 7225 "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7226 []>, Sched<[WriteFShuffle256]>, VEX, VEX_L; 7227let mayStore = 1 in 7228def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs), 7229 (ins f128mem:$dst, VR256:$src1, u8imm:$src2), 7230 "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7231 []>, Sched<[WriteFStoreX]>, VEX, VEX_L; 7232} 7233 7234multiclass vextract_lowering<string InstrStr, ValueType From, ValueType To> { 7235 def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), 7236 (To (!cast<Instruction>(InstrStr#rr) 7237 (From VR256:$src1), 7238 (EXTRACT_get_vextract128_imm VR128:$ext)))>; 7239 def : Pat<(store (To (vextract128_extract:$ext (From VR256:$src1), 7240 (iPTR imm))), addr:$dst), 7241 (!cast<Instruction>(InstrStr#mr) addr:$dst, VR256:$src1, 7242 (EXTRACT_get_vextract128_imm VR128:$ext))>; 7243} 7244 7245// AVX1 patterns 7246let Predicates = [HasAVX, NoVLX] in { 7247 defm : vextract_lowering<"VEXTRACTF128", v8f32, v4f32>; 7248 defm : vextract_lowering<"VEXTRACTF128", v4f64, v2f64>; 7249} 7250 7251let Predicates = [HasAVX1Only] in { 7252 defm : vextract_lowering<"VEXTRACTF128", v4i64, v2i64>; 7253 defm : vextract_lowering<"VEXTRACTF128", v8i32, v4i32>; 7254 defm : vextract_lowering<"VEXTRACTF128", v16i16, v8i16>; 7255 defm : vextract_lowering<"VEXTRACTF128", v16f16, v8f16>; 7256 defm : vextract_lowering<"VEXTRACTF128", v32i8, v16i8>; 7257 defm : vextract_lowering<"VEXTRACTF128", v32i8, v16i8>; 7258} 7259 7260//===----------------------------------------------------------------------===// 7261// VMASKMOV - Conditional SIMD Packed Loads and Stores 7262// 7263multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr, 7264 Intrinsic IntLd, Intrinsic IntLd256, 7265 Intrinsic IntSt, Intrinsic IntSt256, 7266 X86SchedWriteMaskMove schedX, 7267 X86SchedWriteMaskMove schedY> { 7268 def rm : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst), 7269 (ins VR128:$src1, f128mem:$src2), 7270 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7271 [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>, 7272 VEX_4V, Sched<[schedX.RM]>; 7273 def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst), 7274 (ins VR256:$src1, f256mem:$src2), 7275 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7276 [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>, 7277 VEX_4V, VEX_L, Sched<[schedY.RM]>; 7278 def mr : AVX8I<opc_mr, MRMDestMem, (outs), 7279 (ins f128mem:$dst, VR128:$src1, VR128:$src2), 7280 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7281 [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>, 7282 VEX_4V, Sched<[schedX.MR]>; 7283 def Ymr : AVX8I<opc_mr, MRMDestMem, (outs), 7284 (ins f256mem:$dst, VR256:$src1, VR256:$src2), 7285 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7286 [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, 7287 VEX_4V, VEX_L, Sched<[schedY.MR]>; 7288} 7289 7290let ExeDomain = SSEPackedSingle in 7291defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps", 7292 int_x86_avx_maskload_ps, 7293 int_x86_avx_maskload_ps_256, 7294 int_x86_avx_maskstore_ps, 7295 int_x86_avx_maskstore_ps_256, 7296 WriteFMaskMove32, WriteFMaskMove32Y>; 7297let ExeDomain = SSEPackedDouble in 7298defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd", 7299 int_x86_avx_maskload_pd, 7300 int_x86_avx_maskload_pd_256, 7301 int_x86_avx_maskstore_pd, 7302 int_x86_avx_maskstore_pd_256, 7303 WriteFMaskMove64, WriteFMaskMove64Y>; 7304 7305//===----------------------------------------------------------------------===// 7306// AVX_VNNI 7307//===----------------------------------------------------------------------===// 7308let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI], Constraints = "$src1 = $dst", 7309 ExplicitVEXPrefix = 1, checkVEXPredicate = 1 in 7310multiclass avx_vnni_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 7311 bit IsCommutable> { 7312 let isCommutable = IsCommutable in 7313 def rr : AVX8I<opc, MRMSrcReg, (outs VR128:$dst), 7314 (ins VR128:$src1, VR128:$src2, VR128:$src3), 7315 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 7316 [(set VR128:$dst, (v4i32 (OpNode VR128:$src1, 7317 VR128:$src2, VR128:$src3)))]>, 7318 VEX_4V, Sched<[SchedWriteVecIMul.XMM]>; 7319 7320 def rm : AVX8I<opc, MRMSrcMem, (outs VR128:$dst), 7321 (ins VR128:$src1, VR128:$src2, i128mem:$src3), 7322 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 7323 [(set VR128:$dst, (v4i32 (OpNode VR128:$src1, VR128:$src2, 7324 (loadv4i32 addr:$src3))))]>, 7325 VEX_4V, Sched<[SchedWriteVecIMul.XMM]>; 7326 7327 let isCommutable = IsCommutable in 7328 def Yrr : AVX8I<opc, MRMSrcReg, (outs VR256:$dst), 7329 (ins VR256:$src1, VR256:$src2, VR256:$src3), 7330 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 7331 [(set VR256:$dst, (v8i32 (OpNode VR256:$src1, 7332 VR256:$src2, VR256:$src3)))]>, 7333 VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.XMM]>; 7334 7335 def Yrm : AVX8I<opc, MRMSrcMem, (outs VR256:$dst), 7336 (ins VR256:$src1, VR256:$src2, i256mem:$src3), 7337 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 7338 [(set VR256:$dst, (v8i32 (OpNode VR256:$src1, VR256:$src2, 7339 (loadv8i32 addr:$src3))))]>, 7340 VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.XMM]>; 7341} 7342 7343defm VPDPBUSD : avx_vnni_rm<0x50, "vpdpbusd", X86Vpdpbusd, 0>; 7344defm VPDPBUSDS : avx_vnni_rm<0x51, "vpdpbusds", X86Vpdpbusds, 0>; 7345defm VPDPWSSD : avx_vnni_rm<0x52, "vpdpwssd", X86Vpdpwssd, 1>; 7346defm VPDPWSSDS : avx_vnni_rm<0x53, "vpdpwssds", X86Vpdpwssds, 1>; 7347 7348def X86vpmaddwd_su : PatFrag<(ops node:$lhs, node:$rhs), 7349 (X86vpmaddwd node:$lhs, node:$rhs), [{ 7350 return N->hasOneUse(); 7351}]>; 7352 7353let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI] in { 7354 def : Pat<(v8i32 (add VR256:$src1, 7355 (X86vpmaddwd_su VR256:$src2, VR256:$src3))), 7356 (VPDPWSSDYrr VR256:$src1, VR256:$src2, VR256:$src3)>; 7357 def : Pat<(v8i32 (add VR256:$src1, 7358 (X86vpmaddwd_su VR256:$src2, (load addr:$src3)))), 7359 (VPDPWSSDYrm VR256:$src1, VR256:$src2, addr:$src3)>; 7360 def : Pat<(v4i32 (add VR128:$src1, 7361 (X86vpmaddwd_su VR128:$src2, VR128:$src3))), 7362 (VPDPWSSDrr VR128:$src1, VR128:$src2, VR128:$src3)>; 7363 def : Pat<(v4i32 (add VR128:$src1, 7364 (X86vpmaddwd_su VR128:$src2, (load addr:$src3)))), 7365 (VPDPWSSDrm VR128:$src1, VR128:$src2, addr:$src3)>; 7366} 7367 7368//===----------------------------------------------------------------------===// 7369// VPERMIL - Permute Single and Double Floating-Point Values 7370// 7371 7372multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr, 7373 RegisterClass RC, X86MemOperand x86memop_f, 7374 X86MemOperand x86memop_i, 7375 ValueType f_vt, ValueType i_vt, 7376 X86FoldableSchedWrite sched, 7377 X86FoldableSchedWrite varsched> { 7378 let Predicates = [HasAVX, NoVLX] in { 7379 def rr : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst), 7380 (ins RC:$src1, RC:$src2), 7381 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7382 [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, (i_vt RC:$src2))))]>, VEX_4V, 7383 Sched<[varsched]>; 7384 def rm : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst), 7385 (ins RC:$src1, x86memop_i:$src2), 7386 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7387 [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, 7388 (i_vt (load addr:$src2)))))]>, VEX_4V, 7389 Sched<[varsched.Folded, sched.ReadAfterFold]>; 7390 7391 def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst), 7392 (ins RC:$src1, u8imm:$src2), 7393 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7394 [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 timm:$src2))))]>, VEX, 7395 Sched<[sched]>; 7396 def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst), 7397 (ins x86memop_f:$src1, u8imm:$src2), 7398 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7399 [(set RC:$dst, 7400 (f_vt (X86VPermilpi (load addr:$src1), (i8 timm:$src2))))]>, VEX, 7401 Sched<[sched.Folded]>; 7402 }// Predicates = [HasAVX, NoVLX] 7403} 7404 7405let ExeDomain = SSEPackedSingle in { 7406 defm VPERMILPS : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem, 7407 v4f32, v4i32, SchedWriteFShuffle.XMM, 7408 SchedWriteFVarShuffle.XMM>; 7409 defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem, 7410 v8f32, v8i32, SchedWriteFShuffle.YMM, 7411 SchedWriteFVarShuffle.YMM>, VEX_L; 7412} 7413let ExeDomain = SSEPackedDouble in { 7414 defm VPERMILPD : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem, 7415 v2f64, v2i64, SchedWriteFShuffle.XMM, 7416 SchedWriteFVarShuffle.XMM>; 7417 defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem, 7418 v4f64, v4i64, SchedWriteFShuffle.YMM, 7419 SchedWriteFVarShuffle.YMM>, VEX_L; 7420} 7421 7422//===----------------------------------------------------------------------===// 7423// VZERO - Zero YMM registers 7424// Note: These instruction do not affect the YMM16-YMM31. 7425// 7426 7427let SchedRW = [WriteSystem] in { 7428let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7, 7429 YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in { 7430 // Zero All YMM registers 7431 def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall", 7432 [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L, 7433 Requires<[HasAVX]>, VEX_WIG; 7434 7435 // Zero Upper bits of YMM registers 7436 def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper", 7437 [(int_x86_avx_vzeroupper)]>, PS, VEX, 7438 Requires<[HasAVX]>, VEX_WIG; 7439} // Defs 7440} // SchedRW 7441 7442//===----------------------------------------------------------------------===// 7443// Half precision conversion instructions 7444// 7445 7446multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, 7447 X86FoldableSchedWrite sched> { 7448 def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), 7449 "vcvtph2ps\t{$src, $dst|$dst, $src}", 7450 [(set RC:$dst, (X86any_cvtph2ps VR128:$src))]>, 7451 T8PD, VEX, Sched<[sched]>; 7452 let hasSideEffects = 0, mayLoad = 1 in 7453 def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 7454 "vcvtph2ps\t{$src, $dst|$dst, $src}", 7455 []>, T8PD, VEX, Sched<[sched.Folded]>; 7456} 7457 7458multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, 7459 SchedWrite RR, SchedWrite MR> { 7460 def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst), 7461 (ins RC:$src1, i32u8imm:$src2), 7462 "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7463 [(set VR128:$dst, (X86any_cvtps2ph RC:$src1, timm:$src2))]>, 7464 TAPD, VEX, Sched<[RR]>; 7465 let hasSideEffects = 0, mayStore = 1 in 7466 def mr : Ii8<0x1D, MRMDestMem, (outs), 7467 (ins x86memop:$dst, RC:$src1, i32u8imm:$src2), 7468 "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 7469 TAPD, VEX, Sched<[MR]>; 7470} 7471 7472let Predicates = [HasF16C, NoVLX] in { 7473 defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, WriteCvtPH2PS>, SIMD_EXC; 7474 defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, WriteCvtPH2PSY>, VEX_L, SIMD_EXC; 7475 defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, WriteCvtPS2PH, 7476 WriteCvtPS2PHSt>, SIMD_EXC; 7477 defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, WriteCvtPS2PHY, 7478 WriteCvtPS2PHYSt>, VEX_L, SIMD_EXC; 7479 7480 // Pattern match vcvtph2ps of a scalar i64 load. 7481 def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), 7482 (VCVTPH2PSrm addr:$src)>; 7483 def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16 7484 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 7485 (VCVTPH2PSrm addr:$src)>; 7486 def : Pat<(v8f32 (X86any_cvtph2ps (loadv8i16 addr:$src))), 7487 (VCVTPH2PSYrm addr:$src)>; 7488 7489 def : Pat<(store (f64 (extractelt 7490 (bc_v2f64 (v8i16 (X86any_cvtps2ph VR128:$src1, timm:$src2))), 7491 (iPTR 0))), addr:$dst), 7492 (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>; 7493 def : Pat<(store (i64 (extractelt 7494 (bc_v2i64 (v8i16 (X86any_cvtps2ph VR128:$src1, timm:$src2))), 7495 (iPTR 0))), addr:$dst), 7496 (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>; 7497 def : Pat<(store (v8i16 (X86any_cvtps2ph VR256:$src1, timm:$src2)), addr:$dst), 7498 (VCVTPS2PHYmr addr:$dst, VR256:$src1, timm:$src2)>; 7499} 7500 7501//===----------------------------------------------------------------------===// 7502// AVX2 Instructions 7503//===----------------------------------------------------------------------===// 7504 7505/// AVX2_blend_rmi - AVX2 blend with 8-bit immediate 7506multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, 7507 ValueType OpVT, X86FoldableSchedWrite sched, 7508 RegisterClass RC, 7509 X86MemOperand x86memop, SDNodeXForm commuteXForm> { 7510 let isCommutable = 1 in 7511 def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst), 7512 (ins RC:$src1, RC:$src2, u8imm:$src3), 7513 !strconcat(OpcodeStr, 7514 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 7515 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>, 7516 Sched<[sched]>, VEX_4V; 7517 def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst), 7518 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 7519 !strconcat(OpcodeStr, 7520 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 7521 [(set RC:$dst, 7522 (OpVT (OpNode RC:$src1, (load addr:$src2), timm:$src3)))]>, 7523 Sched<[sched.Folded, sched.ReadAfterFold]>, VEX_4V; 7524 7525 // Pattern to commute if load is in first source. 7526 def : Pat<(OpVT (OpNode (load addr:$src2), RC:$src1, timm:$src3)), 7527 (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2, 7528 (commuteXForm timm:$src3))>; 7529} 7530 7531let Predicates = [HasAVX2] in { 7532defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32, 7533 SchedWriteBlend.XMM, VR128, i128mem, 7534 BlendCommuteImm4>; 7535defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32, 7536 SchedWriteBlend.YMM, VR256, i256mem, 7537 BlendCommuteImm8>, VEX_L; 7538 7539def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3), 7540 (VPBLENDDYrri VR256:$src1, VR256:$src2, (BlendScaleImm4 timm:$src3))>; 7541def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3), 7542 (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>; 7543def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3), 7544 (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>; 7545 7546def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3), 7547 (VPBLENDDrri VR128:$src1, VR128:$src2, (BlendScaleImm2to4 timm:$src3))>; 7548def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3), 7549 (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleImm2to4 timm:$src3))>; 7550def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3), 7551 (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2to4 timm:$src3))>; 7552} 7553 7554// For insertion into the zero index (low half) of a 256-bit vector, it is 7555// more efficient to generate a blend with immediate instead of an insert*128. 7556// NOTE: We're using FP instructions here, but execution domain fixing should 7557// take care of using integer instructions when profitable. 7558let Predicates = [HasAVX] in { 7559def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)), 7560 (VBLENDPSYrri VR256:$src1, 7561 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7562 VR128:$src2, sub_xmm), 0xf)>; 7563def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)), 7564 (VBLENDPSYrri VR256:$src1, 7565 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7566 VR128:$src2, sub_xmm), 0xf)>; 7567def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)), 7568 (VBLENDPSYrri VR256:$src1, 7569 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7570 VR128:$src2, sub_xmm), 0xf)>; 7571def : Pat<(insert_subvector (v16f16 VR256:$src1), (v8f16 VR128:$src2), (iPTR 0)), 7572 (VBLENDPSYrri VR256:$src1, 7573 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7574 VR128:$src2, sub_xmm), 0xf)>; 7575def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)), 7576 (VBLENDPSYrri VR256:$src1, 7577 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7578 VR128:$src2, sub_xmm), 0xf)>; 7579 7580def : Pat<(insert_subvector (loadv8i32 addr:$src2), (v4i32 VR128:$src1), (iPTR 0)), 7581 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7582 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; 7583def : Pat<(insert_subvector (loadv4i64 addr:$src2), (v2i64 VR128:$src1), (iPTR 0)), 7584 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7585 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; 7586def : Pat<(insert_subvector (loadv16i16 addr:$src2), (v8i16 VR128:$src1), (iPTR 0)), 7587 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7588 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; 7589def : Pat<(insert_subvector (loadv16f16 addr:$src2), (v8f16 VR128:$src1), (iPTR 0)), 7590 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7591 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; 7592def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0)), 7593 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7594 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; 7595} 7596 7597//===----------------------------------------------------------------------===// 7598// VPBROADCAST - Load from memory and broadcast to all elements of the 7599// destination operand 7600// 7601multiclass avx2_broadcast<bits<8> opc, string OpcodeStr, 7602 X86MemOperand x86memop, PatFrag bcast_frag, 7603 ValueType OpVT128, ValueType OpVT256, Predicate prd> { 7604 let Predicates = [HasAVX2, prd] in { 7605 def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 7606 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7607 [(set VR128:$dst, 7608 (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>, 7609 Sched<[SchedWriteShuffle.XMM]>, VEX; 7610 def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), 7611 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7612 [(set VR128:$dst, 7613 (OpVT128 (bcast_frag addr:$src)))]>, 7614 Sched<[SchedWriteShuffle.XMM.Folded]>, VEX; 7615 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), 7616 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7617 [(set VR256:$dst, 7618 (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>, 7619 Sched<[WriteShuffle256]>, VEX, VEX_L; 7620 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src), 7621 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7622 [(set VR256:$dst, 7623 (OpVT256 (bcast_frag addr:$src)))]>, 7624 Sched<[SchedWriteShuffle.XMM.Folded]>, VEX, VEX_L; 7625 7626 // Provide aliases for broadcast from the same register class that 7627 // automatically does the extract. 7628 def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))), 7629 (!cast<Instruction>(NAME#"Yrr") 7630 (OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>; 7631 } 7632} 7633 7634defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, X86VBroadcastld8, 7635 v16i8, v32i8, NoVLX_Or_NoBWI>; 7636defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, X86VBroadcastld16, 7637 v8i16, v16i16, NoVLX_Or_NoBWI>; 7638defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, X86VBroadcastld32, 7639 v4i32, v8i32, NoVLX>; 7640defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, X86VBroadcastld64, 7641 v2i64, v4i64, NoVLX>; 7642 7643let Predicates = [HasAVX2, NoVLX] in { 7644 // Provide fallback in case the load node that is used in the patterns above 7645 // is used by additional users, which prevents the pattern selection. 7646 def : Pat<(v4f32 (X86VBroadcast FR32:$src)), 7647 (VBROADCASTSSrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>; 7648 def : Pat<(v8f32 (X86VBroadcast FR32:$src)), 7649 (VBROADCASTSSYrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>; 7650 def : Pat<(v4f64 (X86VBroadcast FR64:$src)), 7651 (VBROADCASTSDYrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>; 7652} 7653 7654let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 7655 def : Pat<(v16i8 (X86VBroadcast GR8:$src)), 7656 (VPBROADCASTBrr (VMOVDI2PDIrr 7657 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 7658 GR8:$src, sub_8bit))))>; 7659 def : Pat<(v32i8 (X86VBroadcast GR8:$src)), 7660 (VPBROADCASTBYrr (VMOVDI2PDIrr 7661 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 7662 GR8:$src, sub_8bit))))>; 7663 7664 def : Pat<(v8i16 (X86VBroadcast GR16:$src)), 7665 (VPBROADCASTWrr (VMOVDI2PDIrr 7666 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 7667 GR16:$src, sub_16bit))))>; 7668 def : Pat<(v16i16 (X86VBroadcast GR16:$src)), 7669 (VPBROADCASTWYrr (VMOVDI2PDIrr 7670 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 7671 GR16:$src, sub_16bit))))>; 7672 7673 def : Pat<(v8f16 (X86VBroadcastld16 addr:$src)), 7674 (VPBROADCASTWrm addr:$src)>; 7675 def : Pat<(v16f16 (X86VBroadcastld16 addr:$src)), 7676 (VPBROADCASTWYrm addr:$src)>; 7677 7678 def : Pat<(v8f16 (X86VBroadcast (v8f16 VR128:$src))), 7679 (VPBROADCASTWrr VR128:$src)>; 7680 def : Pat<(v16f16 (X86VBroadcast (v8f16 VR128:$src))), 7681 (VPBROADCASTWYrr VR128:$src)>; 7682 7683 def : Pat<(v8f16 (X86VBroadcast (f16 FR16:$src))), 7684 (VPBROADCASTWrr (COPY_TO_REGCLASS FR16:$src, VR128))>; 7685 def : Pat<(v16f16 (X86VBroadcast (f16 FR16:$src))), 7686 (VPBROADCASTWYrr (COPY_TO_REGCLASS FR16:$src, VR128))>; 7687} 7688let Predicates = [HasAVX2, NoVLX] in { 7689 def : Pat<(v4i32 (X86VBroadcast GR32:$src)), 7690 (VPBROADCASTDrr (VMOVDI2PDIrr GR32:$src))>; 7691 def : Pat<(v8i32 (X86VBroadcast GR32:$src)), 7692 (VPBROADCASTDYrr (VMOVDI2PDIrr GR32:$src))>; 7693 def : Pat<(v2i64 (X86VBroadcast GR64:$src)), 7694 (VPBROADCASTQrr (VMOV64toPQIrr GR64:$src))>; 7695 def : Pat<(v4i64 (X86VBroadcast GR64:$src)), 7696 (VPBROADCASTQYrr (VMOV64toPQIrr GR64:$src))>; 7697} 7698 7699// AVX1 broadcast patterns 7700let Predicates = [HasAVX1Only] in { 7701def : Pat<(v8i32 (X86VBroadcastld32 addr:$src)), 7702 (VBROADCASTSSYrm addr:$src)>; 7703def : Pat<(v4i64 (X86VBroadcastld64 addr:$src)), 7704 (VBROADCASTSDYrm addr:$src)>; 7705def : Pat<(v4i32 (X86VBroadcastld32 addr:$src)), 7706 (VBROADCASTSSrm addr:$src)>; 7707} 7708 7709 // Provide fallback in case the load node that is used in the patterns above 7710 // is used by additional users, which prevents the pattern selection. 7711let Predicates = [HasAVX, NoVLX] in { 7712 // 128bit broadcasts: 7713 def : Pat<(v2f64 (X86VBroadcast f64:$src)), 7714 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>; 7715 def : Pat<(v2f64 (X86VBroadcastld64 addr:$src)), 7716 (VMOVDDUPrm addr:$src)>; 7717 7718 def : Pat<(v2f64 (X86VBroadcast v2f64:$src)), 7719 (VMOVDDUPrr VR128:$src)>; 7720} 7721 7722let Predicates = [HasAVX1Only] in { 7723 def : Pat<(v4f32 (X86VBroadcast FR32:$src)), 7724 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)>; 7725 def : Pat<(v8f32 (X86VBroadcast FR32:$src)), 7726 (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), 7727 (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), sub_xmm), 7728 (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), 1)>; 7729 def : Pat<(v8f32 (X86VBroadcast v4f32:$src)), 7730 (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), 7731 (v4f32 (VPERMILPSri VR128:$src, 0)), sub_xmm), 7732 (v4f32 (VPERMILPSri VR128:$src, 0)), 1)>; 7733 def : Pat<(v4f64 (X86VBroadcast FR64:$src)), 7734 (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), 7735 (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), sub_xmm), 7736 (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), 1)>; 7737 def : Pat<(v4f64 (X86VBroadcast v2f64:$src)), 7738 (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), 7739 (v2f64 (VMOVDDUPrr VR128:$src)), sub_xmm), 7740 (v2f64 (VMOVDDUPrr VR128:$src)), 1)>; 7741 7742 def : Pat<(v4i32 (X86VBroadcast GR32:$src)), 7743 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)>; 7744 def : Pat<(v8i32 (X86VBroadcast GR32:$src)), 7745 (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7746 (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), sub_xmm), 7747 (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), 1)>; 7748 def : Pat<(v4i64 (X86VBroadcast GR64:$src)), 7749 (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), 7750 (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), sub_xmm), 7751 (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), 1)>; 7752 7753 def : Pat<(v2i64 (X86VBroadcast i64:$src)), 7754 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)>; 7755 def : Pat<(v2i64 (X86VBroadcastld64 addr:$src)), 7756 (VMOVDDUPrm addr:$src)>; 7757} 7758 7759//===----------------------------------------------------------------------===// 7760// VPERM - Permute instructions 7761// 7762 7763multiclass avx2_perm<bits<8> opc, string OpcodeStr, 7764 ValueType OpVT, X86FoldableSchedWrite Sched, 7765 X86MemOperand memOp> { 7766 let Predicates = [HasAVX2, NoVLX] in { 7767 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), 7768 (ins VR256:$src1, VR256:$src2), 7769 !strconcat(OpcodeStr, 7770 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7771 [(set VR256:$dst, 7772 (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>, 7773 Sched<[Sched]>, VEX_4V, VEX_L; 7774 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), 7775 (ins VR256:$src1, memOp:$src2), 7776 !strconcat(OpcodeStr, 7777 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7778 [(set VR256:$dst, 7779 (OpVT (X86VPermv VR256:$src1, 7780 (load addr:$src2))))]>, 7781 Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX_4V, VEX_L; 7782 } 7783} 7784 7785defm VPERMD : avx2_perm<0x36, "vpermd", v8i32, WriteVarShuffle256, i256mem>; 7786let ExeDomain = SSEPackedSingle in 7787defm VPERMPS : avx2_perm<0x16, "vpermps", v8f32, WriteFVarShuffle256, f256mem>; 7788 7789multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, 7790 ValueType OpVT, X86FoldableSchedWrite Sched, 7791 X86MemOperand memOp> { 7792 let Predicates = [HasAVX2, NoVLX] in { 7793 def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst), 7794 (ins VR256:$src1, u8imm:$src2), 7795 !strconcat(OpcodeStr, 7796 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7797 [(set VR256:$dst, 7798 (OpVT (X86VPermi VR256:$src1, (i8 timm:$src2))))]>, 7799 Sched<[Sched]>, VEX, VEX_L; 7800 def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst), 7801 (ins memOp:$src1, u8imm:$src2), 7802 !strconcat(OpcodeStr, 7803 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7804 [(set VR256:$dst, 7805 (OpVT (X86VPermi (mem_frag addr:$src1), 7806 (i8 timm:$src2))))]>, 7807 Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX, VEX_L; 7808 } 7809} 7810 7811defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64, 7812 WriteShuffle256, i256mem>, VEX_W; 7813let ExeDomain = SSEPackedDouble in 7814defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64, 7815 WriteFShuffle256, f256mem>, VEX_W; 7816 7817//===----------------------------------------------------------------------===// 7818// VPERM2I128 - Permute Integer vector Values in 128-bit chunks 7819// 7820let isCommutable = 1 in 7821def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst), 7822 (ins VR256:$src1, VR256:$src2, u8imm:$src3), 7823 "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>, 7824 Sched<[WriteShuffle256]>, VEX_4V, VEX_L; 7825def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst), 7826 (ins VR256:$src1, f256mem:$src2, u8imm:$src3), 7827 "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>, 7828 Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L; 7829 7830let Predicates = [HasAVX2] in { 7831 defm : vperm2x128_lowering<"VPERM2I128", v4i64, loadv4i64>; 7832 defm : vperm2x128_lowering<"VPERM2I128", v8i32, loadv8i32>; 7833 defm : vperm2x128_lowering<"VPERM2I128", v16i16, loadv16i16>; 7834 defm : vperm2x128_lowering<"VPERM2I128", v16f16, loadv16f16>; 7835 defm : vperm2x128_lowering<"VPERM2I128", v32i8, loadv32i8>; 7836 defm : vperm2x128_lowering<"VPERM2I128", v32i8, loadv32i8>; 7837} 7838 7839//===----------------------------------------------------------------------===// 7840// VINSERTI128 - Insert packed integer values 7841// 7842let hasSideEffects = 0 in { 7843def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst), 7844 (ins VR256:$src1, VR128:$src2, u8imm:$src3), 7845 "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7846 []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L; 7847let mayLoad = 1 in 7848def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst), 7849 (ins VR256:$src1, i128mem:$src2, u8imm:$src3), 7850 "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7851 []>, Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L; 7852} 7853 7854let Predicates = [HasAVX2, NoVLX] in { 7855 defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v2i64, v4i64, loadv2i64, loadv4i64>; 7856 defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v4i32, v8i32, loadv4i32, loadv8i32>; 7857 defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8i16, v16i16, loadv8i16, loadv16i16>; 7858 defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8f16, v16f16, loadv8f16, loadv16f16>; 7859 defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v16i8, v32i8, loadv16i8, loadv32i8>; 7860 defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v16i8, v32i8, loadv16i8, loadv32i8>; 7861} 7862 7863//===----------------------------------------------------------------------===// 7864// VEXTRACTI128 - Extract packed integer values 7865// 7866def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst), 7867 (ins VR256:$src1, u8imm:$src2), 7868 "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 7869 Sched<[WriteShuffle256]>, VEX, VEX_L; 7870let hasSideEffects = 0, mayStore = 1 in 7871def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs), 7872 (ins i128mem:$dst, VR256:$src1, u8imm:$src2), 7873 "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 7874 Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_L; 7875 7876let Predicates = [HasAVX2, NoVLX] in { 7877 defm : vextract_lowering<"VEXTRACTI128", v4i64, v2i64>; 7878 defm : vextract_lowering<"VEXTRACTI128", v8i32, v4i32>; 7879 defm : vextract_lowering<"VEXTRACTI128", v16i16, v8i16>; 7880 defm : vextract_lowering<"VEXTRACTI128", v16f16, v8f16>; 7881 defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>; 7882 defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>; 7883} 7884 7885//===----------------------------------------------------------------------===// 7886// VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores 7887// 7888multiclass avx2_pmovmask<string OpcodeStr, 7889 Intrinsic IntLd128, Intrinsic IntLd256, 7890 Intrinsic IntSt128, Intrinsic IntSt256, 7891 X86SchedWriteMaskMove schedX, 7892 X86SchedWriteMaskMove schedY> { 7893 def rm : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst), 7894 (ins VR128:$src1, i128mem:$src2), 7895 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7896 [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>, 7897 VEX_4V, Sched<[schedX.RM]>; 7898 def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst), 7899 (ins VR256:$src1, i256mem:$src2), 7900 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7901 [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>, 7902 VEX_4V, VEX_L, Sched<[schedY.RM]>; 7903 def mr : AVX28I<0x8e, MRMDestMem, (outs), 7904 (ins i128mem:$dst, VR128:$src1, VR128:$src2), 7905 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7906 [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>, 7907 VEX_4V, Sched<[schedX.MR]>; 7908 def Ymr : AVX28I<0x8e, MRMDestMem, (outs), 7909 (ins i256mem:$dst, VR256:$src1, VR256:$src2), 7910 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7911 [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, 7912 VEX_4V, VEX_L, Sched<[schedY.MR]>; 7913} 7914 7915defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd", 7916 int_x86_avx2_maskload_d, 7917 int_x86_avx2_maskload_d_256, 7918 int_x86_avx2_maskstore_d, 7919 int_x86_avx2_maskstore_d_256, 7920 WriteVecMaskMove32, WriteVecMaskMove32Y>; 7921defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq", 7922 int_x86_avx2_maskload_q, 7923 int_x86_avx2_maskload_q_256, 7924 int_x86_avx2_maskstore_q, 7925 int_x86_avx2_maskstore_q_256, 7926 WriteVecMaskMove64, WriteVecMaskMove64Y>, VEX_W; 7927 7928multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT, 7929 ValueType MaskVT> { 7930 // masked store 7931 def: Pat<(masked_store (VT RC:$src), addr:$ptr, (MaskVT RC:$mask)), 7932 (!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>; 7933 // masked load 7934 def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), undef)), 7935 (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>; 7936 def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), 7937 (VT immAllZerosV))), 7938 (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>; 7939} 7940let Predicates = [HasAVX] in { 7941 defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32>; 7942 defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64>; 7943 defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32>; 7944 defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64>; 7945} 7946let Predicates = [HasAVX1Only] in { 7947 // load/store i32/i64 not supported use ps/pd version 7948 defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32>; 7949 defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64>; 7950 defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32>; 7951 defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64>; 7952} 7953let Predicates = [HasAVX2] in { 7954 defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32>; 7955 defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64>; 7956 defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32>; 7957 defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64>; 7958} 7959 7960//===----------------------------------------------------------------------===// 7961// Variable Bit Shifts 7962// 7963multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode, 7964 ValueType vt128, ValueType vt256> { 7965 def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), 7966 (ins VR128:$src1, VR128:$src2), 7967 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7968 [(set VR128:$dst, 7969 (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>, 7970 VEX_4V, Sched<[SchedWriteVarVecShift.XMM]>; 7971 def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), 7972 (ins VR128:$src1, i128mem:$src2), 7973 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7974 [(set VR128:$dst, 7975 (vt128 (OpNode VR128:$src1, 7976 (vt128 (load addr:$src2)))))]>, 7977 VEX_4V, Sched<[SchedWriteVarVecShift.XMM.Folded, 7978 SchedWriteVarVecShift.XMM.ReadAfterFold]>; 7979 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), 7980 (ins VR256:$src1, VR256:$src2), 7981 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7982 [(set VR256:$dst, 7983 (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>, 7984 VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM]>; 7985 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), 7986 (ins VR256:$src1, i256mem:$src2), 7987 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7988 [(set VR256:$dst, 7989 (vt256 (OpNode VR256:$src1, 7990 (vt256 (load addr:$src2)))))]>, 7991 VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded, 7992 SchedWriteVarVecShift.YMM.ReadAfterFold]>; 7993} 7994 7995let Predicates = [HasAVX2, NoVLX] in { 7996 defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", X86vshlv, v4i32, v8i32>; 7997 defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", X86vshlv, v2i64, v4i64>, VEX_W; 7998 defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", X86vsrlv, v4i32, v8i32>; 7999 defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", X86vsrlv, v2i64, v4i64>, VEX_W; 8000 defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", X86vsrav, v4i32, v8i32>; 8001} 8002 8003//===----------------------------------------------------------------------===// 8004// VGATHER - GATHER Operations 8005 8006// FIXME: Improve scheduling of gather instructions. 8007multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256, 8008 X86MemOperand memop128, X86MemOperand memop256> { 8009let mayLoad = 1, hasSideEffects = 0 in { 8010 def rm : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb), 8011 (ins VR128:$src1, memop128:$src2, VR128:$mask), 8012 !strconcat(OpcodeStr, 8013 "\t{$mask, $src2, $dst|$dst, $src2, $mask}"), 8014 []>, VEX, Sched<[WriteLoad, WriteVecMaskedGatherWriteback]>; 8015 def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb), 8016 (ins RC256:$src1, memop256:$src2, RC256:$mask), 8017 !strconcat(OpcodeStr, 8018 "\t{$mask, $src2, $dst|$dst, $src2, $mask}"), 8019 []>, VEX, VEX_L, Sched<[WriteLoad, WriteVecMaskedGatherWriteback]>; 8020} 8021} 8022 8023let Predicates = [HasAVX2] in { 8024 let mayLoad = 1, hasSideEffects = 0, Constraints 8025 = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb" 8026 in { 8027 defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", 8028 VR256, vx128mem, vx256mem>, VEX_W; 8029 defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", 8030 VR256, vx128mem, vy256mem>, VEX_W; 8031 defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", 8032 VR256, vx128mem, vy256mem>; 8033 defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", 8034 VR128, vx64mem, vy128mem>; 8035 8036 let ExeDomain = SSEPackedDouble in { 8037 defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", 8038 VR256, vx128mem, vx256mem>, VEX_W; 8039 defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", 8040 VR256, vx128mem, vy256mem>, VEX_W; 8041 } 8042 8043 let ExeDomain = SSEPackedSingle in { 8044 defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", 8045 VR256, vx128mem, vy256mem>; 8046 defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", 8047 VR128, vx64mem, vy128mem>; 8048 } 8049 } 8050} 8051 8052//===----------------------------------------------------------------------===// 8053// GFNI instructions 8054//===----------------------------------------------------------------------===// 8055 8056multiclass GF2P8MULB_rm<string OpcodeStr, ValueType OpVT, 8057 RegisterClass RC, PatFrag MemOpFrag, 8058 X86MemOperand X86MemOp, X86FoldableSchedWrite sched, 8059 bit Is2Addr = 0> { 8060 let ExeDomain = SSEPackedInt, 8061 AsmString = !if(Is2Addr, 8062 OpcodeStr#"\t{$src2, $dst|$dst, $src2}", 8063 OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}") in { 8064 let isCommutable = 1 in 8065 def rr : PDI<0xCF, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), "", 8066 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, RC:$src2)))]>, 8067 Sched<[sched]>, T8PD; 8068 8069 def rm : PDI<0xCF, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2), "", 8070 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, 8071 (MemOpFrag addr:$src2))))]>, 8072 Sched<[sched.Folded, sched.ReadAfterFold]>, T8PD; 8073 } 8074} 8075 8076multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT, 8077 SDNode OpNode, RegisterClass RC, PatFrag MemOpFrag, 8078 X86MemOperand X86MemOp, X86FoldableSchedWrite sched, 8079 bit Is2Addr = 0> { 8080 let AsmString = !if(Is2Addr, 8081 OpStr#"\t{$src3, $src2, $dst|$dst, $src2, $src3}", 8082 OpStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}") in { 8083 def rri : Ii8<Op, MRMSrcReg, (outs RC:$dst), 8084 (ins RC:$src1, RC:$src2, u8imm:$src3), "", 8085 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))], 8086 SSEPackedInt>, Sched<[sched]>; 8087 def rmi : Ii8<Op, MRMSrcMem, (outs RC:$dst), 8088 (ins RC:$src1, X86MemOp:$src2, u8imm:$src3), "", 8089 [(set RC:$dst, (OpVT (OpNode RC:$src1, 8090 (MemOpFrag addr:$src2), 8091 timm:$src3)))], SSEPackedInt>, 8092 Sched<[sched.Folded, sched.ReadAfterFold]>; 8093 } 8094} 8095 8096multiclass GF2P8AFFINE_common<bits<8> Op, string OpStr, SDNode OpNode> { 8097 let Constraints = "$src1 = $dst", 8098 Predicates = [HasGFNI, UseSSE2] in 8099 defm NAME : GF2P8AFFINE_rmi<Op, OpStr, v16i8, OpNode, 8100 VR128, load, i128mem, SchedWriteVecIMul.XMM, 1>; 8101 let Predicates = [HasGFNI, HasAVX, NoVLX] in { 8102 defm V#NAME : GF2P8AFFINE_rmi<Op, "v"#OpStr, v16i8, OpNode, VR128, 8103 load, i128mem, SchedWriteVecIMul.XMM>, 8104 VEX_4V, VEX_W; 8105 defm V#NAME#Y : GF2P8AFFINE_rmi<Op, "v"#OpStr, v32i8, OpNode, VR256, 8106 load, i256mem, SchedWriteVecIMul.YMM>, 8107 VEX_4V, VEX_L, VEX_W; 8108 } 8109} 8110 8111// GF2P8MULB 8112let Constraints = "$src1 = $dst", 8113 Predicates = [HasGFNI, UseSSE2] in 8114defm GF2P8MULB : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memop, 8115 i128mem, SchedWriteVecALU.XMM, 1>; 8116let Predicates = [HasGFNI, HasAVX, NoVLX] in { 8117 defm VGF2P8MULB : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, load, 8118 i128mem, SchedWriteVecALU.XMM>, VEX_4V; 8119 defm VGF2P8MULBY : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, load, 8120 i256mem, SchedWriteVecALU.YMM>, VEX_4V, VEX_L; 8121} 8122// GF2P8AFFINEINVQB, GF2P8AFFINEQB 8123let isCommutable = 0 in { 8124 defm GF2P8AFFINEINVQB : GF2P8AFFINE_common<0xCF, "gf2p8affineinvqb", 8125 X86GF2P8affineinvqb>, TAPD; 8126 defm GF2P8AFFINEQB : GF2P8AFFINE_common<0xCE, "gf2p8affineqb", 8127 X86GF2P8affineqb>, TAPD; 8128} 8129 8130// AVX-IFMA 8131let Predicates = [HasAVXIFMA, NoVLX_Or_NoIFMA], Constraints = "$src1 = $dst", 8132 checkVEXPredicate = 1 in 8133multiclass avx_ifma_rm<bits<8> opc, string OpcodeStr, SDNode OpNode> { 8134 // NOTE: The SDNode have the multiply operands first with the add last. 8135 // This enables commuted load patterns to be autogenerated by tablegen. 8136 let isCommutable = 1 in { 8137 def rr : AVX8I<opc, MRMSrcReg, (outs VR128:$dst), 8138 (ins VR128:$src1, VR128:$src2, VR128:$src3), 8139 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 8140 [(set VR128:$dst, (v2i64 (OpNode VR128:$src2, 8141 VR128:$src3, VR128:$src1)))]>, 8142 VEX_4V, Sched<[SchedWriteVecIMul.XMM]>; 8143 } 8144 def rm : AVX8I<opc, MRMSrcMem, (outs VR128:$dst), 8145 (ins VR128:$src1, VR128:$src2, i128mem:$src3), 8146 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 8147 [(set VR128:$dst, (v2i64 (OpNode VR128:$src2, 8148 (loadv2i64 addr:$src3), VR128:$src1)))]>, 8149 VEX_4V, Sched<[SchedWriteVecIMul.XMM]>; 8150 let isCommutable = 1 in { 8151 def Yrr : AVX8I<opc, MRMSrcReg, (outs VR256:$dst), 8152 (ins VR256:$src1, VR256:$src2, VR256:$src3), 8153 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 8154 [(set VR256:$dst, (v4i64 (OpNode VR256:$src2, 8155 VR256:$src3, VR256:$src1)))]>, 8156 VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM]>; 8157 } 8158 def Yrm : AVX8I<opc, MRMSrcMem, (outs VR256:$dst), 8159 (ins VR256:$src1, VR256:$src2, i256mem:$src3), 8160 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 8161 [(set VR256:$dst, (v4i64 (OpNode VR256:$src2, 8162 (loadv4i64 addr:$src3), VR256:$src1)))]>, 8163 VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM]>; 8164} 8165 8166defm VPMADD52HUQ : avx_ifma_rm<0xb5, "vpmadd52huq", x86vpmadd52h>, VEX_W, ExplicitVEXPrefix; 8167defm VPMADD52LUQ : avx_ifma_rm<0xb4, "vpmadd52luq", x86vpmadd52l>, VEX_W, ExplicitVEXPrefix; 8168 8169// AVX-VNNI-INT8 8170let Constraints = "$src1 = $dst" in 8171multiclass avx_dotprod_rm<bits<8> Opc, string OpcodeStr, ValueType OpVT, 8172 RegisterClass RC, PatFrag MemOpFrag, 8173 X86MemOperand X86memop, SDNode OpNode, 8174 X86FoldableSchedWrite Sched, 8175 bit IsCommutable> { 8176 let isCommutable = IsCommutable in 8177 def rr : I<Opc, MRMSrcReg, (outs RC:$dst), 8178 (ins RC:$src1, RC:$src2, RC:$src3), 8179 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 8180 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, 8181 VEX_4V, Sched<[Sched]>; 8182 def rm : I<Opc, MRMSrcMem, (outs RC:$dst), 8183 (ins RC:$src1, RC:$src2, X86memop:$src3), 8184 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 8185 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, 8186 (MemOpFrag addr:$src3))))]>, 8187 VEX_4V, Sched<[Sched.Folded, Sched.ReadAfterFold]>; 8188} 8189 8190let Predicates = [HasAVXVNNIINT8] in { 8191 defm VPDPBSSD : avx_dotprod_rm<0x50,"vpdpbssd", v4i32, VR128, loadv4i32, 8192 i128mem, X86vpdpbssd, SchedWriteVecIMul.XMM, 8193 1>, T8XD; 8194 defm VPDPBSSDY : avx_dotprod_rm<0x50,"vpdpbssd", v8i32, VR256, loadv8i32, 8195 i256mem, X86vpdpbssd, SchedWriteVecIMul.YMM, 8196 1>, VEX_L, T8XD; 8197 defm VPDPBUUD : avx_dotprod_rm<0x50,"vpdpbuud", v4i32, VR128, loadv4i32, 8198 i128mem, X86vpdpbuud, SchedWriteVecIMul.XMM, 8199 1>, T8PS; 8200 defm VPDPBUUDY : avx_dotprod_rm<0x50,"vpdpbuud", v8i32, VR256, loadv8i32, 8201 i256mem, X86vpdpbuud, SchedWriteVecIMul.YMM, 8202 1>, VEX_L, T8PS; 8203 defm VPDPBSSDS : avx_dotprod_rm<0x51,"vpdpbssds", v4i32, VR128, loadv4i32, 8204 i128mem, X86vpdpbssds, SchedWriteVecIMul.XMM, 8205 1>, T8XD; 8206 defm VPDPBSSDSY : avx_dotprod_rm<0x51,"vpdpbssds", v8i32, VR256, loadv8i32, 8207 i256mem, X86vpdpbssds, SchedWriteVecIMul.YMM, 8208 1>, VEX_L, T8XD; 8209 defm VPDPBUUDS : avx_dotprod_rm<0x51,"vpdpbuuds", v4i32, VR128, loadv4i32, 8210 i128mem, X86vpdpbuuds, SchedWriteVecIMul.XMM, 8211 1>, T8PS; 8212 defm VPDPBUUDSY : avx_dotprod_rm<0x51,"vpdpbuuds", v8i32, VR256, loadv8i32, 8213 i256mem, X86vpdpbuuds, SchedWriteVecIMul.YMM, 8214 1>, VEX_L, T8PS; 8215 defm VPDPBSUD : avx_dotprod_rm<0x50,"vpdpbsud", v4i32, VR128, loadv4i32, 8216 i128mem, X86vpdpbsud, SchedWriteVecIMul.XMM, 8217 0>, T8XS; 8218 defm VPDPBSUDY : avx_dotprod_rm<0x50,"vpdpbsud", v8i32, VR256, loadv8i32, 8219 i256mem, X86vpdpbsud, SchedWriteVecIMul.YMM, 8220 0>, VEX_L, T8XS; 8221 defm VPDPBSUDS : avx_dotprod_rm<0x51,"vpdpbsuds", v4i32, VR128, loadv4i32, 8222 i128mem, X86vpdpbsuds, SchedWriteVecIMul.XMM, 8223 0>, T8XS; 8224 defm VPDPBSUDSY : avx_dotprod_rm<0x51,"vpdpbsuds", v8i32, VR256, loadv8i32, 8225 i256mem, X86vpdpbsuds, SchedWriteVecIMul.YMM, 8226 0>, VEX_L, T8XS; 8227} 8228 8229// AVX-NE-CONVERT 8230multiclass AVX_NE_CONVERT_BASE<bits<8> Opcode, string OpcodeStr, 8231 X86MemOperand MemOp128, X86MemOperand MemOp256> { 8232 def rm : I<Opcode, MRMSrcMem, (outs VR128:$dst), (ins MemOp128:$src), 8233 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 8234 [(set VR128:$dst, 8235 (!cast<Intrinsic>("int_x86_"#OpcodeStr#"128") addr:$src))]>, 8236 Sched<[WriteCvtPH2PS]>, VEX; 8237 def Yrm : I<Opcode, MRMSrcMem, (outs VR256:$dst), (ins MemOp256:$src), 8238 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 8239 [(set VR256:$dst, 8240 (!cast<Intrinsic>("int_x86_"#OpcodeStr#"256") addr:$src))]>, 8241 Sched<[WriteCvtPH2PSY]>, VEX, VEX_L; 8242} 8243 8244multiclass VCVTNEPS2BF16_BASE { 8245 def rr : I<0x72, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 8246 "vcvtneps2bf16\t{$src, $dst|$dst, $src}", 8247 [(set VR128:$dst, (int_x86_vcvtneps2bf16128 VR128:$src))]>, 8248 Sched<[WriteCvtPH2PS]>; 8249 def rm : I<0x72, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 8250 "vcvtneps2bf16{x}\t{$src, $dst|$dst, $src}", 8251 [(set VR128:$dst, (int_x86_vcvtneps2bf16128 (loadv4f32 addr:$src)))]>, 8252 Sched<[WriteCvtPH2PS]>; 8253 def Yrr : I<0x72, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), 8254 "vcvtneps2bf16\t{$src, $dst|$dst, $src}", 8255 [(set VR128:$dst, (int_x86_vcvtneps2bf16256 VR256:$src))]>, 8256 Sched<[WriteCvtPH2PSY]>, VEX_L; 8257 def Yrm : I<0x72, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), 8258 "vcvtneps2bf16{y}\t{$src, $dst|$dst, $src}", 8259 [(set VR128:$dst, (int_x86_vcvtneps2bf16256 (loadv8f32 addr:$src)))]>, 8260 Sched<[WriteCvtPH2PSY]>, VEX_L; 8261} 8262 8263let Predicates = [HasAVXNECONVERT] in { 8264 defm VBCSTNEBF162PS : AVX_NE_CONVERT_BASE<0xb1, "vbcstnebf162ps", f16mem, 8265 f16mem>, T8XS; 8266 defm VBCSTNESH2PS : AVX_NE_CONVERT_BASE<0xb1, "vbcstnesh2ps", f16mem, f16mem>, 8267 T8PD; 8268 defm VCVTNEEBF162PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneebf162ps", f128mem, 8269 f256mem>, T8XS; 8270 defm VCVTNEEPH2PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneeph2ps", f128mem, 8271 f256mem>, T8PD; 8272 defm VCVTNEOBF162PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneobf162ps", f128mem, 8273 f256mem>, T8XD; 8274 defm VCVTNEOPH2PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneoph2ps", f128mem, 8275 f256mem>, T8PS; 8276 let checkVEXPredicate = 1 in 8277 defm VCVTNEPS2BF16 : VCVTNEPS2BF16_BASE, VEX, T8XS, ExplicitVEXPrefix; 8278} 8279 8280def : InstAlias<"vcvtneps2bf16x\t{$src, $dst|$dst, $src}", 8281 (VCVTNEPS2BF16rr VR128:$dst, VR128:$src), 0, "att">; 8282def : InstAlias<"vcvtneps2bf16y\t{$src, $dst|$dst, $src}", 8283 (VCVTNEPS2BF16Yrr VR128:$dst, VR256:$src), 0, "att">; 8284