1//=- ARMSchedCyclone.td - AArch64 Cyclone Scheduling Defs ----*- tablegen -*-=// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the machine model for AArch64 Cyclone to support 11// instruction scheduling and other instruction cost heuristics. 12// 13//===----------------------------------------------------------------------===// 14 15def CycloneModel : SchedMachineModel { 16 let IssueWidth = 6; // 6 micro-ops are dispatched per cycle. 17 let MicroOpBufferSize = 192; // Based on the reorder buffer. 18 let LoadLatency = 4; // Optimistic load latency. 19 let MispredictPenalty = 16; // 14-19 cycles are typical. 20} 21 22//===----------------------------------------------------------------------===// 23// Define each kind of processor resource and number available on Cyclone. 24 25// 4 integer pipes 26def CyUnitI : ProcResource<4> { 27 let BufferSize = 48; 28} 29 30// 2 branch units: I[0..1] 31def CyUnitB : ProcResource<2> { 32 let Super = CyUnitI; 33 let BufferSize = 24; 34} 35 36// 1 indirect-branch unit: I[0] 37def CyUnitBR : ProcResource<1> { 38 let Super = CyUnitB; 39} 40 41// 2 shifter pipes: I[2..3] 42// When an instruction consumes a CyUnitIS, it also consumes a CyUnitI 43def CyUnitIS : ProcResource<2> { 44 let Super = CyUnitI; 45 let BufferSize = 24; 46} 47 48// 1 mul pipe: I[0] 49def CyUnitIM : ProcResource<1> { 50 let Super = CyUnitBR; 51 let BufferSize = 32; 52} 53 54// 1 div pipe: I[1] 55def CyUnitID : ProcResource<1> { 56 let Super = CyUnitB; 57 let BufferSize = 16; 58} 59 60// 1 integer division unit. This is driven by the ID pipe, but only 61// consumes the pipe for one cycle at issue and another cycle at writeback. 62def CyUnitIntDiv : ProcResource<1>; 63 64// 2 ld/st pipes. 65def CyUnitLS : ProcResource<2> { 66 let BufferSize = 28; 67} 68 69// 3 fp/vector pipes. 70def CyUnitV : ProcResource<3> { 71 let BufferSize = 48; 72} 73// 2 fp/vector arithmetic and multiply pipes: V[0-1] 74def CyUnitVM : ProcResource<2> { 75 let Super = CyUnitV; 76 let BufferSize = 32; 77} 78// 1 fp/vector division/sqrt pipe: V[2] 79def CyUnitVD : ProcResource<1> { 80 let Super = CyUnitV; 81 let BufferSize = 16; 82} 83// 1 fp compare pipe: V[0] 84def CyUnitVC : ProcResource<1> { 85 let Super = CyUnitVM; 86 let BufferSize = 16; 87} 88 89// 2 fp division/square-root units. These are driven by the VD pipe, 90// but only consume the pipe for one cycle at issue and a cycle at writeback. 91def CyUnitFloatDiv : ProcResource<2>; 92 93//===----------------------------------------------------------------------===// 94// Define scheduler read/write resources and latency on Cyclone. 95// This mirrors sections 7.7-7.9 of the Tuning Guide v1.0.1. 96 97let SchedModel = CycloneModel in { 98 99//--- 100// 7.8.1. Moves 101//--- 102 103// A single nop micro-op (uX). 104def WriteX : SchedWriteRes<[]> { let Latency = 0; } 105 106// Move zero is a register rename (to machine register zero). 107// The move is replaced by a single nop micro-op. 108// MOVZ Rd, #0 109// AND Rd, Rzr, #imm 110def WriteZPred : SchedPredicate<[{TII->isGPRZero(MI)}]>; 111def WriteImmZ : SchedWriteVariant<[ 112 SchedVar<WriteZPred, [WriteX]>, 113 SchedVar<NoSchedPred, [WriteImm]>]>; 114def : InstRW<[WriteImmZ], (instrs MOVZWi,MOVZXi,ANDWri,ANDXri)>; 115 116// Move GPR is a register rename and single nop micro-op. 117// ORR Xd, XZR, Xm 118// ADD Xd, Xn, #0 119def WriteIMovPred : SchedPredicate<[{TII->isGPRCopy(MI)}]>; 120def WriteVMovPred : SchedPredicate<[{TII->isFPRCopy(MI)}]>; 121def WriteMov : SchedWriteVariant<[ 122 SchedVar<WriteIMovPred, [WriteX]>, 123 SchedVar<WriteVMovPred, [WriteX]>, 124 SchedVar<NoSchedPred, [WriteI]>]>; 125def : InstRW<[WriteMov], (instrs COPY,ORRXrr,ADDXrr)>; 126 127// Move non-zero immediate is an integer ALU op. 128// MOVN,MOVZ,MOVK 129def : WriteRes<WriteImm, [CyUnitI]>; 130 131//--- 132// 7.8.2-7.8.5. Arithmetic and Logical, Comparison, Conditional, 133// Shifts and Bitfield Operations 134//--- 135 136// ADR,ADRP 137// ADD(S)ri,SUB(S)ri,AND(S)ri,EORri,ORRri 138// ADD(S)rr,SUB(S)rr,AND(S)rr,BIC(S)rr,EONrr,EORrr,ORNrr,ORRrr 139// ADC(S),SBC(S) 140// Aliases: CMN, CMP, TST 141// 142// Conditional operations. 143// CCMNi,CCMPi,CCMNr,CCMPr, 144// CSEL,CSINC,CSINV,CSNEG 145// 146// Bit counting and reversal operations. 147// CLS,CLZ,RBIT,REV,REV16,REV32 148def : WriteRes<WriteI, [CyUnitI]>; 149 150// ADD with shifted register operand is a single micro-op that 151// consumes a shift pipeline for two cycles. 152// ADD(S)rs,SUB(S)rs,AND(S)rs,BIC(S)rs,EONrs,EORrs,ORNrs,ORRrs 153// EXAMPLE: ADDrs Xn, Xm LSL #imm 154def : WriteRes<WriteISReg, [CyUnitIS]> { 155 let Latency = 2; 156 let ResourceCycles = [2]; 157} 158 159// ADD with extended register operand is the same as shifted reg operand. 160// ADD(S)re,SUB(S)re 161// EXAMPLE: ADDXre Xn, Xm, UXTB #1 162def : WriteRes<WriteIEReg, [CyUnitIS]> { 163 let Latency = 2; 164 let ResourceCycles = [2]; 165} 166 167// Variable shift and bitfield operations. 168// ASRV,LSLV,LSRV,RORV,BFM,SBFM,UBFM 169def : WriteRes<WriteIS, [CyUnitIS]>; 170 171// EXTR Shifts a pair of registers and requires two micro-ops. 172// The second micro-op is delayed, as modeled by ReadExtrHi. 173// EXTR Xn, Xm, #imm 174def : WriteRes<WriteExtr, [CyUnitIS, CyUnitIS]> { 175 let Latency = 2; 176 let NumMicroOps = 2; 177} 178 179// EXTR's first register read is delayed by one cycle, effectively 180// shortening its writer's latency. 181// EXTR Xn, Xm, #imm 182def : ReadAdvance<ReadExtrHi, 1>; 183 184//--- 185// 7.8.6. Multiplies 186//--- 187 188// MUL/MNEG are aliases for MADD/MSUB. 189// MADDW,MSUBW,SMADDL,SMSUBL,UMADDL,UMSUBL 190def : WriteRes<WriteIM32, [CyUnitIM]> { 191 let Latency = 4; 192} 193// MADDX,MSUBX,SMULH,UMULH 194def : WriteRes<WriteIM64, [CyUnitIM]> { 195 let Latency = 5; 196} 197 198//--- 199// 7.8.7. Divide 200//--- 201 202// 32-bit divide takes 7-13 cycles. 10 cycles covers a 20-bit quotient. 203// The ID pipe is consumed for 2 cycles: issue and writeback. 204// SDIVW,UDIVW 205def : WriteRes<WriteID32, [CyUnitID, CyUnitIntDiv]> { 206 let Latency = 10; 207 let ResourceCycles = [2, 10]; 208} 209// 64-bit divide takes 7-21 cycles. 13 cycles covers a 32-bit quotient. 210// The ID pipe is consumed for 2 cycles: issue and writeback. 211// SDIVX,UDIVX 212def : WriteRes<WriteID64, [CyUnitID, CyUnitIntDiv]> { 213 let Latency = 13; 214 let ResourceCycles = [2, 13]; 215} 216 217//--- 218// 7.8.8,7.8.10. Load/Store, single element 219//--- 220 221// Integer loads take 4 cycles and use one LS unit for one cycle. 222def : WriteRes<WriteLD, [CyUnitLS]> { 223 let Latency = 4; 224} 225 226// Store-load forwarding is 4 cycles. 227// 228// Note: The store-exclusive sequence incorporates this 229// latency. However, general heuristics should not model the 230// dependence between a store and subsequent may-alias load because 231// hardware speculation works. 232def : WriteRes<WriteST, [CyUnitLS]> { 233 let Latency = 4; 234} 235 236// Load from base address plus an optionally scaled register offset. 237// Rt latency is latency WriteIS + WriteLD. 238// EXAMPLE: LDR Xn, Xm [, lsl 3] 239def CyWriteLDIdx : SchedWriteVariant<[ 240 SchedVar<ScaledIdxPred, [WriteIS, WriteLD]>, // Load from scaled register. 241 SchedVar<NoSchedPred, [WriteLD]>]>; // Load from register offset. 242def : SchedAlias<WriteLDIdx, CyWriteLDIdx>; // Map AArch64->Cyclone type. 243 244// EXAMPLE: STR Xn, Xm [, lsl 3] 245def CyWriteSTIdx : SchedWriteVariant<[ 246 SchedVar<ScaledIdxPred, [WriteIS, WriteST]>, // Store to scaled register. 247 SchedVar<NoSchedPred, [WriteST]>]>; // Store to register offset. 248def : SchedAlias<WriteSTIdx, CyWriteSTIdx>; // Map AArch64->Cyclone type. 249 250// Read the (unshifted) base register Xn in the second micro-op one cycle later. 251// EXAMPLE: LDR Xn, Xm [, lsl 3] 252def ReadBaseRS : SchedReadAdvance<1>; 253def CyReadAdrBase : SchedReadVariant<[ 254 SchedVar<ScaledIdxPred, [ReadBaseRS]>, // Read base reg after shifting offset. 255 SchedVar<NoSchedPred, [ReadDefault]>]>; // Read base reg with no shift. 256def : SchedAlias<ReadAdrBase, CyReadAdrBase>; // Map AArch64->Cyclone type. 257 258//--- 259// 7.8.9,7.8.11. Load/Store, paired 260//--- 261 262// Address pre/post increment is a simple ALU op with one cycle latency. 263def : WriteRes<WriteAdr, [CyUnitI]>; 264 265// LDP high register write is fused with the load, but a nop micro-op remains. 266def : WriteRes<WriteLDHi, []> { 267 let Latency = 4; 268} 269 270// STP is a vector op and store, except for QQ, which is just two stores. 271def : SchedAlias<WriteSTP, WriteVSTShuffle>; 272def : InstRW<[WriteST, WriteST], (instrs STPQi)>; 273 274//--- 275// 7.8.13. Branches 276//--- 277 278// Branches take a single micro-op. 279// The misprediction penalty is defined as a SchedMachineModel property. 280def : WriteRes<WriteBr, [CyUnitB]> {let Latency = 0;} 281def : WriteRes<WriteBrReg, [CyUnitBR]> {let Latency = 0;} 282 283//--- 284// 7.8.14. Never-issued Instructions, Barrier and Hint Operations 285//--- 286 287// NOP,SEV,SEVL,WFE,WFI,YIELD 288def : WriteRes<WriteHint, []> {let Latency = 0;} 289// ISB 290def : InstRW<[WriteI], (instrs ISB)>; 291// SLREX,DMB,DSB 292def : WriteRes<WriteBarrier, [CyUnitLS]>; 293 294// System instructions get an invalid latency because the latency of 295// other operations across them is meaningless. 296def : WriteRes<WriteSys, []> {let Latency = -1;} 297 298//===----------------------------------------------------------------------===// 299// 7.9 Vector Unit Instructions 300 301// Simple vector operations take 2 cycles. 302def : WriteRes<WriteV, [CyUnitV]> {let Latency = 2;} 303 304// Define some longer latency vector op types for Cyclone. 305def CyWriteV3 : SchedWriteRes<[CyUnitV]> {let Latency = 3;} 306def CyWriteV4 : SchedWriteRes<[CyUnitV]> {let Latency = 4;} 307def CyWriteV5 : SchedWriteRes<[CyUnitV]> {let Latency = 5;} 308def CyWriteV6 : SchedWriteRes<[CyUnitV]> {let Latency = 6;} 309 310// Simple floating-point operations take 2 cycles. 311def : WriteRes<WriteF, [CyUnitV]> {let Latency = 2;} 312 313//--- 314// 7.9.1 Vector Moves 315//--- 316 317// TODO: Add Cyclone-specific zero-cycle zeros. LLVM currently 318// generates expensive int-float conversion instead: 319// FMOVDi Dd, #0.0 320// FMOVv2f64ns Vd.2d, #0.0 321 322// FMOVSi,FMOVDi 323def : WriteRes<WriteFImm, [CyUnitV]> {let Latency = 2;} 324 325// MOVI,MVNI are WriteV 326// FMOVv2f32ns,FMOVv2f64ns,FMOVv4f32ns are WriteV 327 328// Move FPR is a register rename and single nop micro-op. 329// ORR.16b Vd,Vn,Vn 330// COPY is handled above in the WriteMov Variant. 331def WriteVMov : SchedWriteVariant<[ 332 SchedVar<WriteVMovPred, [WriteX]>, 333 SchedVar<NoSchedPred, [WriteV]>]>; 334def : InstRW<[WriteVMov], (instrs ORRv16i8)>; 335 336// FMOVSr,FMOVDr are WriteF. 337 338// MOV V,V is a WriteV. 339 340// CPY D,V[x] is a WriteV 341 342// INS V[x],V[y] is a WriteV. 343 344// FMOVWSr,FMOVXDr,FMOVXDHighr 345def : WriteRes<WriteFCopy, [CyUnitLS]> { 346 let Latency = 5; 347} 348 349// FMOVSWr,FMOVDXr 350def : InstRW<[WriteLD], (instrs FMOVSWr,FMOVDXr,FMOVDXHighr)>; 351 352// INS V[x],R 353def CyWriteCopyToFPR : WriteSequence<[WriteVLD, WriteV]>; 354def : InstRW<[CyWriteCopyToFPR], (instregex "INSv")>; 355 356// SMOV,UMOV R,V[x] 357def CyWriteCopyToGPR : WriteSequence<[WriteLD, WriteI]>; 358def : InstRW<[CyWriteCopyToGPR], (instregex "SMOVv","UMOVv")>; 359 360// DUP V,R 361def : InstRW<[CyWriteCopyToFPR], (instregex "DUPv")>; 362 363// DUP V,V[x] is a WriteV. 364 365//--- 366// 7.9.2 Integer Arithmetic, Logical, and Comparisons 367//--- 368 369// BIC,ORR V,#imm are WriteV 370 371def : InstRW<[CyWriteV3], (instregex "ABSv")>; 372 373// MVN,NEG,NOT are WriteV 374 375def : InstRW<[CyWriteV3], (instregex "SQABSv","SQNEGv")>; 376 377// ADDP is a WriteV. 378def CyWriteVADDLP : SchedWriteRes<[CyUnitV]> {let Latency = 2;} 379def : InstRW<[CyWriteVADDLP], (instregex "SADDLPv","UADDLPv")>; 380 381def : InstRW<[CyWriteV3], 382 (instregex "ADDVv","SMAXVv","UMAXVv","SMINVv","UMINVv")>; 383 384def : InstRW<[CyWriteV3], (instregex "SADDLV","UADDLV")>; 385 386// ADD,SUB are WriteV 387 388// Forward declare. 389def CyWriteVABD : SchedWriteRes<[CyUnitV]> {let Latency = 3;} 390 391// Add/Diff and accumulate uses the vector multiply unit. 392def CyWriteVAccum : SchedWriteRes<[CyUnitVM]> {let Latency = 3;} 393def CyReadVAccum : SchedReadAdvance<1, 394 [CyWriteVAccum, CyWriteVADDLP, CyWriteVABD]>; 395 396def : InstRW<[CyWriteVAccum, CyReadVAccum], 397 (instregex "SADALP","UADALP")>; 398 399def : InstRW<[CyWriteVAccum, CyReadVAccum], 400 (instregex "SABAv","UABAv","SABALv","UABALv")>; 401 402def : InstRW<[CyWriteV3], (instregex "SQADDv","SQSUBv","UQADDv","UQSUBv")>; 403 404def : InstRW<[CyWriteV3], (instregex "SUQADDv","USQADDv")>; 405 406def : InstRW<[CyWriteV4], (instregex "ADDHNv","RADDHNv", "RSUBHNv", "SUBHNv")>; 407 408// WriteV includes: 409// AND,BIC,CMTST,EOR,ORN,ORR 410// ADDP 411// SHADD,SHSUB,SRHADD,UHADD,UHSUB,URHADD 412// SADDL,SSUBL,UADDL,USUBL 413// SADDW,SSUBW,UADDW,USUBW 414 415def : InstRW<[CyWriteV3], (instregex "CMEQv","CMGEv","CMGTv", 416 "CMLEv","CMLTv", 417 "CMHIv","CMHSv")>; 418 419def : InstRW<[CyWriteV3], (instregex "SMAXv","SMINv","UMAXv","UMINv", 420 "SMAXPv","SMINPv","UMAXPv","UMINPv")>; 421 422def : InstRW<[CyWriteVABD], (instregex "SABDv","UABDv", 423 "SABDLv","UABDLv")>; 424 425//--- 426// 7.9.3 Floating Point Arithmetic and Comparisons 427//--- 428 429// FABS,FNEG are WriteF 430 431def : InstRW<[CyWriteV4], (instrs FADDPv2i32p)>; 432def : InstRW<[CyWriteV5], (instrs FADDPv2i64p)>; 433 434def : InstRW<[CyWriteV3], (instregex "FMAXPv2i","FMAXNMPv2i", 435 "FMINPv2i","FMINNMPv2i")>; 436 437def : InstRW<[CyWriteV4], (instregex "FMAXVv","FMAXNMVv","FMINVv","FMINNMVv")>; 438 439def : InstRW<[CyWriteV4], (instrs FADDSrr,FADDv2f32,FADDv4f32, 440 FSUBSrr,FSUBv2f32,FSUBv4f32, 441 FADDPv2f32,FADDPv4f32, 442 FABD32,FABDv2f32,FABDv4f32)>; 443def : InstRW<[CyWriteV5], (instrs FADDDrr,FADDv2f64, 444 FSUBDrr,FSUBv2f64, 445 FADDPv2f64, 446 FABD64,FABDv2f64)>; 447 448def : InstRW<[CyWriteV3], (instregex "FCMEQ","FCMGT","FCMLE","FCMLT")>; 449 450def : InstRW<[CyWriteV3], (instregex "FACGE","FACGT", 451 "FMAXS","FMAXD","FMAXv", 452 "FMINS","FMIND","FMINv", 453 "FMAXNMS","FMAXNMD","FMAXNMv", 454 "FMINNMS","FMINNMD","FMINNMv", 455 "FMAXPv2f","FMAXPv4f", 456 "FMINPv2f","FMINPv4f", 457 "FMAXNMPv2f","FMAXNMPv4f", 458 "FMINNMPv2f","FMINNMPv4f")>; 459 460// FCMP,FCMPE,FCCMP,FCCMPE 461def : WriteRes<WriteFCmp, [CyUnitVC]> {let Latency = 4;} 462 463// FCSEL is a WriteF. 464 465//--- 466// 7.9.4 Shifts and Bitfield Operations 467//--- 468 469// SHL is a WriteV 470 471def CyWriteVSHR : SchedWriteRes<[CyUnitV]> {let Latency = 2;} 472def : InstRW<[CyWriteVSHR], (instregex "SSHRv","USHRv")>; 473 474def CyWriteVSRSHR : SchedWriteRes<[CyUnitV]> {let Latency = 3;} 475def : InstRW<[CyWriteVSRSHR], (instregex "SRSHRv","URSHRv")>; 476 477// Shift and accumulate uses the vector multiply unit. 478def CyWriteVShiftAcc : SchedWriteRes<[CyUnitVM]> {let Latency = 3;} 479def CyReadVShiftAcc : SchedReadAdvance<1, 480 [CyWriteVShiftAcc, CyWriteVSHR, CyWriteVSRSHR]>; 481def : InstRW<[CyWriteVShiftAcc, CyReadVShiftAcc], 482 (instregex "SRSRAv","SSRAv","URSRAv","USRAv")>; 483 484// SSHL,USHL are WriteV. 485 486def : InstRW<[CyWriteV3], (instregex "SRSHLv","URSHLv")>; 487 488// SQSHL,SQSHLU,UQSHL are WriteV. 489 490def : InstRW<[CyWriteV3], (instregex "SQRSHLv","UQRSHLv")>; 491 492// WriteV includes: 493// SHLL,SSHLL,USHLL 494// SLI,SRI 495// BIF,BIT,BSL 496// EXT 497// CLS,CLZ,CNT,RBIT,REV16,REV32,REV64,XTN 498// XTN2 499 500def : InstRW<[CyWriteV4], 501 (instregex "RSHRNv","SHRNv", 502 "SQRSHRNv","SQRSHRUNv","SQSHRNv","SQSHRUNv", 503 "UQRSHRNv","UQSHRNv","SQXTNv","SQXTUNv","UQXTNv")>; 504 505//--- 506// 7.9.5 Multiplication 507//--- 508 509def CyWriteVMul : SchedWriteRes<[CyUnitVM]> { let Latency = 4;} 510def : InstRW<[CyWriteVMul], (instregex "MULv","SMULLv","UMULLv", 511 "SQDMULLv","SQDMULHv","SQRDMULHv")>; 512 513// FMUL,FMULX,FNMUL default to WriteFMul. 514def : WriteRes<WriteFMul, [CyUnitVM]> { let Latency = 4;} 515 516def CyWriteV64Mul : SchedWriteRes<[CyUnitVM]> { let Latency = 5;} 517def : InstRW<[CyWriteV64Mul], (instrs FMULDrr,FMULv2f64,FMULv2i64_indexed, 518 FNMULDrr,FMULX64,FMULXv2f64,FMULXv2i64_indexed)>; 519 520def CyReadVMulAcc : SchedReadAdvance<1, [CyWriteVMul, CyWriteV64Mul]>; 521def : InstRW<[CyWriteVMul, CyReadVMulAcc], 522 (instregex "MLA","MLS","SMLAL","SMLSL","UMLAL","UMLSL", 523 "SQDMLAL","SQDMLSL")>; 524 525def CyWriteSMul : SchedWriteRes<[CyUnitVM]> { let Latency = 8;} 526def CyWriteDMul : SchedWriteRes<[CyUnitVM]> { let Latency = 10;} 527def CyReadSMul : SchedReadAdvance<4, [CyWriteSMul]>; 528def CyReadDMul : SchedReadAdvance<5, [CyWriteDMul]>; 529 530def : InstRW<[CyWriteSMul, CyReadSMul], 531 (instrs FMADDSrrr,FMSUBSrrr,FNMADDSrrr,FNMSUBSrrr, 532 FMLAv2f32,FMLAv4f32, 533 FMLAv1i32_indexed,FMLAv1i64_indexed,FMLAv2i32_indexed)>; 534def : InstRW<[CyWriteDMul, CyReadDMul], 535 (instrs FMADDDrrr,FMSUBDrrr,FNMADDDrrr,FNMSUBDrrr, 536 FMLAv2f64,FMLAv2i64_indexed, 537 FMLSv2f64,FMLSv2i64_indexed)>; 538 539def CyWritePMUL : SchedWriteRes<[CyUnitVD]> { let Latency = 3; } 540def : InstRW<[CyWritePMUL], (instregex "PMULv", "PMULLv")>; 541 542//--- 543// 7.9.6 Divide and Square Root 544//--- 545 546// FDIV,FSQRT 547// TODO: Add 64-bit variant with 19 cycle latency. 548// TODO: Specialize FSQRT for longer latency. 549def : WriteRes<WriteFDiv, [CyUnitVD, CyUnitFloatDiv]> { 550 let Latency = 17; 551 let ResourceCycles = [2, 17]; 552} 553 554def : InstRW<[CyWriteV4], (instregex "FRECPEv","FRECPXv","URECPEv","URSQRTEv")>; 555 556def WriteFRSQRTE : SchedWriteRes<[CyUnitVM]> { let Latency = 4; } 557def : InstRW<[WriteFRSQRTE], (instregex "FRSQRTEv")>; 558 559def WriteFRECPS : SchedWriteRes<[CyUnitVM]> { let Latency = 8; } 560def WriteFRSQRTS : SchedWriteRes<[CyUnitVM]> { let Latency = 10; } 561def : InstRW<[WriteFRECPS], (instregex "FRECPSv")>; 562def : InstRW<[WriteFRSQRTS], (instregex "FRSQRTSv")>; 563 564//--- 565// 7.9.7 Integer-FP Conversions 566//--- 567 568// FCVT lengthen f16/s32 569def : InstRW<[WriteV], (instrs FCVTSHr,FCVTDHr,FCVTDSr)>; 570 571// FCVT,FCVTN,FCVTXN 572// SCVTF,UCVTF V,V 573// FRINT(AIMNPXZ) V,V 574def : WriteRes<WriteFCvt, [CyUnitV]> {let Latency = 4;} 575 576// SCVT/UCVT S/D, Rd = VLD5+V4: 9 cycles. 577def CyWriteCvtToFPR : WriteSequence<[WriteVLD, CyWriteV4]>; 578def : InstRW<[CyWriteCopyToFPR], (instregex "FCVT[AMNPZ][SU][SU][WX][SD]r")>; 579 580// FCVT Rd, S/D = V6+LD4: 10 cycles 581def CyWriteCvtToGPR : WriteSequence<[CyWriteV6, WriteLD]>; 582def : InstRW<[CyWriteCvtToGPR], (instregex "[SU]CVTF[SU][WX][SD]r")>; 583 584// FCVTL is a WriteV 585 586//--- 587// 7.9.8-7.9.10 Cryptography, Data Transposition, Table Lookup 588//--- 589 590def CyWriteCrypto2 : SchedWriteRes<[CyUnitVD]> {let Latency = 2;} 591def : InstRW<[CyWriteCrypto2], (instrs AESIMCrr, AESMCrr, SHA1Hrr, 592 AESDrr, AESErr, SHA1SU1rr, SHA256SU0rr, 593 SHA1SU0rrr)>; 594 595def CyWriteCrypto3 : SchedWriteRes<[CyUnitVD]> {let Latency = 3;} 596def : InstRW<[CyWriteCrypto3], (instrs SHA256SU1rrr)>; 597 598def CyWriteCrypto6 : SchedWriteRes<[CyUnitVD]> {let Latency = 6;} 599def : InstRW<[CyWriteCrypto6], (instrs SHA1Crrr, SHA1Mrrr, SHA1Prrr, 600 SHA256Hrrr,SHA256H2rrr)>; 601 602// TRN,UZP,ZUP are WriteV. 603 604// TBL,TBX are WriteV. 605 606//--- 607// 7.9.11-7.9.14 Load/Store, single element and paired 608//--- 609 610// Loading into the vector unit takes 5 cycles vs 4 for integer loads. 611def : WriteRes<WriteVLD, [CyUnitLS]> { 612 let Latency = 5; 613} 614 615// Store-load forwarding is 4 cycles. 616def : WriteRes<WriteVST, [CyUnitLS]> { 617 let Latency = 4; 618} 619 620// WriteVLDPair/VSTPair sequences are expanded by the target description. 621 622//--- 623// 7.9.15 Load, element operations 624//--- 625 626// Only the first WriteVLD and WriteAdr for writeback matches def operands. 627// Subsequent WriteVLDs consume resources. Since all loaded values have the 628// same latency, this is acceptable. 629 630// Vd is read 5 cycles after issuing the vector load. 631def : ReadAdvance<ReadVLD, 5>; 632 633def : InstRW<[WriteVLD], 634 (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; 635def : InstRW<[WriteVLD, WriteAdr], 636 (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>; 637 638// Register writes from the load's high half are fused micro-ops. 639def : InstRW<[WriteVLD], 640 (instregex "LD1Twov(8b|4h|2s|1d)$")>; 641def : InstRW<[WriteVLD, WriteAdr], 642 (instregex "LD1Twov(8b|4h|2s|1d)_POST")>; 643def : InstRW<[WriteVLD, WriteVLD], 644 (instregex "LD1Twov(16b|8h|4s|2d)$")>; 645def : InstRW<[WriteVLD, WriteAdr, WriteVLD], 646 (instregex "LD1Twov(16b|8h|4s|2d)_POST")>; 647 648def : InstRW<[WriteVLD, WriteVLD], 649 (instregex "LD1Threev(8b|4h|2s|1d)$")>; 650def : InstRW<[WriteVLD, WriteAdr, WriteVLD], 651 (instregex "LD1Threev(8b|4h|2s|1d)_POST")>; 652def : InstRW<[WriteVLD, WriteVLD, WriteVLD], 653 (instregex "LD1Threev(16b|8h|4s|2d)$")>; 654def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD], 655 (instregex "LD1Threev(16b|8h|4s|2d)_POST")>; 656 657def : InstRW<[WriteVLD, WriteVLD], 658 (instregex "LD1Fourv(8b|4h|2s|1d)$")>; 659def : InstRW<[WriteVLD, WriteAdr, WriteVLD], 660 (instregex "LD1Fourv(8b|4h|2s|1d)_POST")>; 661def : InstRW<[WriteVLD, WriteVLD, WriteVLD, WriteVLD], 662 (instregex "LD1Fourv(16b|8h|4s|2d)$")>; 663def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD, WriteVLD], 664 (instregex "LD1Fourv(16b|8h|4s|2d)_POST")>; 665 666def : InstRW<[WriteVLDShuffle, ReadVLD], 667 (instregex "LD1i(8|16|32)$")>; 668def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr], 669 (instregex "LD1i(8|16|32)_POST")>; 670 671def : InstRW<[WriteVLDShuffle, ReadVLD], (instrs LD1i64)>; 672def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr],(instrs LD1i64_POST)>; 673 674def : InstRW<[WriteVLDShuffle], 675 (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; 676def : InstRW<[WriteVLDShuffle, WriteAdr], 677 (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; 678 679def : InstRW<[WriteVLDShuffle, WriteV], 680 (instregex "LD2Twov(8b|4h|2s)$")>; 681def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV], 682 (instregex "LD2Twov(8b|4h|2s)_POST$")>; 683def : InstRW<[WriteVLDShuffle, WriteVLDShuffle], 684 (instregex "LD2Twov(16b|8h|4s|2d)$")>; 685def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle], 686 (instregex "LD2Twov(16b|8h|4s|2d)_POST")>; 687 688def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV], 689 (instregex "LD2i(8|16|32)$")>; 690def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV], 691 (instregex "LD2i(8|16|32)_POST")>; 692def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV], 693 (instregex "LD2i64$")>; 694def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV], 695 (instregex "LD2i64_POST")>; 696 697def : InstRW<[WriteVLDShuffle, WriteV], 698 (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; 699def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV], 700 (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>; 701 702def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV], 703 (instregex "LD3Threev(8b|4h|2s)$")>; 704def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV], 705 (instregex "LD3Threev(8b|4h|2s)_POST")>; 706def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteVLDShuffle], 707 (instregex "LD3Threev(16b|8h|4s|2d)$")>; 708def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteVLDShuffle], 709 (instregex "LD3Threev(16b|8h|4s|2d)_POST")>; 710 711def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV], 712 (instregex "LD3i(8|16|32)$")>; 713def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV], 714 (instregex "LD3i(8|16|32)_POST")>; 715 716def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV], 717 (instregex "LD3i64$")>; 718def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV], 719 (instregex "LD3i64_POST")>; 720 721def : InstRW<[WriteVLDShuffle, WriteV, WriteV], 722 (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)$")>; 723def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV], 724 (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)_POST")>; 725 726def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV], 727 (instrs LD3Rv1d,LD3Rv2d)>; 728def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV], 729 (instrs LD3Rv2d_POST,LD3Rv2d_POST)>; 730 731def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV], 732 (instregex "LD4Fourv(8b|4h|2s)$")>; 733def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV], 734 (instregex "LD4Fourv(8b|4h|2s)_POST")>; 735def : InstRW<[WriteVLDPairShuffle, WriteVLDPairShuffle, 736 WriteVLDPairShuffle, WriteVLDPairShuffle], 737 (instregex "LD4Fourv(16b|8h|4s|2d)$")>; 738def : InstRW<[WriteVLDPairShuffle, WriteAdr, WriteVLDPairShuffle, 739 WriteVLDPairShuffle, WriteVLDPairShuffle], 740 (instregex "LD4Fourv(16b|8h|4s|2d)_POST")>; 741 742def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV, WriteV], 743 (instregex "LD4i(8|16|32)$")>; 744def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV, WriteV], 745 (instregex "LD4i(8|16|32)_POST")>; 746 747 748def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV, WriteV], 749 (instrs LD4i64)>; 750def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV], 751 (instrs LD4i64_POST)>; 752 753def : InstRW<[WriteVLDShuffle, WriteV, WriteV, WriteV], 754 (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)$")>; 755def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV, WriteV], 756 (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)_POST")>; 757 758def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV], 759 (instrs LD4Rv1d,LD4Rv2d)>; 760def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV], 761 (instrs LD4Rv1d_POST,LD4Rv2d_POST)>; 762 763//--- 764// 7.9.16 Store, element operations 765//--- 766 767// Only the WriteAdr for writeback matches a def operands. 768// Subsequent WriteVLDs only consume resources. 769 770def : InstRW<[WriteVST], 771 (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; 772def : InstRW<[WriteAdr, WriteVST], 773 (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>; 774 775def : InstRW<[WriteVSTShuffle], 776 (instregex "ST1Twov(8b|4h|2s|1d)$")>; 777def : InstRW<[WriteAdr, WriteVSTShuffle], 778 (instregex "ST1Twov(8b|4h|2s|1d)_POST")>; 779def : InstRW<[WriteVST, WriteVST], 780 (instregex "ST1Twov(16b|8h|4s|2d)$")>; 781def : InstRW<[WriteAdr, WriteVST, WriteVST], 782 (instregex "ST1Twov(16b|8h|4s|2d)_POST")>; 783 784def : InstRW<[WriteVSTShuffle, WriteVST], 785 (instregex "ST1Threev(8b|4h|2s|1d)$")>; 786def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVST], 787 (instregex "ST1Threev(8b|4h|2s|1d)_POST")>; 788def : InstRW<[WriteVST, WriteVST, WriteVST], 789 (instregex "ST1Threev(16b|8h|4s|2d)$")>; 790def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST], 791 (instregex "ST1Threev(16b|8h|4s|2d)_POST")>; 792 793def : InstRW<[WriteVSTShuffle, WriteVSTShuffle], 794 (instregex "ST1Fourv(8b|4h|2s|1d)$")>; 795def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], 796 (instregex "ST1Fourv(8b|4h|2s|1d)_POST")>; 797def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST], 798 (instregex "ST1Fourv(16b|8h|4s|2d)$")>; 799def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST, WriteVST], 800 (instregex "ST1Fourv(16b|8h|4s|2d)_POST")>; 801 802def : InstRW<[WriteVSTShuffle], (instregex "ST1i(8|16|32)$")>; 803def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST1i(8|16|32)_POST")>; 804 805def : InstRW<[WriteVSTShuffle], (instrs ST1i64)>; 806def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST1i64_POST)>; 807 808def : InstRW<[WriteVSTShuffle], 809 (instregex "ST2Twov(8b|4h|2s)$")>; 810def : InstRW<[WriteAdr, WriteVSTShuffle], 811 (instregex "ST2Twov(8b|4h|2s)_POST")>; 812def : InstRW<[WriteVSTShuffle, WriteVSTShuffle], 813 (instregex "ST2Twov(16b|8h|4s|2d)$")>; 814def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], 815 (instregex "ST2Twov(16b|8h|4s|2d)_POST")>; 816 817def : InstRW<[WriteVSTShuffle], (instregex "ST2i(8|16|32)$")>; 818def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST2i(8|16|32)_POST")>; 819def : InstRW<[WriteVSTShuffle], (instrs ST2i64)>; 820def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST2i64_POST)>; 821 822def : InstRW<[WriteVSTShuffle, WriteVSTShuffle], 823 (instregex "ST3Threev(8b|4h|2s)$")>; 824def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], 825 (instregex "ST3Threev(8b|4h|2s)_POST")>; 826def : InstRW<[WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle], 827 (instregex "ST3Threev(16b|8h|4s|2d)$")>; 828def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle], 829 (instregex "ST3Threev(16b|8h|4s|2d)_POST")>; 830 831def : InstRW<[WriteVSTShuffle], (instregex "ST3i(8|16|32)$")>; 832def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST3i(8|16|32)_POST")>; 833 834def :InstRW<[WriteVSTShuffle, WriteVSTShuffle], (instrs ST3i64)>; 835def :InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], (instrs ST3i64_POST)>; 836 837def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle], 838 (instregex "ST4Fourv(8b|4h|2s|1d)$")>; 839def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle], 840 (instregex "ST4Fourv(8b|4h|2s|1d)_POST")>; 841def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle, 842 WriteVSTPairShuffle, WriteVSTPairShuffle], 843 (instregex "ST4Fourv(16b|8h|4s|2d)$")>; 844def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle, 845 WriteVSTPairShuffle, WriteVSTPairShuffle], 846 (instregex "ST4Fourv(16b|8h|4s|2d)_POST")>; 847 848def : InstRW<[WriteVSTPairShuffle], (instregex "ST4i(8|16|32)$")>; 849def : InstRW<[WriteAdr, WriteVSTPairShuffle], (instregex "ST4i(8|16|32)_POST")>; 850 851def : InstRW<[WriteVSTShuffle, WriteVSTShuffle], (instrs ST4i64)>; 852def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],(instrs ST4i64_POST)>; 853 854//--- 855// Unused SchedRead types 856//--- 857 858def : ReadAdvance<ReadI, 0>; 859def : ReadAdvance<ReadISReg, 0>; 860def : ReadAdvance<ReadIEReg, 0>; 861def : ReadAdvance<ReadIM, 0>; 862def : ReadAdvance<ReadIMA, 0>; 863def : ReadAdvance<ReadID, 0>; 864 865} // SchedModel = CycloneModel 866