1//=- AArch64SchedCyclone.td - Cyclone Scheduling Definitions -*- tablegen -*-=// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file defines the machine model for AArch64 Cyclone to support 10// instruction scheduling and other instruction cost heuristics. 11// 12//===----------------------------------------------------------------------===// 13 14def CycloneModel : SchedMachineModel { 15 let IssueWidth = 6; // 6 micro-ops are dispatched per cycle. 16 let MicroOpBufferSize = 192; // Based on the reorder buffer. 17 let LoadLatency = 4; // Optimistic load latency. 18 let MispredictPenalty = 16; // 14-19 cycles are typical. 19 let CompleteModel = 1; 20 21 list<Predicate> UnsupportedFeatures = SVEUnsupported.F; 22} 23 24//===----------------------------------------------------------------------===// 25// Define each kind of processor resource and number available on Cyclone. 26 27// 4 integer pipes 28def CyUnitI : ProcResource<4> { 29 let BufferSize = 48; 30} 31 32// 2 branch units: I[0..1] 33def CyUnitB : ProcResource<2> { 34 let Super = CyUnitI; 35 let BufferSize = 24; 36} 37 38// 1 indirect-branch unit: I[0] 39def CyUnitBR : ProcResource<1> { 40 let Super = CyUnitB; 41} 42 43// 2 shifter pipes: I[2..3] 44// When an instruction consumes a CyUnitIS, it also consumes a CyUnitI 45def CyUnitIS : ProcResource<2> { 46 let Super = CyUnitI; 47 let BufferSize = 24; 48} 49 50// 1 mul pipe: I[0] 51def CyUnitIM : ProcResource<1> { 52 let Super = CyUnitBR; 53 let BufferSize = 32; 54} 55 56// 1 div pipe: I[1] 57def CyUnitID : ProcResource<1> { 58 let Super = CyUnitB; 59 let BufferSize = 16; 60} 61 62// 1 integer division unit. This is driven by the ID pipe, but only 63// consumes the pipe for one cycle at issue and another cycle at writeback. 64def CyUnitIntDiv : ProcResource<1>; 65 66// 2 ld/st pipes. 67def CyUnitLS : ProcResource<2> { 68 let BufferSize = 28; 69} 70 71// 3 fp/vector pipes. 72def CyUnitV : ProcResource<3> { 73 let BufferSize = 48; 74} 75// 2 fp/vector arithmetic and multiply pipes: V[0-1] 76def CyUnitVM : ProcResource<2> { 77 let Super = CyUnitV; 78 let BufferSize = 32; 79} 80// 1 fp/vector division/sqrt pipe: V[2] 81def CyUnitVD : ProcResource<1> { 82 let Super = CyUnitV; 83 let BufferSize = 16; 84} 85// 1 fp compare pipe: V[0] 86def CyUnitVC : ProcResource<1> { 87 let Super = CyUnitVM; 88 let BufferSize = 16; 89} 90 91// 2 fp division/square-root units. These are driven by the VD pipe, 92// but only consume the pipe for one cycle at issue and a cycle at writeback. 93def CyUnitFloatDiv : ProcResource<2>; 94 95//===----------------------------------------------------------------------===// 96// Define scheduler read/write resources and latency on Cyclone. 97// This mirrors sections 7.7-7.9 of the Tuning Guide v1.0.1. 98 99let SchedModel = CycloneModel in { 100 101//--- 102// 7.8.1. Moves 103//--- 104 105// A single nop micro-op (uX). 106def WriteX : SchedWriteRes<[]> { let Latency = 0; } 107 108// Move zero is a register rename (to machine register zero). 109// The move is replaced by a single nop micro-op. 110// MOVZ Rd, #0 111// AND Rd, Rzr, #imm 112def WriteZPred : SchedPredicate<[{TII->isGPRZero(*MI)}]>; 113def WriteImmZ : SchedWriteVariant<[ 114 SchedVar<WriteZPred, [WriteX]>, 115 SchedVar<NoSchedPred, [WriteImm]>]>; 116def : InstRW<[WriteImmZ], (instrs MOVZWi,MOVZXi,ANDWri,ANDXri)>; 117 118// Move GPR is a register rename and single nop micro-op. 119// ORR Xd, XZR, Xm 120// ADD Xd, Xn, #0 121def WriteIMovPred : SchedPredicate<[{TII->isGPRCopy(*MI)}]>; 122def WriteVMovPred : SchedPredicate<[{TII->isFPRCopy(*MI)}]>; 123def WriteMov : SchedWriteVariant<[ 124 SchedVar<WriteIMovPred, [WriteX]>, 125 SchedVar<WriteVMovPred, [WriteX]>, 126 SchedVar<NoSchedPred, [WriteI]>]>; 127def : InstRW<[WriteMov], (instrs COPY,ORRXrr,ADDXrr)>; 128 129// Move non-zero immediate is an integer ALU op. 130// MOVN,MOVZ,MOVK 131def : WriteRes<WriteImm, [CyUnitI]>; 132 133//--- 134// 7.8.2-7.8.5. Arithmetic and Logical, Comparison, Conditional, 135// Shifts and Bitfield Operations 136//--- 137 138// ADR,ADRP 139// ADD(S)ri,SUB(S)ri,AND(S)ri,EORri,ORRri 140// ADD(S)rr,SUB(S)rr,AND(S)rr,BIC(S)rr,EONrr,EORrr,ORNrr,ORRrr 141// ADC(S),SBC(S) 142// Aliases: CMN, CMP, TST 143// 144// Conditional operations. 145// CCMNi,CCMPi,CCMNr,CCMPr, 146// CSEL,CSINC,CSINV,CSNEG 147// 148// Bit counting and reversal operations. 149// CLS,CLZ,RBIT,REV,REV16,REV32 150def : WriteRes<WriteI, [CyUnitI]>; 151 152// ADD with shifted register operand is a single micro-op that 153// consumes a shift pipeline for two cycles. 154// ADD(S)rs,SUB(S)rs,AND(S)rs,BIC(S)rs,EONrs,EORrs,ORNrs,ORRrs 155// EXAMPLE: ADDrs Xn, Xm LSL #imm 156def : WriteRes<WriteISReg, [CyUnitIS]> { 157 let Latency = 2; 158 let ResourceCycles = [2]; 159} 160 161// ADD with extended register operand is the same as shifted reg operand. 162// ADD(S)re,SUB(S)re 163// EXAMPLE: ADDXre Xn, Xm, UXTB #1 164def : WriteRes<WriteIEReg, [CyUnitIS]> { 165 let Latency = 2; 166 let ResourceCycles = [2]; 167} 168 169// Variable shift and bitfield operations. 170// ASRV,LSLV,LSRV,RORV,BFM,SBFM,UBFM 171def : WriteRes<WriteIS, [CyUnitIS]>; 172 173// EXTR Shifts a pair of registers and requires two micro-ops. 174// The second micro-op is delayed, as modeled by ReadExtrHi. 175// EXTR Xn, Xm, #imm 176def : WriteRes<WriteExtr, [CyUnitIS, CyUnitIS]> { 177 let Latency = 2; 178 let NumMicroOps = 2; 179} 180 181// EXTR's first register read is delayed by one cycle, effectively 182// shortening its writer's latency. 183// EXTR Xn, Xm, #imm 184def : ReadAdvance<ReadExtrHi, 1>; 185 186//--- 187// 7.8.6. Multiplies 188//--- 189 190// MUL/MNEG are aliases for MADD/MSUB. 191// MADDW,MSUBW,SMADDL,SMSUBL,UMADDL,UMSUBL 192def : WriteRes<WriteIM32, [CyUnitIM]> { 193 let Latency = 4; 194} 195// MADDX,MSUBX,SMULH,UMULH 196def : WriteRes<WriteIM64, [CyUnitIM]> { 197 let Latency = 5; 198} 199 200//--- 201// 7.8.7. Divide 202//--- 203 204// 32-bit divide takes 7-13 cycles. 10 cycles covers a 20-bit quotient. 205// The ID pipe is consumed for 2 cycles: issue and writeback. 206// SDIVW,UDIVW 207def : WriteRes<WriteID32, [CyUnitID, CyUnitIntDiv]> { 208 let Latency = 10; 209 let ResourceCycles = [2, 10]; 210} 211// 64-bit divide takes 7-21 cycles. 13 cycles covers a 32-bit quotient. 212// The ID pipe is consumed for 2 cycles: issue and writeback. 213// SDIVX,UDIVX 214def : WriteRes<WriteID64, [CyUnitID, CyUnitIntDiv]> { 215 let Latency = 13; 216 let ResourceCycles = [2, 13]; 217} 218 219//--- 220// 7.8.8,7.8.10. Load/Store, single element 221//--- 222 223// Integer loads take 4 cycles and use one LS unit for one cycle. 224def : WriteRes<WriteLD, [CyUnitLS]> { 225 let Latency = 4; 226} 227 228// Store-load forwarding is 4 cycles. 229// 230// Note: The store-exclusive sequence incorporates this 231// latency. However, general heuristics should not model the 232// dependence between a store and subsequent may-alias load because 233// hardware speculation works. 234def : WriteRes<WriteST, [CyUnitLS]> { 235 let Latency = 4; 236} 237 238// Load from base address plus an optionally scaled register offset. 239// Rt latency is latency WriteIS + WriteLD. 240// EXAMPLE: LDR Xn, Xm [, lsl 3] 241def CyWriteLDIdx : SchedWriteVariant<[ 242 SchedVar<ScaledIdxPred, [WriteIS, WriteLD]>, // Load from scaled register. 243 SchedVar<NoSchedPred, [WriteLD]>]>; // Load from register offset. 244def : SchedAlias<WriteLDIdx, CyWriteLDIdx>; // Map AArch64->Cyclone type. 245 246// EXAMPLE: STR Xn, Xm [, lsl 3] 247def CyWriteSTIdx : SchedWriteVariant<[ 248 SchedVar<ScaledIdxPred, [WriteIS, WriteST]>, // Store to scaled register. 249 SchedVar<NoSchedPred, [WriteST]>]>; // Store to register offset. 250def : SchedAlias<WriteSTIdx, CyWriteSTIdx>; // Map AArch64->Cyclone type. 251 252// Read the (unshifted) base register Xn in the second micro-op one cycle later. 253// EXAMPLE: LDR Xn, Xm [, lsl 3] 254def ReadBaseRS : SchedReadAdvance<1>; 255def CyReadAdrBase : SchedReadVariant<[ 256 SchedVar<ScaledIdxPred, [ReadBaseRS]>, // Read base reg after shifting offset. 257 SchedVar<NoSchedPred, [ReadDefault]>]>; // Read base reg with no shift. 258def : SchedAlias<ReadAdrBase, CyReadAdrBase>; // Map AArch64->Cyclone type. 259 260//--- 261// 7.8.9,7.8.11. Load/Store, paired 262//--- 263 264// Address pre/post increment is a simple ALU op with one cycle latency. 265def : WriteRes<WriteAdr, [CyUnitI]>; 266 267// LDP high register write is fused with the load, but a nop micro-op remains. 268def : WriteRes<WriteLDHi, []> { 269 let Latency = 4; 270} 271 272// STP is a vector op and store, except for QQ, which is just two stores. 273def : SchedAlias<WriteSTP, WriteVSTShuffle>; 274def : InstRW<[WriteST, WriteST], (instrs STPQi)>; 275 276//--- 277// 7.8.13. Branches 278//--- 279 280// Branches take a single micro-op. 281// The misprediction penalty is defined as a SchedMachineModel property. 282def : WriteRes<WriteBr, [CyUnitB]> {let Latency = 0;} 283def : WriteRes<WriteBrReg, [CyUnitBR]> {let Latency = 0;} 284 285//--- 286// 7.8.14. Never-issued Instructions, Barrier and Hint Operations 287//--- 288 289// NOP,SEV,SEVL,WFE,WFI,YIELD 290def : WriteRes<WriteHint, []> {let Latency = 0;} 291// ISB 292def : InstRW<[WriteI], (instrs ISB)>; 293// SLREX,DMB,DSB 294def : WriteRes<WriteBarrier, [CyUnitLS]>; 295 296// System instructions get an invalid latency because the latency of 297// other operations across them is meaningless. 298def : WriteRes<WriteSys, []> {let Latency = -1;} 299 300//===----------------------------------------------------------------------===// 301// 7.9 Vector Unit Instructions 302 303// Simple vector operations take 2 cycles. 304def : WriteRes<WriteV, [CyUnitV]> {let Latency = 2;} 305 306// Define some longer latency vector op types for Cyclone. 307def CyWriteV3 : SchedWriteRes<[CyUnitV]> {let Latency = 3;} 308def CyWriteV4 : SchedWriteRes<[CyUnitV]> {let Latency = 4;} 309def CyWriteV5 : SchedWriteRes<[CyUnitV]> {let Latency = 5;} 310def CyWriteV6 : SchedWriteRes<[CyUnitV]> {let Latency = 6;} 311 312// Simple floating-point operations take 2 cycles. 313def : WriteRes<WriteF, [CyUnitV]> {let Latency = 2;} 314 315//--- 316// 7.9.1 Vector Moves 317//--- 318 319// TODO: Add Cyclone-specific zero-cycle zeros. LLVM currently 320// generates expensive int-float conversion instead: 321// FMOVDi Dd, #0.0 322// FMOVv2f64ns Vd.2d, #0.0 323 324// FMOVSi,FMOVDi 325def : WriteRes<WriteFImm, [CyUnitV]> {let Latency = 2;} 326 327// MOVI,MVNI are WriteV 328// FMOVv2f32ns,FMOVv2f64ns,FMOVv4f32ns are WriteV 329 330// Move FPR is a register rename and single nop micro-op. 331// ORR.16b Vd,Vn,Vn 332// COPY is handled above in the WriteMov Variant. 333def WriteVMov : SchedWriteVariant<[ 334 SchedVar<WriteVMovPred, [WriteX]>, 335 SchedVar<NoSchedPred, [WriteV]>]>; 336def : InstRW<[WriteVMov], (instrs ORRv16i8)>; 337 338// FMOVSr,FMOVDr are WriteF. 339 340// MOV V,V is a WriteV. 341 342// CPY D,V[x] is a WriteV 343 344// INS V[x],V[y] is a WriteV. 345 346// FMOVWSr,FMOVXDr,FMOVXDHighr 347def : WriteRes<WriteFCopy, [CyUnitLS]> { 348 let Latency = 5; 349} 350 351// FMOVSWr,FMOVDXr 352def : InstRW<[WriteLD], (instrs FMOVSWr,FMOVDXr,FMOVDXHighr)>; 353 354// INS V[x],R 355def CyWriteCopyToFPR : WriteSequence<[WriteVLD, WriteV]>; 356def : InstRW<[CyWriteCopyToFPR], (instregex "INSv")>; 357 358// SMOV,UMOV R,V[x] 359def CyWriteCopyToGPR : WriteSequence<[WriteLD, WriteI]>; 360def : InstRW<[CyWriteCopyToGPR], (instregex "SMOVv","UMOVv")>; 361 362// DUP V,R 363def : InstRW<[CyWriteCopyToFPR], (instregex "DUPv")>; 364 365// DUP V,V[x] is a WriteV. 366 367//--- 368// 7.9.2 Integer Arithmetic, Logical, and Comparisons 369//--- 370 371// BIC,ORR V,#imm are WriteV 372 373def : InstRW<[CyWriteV3], (instregex "ABSv")>; 374 375// MVN,NEG,NOT are WriteV 376 377def : InstRW<[CyWriteV3], (instregex "SQABSv","SQNEGv")>; 378 379// ADDP is a WriteV. 380def CyWriteVADDLP : SchedWriteRes<[CyUnitV]> {let Latency = 2;} 381def : InstRW<[CyWriteVADDLP], (instregex "SADDLPv","UADDLPv")>; 382 383def : InstRW<[CyWriteV3], 384 (instregex "ADDVv","SMAXVv","UMAXVv","SMINVv","UMINVv")>; 385 386def : InstRW<[CyWriteV3], (instregex "SADDLV","UADDLV")>; 387 388// ADD,SUB are WriteV 389 390// Forward declare. 391def CyWriteVABD : SchedWriteRes<[CyUnitV]> {let Latency = 3;} 392 393// Add/Diff and accumulate uses the vector multiply unit. 394def CyWriteVAccum : SchedWriteRes<[CyUnitVM]> {let Latency = 3;} 395def CyReadVAccum : SchedReadAdvance<1, 396 [CyWriteVAccum, CyWriteVADDLP, CyWriteVABD]>; 397 398def : InstRW<[CyWriteVAccum, CyReadVAccum], 399 (instregex "SADALP","UADALP")>; 400 401def : InstRW<[CyWriteVAccum, CyReadVAccum], 402 (instregex "SABAv","UABAv","SABALv","UABALv")>; 403 404def : InstRW<[CyWriteV3], (instregex "SQADDv","SQSUBv","UQADDv","UQSUBv")>; 405 406def : InstRW<[CyWriteV3], (instregex "SUQADDv","USQADDv")>; 407 408def : InstRW<[CyWriteV4], (instregex "ADDHNv","RADDHNv", "RSUBHNv", "SUBHNv")>; 409 410// WriteV includes: 411// AND,BIC,CMTST,EOR,ORN,ORR 412// ADDP 413// SHADD,SHSUB,SRHADD,UHADD,UHSUB,URHADD 414// SADDL,SSUBL,UADDL,USUBL 415// SADDW,SSUBW,UADDW,USUBW 416 417def : InstRW<[CyWriteV3], (instregex "CMEQv","CMGEv","CMGTv", 418 "CMLEv","CMLTv", 419 "CMHIv","CMHSv")>; 420 421def : InstRW<[CyWriteV3], (instregex "SMAXv","SMINv","UMAXv","UMINv", 422 "SMAXPv","SMINPv","UMAXPv","UMINPv")>; 423 424def : InstRW<[CyWriteVABD], (instregex "SABDv","UABDv", 425 "SABDLv","UABDLv")>; 426 427//--- 428// 7.9.3 Floating Point Arithmetic and Comparisons 429//--- 430 431// FABS,FNEG are WriteF 432 433def : InstRW<[CyWriteV4], (instrs FADDPv2i32p)>; 434def : InstRW<[CyWriteV5], (instrs FADDPv2i64p)>; 435 436def : InstRW<[CyWriteV3], (instregex "FMAXPv2i","FMAXNMPv2i", 437 "FMINPv2i","FMINNMPv2i")>; 438 439def : InstRW<[CyWriteV4], (instregex "FMAXVv","FMAXNMVv","FMINVv","FMINNMVv")>; 440 441def : InstRW<[CyWriteV4], (instrs FADDSrr,FADDv2f32,FADDv4f32, 442 FSUBSrr,FSUBv2f32,FSUBv4f32, 443 FADDPv2f32,FADDPv4f32, 444 FABD32,FABDv2f32,FABDv4f32)>; 445def : InstRW<[CyWriteV5], (instrs FADDDrr,FADDv2f64, 446 FSUBDrr,FSUBv2f64, 447 FADDPv2f64, 448 FABD64,FABDv2f64)>; 449 450def : InstRW<[CyWriteV3], (instregex "FCMEQ","FCMGT","FCMLE","FCMLT")>; 451 452def : InstRW<[CyWriteV3], (instregex "FACGE","FACGT", 453 "FMAXS","FMAXD","FMAXv", 454 "FMINS","FMIND","FMINv", 455 "FMAXNMS","FMAXNMD","FMAXNMv", 456 "FMINNMS","FMINNMD","FMINNMv", 457 "FMAXPv2f","FMAXPv4f", 458 "FMINPv2f","FMINPv4f", 459 "FMAXNMPv2f","FMAXNMPv4f", 460 "FMINNMPv2f","FMINNMPv4f")>; 461 462// FCMP,FCMPE,FCCMP,FCCMPE 463def : WriteRes<WriteFCmp, [CyUnitVC]> {let Latency = 4;} 464 465// FCSEL is a WriteF. 466 467//--- 468// 7.9.4 Shifts and Bitfield Operations 469//--- 470 471// SHL is a WriteV 472 473def CyWriteVSHR : SchedWriteRes<[CyUnitV]> {let Latency = 2;} 474def : InstRW<[CyWriteVSHR], (instregex "SSHRv","USHRv")>; 475 476def CyWriteVSRSHR : SchedWriteRes<[CyUnitV]> {let Latency = 3;} 477def : InstRW<[CyWriteVSRSHR], (instregex "SRSHRv","URSHRv")>; 478 479// Shift and accumulate uses the vector multiply unit. 480def CyWriteVShiftAcc : SchedWriteRes<[CyUnitVM]> {let Latency = 3;} 481def CyReadVShiftAcc : SchedReadAdvance<1, 482 [CyWriteVShiftAcc, CyWriteVSHR, CyWriteVSRSHR]>; 483def : InstRW<[CyWriteVShiftAcc, CyReadVShiftAcc], 484 (instregex "SRSRAv","SSRAv","URSRAv","USRAv")>; 485 486// SSHL,USHL are WriteV. 487 488def : InstRW<[CyWriteV3], (instregex "SRSHLv","URSHLv")>; 489 490// SQSHL,SQSHLU,UQSHL are WriteV. 491 492def : InstRW<[CyWriteV3], (instregex "SQRSHLv","UQRSHLv")>; 493 494// WriteV includes: 495// SHLL,SSHLL,USHLL 496// SLI,SRI 497// BIF,BIT,BSL 498// EXT 499// CLS,CLZ,CNT,RBIT,REV16,REV32,REV64,XTN 500// XTN2 501 502def : InstRW<[CyWriteV4], 503 (instregex "RSHRNv","SHRNv", 504 "SQRSHRNv","SQRSHRUNv","SQSHRNv","SQSHRUNv", 505 "UQRSHRNv","UQSHRNv","SQXTNv","SQXTUNv","UQXTNv")>; 506 507//--- 508// 7.9.5 Multiplication 509//--- 510 511def CyWriteVMul : SchedWriteRes<[CyUnitVM]> { let Latency = 4;} 512def : InstRW<[CyWriteVMul], (instregex "MULv","SMULLv","UMULLv", 513 "SQDMULLv","SQDMULHv","SQRDMULHv")>; 514 515// FMUL,FMULX,FNMUL default to WriteFMul. 516def : WriteRes<WriteFMul, [CyUnitVM]> { let Latency = 4;} 517 518def CyWriteV64Mul : SchedWriteRes<[CyUnitVM]> { let Latency = 5;} 519def : InstRW<[CyWriteV64Mul], (instrs FMULDrr,FMULv2f64,FMULv2i64_indexed, 520 FNMULDrr,FMULX64,FMULXv2f64,FMULXv2i64_indexed)>; 521 522def CyReadVMulAcc : SchedReadAdvance<1, [CyWriteVMul, CyWriteV64Mul]>; 523def : InstRW<[CyWriteVMul, CyReadVMulAcc], 524 (instregex "MLA","MLS","SMLAL","SMLSL","UMLAL","UMLSL", 525 "SQDMLAL","SQDMLSL")>; 526 527def CyWriteSMul : SchedWriteRes<[CyUnitVM]> { let Latency = 8;} 528def CyWriteDMul : SchedWriteRes<[CyUnitVM]> { let Latency = 10;} 529def CyReadSMul : SchedReadAdvance<4, [CyWriteSMul]>; 530def CyReadDMul : SchedReadAdvance<5, [CyWriteDMul]>; 531 532def : InstRW<[CyWriteSMul, CyReadSMul], 533 (instrs FMADDSrrr,FMSUBSrrr,FNMADDSrrr,FNMSUBSrrr, 534 FMLAv2f32,FMLAv4f32, 535 FMLAv1i32_indexed,FMLAv1i64_indexed,FMLAv2i32_indexed)>; 536def : InstRW<[CyWriteDMul, CyReadDMul], 537 (instrs FMADDDrrr,FMSUBDrrr,FNMADDDrrr,FNMSUBDrrr, 538 FMLAv2f64,FMLAv2i64_indexed, 539 FMLSv2f64,FMLSv2i64_indexed)>; 540 541def CyWritePMUL : SchedWriteRes<[CyUnitVD]> { let Latency = 3; } 542def : InstRW<[CyWritePMUL], (instregex "PMULv", "PMULLv")>; 543 544//--- 545// 7.9.6 Divide and Square Root 546//--- 547 548// FDIV,FSQRT 549// TODO: Add 64-bit variant with 19 cycle latency. 550// TODO: Specialize FSQRT for longer latency. 551def : WriteRes<WriteFDiv, [CyUnitVD, CyUnitFloatDiv]> { 552 let Latency = 17; 553 let ResourceCycles = [2, 17]; 554} 555 556def : InstRW<[CyWriteV4], (instregex "FRECPEv","FRECPXv","URECPEv","URSQRTEv")>; 557 558def WriteFRSQRTE : SchedWriteRes<[CyUnitVM]> { let Latency = 4; } 559def : InstRW<[WriteFRSQRTE], (instregex "FRSQRTEv")>; 560 561def WriteFRECPS : SchedWriteRes<[CyUnitVM]> { let Latency = 8; } 562def WriteFRSQRTS : SchedWriteRes<[CyUnitVM]> { let Latency = 10; } 563def : InstRW<[WriteFRECPS], (instregex "FRECPSv")>; 564def : InstRW<[WriteFRSQRTS], (instregex "FRSQRTSv")>; 565 566//--- 567// 7.9.7 Integer-FP Conversions 568//--- 569 570// FCVT lengthen f16/s32 571def : InstRW<[WriteV], (instrs FCVTSHr,FCVTDHr,FCVTDSr)>; 572 573// FCVT,FCVTN,FCVTXN 574// SCVTF,UCVTF V,V 575// FRINT(AIMNPXZ) V,V 576def : WriteRes<WriteFCvt, [CyUnitV]> {let Latency = 4;} 577 578// SCVT/UCVT S/D, Rd = VLD5+V4: 9 cycles. 579def CyWriteCvtToFPR : WriteSequence<[WriteVLD, CyWriteV4]>; 580def : InstRW<[CyWriteCopyToFPR], (instregex "FCVT[AMNPZ][SU][SU][WX][SD]r")>; 581 582// FCVT Rd, S/D = V6+LD4: 10 cycles 583def CyWriteCvtToGPR : WriteSequence<[CyWriteV6, WriteLD]>; 584def : InstRW<[CyWriteCvtToGPR], (instregex "[SU]CVTF[SU][WX][SD]r")>; 585 586// FCVTL is a WriteV 587 588//--- 589// 7.9.8-7.9.10 Cryptography, Data Transposition, Table Lookup 590//--- 591 592def CyWriteCrypto2 : SchedWriteRes<[CyUnitVD]> {let Latency = 2;} 593def : InstRW<[CyWriteCrypto2], (instrs AESIMCrr, AESMCrr, SHA1Hrr, 594 AESDrr, AESErr, SHA1SU1rr, SHA256SU0rr, 595 SHA1SU0rrr)>; 596 597def CyWriteCrypto3 : SchedWriteRes<[CyUnitVD]> {let Latency = 3;} 598def : InstRW<[CyWriteCrypto3], (instrs SHA256SU1rrr)>; 599 600def CyWriteCrypto6 : SchedWriteRes<[CyUnitVD]> {let Latency = 6;} 601def : InstRW<[CyWriteCrypto6], (instrs SHA1Crrr, SHA1Mrrr, SHA1Prrr, 602 SHA256Hrrr,SHA256H2rrr)>; 603 604// TRN,UZP,ZUP are WriteV. 605 606// TBL,TBX are WriteV. 607 608//--- 609// 7.9.11-7.9.14 Load/Store, single element and paired 610//--- 611 612// Loading into the vector unit takes 5 cycles vs 4 for integer loads. 613def : WriteRes<WriteVLD, [CyUnitLS]> { 614 let Latency = 5; 615} 616 617// Store-load forwarding is 4 cycles. 618def : WriteRes<WriteVST, [CyUnitLS]> { 619 let Latency = 4; 620} 621 622// WriteVLDPair/VSTPair sequences are expanded by the target description. 623 624//--- 625// 7.9.15 Load, element operations 626//--- 627 628// Only the first WriteVLD and WriteAdr for writeback matches def operands. 629// Subsequent WriteVLDs consume resources. Since all loaded values have the 630// same latency, this is acceptable. 631 632// Vd is read 5 cycles after issuing the vector load. 633def : ReadAdvance<ReadVLD, 5>; 634 635def : InstRW<[WriteVLD], 636 (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; 637def : InstRW<[WriteVLD, WriteAdr], 638 (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>; 639 640// Register writes from the load's high half are fused micro-ops. 641def : InstRW<[WriteVLD], 642 (instregex "LD1Twov(8b|4h|2s|1d)$")>; 643def : InstRW<[WriteVLD, WriteAdr], 644 (instregex "LD1Twov(8b|4h|2s|1d)_POST")>; 645def : InstRW<[WriteVLD, WriteVLD], 646 (instregex "LD1Twov(16b|8h|4s|2d)$")>; 647def : InstRW<[WriteVLD, WriteAdr, WriteVLD], 648 (instregex "LD1Twov(16b|8h|4s|2d)_POST")>; 649 650def : InstRW<[WriteVLD, WriteVLD], 651 (instregex "LD1Threev(8b|4h|2s|1d)$")>; 652def : InstRW<[WriteVLD, WriteAdr, WriteVLD], 653 (instregex "LD1Threev(8b|4h|2s|1d)_POST")>; 654def : InstRW<[WriteVLD, WriteVLD, WriteVLD], 655 (instregex "LD1Threev(16b|8h|4s|2d)$")>; 656def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD], 657 (instregex "LD1Threev(16b|8h|4s|2d)_POST")>; 658 659def : InstRW<[WriteVLD, WriteVLD], 660 (instregex "LD1Fourv(8b|4h|2s|1d)$")>; 661def : InstRW<[WriteVLD, WriteAdr, WriteVLD], 662 (instregex "LD1Fourv(8b|4h|2s|1d)_POST")>; 663def : InstRW<[WriteVLD, WriteVLD, WriteVLD, WriteVLD], 664 (instregex "LD1Fourv(16b|8h|4s|2d)$")>; 665def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD, WriteVLD], 666 (instregex "LD1Fourv(16b|8h|4s|2d)_POST")>; 667 668def : InstRW<[WriteVLDShuffle, ReadVLD], 669 (instregex "LD1i(8|16|32)$")>; 670def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr], 671 (instregex "LD1i(8|16|32)_POST")>; 672 673def : InstRW<[WriteVLDShuffle, ReadVLD], (instrs LD1i64)>; 674def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr],(instrs LD1i64_POST)>; 675 676def : InstRW<[WriteVLDShuffle], 677 (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; 678def : InstRW<[WriteVLDShuffle, WriteAdr], 679 (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; 680 681def : InstRW<[WriteVLDShuffle, WriteV], 682 (instregex "LD2Twov(8b|4h|2s)$")>; 683def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV], 684 (instregex "LD2Twov(8b|4h|2s)_POST$")>; 685def : InstRW<[WriteVLDShuffle, WriteVLDShuffle], 686 (instregex "LD2Twov(16b|8h|4s|2d)$")>; 687def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle], 688 (instregex "LD2Twov(16b|8h|4s|2d)_POST")>; 689 690def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV], 691 (instregex "LD2i(8|16|32)$")>; 692def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV], 693 (instregex "LD2i(8|16|32)_POST")>; 694def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV], 695 (instregex "LD2i64$")>; 696def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV], 697 (instregex "LD2i64_POST")>; 698 699def : InstRW<[WriteVLDShuffle, WriteV], 700 (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; 701def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV], 702 (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>; 703 704def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV], 705 (instregex "LD3Threev(8b|4h|2s)$")>; 706def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV], 707 (instregex "LD3Threev(8b|4h|2s)_POST")>; 708def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteVLDShuffle], 709 (instregex "LD3Threev(16b|8h|4s|2d)$")>; 710def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteVLDShuffle], 711 (instregex "LD3Threev(16b|8h|4s|2d)_POST")>; 712 713def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV], 714 (instregex "LD3i(8|16|32)$")>; 715def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV], 716 (instregex "LD3i(8|16|32)_POST")>; 717 718def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV], 719 (instregex "LD3i64$")>; 720def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV], 721 (instregex "LD3i64_POST")>; 722 723def : InstRW<[WriteVLDShuffle, WriteV, WriteV], 724 (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)$")>; 725def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV], 726 (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)_POST")>; 727 728def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV], 729 (instrs LD3Rv1d,LD3Rv2d)>; 730def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV], 731 (instrs LD3Rv1d_POST,LD3Rv2d_POST)>; 732 733def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV], 734 (instregex "LD4Fourv(8b|4h|2s)$")>; 735def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV], 736 (instregex "LD4Fourv(8b|4h|2s)_POST")>; 737def : InstRW<[WriteVLDPairShuffle, WriteVLDPairShuffle, 738 WriteVLDPairShuffle, WriteVLDPairShuffle], 739 (instregex "LD4Fourv(16b|8h|4s|2d)$")>; 740def : InstRW<[WriteVLDPairShuffle, WriteAdr, WriteVLDPairShuffle, 741 WriteVLDPairShuffle, WriteVLDPairShuffle], 742 (instregex "LD4Fourv(16b|8h|4s|2d)_POST")>; 743 744def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV, WriteV], 745 (instregex "LD4i(8|16|32)$")>; 746def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV, WriteV], 747 (instregex "LD4i(8|16|32)_POST")>; 748 749 750def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV, WriteV], 751 (instrs LD4i64)>; 752def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV], 753 (instrs LD4i64_POST)>; 754 755def : InstRW<[WriteVLDShuffle, WriteV, WriteV, WriteV], 756 (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)$")>; 757def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV, WriteV], 758 (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)_POST")>; 759 760def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV], 761 (instrs LD4Rv1d,LD4Rv2d)>; 762def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV], 763 (instrs LD4Rv1d_POST,LD4Rv2d_POST)>; 764 765//--- 766// 7.9.16 Store, element operations 767//--- 768 769// Only the WriteAdr for writeback matches a def operands. 770// Subsequent WriteVLDs only consume resources. 771 772def : InstRW<[WriteVST], 773 (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; 774def : InstRW<[WriteAdr, WriteVST], 775 (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>; 776 777def : InstRW<[WriteVSTShuffle], 778 (instregex "ST1Twov(8b|4h|2s|1d)$")>; 779def : InstRW<[WriteAdr, WriteVSTShuffle], 780 (instregex "ST1Twov(8b|4h|2s|1d)_POST")>; 781def : InstRW<[WriteVST, WriteVST], 782 (instregex "ST1Twov(16b|8h|4s|2d)$")>; 783def : InstRW<[WriteAdr, WriteVST, WriteVST], 784 (instregex "ST1Twov(16b|8h|4s|2d)_POST")>; 785 786def : InstRW<[WriteVSTShuffle, WriteVST], 787 (instregex "ST1Threev(8b|4h|2s|1d)$")>; 788def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVST], 789 (instregex "ST1Threev(8b|4h|2s|1d)_POST")>; 790def : InstRW<[WriteVST, WriteVST, WriteVST], 791 (instregex "ST1Threev(16b|8h|4s|2d)$")>; 792def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST], 793 (instregex "ST1Threev(16b|8h|4s|2d)_POST")>; 794 795def : InstRW<[WriteVSTShuffle, WriteVSTShuffle], 796 (instregex "ST1Fourv(8b|4h|2s|1d)$")>; 797def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], 798 (instregex "ST1Fourv(8b|4h|2s|1d)_POST")>; 799def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST], 800 (instregex "ST1Fourv(16b|8h|4s|2d)$")>; 801def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST, WriteVST], 802 (instregex "ST1Fourv(16b|8h|4s|2d)_POST")>; 803 804def : InstRW<[WriteVSTShuffle], (instregex "ST1i(8|16|32)$")>; 805def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST1i(8|16|32)_POST")>; 806 807def : InstRW<[WriteVSTShuffle], (instrs ST1i64)>; 808def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST1i64_POST)>; 809 810def : InstRW<[WriteVSTShuffle], 811 (instregex "ST2Twov(8b|4h|2s)$")>; 812def : InstRW<[WriteAdr, WriteVSTShuffle], 813 (instregex "ST2Twov(8b|4h|2s)_POST")>; 814def : InstRW<[WriteVSTShuffle, WriteVSTShuffle], 815 (instregex "ST2Twov(16b|8h|4s|2d)$")>; 816def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], 817 (instregex "ST2Twov(16b|8h|4s|2d)_POST")>; 818 819def : InstRW<[WriteVSTShuffle], (instregex "ST2i(8|16|32)$")>; 820def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST2i(8|16|32)_POST")>; 821def : InstRW<[WriteVSTShuffle], (instrs ST2i64)>; 822def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST2i64_POST)>; 823 824def : InstRW<[WriteVSTShuffle, WriteVSTShuffle], 825 (instregex "ST3Threev(8b|4h|2s)$")>; 826def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], 827 (instregex "ST3Threev(8b|4h|2s)_POST")>; 828def : InstRW<[WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle], 829 (instregex "ST3Threev(16b|8h|4s|2d)$")>; 830def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle], 831 (instregex "ST3Threev(16b|8h|4s|2d)_POST")>; 832 833def : InstRW<[WriteVSTShuffle], (instregex "ST3i(8|16|32)$")>; 834def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST3i(8|16|32)_POST")>; 835 836def :InstRW<[WriteVSTShuffle, WriteVSTShuffle], (instrs ST3i64)>; 837def :InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], (instrs ST3i64_POST)>; 838 839def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle], 840 (instregex "ST4Fourv(8b|4h|2s|1d)$")>; 841def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle], 842 (instregex "ST4Fourv(8b|4h|2s|1d)_POST")>; 843def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle, 844 WriteVSTPairShuffle, WriteVSTPairShuffle], 845 (instregex "ST4Fourv(16b|8h|4s|2d)$")>; 846def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle, 847 WriteVSTPairShuffle, WriteVSTPairShuffle], 848 (instregex "ST4Fourv(16b|8h|4s|2d)_POST")>; 849 850def : InstRW<[WriteVSTPairShuffle], (instregex "ST4i(8|16|32)$")>; 851def : InstRW<[WriteAdr, WriteVSTPairShuffle], (instregex "ST4i(8|16|32)_POST")>; 852 853def : InstRW<[WriteVSTShuffle, WriteVSTShuffle], (instrs ST4i64)>; 854def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],(instrs ST4i64_POST)>; 855 856// Atomic operations are not supported. 857def : WriteRes<WriteAtomic, []> { let Unsupported = 1; } 858 859//--- 860// Unused SchedRead types 861//--- 862 863def : ReadAdvance<ReadI, 0>; 864def : ReadAdvance<ReadISReg, 0>; 865def : ReadAdvance<ReadIEReg, 0>; 866def : ReadAdvance<ReadIM, 0>; 867def : ReadAdvance<ReadIMA, 0>; 868def : ReadAdvance<ReadID, 0>; 869 870} // SchedModel = CycloneModel 871