1//=- AArch64SchedCyclone.td - Cyclone Scheduling Definitions -*- tablegen -*-=// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the machine model for AArch64 Cyclone to support 11// instruction scheduling and other instruction cost heuristics. 12// 13//===----------------------------------------------------------------------===// 14 15def CycloneModel : SchedMachineModel { 16 let IssueWidth = 6; // 6 micro-ops are dispatched per cycle. 17 let MicroOpBufferSize = 192; // Based on the reorder buffer. 18 let LoadLatency = 4; // Optimistic load latency. 19 let MispredictPenalty = 16; // 14-19 cycles are typical. 20 let CompleteModel = 1; 21} 22 23//===----------------------------------------------------------------------===// 24// Define each kind of processor resource and number available on Cyclone. 25 26// 4 integer pipes 27def CyUnitI : ProcResource<4> { 28 let BufferSize = 48; 29} 30 31// 2 branch units: I[0..1] 32def CyUnitB : ProcResource<2> { 33 let Super = CyUnitI; 34 let BufferSize = 24; 35} 36 37// 1 indirect-branch unit: I[0] 38def CyUnitBR : ProcResource<1> { 39 let Super = CyUnitB; 40} 41 42// 2 shifter pipes: I[2..3] 43// When an instruction consumes a CyUnitIS, it also consumes a CyUnitI 44def CyUnitIS : ProcResource<2> { 45 let Super = CyUnitI; 46 let BufferSize = 24; 47} 48 49// 1 mul pipe: I[0] 50def CyUnitIM : ProcResource<1> { 51 let Super = CyUnitBR; 52 let BufferSize = 32; 53} 54 55// 1 div pipe: I[1] 56def CyUnitID : ProcResource<1> { 57 let Super = CyUnitB; 58 let BufferSize = 16; 59} 60 61// 1 integer division unit. This is driven by the ID pipe, but only 62// consumes the pipe for one cycle at issue and another cycle at writeback. 63def CyUnitIntDiv : ProcResource<1>; 64 65// 2 ld/st pipes. 66def CyUnitLS : ProcResource<2> { 67 let BufferSize = 28; 68} 69 70// 3 fp/vector pipes. 71def CyUnitV : ProcResource<3> { 72 let BufferSize = 48; 73} 74// 2 fp/vector arithmetic and multiply pipes: V[0-1] 75def CyUnitVM : ProcResource<2> { 76 let Super = CyUnitV; 77 let BufferSize = 32; 78} 79// 1 fp/vector division/sqrt pipe: V[2] 80def CyUnitVD : ProcResource<1> { 81 let Super = CyUnitV; 82 let BufferSize = 16; 83} 84// 1 fp compare pipe: V[0] 85def CyUnitVC : ProcResource<1> { 86 let Super = CyUnitVM; 87 let BufferSize = 16; 88} 89 90// 2 fp division/square-root units. These are driven by the VD pipe, 91// but only consume the pipe for one cycle at issue and a cycle at writeback. 92def CyUnitFloatDiv : ProcResource<2>; 93 94//===----------------------------------------------------------------------===// 95// Define scheduler read/write resources and latency on Cyclone. 96// This mirrors sections 7.7-7.9 of the Tuning Guide v1.0.1. 97 98let SchedModel = CycloneModel in { 99 100//--- 101// 7.8.1. Moves 102//--- 103 104// A single nop micro-op (uX). 105def WriteX : SchedWriteRes<[]> { let Latency = 0; } 106 107// Move zero is a register rename (to machine register zero). 108// The move is replaced by a single nop micro-op. 109// MOVZ Rd, #0 110// AND Rd, Rzr, #imm 111def WriteZPred : SchedPredicate<[{TII->isGPRZero(*MI)}]>; 112def WriteImmZ : SchedWriteVariant<[ 113 SchedVar<WriteZPred, [WriteX]>, 114 SchedVar<NoSchedPred, [WriteImm]>]>; 115def : InstRW<[WriteImmZ], (instrs MOVZWi,MOVZXi,ANDWri,ANDXri)>; 116 117// Move GPR is a register rename and single nop micro-op. 118// ORR Xd, XZR, Xm 119// ADD Xd, Xn, #0 120def WriteIMovPred : SchedPredicate<[{TII->isGPRCopy(*MI)}]>; 121def WriteVMovPred : SchedPredicate<[{TII->isFPRCopy(*MI)}]>; 122def WriteMov : SchedWriteVariant<[ 123 SchedVar<WriteIMovPred, [WriteX]>, 124 SchedVar<WriteVMovPred, [WriteX]>, 125 SchedVar<NoSchedPred, [WriteI]>]>; 126def : InstRW<[WriteMov], (instrs COPY,ORRXrr,ADDXrr)>; 127 128// Move non-zero immediate is an integer ALU op. 129// MOVN,MOVZ,MOVK 130def : WriteRes<WriteImm, [CyUnitI]>; 131 132//--- 133// 7.8.2-7.8.5. Arithmetic and Logical, Comparison, Conditional, 134// Shifts and Bitfield Operations 135//--- 136 137// ADR,ADRP 138// ADD(S)ri,SUB(S)ri,AND(S)ri,EORri,ORRri 139// ADD(S)rr,SUB(S)rr,AND(S)rr,BIC(S)rr,EONrr,EORrr,ORNrr,ORRrr 140// ADC(S),SBC(S) 141// Aliases: CMN, CMP, TST 142// 143// Conditional operations. 144// CCMNi,CCMPi,CCMNr,CCMPr, 145// CSEL,CSINC,CSINV,CSNEG 146// 147// Bit counting and reversal operations. 148// CLS,CLZ,RBIT,REV,REV16,REV32 149def : WriteRes<WriteI, [CyUnitI]>; 150 151// ADD with shifted register operand is a single micro-op that 152// consumes a shift pipeline for two cycles. 153// ADD(S)rs,SUB(S)rs,AND(S)rs,BIC(S)rs,EONrs,EORrs,ORNrs,ORRrs 154// EXAMPLE: ADDrs Xn, Xm LSL #imm 155def : WriteRes<WriteISReg, [CyUnitIS]> { 156 let Latency = 2; 157 let ResourceCycles = [2]; 158} 159 160// ADD with extended register operand is the same as shifted reg operand. 161// ADD(S)re,SUB(S)re 162// EXAMPLE: ADDXre Xn, Xm, UXTB #1 163def : WriteRes<WriteIEReg, [CyUnitIS]> { 164 let Latency = 2; 165 let ResourceCycles = [2]; 166} 167 168// Variable shift and bitfield operations. 169// ASRV,LSLV,LSRV,RORV,BFM,SBFM,UBFM 170def : WriteRes<WriteIS, [CyUnitIS]>; 171 172// EXTR Shifts a pair of registers and requires two micro-ops. 173// The second micro-op is delayed, as modeled by ReadExtrHi. 174// EXTR Xn, Xm, #imm 175def : WriteRes<WriteExtr, [CyUnitIS, CyUnitIS]> { 176 let Latency = 2; 177 let NumMicroOps = 2; 178} 179 180// EXTR's first register read is delayed by one cycle, effectively 181// shortening its writer's latency. 182// EXTR Xn, Xm, #imm 183def : ReadAdvance<ReadExtrHi, 1>; 184 185//--- 186// 7.8.6. Multiplies 187//--- 188 189// MUL/MNEG are aliases for MADD/MSUB. 190// MADDW,MSUBW,SMADDL,SMSUBL,UMADDL,UMSUBL 191def : WriteRes<WriteIM32, [CyUnitIM]> { 192 let Latency = 4; 193} 194// MADDX,MSUBX,SMULH,UMULH 195def : WriteRes<WriteIM64, [CyUnitIM]> { 196 let Latency = 5; 197} 198 199//--- 200// 7.8.7. Divide 201//--- 202 203// 32-bit divide takes 7-13 cycles. 10 cycles covers a 20-bit quotient. 204// The ID pipe is consumed for 2 cycles: issue and writeback. 205// SDIVW,UDIVW 206def : WriteRes<WriteID32, [CyUnitID, CyUnitIntDiv]> { 207 let Latency = 10; 208 let ResourceCycles = [2, 10]; 209} 210// 64-bit divide takes 7-21 cycles. 13 cycles covers a 32-bit quotient. 211// The ID pipe is consumed for 2 cycles: issue and writeback. 212// SDIVX,UDIVX 213def : WriteRes<WriteID64, [CyUnitID, CyUnitIntDiv]> { 214 let Latency = 13; 215 let ResourceCycles = [2, 13]; 216} 217 218//--- 219// 7.8.8,7.8.10. Load/Store, single element 220//--- 221 222// Integer loads take 4 cycles and use one LS unit for one cycle. 223def : WriteRes<WriteLD, [CyUnitLS]> { 224 let Latency = 4; 225} 226 227// Store-load forwarding is 4 cycles. 228// 229// Note: The store-exclusive sequence incorporates this 230// latency. However, general heuristics should not model the 231// dependence between a store and subsequent may-alias load because 232// hardware speculation works. 233def : WriteRes<WriteST, [CyUnitLS]> { 234 let Latency = 4; 235} 236 237// Load from base address plus an optionally scaled register offset. 238// Rt latency is latency WriteIS + WriteLD. 239// EXAMPLE: LDR Xn, Xm [, lsl 3] 240def CyWriteLDIdx : SchedWriteVariant<[ 241 SchedVar<ScaledIdxPred, [WriteIS, WriteLD]>, // Load from scaled register. 242 SchedVar<NoSchedPred, [WriteLD]>]>; // Load from register offset. 243def : SchedAlias<WriteLDIdx, CyWriteLDIdx>; // Map AArch64->Cyclone type. 244 245// EXAMPLE: STR Xn, Xm [, lsl 3] 246def CyWriteSTIdx : SchedWriteVariant<[ 247 SchedVar<ScaledIdxPred, [WriteIS, WriteST]>, // Store to scaled register. 248 SchedVar<NoSchedPred, [WriteST]>]>; // Store to register offset. 249def : SchedAlias<WriteSTIdx, CyWriteSTIdx>; // Map AArch64->Cyclone type. 250 251// Read the (unshifted) base register Xn in the second micro-op one cycle later. 252// EXAMPLE: LDR Xn, Xm [, lsl 3] 253def ReadBaseRS : SchedReadAdvance<1>; 254def CyReadAdrBase : SchedReadVariant<[ 255 SchedVar<ScaledIdxPred, [ReadBaseRS]>, // Read base reg after shifting offset. 256 SchedVar<NoSchedPred, [ReadDefault]>]>; // Read base reg with no shift. 257def : SchedAlias<ReadAdrBase, CyReadAdrBase>; // Map AArch64->Cyclone type. 258 259//--- 260// 7.8.9,7.8.11. Load/Store, paired 261//--- 262 263// Address pre/post increment is a simple ALU op with one cycle latency. 264def : WriteRes<WriteAdr, [CyUnitI]>; 265 266// LDP high register write is fused with the load, but a nop micro-op remains. 267def : WriteRes<WriteLDHi, []> { 268 let Latency = 4; 269} 270 271// STP is a vector op and store, except for QQ, which is just two stores. 272def : SchedAlias<WriteSTP, WriteVSTShuffle>; 273def : InstRW<[WriteST, WriteST], (instrs STPQi)>; 274 275//--- 276// 7.8.13. Branches 277//--- 278 279// Branches take a single micro-op. 280// The misprediction penalty is defined as a SchedMachineModel property. 281def : WriteRes<WriteBr, [CyUnitB]> {let Latency = 0;} 282def : WriteRes<WriteBrReg, [CyUnitBR]> {let Latency = 0;} 283 284//--- 285// 7.8.14. Never-issued Instructions, Barrier and Hint Operations 286//--- 287 288// NOP,SEV,SEVL,WFE,WFI,YIELD 289def : WriteRes<WriteHint, []> {let Latency = 0;} 290// ISB 291def : InstRW<[WriteI], (instrs ISB)>; 292// SLREX,DMB,DSB 293def : WriteRes<WriteBarrier, [CyUnitLS]>; 294 295// System instructions get an invalid latency because the latency of 296// other operations across them is meaningless. 297def : WriteRes<WriteSys, []> {let Latency = -1;} 298 299//===----------------------------------------------------------------------===// 300// 7.9 Vector Unit Instructions 301 302// Simple vector operations take 2 cycles. 303def : WriteRes<WriteV, [CyUnitV]> {let Latency = 2;} 304 305// Define some longer latency vector op types for Cyclone. 306def CyWriteV3 : SchedWriteRes<[CyUnitV]> {let Latency = 3;} 307def CyWriteV4 : SchedWriteRes<[CyUnitV]> {let Latency = 4;} 308def CyWriteV5 : SchedWriteRes<[CyUnitV]> {let Latency = 5;} 309def CyWriteV6 : SchedWriteRes<[CyUnitV]> {let Latency = 6;} 310 311// Simple floating-point operations take 2 cycles. 312def : WriteRes<WriteF, [CyUnitV]> {let Latency = 2;} 313 314//--- 315// 7.9.1 Vector Moves 316//--- 317 318// TODO: Add Cyclone-specific zero-cycle zeros. LLVM currently 319// generates expensive int-float conversion instead: 320// FMOVDi Dd, #0.0 321// FMOVv2f64ns Vd.2d, #0.0 322 323// FMOVSi,FMOVDi 324def : WriteRes<WriteFImm, [CyUnitV]> {let Latency = 2;} 325 326// MOVI,MVNI are WriteV 327// FMOVv2f32ns,FMOVv2f64ns,FMOVv4f32ns are WriteV 328 329// Move FPR is a register rename and single nop micro-op. 330// ORR.16b Vd,Vn,Vn 331// COPY is handled above in the WriteMov Variant. 332def WriteVMov : SchedWriteVariant<[ 333 SchedVar<WriteVMovPred, [WriteX]>, 334 SchedVar<NoSchedPred, [WriteV]>]>; 335def : InstRW<[WriteVMov], (instrs ORRv16i8)>; 336 337// FMOVSr,FMOVDr are WriteF. 338 339// MOV V,V is a WriteV. 340 341// CPY D,V[x] is a WriteV 342 343// INS V[x],V[y] is a WriteV. 344 345// FMOVWSr,FMOVXDr,FMOVXDHighr 346def : WriteRes<WriteFCopy, [CyUnitLS]> { 347 let Latency = 5; 348} 349 350// FMOVSWr,FMOVDXr 351def : InstRW<[WriteLD], (instrs FMOVSWr,FMOVDXr,FMOVDXHighr)>; 352 353// INS V[x],R 354def CyWriteCopyToFPR : WriteSequence<[WriteVLD, WriteV]>; 355def : InstRW<[CyWriteCopyToFPR], (instregex "INSv")>; 356 357// SMOV,UMOV R,V[x] 358def CyWriteCopyToGPR : WriteSequence<[WriteLD, WriteI]>; 359def : InstRW<[CyWriteCopyToGPR], (instregex "SMOVv","UMOVv")>; 360 361// DUP V,R 362def : InstRW<[CyWriteCopyToFPR], (instregex "DUPv")>; 363 364// DUP V,V[x] is a WriteV. 365 366//--- 367// 7.9.2 Integer Arithmetic, Logical, and Comparisons 368//--- 369 370// BIC,ORR V,#imm are WriteV 371 372def : InstRW<[CyWriteV3], (instregex "ABSv")>; 373 374// MVN,NEG,NOT are WriteV 375 376def : InstRW<[CyWriteV3], (instregex "SQABSv","SQNEGv")>; 377 378// ADDP is a WriteV. 379def CyWriteVADDLP : SchedWriteRes<[CyUnitV]> {let Latency = 2;} 380def : InstRW<[CyWriteVADDLP], (instregex "SADDLPv","UADDLPv")>; 381 382def : InstRW<[CyWriteV3], 383 (instregex "ADDVv","SMAXVv","UMAXVv","SMINVv","UMINVv")>; 384 385def : InstRW<[CyWriteV3], (instregex "SADDLV","UADDLV")>; 386 387// ADD,SUB are WriteV 388 389// Forward declare. 390def CyWriteVABD : SchedWriteRes<[CyUnitV]> {let Latency = 3;} 391 392// Add/Diff and accumulate uses the vector multiply unit. 393def CyWriteVAccum : SchedWriteRes<[CyUnitVM]> {let Latency = 3;} 394def CyReadVAccum : SchedReadAdvance<1, 395 [CyWriteVAccum, CyWriteVADDLP, CyWriteVABD]>; 396 397def : InstRW<[CyWriteVAccum, CyReadVAccum], 398 (instregex "SADALP","UADALP")>; 399 400def : InstRW<[CyWriteVAccum, CyReadVAccum], 401 (instregex "SABAv","UABAv","SABALv","UABALv")>; 402 403def : InstRW<[CyWriteV3], (instregex "SQADDv","SQSUBv","UQADDv","UQSUBv")>; 404 405def : InstRW<[CyWriteV3], (instregex "SUQADDv","USQADDv")>; 406 407def : InstRW<[CyWriteV4], (instregex "ADDHNv","RADDHNv", "RSUBHNv", "SUBHNv")>; 408 409// WriteV includes: 410// AND,BIC,CMTST,EOR,ORN,ORR 411// ADDP 412// SHADD,SHSUB,SRHADD,UHADD,UHSUB,URHADD 413// SADDL,SSUBL,UADDL,USUBL 414// SADDW,SSUBW,UADDW,USUBW 415 416def : InstRW<[CyWriteV3], (instregex "CMEQv","CMGEv","CMGTv", 417 "CMLEv","CMLTv", 418 "CMHIv","CMHSv")>; 419 420def : InstRW<[CyWriteV3], (instregex "SMAXv","SMINv","UMAXv","UMINv", 421 "SMAXPv","SMINPv","UMAXPv","UMINPv")>; 422 423def : InstRW<[CyWriteVABD], (instregex "SABDv","UABDv", 424 "SABDLv","UABDLv")>; 425 426//--- 427// 7.9.3 Floating Point Arithmetic and Comparisons 428//--- 429 430// FABS,FNEG are WriteF 431 432def : InstRW<[CyWriteV4], (instrs FADDPv2i32p)>; 433def : InstRW<[CyWriteV5], (instrs FADDPv2i64p)>; 434 435def : InstRW<[CyWriteV3], (instregex "FMAXPv2i","FMAXNMPv2i", 436 "FMINPv2i","FMINNMPv2i")>; 437 438def : InstRW<[CyWriteV4], (instregex "FMAXVv","FMAXNMVv","FMINVv","FMINNMVv")>; 439 440def : InstRW<[CyWriteV4], (instrs FADDSrr,FADDv2f32,FADDv4f32, 441 FSUBSrr,FSUBv2f32,FSUBv4f32, 442 FADDPv2f32,FADDPv4f32, 443 FABD32,FABDv2f32,FABDv4f32)>; 444def : InstRW<[CyWriteV5], (instrs FADDDrr,FADDv2f64, 445 FSUBDrr,FSUBv2f64, 446 FADDPv2f64, 447 FABD64,FABDv2f64)>; 448 449def : InstRW<[CyWriteV3], (instregex "FCMEQ","FCMGT","FCMLE","FCMLT")>; 450 451def : InstRW<[CyWriteV3], (instregex "FACGE","FACGT", 452 "FMAXS","FMAXD","FMAXv", 453 "FMINS","FMIND","FMINv", 454 "FMAXNMS","FMAXNMD","FMAXNMv", 455 "FMINNMS","FMINNMD","FMINNMv", 456 "FMAXPv2f","FMAXPv4f", 457 "FMINPv2f","FMINPv4f", 458 "FMAXNMPv2f","FMAXNMPv4f", 459 "FMINNMPv2f","FMINNMPv4f")>; 460 461// FCMP,FCMPE,FCCMP,FCCMPE 462def : WriteRes<WriteFCmp, [CyUnitVC]> {let Latency = 4;} 463 464// FCSEL is a WriteF. 465 466//--- 467// 7.9.4 Shifts and Bitfield Operations 468//--- 469 470// SHL is a WriteV 471 472def CyWriteVSHR : SchedWriteRes<[CyUnitV]> {let Latency = 2;} 473def : InstRW<[CyWriteVSHR], (instregex "SSHRv","USHRv")>; 474 475def CyWriteVSRSHR : SchedWriteRes<[CyUnitV]> {let Latency = 3;} 476def : InstRW<[CyWriteVSRSHR], (instregex "SRSHRv","URSHRv")>; 477 478// Shift and accumulate uses the vector multiply unit. 479def CyWriteVShiftAcc : SchedWriteRes<[CyUnitVM]> {let Latency = 3;} 480def CyReadVShiftAcc : SchedReadAdvance<1, 481 [CyWriteVShiftAcc, CyWriteVSHR, CyWriteVSRSHR]>; 482def : InstRW<[CyWriteVShiftAcc, CyReadVShiftAcc], 483 (instregex "SRSRAv","SSRAv","URSRAv","USRAv")>; 484 485// SSHL,USHL are WriteV. 486 487def : InstRW<[CyWriteV3], (instregex "SRSHLv","URSHLv")>; 488 489// SQSHL,SQSHLU,UQSHL are WriteV. 490 491def : InstRW<[CyWriteV3], (instregex "SQRSHLv","UQRSHLv")>; 492 493// WriteV includes: 494// SHLL,SSHLL,USHLL 495// SLI,SRI 496// BIF,BIT,BSL 497// EXT 498// CLS,CLZ,CNT,RBIT,REV16,REV32,REV64,XTN 499// XTN2 500 501def : InstRW<[CyWriteV4], 502 (instregex "RSHRNv","SHRNv", 503 "SQRSHRNv","SQRSHRUNv","SQSHRNv","SQSHRUNv", 504 "UQRSHRNv","UQSHRNv","SQXTNv","SQXTUNv","UQXTNv")>; 505 506//--- 507// 7.9.5 Multiplication 508//--- 509 510def CyWriteVMul : SchedWriteRes<[CyUnitVM]> { let Latency = 4;} 511def : InstRW<[CyWriteVMul], (instregex "MULv","SMULLv","UMULLv", 512 "SQDMULLv","SQDMULHv","SQRDMULHv")>; 513 514// FMUL,FMULX,FNMUL default to WriteFMul. 515def : WriteRes<WriteFMul, [CyUnitVM]> { let Latency = 4;} 516 517def CyWriteV64Mul : SchedWriteRes<[CyUnitVM]> { let Latency = 5;} 518def : InstRW<[CyWriteV64Mul], (instrs FMULDrr,FMULv2f64,FMULv2i64_indexed, 519 FNMULDrr,FMULX64,FMULXv2f64,FMULXv2i64_indexed)>; 520 521def CyReadVMulAcc : SchedReadAdvance<1, [CyWriteVMul, CyWriteV64Mul]>; 522def : InstRW<[CyWriteVMul, CyReadVMulAcc], 523 (instregex "MLA","MLS","SMLAL","SMLSL","UMLAL","UMLSL", 524 "SQDMLAL","SQDMLSL")>; 525 526def CyWriteSMul : SchedWriteRes<[CyUnitVM]> { let Latency = 8;} 527def CyWriteDMul : SchedWriteRes<[CyUnitVM]> { let Latency = 10;} 528def CyReadSMul : SchedReadAdvance<4, [CyWriteSMul]>; 529def CyReadDMul : SchedReadAdvance<5, [CyWriteDMul]>; 530 531def : InstRW<[CyWriteSMul, CyReadSMul], 532 (instrs FMADDSrrr,FMSUBSrrr,FNMADDSrrr,FNMSUBSrrr, 533 FMLAv2f32,FMLAv4f32, 534 FMLAv1i32_indexed,FMLAv1i64_indexed,FMLAv2i32_indexed)>; 535def : InstRW<[CyWriteDMul, CyReadDMul], 536 (instrs FMADDDrrr,FMSUBDrrr,FNMADDDrrr,FNMSUBDrrr, 537 FMLAv2f64,FMLAv2i64_indexed, 538 FMLSv2f64,FMLSv2i64_indexed)>; 539 540def CyWritePMUL : SchedWriteRes<[CyUnitVD]> { let Latency = 3; } 541def : InstRW<[CyWritePMUL], (instregex "PMULv", "PMULLv")>; 542 543//--- 544// 7.9.6 Divide and Square Root 545//--- 546 547// FDIV,FSQRT 548// TODO: Add 64-bit variant with 19 cycle latency. 549// TODO: Specialize FSQRT for longer latency. 550def : WriteRes<WriteFDiv, [CyUnitVD, CyUnitFloatDiv]> { 551 let Latency = 17; 552 let ResourceCycles = [2, 17]; 553} 554 555def : InstRW<[CyWriteV4], (instregex "FRECPEv","FRECPXv","URECPEv","URSQRTEv")>; 556 557def WriteFRSQRTE : SchedWriteRes<[CyUnitVM]> { let Latency = 4; } 558def : InstRW<[WriteFRSQRTE], (instregex "FRSQRTEv")>; 559 560def WriteFRECPS : SchedWriteRes<[CyUnitVM]> { let Latency = 8; } 561def WriteFRSQRTS : SchedWriteRes<[CyUnitVM]> { let Latency = 10; } 562def : InstRW<[WriteFRECPS], (instregex "FRECPSv")>; 563def : InstRW<[WriteFRSQRTS], (instregex "FRSQRTSv")>; 564 565//--- 566// 7.9.7 Integer-FP Conversions 567//--- 568 569// FCVT lengthen f16/s32 570def : InstRW<[WriteV], (instrs FCVTSHr,FCVTDHr,FCVTDSr)>; 571 572// FCVT,FCVTN,FCVTXN 573// SCVTF,UCVTF V,V 574// FRINT(AIMNPXZ) V,V 575def : WriteRes<WriteFCvt, [CyUnitV]> {let Latency = 4;} 576 577// SCVT/UCVT S/D, Rd = VLD5+V4: 9 cycles. 578def CyWriteCvtToFPR : WriteSequence<[WriteVLD, CyWriteV4]>; 579def : InstRW<[CyWriteCopyToFPR], (instregex "FCVT[AMNPZ][SU][SU][WX][SD]r")>; 580 581// FCVT Rd, S/D = V6+LD4: 10 cycles 582def CyWriteCvtToGPR : WriteSequence<[CyWriteV6, WriteLD]>; 583def : InstRW<[CyWriteCvtToGPR], (instregex "[SU]CVTF[SU][WX][SD]r")>; 584 585// FCVTL is a WriteV 586 587//--- 588// 7.9.8-7.9.10 Cryptography, Data Transposition, Table Lookup 589//--- 590 591def CyWriteCrypto2 : SchedWriteRes<[CyUnitVD]> {let Latency = 2;} 592def : InstRW<[CyWriteCrypto2], (instrs AESIMCrr, AESMCrr, SHA1Hrr, 593 AESDrr, AESErr, SHA1SU1rr, SHA256SU0rr, 594 SHA1SU0rrr)>; 595 596def CyWriteCrypto3 : SchedWriteRes<[CyUnitVD]> {let Latency = 3;} 597def : InstRW<[CyWriteCrypto3], (instrs SHA256SU1rrr)>; 598 599def CyWriteCrypto6 : SchedWriteRes<[CyUnitVD]> {let Latency = 6;} 600def : InstRW<[CyWriteCrypto6], (instrs SHA1Crrr, SHA1Mrrr, SHA1Prrr, 601 SHA256Hrrr,SHA256H2rrr)>; 602 603// TRN,UZP,ZUP are WriteV. 604 605// TBL,TBX are WriteV. 606 607//--- 608// 7.9.11-7.9.14 Load/Store, single element and paired 609//--- 610 611// Loading into the vector unit takes 5 cycles vs 4 for integer loads. 612def : WriteRes<WriteVLD, [CyUnitLS]> { 613 let Latency = 5; 614} 615 616// Store-load forwarding is 4 cycles. 617def : WriteRes<WriteVST, [CyUnitLS]> { 618 let Latency = 4; 619} 620 621// WriteVLDPair/VSTPair sequences are expanded by the target description. 622 623//--- 624// 7.9.15 Load, element operations 625//--- 626 627// Only the first WriteVLD and WriteAdr for writeback matches def operands. 628// Subsequent WriteVLDs consume resources. Since all loaded values have the 629// same latency, this is acceptable. 630 631// Vd is read 5 cycles after issuing the vector load. 632def : ReadAdvance<ReadVLD, 5>; 633 634def : InstRW<[WriteVLD], 635 (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; 636def : InstRW<[WriteVLD, WriteAdr], 637 (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>; 638 639// Register writes from the load's high half are fused micro-ops. 640def : InstRW<[WriteVLD], 641 (instregex "LD1Twov(8b|4h|2s|1d)$")>; 642def : InstRW<[WriteVLD, WriteAdr], 643 (instregex "LD1Twov(8b|4h|2s|1d)_POST")>; 644def : InstRW<[WriteVLD, WriteVLD], 645 (instregex "LD1Twov(16b|8h|4s|2d)$")>; 646def : InstRW<[WriteVLD, WriteAdr, WriteVLD], 647 (instregex "LD1Twov(16b|8h|4s|2d)_POST")>; 648 649def : InstRW<[WriteVLD, WriteVLD], 650 (instregex "LD1Threev(8b|4h|2s|1d)$")>; 651def : InstRW<[WriteVLD, WriteAdr, WriteVLD], 652 (instregex "LD1Threev(8b|4h|2s|1d)_POST")>; 653def : InstRW<[WriteVLD, WriteVLD, WriteVLD], 654 (instregex "LD1Threev(16b|8h|4s|2d)$")>; 655def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD], 656 (instregex "LD1Threev(16b|8h|4s|2d)_POST")>; 657 658def : InstRW<[WriteVLD, WriteVLD], 659 (instregex "LD1Fourv(8b|4h|2s|1d)$")>; 660def : InstRW<[WriteVLD, WriteAdr, WriteVLD], 661 (instregex "LD1Fourv(8b|4h|2s|1d)_POST")>; 662def : InstRW<[WriteVLD, WriteVLD, WriteVLD, WriteVLD], 663 (instregex "LD1Fourv(16b|8h|4s|2d)$")>; 664def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD, WriteVLD], 665 (instregex "LD1Fourv(16b|8h|4s|2d)_POST")>; 666 667def : InstRW<[WriteVLDShuffle, ReadVLD], 668 (instregex "LD1i(8|16|32)$")>; 669def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr], 670 (instregex "LD1i(8|16|32)_POST")>; 671 672def : InstRW<[WriteVLDShuffle, ReadVLD], (instrs LD1i64)>; 673def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr],(instrs LD1i64_POST)>; 674 675def : InstRW<[WriteVLDShuffle], 676 (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; 677def : InstRW<[WriteVLDShuffle, WriteAdr], 678 (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; 679 680def : InstRW<[WriteVLDShuffle, WriteV], 681 (instregex "LD2Twov(8b|4h|2s)$")>; 682def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV], 683 (instregex "LD2Twov(8b|4h|2s)_POST$")>; 684def : InstRW<[WriteVLDShuffle, WriteVLDShuffle], 685 (instregex "LD2Twov(16b|8h|4s|2d)$")>; 686def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle], 687 (instregex "LD2Twov(16b|8h|4s|2d)_POST")>; 688 689def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV], 690 (instregex "LD2i(8|16|32)$")>; 691def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV], 692 (instregex "LD2i(8|16|32)_POST")>; 693def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV], 694 (instregex "LD2i64$")>; 695def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV], 696 (instregex "LD2i64_POST")>; 697 698def : InstRW<[WriteVLDShuffle, WriteV], 699 (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; 700def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV], 701 (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>; 702 703def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV], 704 (instregex "LD3Threev(8b|4h|2s)$")>; 705def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV], 706 (instregex "LD3Threev(8b|4h|2s)_POST")>; 707def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteVLDShuffle], 708 (instregex "LD3Threev(16b|8h|4s|2d)$")>; 709def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteVLDShuffle], 710 (instregex "LD3Threev(16b|8h|4s|2d)_POST")>; 711 712def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV], 713 (instregex "LD3i(8|16|32)$")>; 714def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV], 715 (instregex "LD3i(8|16|32)_POST")>; 716 717def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV], 718 (instregex "LD3i64$")>; 719def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV], 720 (instregex "LD3i64_POST")>; 721 722def : InstRW<[WriteVLDShuffle, WriteV, WriteV], 723 (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)$")>; 724def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV], 725 (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)_POST")>; 726 727def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV], 728 (instrs LD3Rv1d,LD3Rv2d)>; 729def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV], 730 (instrs LD3Rv1d_POST,LD3Rv2d_POST)>; 731 732def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV], 733 (instregex "LD4Fourv(8b|4h|2s)$")>; 734def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV], 735 (instregex "LD4Fourv(8b|4h|2s)_POST")>; 736def : InstRW<[WriteVLDPairShuffle, WriteVLDPairShuffle, 737 WriteVLDPairShuffle, WriteVLDPairShuffle], 738 (instregex "LD4Fourv(16b|8h|4s|2d)$")>; 739def : InstRW<[WriteVLDPairShuffle, WriteAdr, WriteVLDPairShuffle, 740 WriteVLDPairShuffle, WriteVLDPairShuffle], 741 (instregex "LD4Fourv(16b|8h|4s|2d)_POST")>; 742 743def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV, WriteV], 744 (instregex "LD4i(8|16|32)$")>; 745def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV, WriteV], 746 (instregex "LD4i(8|16|32)_POST")>; 747 748 749def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV, WriteV], 750 (instrs LD4i64)>; 751def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV], 752 (instrs LD4i64_POST)>; 753 754def : InstRW<[WriteVLDShuffle, WriteV, WriteV, WriteV], 755 (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)$")>; 756def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV, WriteV], 757 (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)_POST")>; 758 759def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV], 760 (instrs LD4Rv1d,LD4Rv2d)>; 761def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV], 762 (instrs LD4Rv1d_POST,LD4Rv2d_POST)>; 763 764//--- 765// 7.9.16 Store, element operations 766//--- 767 768// Only the WriteAdr for writeback matches a def operands. 769// Subsequent WriteVLDs only consume resources. 770 771def : InstRW<[WriteVST], 772 (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; 773def : InstRW<[WriteAdr, WriteVST], 774 (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>; 775 776def : InstRW<[WriteVSTShuffle], 777 (instregex "ST1Twov(8b|4h|2s|1d)$")>; 778def : InstRW<[WriteAdr, WriteVSTShuffle], 779 (instregex "ST1Twov(8b|4h|2s|1d)_POST")>; 780def : InstRW<[WriteVST, WriteVST], 781 (instregex "ST1Twov(16b|8h|4s|2d)$")>; 782def : InstRW<[WriteAdr, WriteVST, WriteVST], 783 (instregex "ST1Twov(16b|8h|4s|2d)_POST")>; 784 785def : InstRW<[WriteVSTShuffle, WriteVST], 786 (instregex "ST1Threev(8b|4h|2s|1d)$")>; 787def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVST], 788 (instregex "ST1Threev(8b|4h|2s|1d)_POST")>; 789def : InstRW<[WriteVST, WriteVST, WriteVST], 790 (instregex "ST1Threev(16b|8h|4s|2d)$")>; 791def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST], 792 (instregex "ST1Threev(16b|8h|4s|2d)_POST")>; 793 794def : InstRW<[WriteVSTShuffle, WriteVSTShuffle], 795 (instregex "ST1Fourv(8b|4h|2s|1d)$")>; 796def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], 797 (instregex "ST1Fourv(8b|4h|2s|1d)_POST")>; 798def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST], 799 (instregex "ST1Fourv(16b|8h|4s|2d)$")>; 800def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST, WriteVST], 801 (instregex "ST1Fourv(16b|8h|4s|2d)_POST")>; 802 803def : InstRW<[WriteVSTShuffle], (instregex "ST1i(8|16|32)$")>; 804def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST1i(8|16|32)_POST")>; 805 806def : InstRW<[WriteVSTShuffle], (instrs ST1i64)>; 807def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST1i64_POST)>; 808 809def : InstRW<[WriteVSTShuffle], 810 (instregex "ST2Twov(8b|4h|2s)$")>; 811def : InstRW<[WriteAdr, WriteVSTShuffle], 812 (instregex "ST2Twov(8b|4h|2s)_POST")>; 813def : InstRW<[WriteVSTShuffle, WriteVSTShuffle], 814 (instregex "ST2Twov(16b|8h|4s|2d)$")>; 815def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], 816 (instregex "ST2Twov(16b|8h|4s|2d)_POST")>; 817 818def : InstRW<[WriteVSTShuffle], (instregex "ST2i(8|16|32)$")>; 819def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST2i(8|16|32)_POST")>; 820def : InstRW<[WriteVSTShuffle], (instrs ST2i64)>; 821def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST2i64_POST)>; 822 823def : InstRW<[WriteVSTShuffle, WriteVSTShuffle], 824 (instregex "ST3Threev(8b|4h|2s)$")>; 825def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], 826 (instregex "ST3Threev(8b|4h|2s)_POST")>; 827def : InstRW<[WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle], 828 (instregex "ST3Threev(16b|8h|4s|2d)$")>; 829def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle], 830 (instregex "ST3Threev(16b|8h|4s|2d)_POST")>; 831 832def : InstRW<[WriteVSTShuffle], (instregex "ST3i(8|16|32)$")>; 833def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST3i(8|16|32)_POST")>; 834 835def :InstRW<[WriteVSTShuffle, WriteVSTShuffle], (instrs ST3i64)>; 836def :InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], (instrs ST3i64_POST)>; 837 838def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle], 839 (instregex "ST4Fourv(8b|4h|2s|1d)$")>; 840def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle], 841 (instregex "ST4Fourv(8b|4h|2s|1d)_POST")>; 842def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle, 843 WriteVSTPairShuffle, WriteVSTPairShuffle], 844 (instregex "ST4Fourv(16b|8h|4s|2d)$")>; 845def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle, 846 WriteVSTPairShuffle, WriteVSTPairShuffle], 847 (instregex "ST4Fourv(16b|8h|4s|2d)_POST")>; 848 849def : InstRW<[WriteVSTPairShuffle], (instregex "ST4i(8|16|32)$")>; 850def : InstRW<[WriteAdr, WriteVSTPairShuffle], (instregex "ST4i(8|16|32)_POST")>; 851 852def : InstRW<[WriteVSTShuffle, WriteVSTShuffle], (instrs ST4i64)>; 853def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],(instrs ST4i64_POST)>; 854 855// Atomic operations are not supported. 856def : WriteRes<WriteAtomic, []> { let Unsupported = 1; } 857 858//--- 859// Unused SchedRead types 860//--- 861 862def : ReadAdvance<ReadI, 0>; 863def : ReadAdvance<ReadISReg, 0>; 864def : ReadAdvance<ReadIEReg, 0>; 865def : ReadAdvance<ReadIM, 0>; 866def : ReadAdvance<ReadIMA, 0>; 867def : ReadAdvance<ReadID, 0>; 868 869} // SchedModel = CycloneModel 870