1//===-- PPCScheduleP7.td - PPC P7 Scheduling Definitions ---*- tablegen -*-===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file defines the itinerary class data for the POWER7 processor. 10// 11//===----------------------------------------------------------------------===// 12 13// Primary reference: 14// IBM POWER7 multicore server processor 15// B. Sinharoy, et al. 16// IBM J. Res. & Dev. (55) 3. May/June 2011. 17 18// Scheduling for the P7 involves tracking two types of resources: 19// 1. The dispatch bundle slots 20// 2. The functional unit resources 21 22// Dispatch units: 23def P7_DU1 : FuncUnit; 24def P7_DU2 : FuncUnit; 25def P7_DU3 : FuncUnit; 26def P7_DU4 : FuncUnit; 27def P7_DU5 : FuncUnit; 28def P7_DU6 : FuncUnit; 29 30def P7_LS1 : FuncUnit; // Load/Store pipeline 1 31def P7_LS2 : FuncUnit; // Load/Store pipeline 2 32 33def P7_FX1 : FuncUnit; // FX pipeline 1 34def P7_FX2 : FuncUnit; // FX pipeline 2 35 36// VS pipeline 1 (vector integer ops. always here) 37def P7_VS1 : FuncUnit; // VS pipeline 1 38// VS pipeline 2 (128-bit stores and perms. here) 39def P7_VS2 : FuncUnit; // VS pipeline 2 40 41def P7_CRU : FuncUnit; // CR unit (CR logicals and move-from-SPRs) 42def P7_BRU : FuncUnit; // BR unit 43 44// Notes: 45// Each LSU pipeline can also execute FX add and logical instructions. 46// Each LSU pipeline can complete a load or store in one cycle. 47// 48// Each store is broken into two parts, AGEN goes to the LSU while a 49// "data steering" op. goes to the FXU or VSU. 50// 51// FX loads have a two cycle load-to-use latency (so one "bubble" cycle). 52// VSU loads have a three cycle load-to-use latency (so two "bubble" cycle). 53// 54// Frequent FX ops. take only one cycle and results can be used again in the 55// next cycle (there is a self-bypass). Getting results from the other FX 56// pipeline takes an additional cycle. 57// 58// The VSU XS is similar to the POWER6, but with a pipeline length of 2 cycles 59// (instead of 3 cycles on the POWER6). VSU XS handles vector FX-style ops. 60// Dispatch of an instruction to VS1 that uses four single prec. inputs 61// (either to a float or XC op). prevents dispatch in that cycle to VS2 of any 62// floating point instruction. 63// 64// The VSU PM is similar to the POWER6, but with a pipeline length of 3 cycles 65// (instead of 4 cycles on the POWER6). vsel is handled by the PM pipeline 66// (unlike on the POWER6). 67// 68// FMA from the VSUs can forward results in 6 cycles. VS1 XS and vector FP 69// share the same write-back, and have a 5-cycle latency difference, so the 70// IFU/IDU will not dispatch an XS instructon 5 cycles after a vector FP 71// op. has been dispatched to VS1. 72// 73// Three cycles after an L1 cache hit, a dependent VSU instruction can issue. 74// 75// Instruction dispatch groups have (at most) four non-branch instructions, and 76// two branches. Unlike on the POWER4/5, a branch does not automatically 77// end the dispatch group, but a second branch must be the last in the group. 78 79def P7Itineraries : ProcessorItineraries< 80 [P7_DU1, P7_DU2, P7_DU3, P7_DU4, P7_DU5, P7_DU6, 81 P7_LS1, P7_LS2, P7_FX1, P7_FX2, P7_VS1, P7_VS2, P7_CRU, P7_BRU], [], [ 82 InstrItinData<IIC_IntSimple , [InstrStage<1, [P7_DU1, P7_DU2, 83 P7_DU3, P7_DU4], 0>, 84 InstrStage<1, [P7_FX1, P7_FX2, 85 P7_LS1, P7_LS2]>], 86 [1, 1, 1]>, 87 InstrItinData<IIC_IntGeneral , [InstrStage<1, [P7_DU1, P7_DU2, 88 P7_DU3, P7_DU4], 0>, 89 InstrStage<1, [P7_FX1, P7_FX2]>], 90 [1, 1, 1]>, 91 InstrItinData<IIC_IntISEL, [InstrStage<1, [P7_DU1], 0>, 92 InstrStage<1, [P7_FX1, P7_FX2], 0>, 93 InstrStage<1, [P7_BRU]>], 94 [1, 1, 1, 1]>, 95 InstrItinData<IIC_IntCompare , [InstrStage<1, [P7_DU1, P7_DU2, 96 P7_DU3, P7_DU4], 0>, 97 InstrStage<1, [P7_FX1, P7_FX2]>], 98 [1, 1, 1]>, 99 // FIXME: Add record-form itinerary data. 100 InstrItinData<IIC_IntDivW , [InstrStage<1, [P7_DU1], 0>, 101 InstrStage<1, [P7_DU2], 0>, 102 InstrStage<36, [P7_FX1, P7_FX2]>], 103 [36, 1, 1]>, 104 InstrItinData<IIC_IntDivD , [InstrStage<1, [P7_DU1], 0>, 105 InstrStage<1, [P7_DU2], 0>, 106 InstrStage<68, [P7_FX1, P7_FX2]>], 107 [68, 1, 1]>, 108 InstrItinData<IIC_IntMulHW , [InstrStage<1, [P7_DU1, P7_DU2, 109 P7_DU3, P7_DU4], 0>, 110 InstrStage<1, [P7_FX1, P7_FX2]>], 111 [4, 1, 1]>, 112 InstrItinData<IIC_IntMulHWU , [InstrStage<1, [P7_DU1, P7_DU2, 113 P7_DU3, P7_DU4], 0>, 114 InstrStage<1, [P7_FX1, P7_FX2]>], 115 [4, 1, 1]>, 116 InstrItinData<IIC_IntMulHD , [InstrStage<1, [P7_DU1, P7_DU2, 117 P7_DU3, P7_DU4], 0>, 118 InstrStage<1, [P7_FX1, P7_FX2]>], 119 [4, 1, 1]>, 120 InstrItinData<IIC_IntMulLI , [InstrStage<1, [P7_DU1, P7_DU2, 121 P7_DU3, P7_DU4], 0>, 122 InstrStage<1, [P7_FX1, P7_FX2]>], 123 [4, 1, 1]>, 124 InstrItinData<IIC_IntRotate , [InstrStage<1, [P7_DU1, P7_DU2, 125 P7_DU3, P7_DU4], 0>, 126 InstrStage<1, [P7_FX1, P7_FX2]>], 127 [1, 1, 1]>, 128 InstrItinData<IIC_IntRotateD , [InstrStage<1, [P7_DU1, P7_DU2, 129 P7_DU3, P7_DU4], 0>, 130 InstrStage<1, [P7_FX1, P7_FX2]>], 131 [1, 1, 1]>, 132 InstrItinData<IIC_IntRotateDI , [InstrStage<1, [P7_DU1, P7_DU2, 133 P7_DU3, P7_DU4], 0>, 134 InstrStage<1, [P7_FX1, P7_FX2]>], 135 [1, 1, 1]>, 136 InstrItinData<IIC_IntShift , [InstrStage<1, [P7_DU1, P7_DU2, 137 P7_DU3, P7_DU4], 0>, 138 InstrStage<1, [P7_FX1, P7_FX2]>], 139 [1, 1, 1]>, 140 InstrItinData<IIC_IntTrapW , [InstrStage<1, [P7_DU1, P7_DU2, 141 P7_DU3, P7_DU4], 0>, 142 InstrStage<1, [P7_FX1, P7_FX2]>], 143 [1, 1]>, 144 InstrItinData<IIC_IntTrapD , [InstrStage<1, [P7_DU1, P7_DU2, 145 P7_DU3, P7_DU4], 0>, 146 InstrStage<1, [P7_FX1, P7_FX2]>], 147 [1, 1]>, 148 InstrItinData<IIC_BrB , [InstrStage<1, [P7_DU5, P7_DU6], 0>, 149 InstrStage<1, [P7_BRU]>], 150 [3, 1, 1]>, 151 InstrItinData<IIC_BrCR , [InstrStage<1, [P7_DU1], 0>, 152 InstrStage<1, [P7_CRU]>], 153 [3, 1, 1]>, 154 InstrItinData<IIC_BrMCR , [InstrStage<1, [P7_DU5, P7_DU6], 0>, 155 InstrStage<1, [P7_BRU]>], 156 [3, 1, 1]>, 157 InstrItinData<IIC_BrMCRX , [InstrStage<1, [P7_DU5, P7_DU6], 0>, 158 InstrStage<1, [P7_BRU]>], 159 [3, 1, 1]>, 160 InstrItinData<IIC_LdStLoad , [InstrStage<1, [P7_DU1, P7_DU2, 161 P7_DU3, P7_DU4], 0>, 162 InstrStage<1, [P7_LS1, P7_LS2]>], 163 [2, 1, 1]>, 164 InstrItinData<IIC_LdStLoadUpd , [InstrStage<1, [P7_DU1], 0>, 165 InstrStage<1, [P7_DU2], 0>, 166 InstrStage<1, [P7_LS1, P7_LS2], 0>, 167 InstrStage<1, [P7_FX1, P7_FX2]>], 168 [2, 2, 1, 1]>, 169 InstrItinData<IIC_LdStLoadUpdX, [InstrStage<1, [P7_DU1], 0>, 170 InstrStage<1, [P7_DU2], 0>, 171 InstrStage<1, [P7_DU3], 0>, 172 InstrStage<1, [P7_DU4], 0>, 173 InstrStage<1, [P7_FX1, P7_FX2]>, 174 InstrStage<1, [P7_LS1, P7_LS2], 0>, 175 InstrStage<1, [P7_FX1, P7_FX2]>], 176 [3, 3, 1, 1]>, 177 InstrItinData<IIC_LdStLD , [InstrStage<1, [P7_DU1, P7_DU2, 178 P7_DU3, P7_DU4], 0>, 179 InstrStage<1, [P7_LS1, P7_LS2]>], 180 [2, 1, 1]>, 181 InstrItinData<IIC_LdStLDU , [InstrStage<1, [P7_DU1], 0>, 182 InstrStage<1, [P7_DU2], 0>, 183 InstrStage<1, [P7_LS1, P7_LS2], 0>, 184 InstrStage<1, [P7_FX1, P7_FX2]>], 185 [2, 2, 1, 1]>, 186 InstrItinData<IIC_LdStLDUX , [InstrStage<1, [P7_DU1], 0>, 187 InstrStage<1, [P7_DU2], 0>, 188 InstrStage<1, [P7_DU3], 0>, 189 InstrStage<1, [P7_DU4], 0>, 190 InstrStage<1, [P7_FX1, P7_FX2]>, 191 InstrStage<1, [P7_LS1, P7_LS2], 0>, 192 InstrStage<1, [P7_FX1, P7_FX2]>], 193 [3, 3, 1, 1]>, 194 InstrItinData<IIC_LdStLFD , [InstrStage<1, [P7_DU1, P7_DU2, 195 P7_DU3, P7_DU4], 0>, 196 InstrStage<1, [P7_LS1, P7_LS2]>], 197 [3, 1, 1]>, 198 InstrItinData<IIC_LdStLVecX , [InstrStage<1, [P7_DU1, P7_DU2, 199 P7_DU3, P7_DU4], 0>, 200 InstrStage<1, [P7_LS1, P7_LS2]>], 201 [3, 1, 1]>, 202 InstrItinData<IIC_LdStLFDU , [InstrStage<1, [P7_DU1], 0>, 203 InstrStage<1, [P7_DU2], 0>, 204 InstrStage<1, [P7_LS1, P7_LS2], 0>, 205 InstrStage<1, [P7_FX1, P7_FX2]>], 206 [3, 3, 1, 1]>, 207 InstrItinData<IIC_LdStLFDUX , [InstrStage<1, [P7_DU1], 0>, 208 InstrStage<1, [P7_DU2], 0>, 209 InstrStage<1, [P7_LS1, P7_LS2], 0>, 210 InstrStage<1, [P7_FX1, P7_FX2]>], 211 [3, 3, 1, 1]>, 212 InstrItinData<IIC_LdStLHA , [InstrStage<1, [P7_DU1], 0>, 213 InstrStage<1, [P7_DU2], 0>, 214 InstrStage<1, [P7_LS1, P7_LS2]>, 215 InstrStage<1, [P7_FX1, P7_FX2]>], 216 [3, 1, 1]>, 217 InstrItinData<IIC_LdStLHAU , [InstrStage<1, [P7_DU1], 0>, 218 InstrStage<1, [P7_DU2], 0>, 219 InstrStage<1, [P7_LS1, P7_LS2], 0>, 220 InstrStage<1, [P7_FX1, P7_FX2]>, 221 InstrStage<1, [P7_FX1, P7_FX2]>], 222 [4, 4, 1, 1]>, 223 InstrItinData<IIC_LdStLHAUX , [InstrStage<1, [P7_DU1], 0>, 224 InstrStage<1, [P7_DU2], 0>, 225 InstrStage<1, [P7_DU3], 0>, 226 InstrStage<1, [P7_DU4], 0>, 227 InstrStage<1, [P7_FX1, P7_FX2]>, 228 InstrStage<1, [P7_LS1, P7_LS2], 0>, 229 InstrStage<1, [P7_FX1, P7_FX2]>, 230 InstrStage<1, [P7_FX1, P7_FX2]>], 231 [4, 4, 1, 1]>, 232 InstrItinData<IIC_LdStLWA , [InstrStage<1, [P7_DU1], 0>, 233 InstrStage<1, [P7_DU2], 0>, 234 InstrStage<1, [P7_LS1, P7_LS2]>, 235 InstrStage<1, [P7_FX1, P7_FX2]>], 236 [3, 1, 1]>, 237 InstrItinData<IIC_LdStLWARX, [InstrStage<1, [P7_DU1], 0>, 238 InstrStage<1, [P7_DU2], 0>, 239 InstrStage<1, [P7_DU3], 0>, 240 InstrStage<1, [P7_DU4], 0>, 241 InstrStage<1, [P7_LS1, P7_LS2]>], 242 [3, 1, 1]>, 243 InstrItinData<IIC_LdStLDARX, [InstrStage<1, [P7_DU1], 0>, 244 InstrStage<1, [P7_DU2], 0>, 245 InstrStage<1, [P7_DU3], 0>, 246 InstrStage<1, [P7_DU4], 0>, 247 InstrStage<1, [P7_LS1, P7_LS2]>], 248 [3, 1, 1]>, 249 InstrItinData<IIC_LdStLMW , [InstrStage<1, [P7_DU1, P7_DU2, 250 P7_DU3, P7_DU4], 0>, 251 InstrStage<1, [P7_LS1, P7_LS2]>], 252 [2, 1, 1]>, 253 InstrItinData<IIC_LdStStore , [InstrStage<1, [P7_DU1, P7_DU2, 254 P7_DU3, P7_DU4], 0>, 255 InstrStage<1, [P7_LS1, P7_LS2], 0>, 256 InstrStage<1, [P7_FX1, P7_FX2]>], 257 [1, 1, 1]>, 258 InstrItinData<IIC_LdStSTD , [InstrStage<1, [P7_DU1, P7_DU2, 259 P7_DU3, P7_DU4], 0>, 260 InstrStage<1, [P7_LS1, P7_LS2], 0>, 261 InstrStage<1, [P7_FX1, P7_FX2]>], 262 [1, 1, 1]>, 263 InstrItinData<IIC_LdStSTU , [InstrStage<1, [P7_DU1], 0>, 264 InstrStage<1, [P7_DU2], 0>, 265 InstrStage<1, [P7_LS1, P7_LS2], 0>, 266 InstrStage<1, [P7_FX1, P7_FX2]>, 267 InstrStage<1, [P7_FX1, P7_FX2]>], 268 [2, 1, 1, 1]>, 269 InstrItinData<IIC_LdStSTUX , [InstrStage<1, [P7_DU1], 0>, 270 InstrStage<1, [P7_DU2], 0>, 271 InstrStage<1, [P7_DU3], 0>, 272 InstrStage<1, [P7_DU4], 0>, 273 InstrStage<1, [P7_LS1, P7_LS2], 0>, 274 InstrStage<1, [P7_FX1, P7_FX2]>, 275 InstrStage<1, [P7_FX1, P7_FX2]>], 276 [2, 1, 1, 1]>, 277 InstrItinData<IIC_LdStSTFD , [InstrStage<1, [P7_DU1, P7_DU2, 278 P7_DU3, P7_DU4], 0>, 279 InstrStage<1, [P7_LS1, P7_LS2], 0>, 280 InstrStage<1, [P7_VS1, P7_VS2]>], 281 [1, 1, 1]>, 282 InstrItinData<IIC_LdStSTFDU , [InstrStage<1, [P7_DU1], 0>, 283 InstrStage<1, [P7_DU2], 0>, 284 InstrStage<1, [P7_LS1, P7_LS2], 0>, 285 InstrStage<1, [P7_FX1, P7_FX2], 0>, 286 InstrStage<1, [P7_VS1, P7_VS2]>], 287 [2, 1, 1, 1]>, 288 InstrItinData<IIC_LdStSTVEBX , [InstrStage<1, [P7_DU1, P7_DU2, 289 P7_DU3, P7_DU4], 0>, 290 InstrStage<1, [P7_LS1, P7_LS2], 0>, 291 InstrStage<1, [P7_VS2]>], 292 [1, 1, 1]>, 293 InstrItinData<IIC_LdStSTDCX , [InstrStage<1, [P7_DU1], 0>, 294 InstrStage<1, [P7_DU2], 0>, 295 InstrStage<1, [P7_DU3], 0>, 296 InstrStage<1, [P7_DU4], 0>, 297 InstrStage<1, [P7_LS1, P7_LS2]>], 298 [1, 1, 1]>, 299 InstrItinData<IIC_LdStSTWCX , [InstrStage<1, [P7_DU1], 0>, 300 InstrStage<1, [P7_DU2], 0>, 301 InstrStage<1, [P7_DU3], 0>, 302 InstrStage<1, [P7_DU4], 0>, 303 InstrStage<1, [P7_LS1, P7_LS2]>], 304 [1, 1, 1]>, 305 InstrItinData<IIC_BrMCRX , [InstrStage<1, [P7_DU1], 0>, 306 InstrStage<1, [P7_DU2], 0>, 307 InstrStage<1, [P7_DU3], 0>, 308 InstrStage<1, [P7_DU4], 0>, 309 InstrStage<1, [P7_CRU]>, 310 InstrStage<1, [P7_FX1, P7_FX2]>], 311 [3, 1]>, // mtcr 312 InstrItinData<IIC_SprMFCR , [InstrStage<1, [P7_DU1], 0>, 313 InstrStage<1, [P7_CRU]>], 314 [6, 1]>, 315 InstrItinData<IIC_SprMFCRF , [InstrStage<1, [P7_DU1], 0>, 316 InstrStage<1, [P7_CRU]>], 317 [3, 1]>, 318 InstrItinData<IIC_SprMTSPR , [InstrStage<1, [P7_DU1], 0>, 319 InstrStage<1, [P7_FX1]>], 320 [4, 1]>, // mtctr 321 InstrItinData<IIC_FPGeneral , [InstrStage<1, [P7_DU1, P7_DU2, 322 P7_DU3, P7_DU4], 0>, 323 InstrStage<1, [P7_VS1, P7_VS2]>], 324 [5, 1, 1]>, 325 InstrItinData<IIC_FPAddSub , [InstrStage<1, [P7_DU1, P7_DU2, 326 P7_DU3, P7_DU4], 0>, 327 InstrStage<1, [P7_VS1, P7_VS2]>], 328 [5, 1, 1]>, 329 InstrItinData<IIC_FPCompare , [InstrStage<1, [P7_DU1, P7_DU2, 330 P7_DU3, P7_DU4], 0>, 331 InstrStage<1, [P7_VS1, P7_VS2]>], 332 [8, 1, 1]>, 333 InstrItinData<IIC_FPDivD , [InstrStage<1, [P7_DU1, P7_DU2, 334 P7_DU3, P7_DU4], 0>, 335 InstrStage<1, [P7_VS1, P7_VS2]>], 336 [33, 1, 1]>, 337 InstrItinData<IIC_FPDivS , [InstrStage<1, [P7_DU1, P7_DU2, 338 P7_DU3, P7_DU4], 0>, 339 InstrStage<1, [P7_VS1, P7_VS2]>], 340 [27, 1, 1]>, 341 InstrItinData<IIC_FPSqrtD , [InstrStage<1, [P7_DU1, P7_DU2, 342 P7_DU3, P7_DU4], 0>, 343 InstrStage<1, [P7_VS1, P7_VS2]>], 344 [44, 1, 1]>, 345 InstrItinData<IIC_FPSqrtS , [InstrStage<1, [P7_DU1, P7_DU2, 346 P7_DU3, P7_DU4], 0>, 347 InstrStage<1, [P7_VS1, P7_VS2]>], 348 [32, 1, 1]>, 349 InstrItinData<IIC_FPFused , [InstrStage<1, [P7_DU1, P7_DU2, 350 P7_DU3, P7_DU4], 0>, 351 InstrStage<1, [P7_VS1, P7_VS2]>], 352 [5, 1, 1, 1]>, 353 InstrItinData<IIC_FPRes , [InstrStage<1, [P7_DU1, P7_DU2, 354 P7_DU3, P7_DU4], 0>, 355 InstrStage<1, [P7_VS1, P7_VS2]>], 356 [5, 1, 1]>, 357 InstrItinData<IIC_VecGeneral , [InstrStage<1, [P7_DU1], 0>, 358 InstrStage<1, [P7_VS1]>], 359 [2, 1, 1]>, 360 InstrItinData<IIC_VecVSL , [InstrStage<1, [P7_DU1], 0>, 361 InstrStage<1, [P7_VS1]>], 362 [2, 1, 1]>, 363 InstrItinData<IIC_VecVSR , [InstrStage<1, [P7_DU1], 0>, 364 InstrStage<1, [P7_VS1]>], 365 [2, 1, 1]>, 366 InstrItinData<IIC_VecFP , [InstrStage<1, [P7_DU1], 0>, 367 InstrStage<1, [P7_VS1, P7_VS2]>], 368 [6, 1, 1]>, 369 InstrItinData<IIC_VecFPCompare, [InstrStage<1, [P7_DU1], 0>, 370 InstrStage<1, [P7_VS1, P7_VS2]>], 371 [6, 1, 1]>, 372 InstrItinData<IIC_VecFPRound , [InstrStage<1, [P7_DU1], 0>, 373 InstrStage<1, [P7_VS1, P7_VS2]>], 374 [6, 1, 1]>, 375 InstrItinData<IIC_VecComplex , [InstrStage<1, [P7_DU1], 0>, 376 InstrStage<1, [P7_VS1]>], 377 [7, 1, 1]>, 378 InstrItinData<IIC_VecPerm , [InstrStage<1, [P7_DU1, P7_DU2], 0>, 379 InstrStage<1, [P7_VS2]>], 380 [3, 1, 1]> 381]>; 382 383// ===---------------------------------------------------------------------===// 384// P7 machine model for scheduling and other instruction cost heuristics. 385 386def P7Model : SchedMachineModel { 387 let IssueWidth = 6; // 4 (non-branch) instructions are dispatched per cycle. 388 // Note that the dispatch bundle size is 6 (including 389 // branches), but the total internal issue bandwidth per 390 // cycle (from all queues) is 8. 391 392 let LoadLatency = 3; // Optimistic load latency assuming bypass. 393 // This is overriden by OperandCycles if the 394 // Itineraries are queried instead. 395 let MispredictPenalty = 16; 396 397 // Try to make sure we have at least 10 dispatch groups in a loop. 398 let LoopMicroOpBufferSize = 40; 399 400 let CompleteModel = 0; 401 402 let Itineraries = P7Itineraries; 403} 404 405