Andrew Scull | 5e1ddfa | 2018-08-14 10:06:54 +0100 | [diff] [blame] | 1 | //===-- llvm/MC/MCSchedule.h - Scheduling -----------------------*- C++ -*-===// |
| 2 | // |
Andrew Walbran | 16937d0 | 2019-10-22 13:54:20 +0100 | [diff] [blame] | 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
Andrew Scull | 5e1ddfa | 2018-08-14 10:06:54 +0100 | [diff] [blame] | 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This file defines the classes used to describe a subtarget's machine model |
| 10 | // for scheduling and other instruction cost heuristics. |
| 11 | // |
| 12 | //===----------------------------------------------------------------------===// |
| 13 | |
| 14 | #ifndef LLVM_MC_MCSCHEDULE_H |
| 15 | #define LLVM_MC_MCSCHEDULE_H |
| 16 | |
| 17 | #include "llvm/ADT/Optional.h" |
Andrew Scull | cdfcccc | 2018-10-05 20:58:37 +0100 | [diff] [blame] | 18 | #include "llvm/Config/llvm-config.h" |
Andrew Scull | 5e1ddfa | 2018-08-14 10:06:54 +0100 | [diff] [blame] | 19 | #include "llvm/Support/DataTypes.h" |
| 20 | #include <cassert> |
| 21 | |
| 22 | namespace llvm { |
| 23 | |
Olivier Deprez | f4ef2d0 | 2021-04-20 13:36:24 +0200 | [diff] [blame^] | 24 | template <typename T> class ArrayRef; |
Andrew Scull | 5e1ddfa | 2018-08-14 10:06:54 +0100 | [diff] [blame] | 25 | struct InstrItinerary; |
| 26 | class MCSubtargetInfo; |
Andrew Scull | cdfcccc | 2018-10-05 20:58:37 +0100 | [diff] [blame] | 27 | class MCInstrInfo; |
| 28 | class MCInst; |
| 29 | class InstrItineraryData; |
Andrew Scull | 5e1ddfa | 2018-08-14 10:06:54 +0100 | [diff] [blame] | 30 | |
| 31 | /// Define a kind of processor resource that will be modeled by the scheduler. |
| 32 | struct MCProcResourceDesc { |
| 33 | const char *Name; |
| 34 | unsigned NumUnits; // Number of resource of this kind |
| 35 | unsigned SuperIdx; // Index of the resources kind that contains this kind. |
| 36 | |
| 37 | // Number of resources that may be buffered. |
| 38 | // |
| 39 | // Buffered resources (BufferSize != 0) may be consumed at some indeterminate |
| 40 | // cycle after dispatch. This should be used for out-of-order cpus when |
| 41 | // instructions that use this resource can be buffered in a reservaton |
| 42 | // station. |
| 43 | // |
| 44 | // Unbuffered resources (BufferSize == 0) always consume their resource some |
| 45 | // fixed number of cycles after dispatch. If a resource is unbuffered, then |
| 46 | // the scheduler will avoid scheduling instructions with conflicting resources |
| 47 | // in the same cycle. This is for in-order cpus, or the in-order portion of |
| 48 | // an out-of-order cpus. |
| 49 | int BufferSize; |
| 50 | |
| 51 | // If the resource has sub-units, a pointer to the first element of an array |
| 52 | // of `NumUnits` elements containing the ProcResourceIdx of the sub units. |
| 53 | // nullptr if the resource does not have sub-units. |
| 54 | const unsigned *SubUnitsIdxBegin; |
| 55 | |
| 56 | bool operator==(const MCProcResourceDesc &Other) const { |
| 57 | return NumUnits == Other.NumUnits && SuperIdx == Other.SuperIdx |
| 58 | && BufferSize == Other.BufferSize; |
| 59 | } |
| 60 | }; |
| 61 | |
| 62 | /// Identify one of the processor resource kinds consumed by a particular |
| 63 | /// scheduling class for the specified number of cycles. |
| 64 | struct MCWriteProcResEntry { |
| 65 | uint16_t ProcResourceIdx; |
| 66 | uint16_t Cycles; |
| 67 | |
| 68 | bool operator==(const MCWriteProcResEntry &Other) const { |
| 69 | return ProcResourceIdx == Other.ProcResourceIdx && Cycles == Other.Cycles; |
| 70 | } |
| 71 | }; |
| 72 | |
| 73 | /// Specify the latency in cpu cycles for a particular scheduling class and def |
| 74 | /// index. -1 indicates an invalid latency. Heuristics would typically consider |
| 75 | /// an instruction with invalid latency to have infinite latency. Also identify |
| 76 | /// the WriteResources of this def. When the operand expands to a sequence of |
| 77 | /// writes, this ID is the last write in the sequence. |
| 78 | struct MCWriteLatencyEntry { |
| 79 | int16_t Cycles; |
| 80 | uint16_t WriteResourceID; |
| 81 | |
| 82 | bool operator==(const MCWriteLatencyEntry &Other) const { |
| 83 | return Cycles == Other.Cycles && WriteResourceID == Other.WriteResourceID; |
| 84 | } |
| 85 | }; |
| 86 | |
| 87 | /// Specify the number of cycles allowed after instruction issue before a |
| 88 | /// particular use operand reads its registers. This effectively reduces the |
| 89 | /// write's latency. Here we allow negative cycles for corner cases where |
| 90 | /// latency increases. This rule only applies when the entry's WriteResource |
| 91 | /// matches the write's WriteResource. |
| 92 | /// |
| 93 | /// MCReadAdvanceEntries are sorted first by operand index (UseIdx), then by |
| 94 | /// WriteResourceIdx. |
| 95 | struct MCReadAdvanceEntry { |
| 96 | unsigned UseIdx; |
| 97 | unsigned WriteResourceID; |
| 98 | int Cycles; |
| 99 | |
| 100 | bool operator==(const MCReadAdvanceEntry &Other) const { |
| 101 | return UseIdx == Other.UseIdx && WriteResourceID == Other.WriteResourceID |
| 102 | && Cycles == Other.Cycles; |
| 103 | } |
| 104 | }; |
| 105 | |
| 106 | /// Summarize the scheduling resources required for an instruction of a |
| 107 | /// particular scheduling class. |
| 108 | /// |
| 109 | /// Defined as an aggregate struct for creating tables with initializer lists. |
| 110 | struct MCSchedClassDesc { |
| 111 | static const unsigned short InvalidNumMicroOps = (1U << 14) - 1; |
| 112 | static const unsigned short VariantNumMicroOps = InvalidNumMicroOps - 1; |
| 113 | |
| 114 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 115 | const char* Name; |
| 116 | #endif |
| 117 | uint16_t NumMicroOps : 14; |
| 118 | bool BeginGroup : 1; |
| 119 | bool EndGroup : 1; |
| 120 | uint16_t WriteProcResIdx; // First index into WriteProcResTable. |
| 121 | uint16_t NumWriteProcResEntries; |
| 122 | uint16_t WriteLatencyIdx; // First index into WriteLatencyTable. |
| 123 | uint16_t NumWriteLatencyEntries; |
| 124 | uint16_t ReadAdvanceIdx; // First index into ReadAdvanceTable. |
| 125 | uint16_t NumReadAdvanceEntries; |
| 126 | |
| 127 | bool isValid() const { |
| 128 | return NumMicroOps != InvalidNumMicroOps; |
| 129 | } |
| 130 | bool isVariant() const { |
| 131 | return NumMicroOps == VariantNumMicroOps; |
| 132 | } |
| 133 | }; |
| 134 | |
Andrew Scull | cdfcccc | 2018-10-05 20:58:37 +0100 | [diff] [blame] | 135 | /// Specify the cost of a register definition in terms of number of physical |
| 136 | /// register allocated at register renaming stage. For example, AMD Jaguar. |
| 137 | /// natively supports 128-bit data types, and operations on 256-bit registers |
| 138 | /// (i.e. YMM registers) are internally split into two COPs (complex operations) |
| 139 | /// and each COP updates a physical register. Basically, on Jaguar, a YMM |
| 140 | /// register write effectively consumes two physical registers. That means, |
| 141 | /// the cost of a YMM write in the BtVer2 model is 2. |
| 142 | struct MCRegisterCostEntry { |
| 143 | unsigned RegisterClassID; |
| 144 | unsigned Cost; |
Andrew Walbran | 16937d0 | 2019-10-22 13:54:20 +0100 | [diff] [blame] | 145 | bool AllowMoveElimination; |
Andrew Scull | cdfcccc | 2018-10-05 20:58:37 +0100 | [diff] [blame] | 146 | }; |
| 147 | |
| 148 | /// A register file descriptor. |
| 149 | /// |
| 150 | /// This struct allows to describe processor register files. In particular, it |
| 151 | /// helps describing the size of the register file, as well as the cost of |
| 152 | /// allocating a register file at register renaming stage. |
| 153 | /// FIXME: this struct can be extended to provide information about the number |
| 154 | /// of read/write ports to the register file. A value of zero for field |
| 155 | /// 'NumPhysRegs' means: this register file has an unbounded number of physical |
| 156 | /// registers. |
| 157 | struct MCRegisterFileDesc { |
| 158 | const char *Name; |
| 159 | uint16_t NumPhysRegs; |
| 160 | uint16_t NumRegisterCostEntries; |
| 161 | // Index of the first cost entry in MCExtraProcessorInfo::RegisterCostTable. |
| 162 | uint16_t RegisterCostEntryIdx; |
Andrew Walbran | 16937d0 | 2019-10-22 13:54:20 +0100 | [diff] [blame] | 163 | // A value of zero means: there is no limit in the number of moves that can be |
| 164 | // eliminated every cycle. |
| 165 | uint16_t MaxMovesEliminatedPerCycle; |
| 166 | // Ture if this register file only knows how to optimize register moves from |
| 167 | // known zero registers. |
| 168 | bool AllowZeroMoveEliminationOnly; |
Andrew Scull | cdfcccc | 2018-10-05 20:58:37 +0100 | [diff] [blame] | 169 | }; |
| 170 | |
| 171 | /// Provide extra details about the machine processor. |
| 172 | /// |
| 173 | /// This is a collection of "optional" processor information that is not |
| 174 | /// normally used by the LLVM machine schedulers, but that can be consumed by |
| 175 | /// external tools like llvm-mca to improve the quality of the peformance |
| 176 | /// analysis. |
| 177 | struct MCExtraProcessorInfo { |
| 178 | // Actual size of the reorder buffer in hardware. |
| 179 | unsigned ReorderBufferSize; |
| 180 | // Number of instructions retired per cycle. |
| 181 | unsigned MaxRetirePerCycle; |
| 182 | const MCRegisterFileDesc *RegisterFiles; |
| 183 | unsigned NumRegisterFiles; |
| 184 | const MCRegisterCostEntry *RegisterCostTable; |
| 185 | unsigned NumRegisterCostEntries; |
Andrew Walbran | 16937d0 | 2019-10-22 13:54:20 +0100 | [diff] [blame] | 186 | unsigned LoadQueueID; |
| 187 | unsigned StoreQueueID; |
Andrew Scull | cdfcccc | 2018-10-05 20:58:37 +0100 | [diff] [blame] | 188 | }; |
| 189 | |
Andrew Scull | 5e1ddfa | 2018-08-14 10:06:54 +0100 | [diff] [blame] | 190 | /// Machine model for scheduling, bundling, and heuristics. |
| 191 | /// |
| 192 | /// The machine model directly provides basic information about the |
| 193 | /// microarchitecture to the scheduler in the form of properties. It also |
| 194 | /// optionally refers to scheduler resource tables and itinerary |
| 195 | /// tables. Scheduler resource tables model the latency and cost for each |
| 196 | /// instruction type. Itinerary tables are an independent mechanism that |
| 197 | /// provides a detailed reservation table describing each cycle of instruction |
| 198 | /// execution. Subtargets may define any or all of the above categories of data |
| 199 | /// depending on the type of CPU and selected scheduler. |
Andrew Scull | cdfcccc | 2018-10-05 20:58:37 +0100 | [diff] [blame] | 200 | /// |
| 201 | /// The machine independent properties defined here are used by the scheduler as |
| 202 | /// an abstract machine model. A real micro-architecture has a number of |
| 203 | /// buffers, queues, and stages. Declaring that a given machine-independent |
| 204 | /// abstract property corresponds to a specific physical property across all |
| 205 | /// subtargets can't be done. Nonetheless, the abstract model is |
| 206 | /// useful. Futhermore, subtargets typically extend this model with processor |
| 207 | /// specific resources to model any hardware features that can be exploited by |
Olivier Deprez | f4ef2d0 | 2021-04-20 13:36:24 +0200 | [diff] [blame^] | 208 | /// scheduling heuristics and aren't sufficiently represented in the abstract. |
Andrew Scull | cdfcccc | 2018-10-05 20:58:37 +0100 | [diff] [blame] | 209 | /// |
| 210 | /// The abstract pipeline is built around the notion of an "issue point". This |
| 211 | /// is merely a reference point for counting machine cycles. The physical |
| 212 | /// machine will have pipeline stages that delay execution. The scheduler does |
| 213 | /// not model those delays because they are irrelevant as long as they are |
| 214 | /// consistent. Inaccuracies arise when instructions have different execution |
| 215 | /// delays relative to each other, in addition to their intrinsic latency. Those |
| 216 | /// special cases can be handled by TableGen constructs such as, ReadAdvance, |
| 217 | /// which reduces latency when reading data, and ResourceCycles, which consumes |
| 218 | /// a processor resource when writing data for a number of abstract |
| 219 | /// cycles. |
| 220 | /// |
| 221 | /// TODO: One tool currently missing is the ability to add a delay to |
| 222 | /// ResourceCycles. That would be easy to add and would likely cover all cases |
| 223 | /// currently handled by the legacy itinerary tables. |
| 224 | /// |
| 225 | /// A note on out-of-order execution and, more generally, instruction |
| 226 | /// buffers. Part of the CPU pipeline is always in-order. The issue point, which |
| 227 | /// is the point of reference for counting cycles, only makes sense as an |
| 228 | /// in-order part of the pipeline. Other parts of the pipeline are sometimes |
| 229 | /// falling behind and sometimes catching up. It's only interesting to model |
| 230 | /// those other, decoupled parts of the pipeline if they may be predictably |
| 231 | /// resource constrained in a way that the scheduler can exploit. |
| 232 | /// |
| 233 | /// The LLVM machine model distinguishes between in-order constraints and |
| 234 | /// out-of-order constraints so that the target's scheduling strategy can apply |
| 235 | /// appropriate heuristics. For a well-balanced CPU pipeline, out-of-order |
| 236 | /// resources would not typically be treated as a hard scheduling |
| 237 | /// constraint. For example, in the GenericScheduler, a delay caused by limited |
| 238 | /// out-of-order resources is not directly reflected in the number of cycles |
| 239 | /// that the scheduler sees between issuing an instruction and its dependent |
| 240 | /// instructions. In other words, out-of-order resources don't directly increase |
| 241 | /// the latency between pairs of instructions. However, they can still be used |
| 242 | /// to detect potential bottlenecks across a sequence of instructions and bias |
| 243 | /// the scheduling heuristics appropriately. |
Andrew Scull | 5e1ddfa | 2018-08-14 10:06:54 +0100 | [diff] [blame] | 244 | struct MCSchedModel { |
| 245 | // IssueWidth is the maximum number of instructions that may be scheduled in |
Andrew Scull | cdfcccc | 2018-10-05 20:58:37 +0100 | [diff] [blame] | 246 | // the same per-cycle group. This is meant to be a hard in-order constraint |
| 247 | // (a.k.a. "hazard"). In the GenericScheduler strategy, no more than |
| 248 | // IssueWidth micro-ops can ever be scheduled in a particular cycle. |
| 249 | // |
| 250 | // In practice, IssueWidth is useful to model any bottleneck between the |
| 251 | // decoder (after micro-op expansion) and the out-of-order reservation |
| 252 | // stations or the decoder bandwidth itself. If the total number of |
| 253 | // reservation stations is also a bottleneck, or if any other pipeline stage |
| 254 | // has a bandwidth limitation, then that can be naturally modeled by adding an |
| 255 | // out-of-order processor resource. |
Andrew Scull | 5e1ddfa | 2018-08-14 10:06:54 +0100 | [diff] [blame] | 256 | unsigned IssueWidth; |
| 257 | static const unsigned DefaultIssueWidth = 1; |
| 258 | |
| 259 | // MicroOpBufferSize is the number of micro-ops that the processor may buffer |
| 260 | // for out-of-order execution. |
| 261 | // |
| 262 | // "0" means operations that are not ready in this cycle are not considered |
| 263 | // for scheduling (they go in the pending queue). Latency is paramount. This |
| 264 | // may be more efficient if many instructions are pending in a schedule. |
| 265 | // |
| 266 | // "1" means all instructions are considered for scheduling regardless of |
| 267 | // whether they are ready in this cycle. Latency still causes issue stalls, |
| 268 | // but we balance those stalls against other heuristics. |
| 269 | // |
| 270 | // "> 1" means the processor is out-of-order. This is a machine independent |
| 271 | // estimate of highly machine specific characteristics such as the register |
| 272 | // renaming pool and reorder buffer. |
| 273 | unsigned MicroOpBufferSize; |
| 274 | static const unsigned DefaultMicroOpBufferSize = 0; |
| 275 | |
| 276 | // LoopMicroOpBufferSize is the number of micro-ops that the processor may |
| 277 | // buffer for optimized loop execution. More generally, this represents the |
| 278 | // optimal number of micro-ops in a loop body. A loop may be partially |
| 279 | // unrolled to bring the count of micro-ops in the loop body closer to this |
| 280 | // number. |
| 281 | unsigned LoopMicroOpBufferSize; |
| 282 | static const unsigned DefaultLoopMicroOpBufferSize = 0; |
| 283 | |
| 284 | // LoadLatency is the expected latency of load instructions. |
| 285 | unsigned LoadLatency; |
| 286 | static const unsigned DefaultLoadLatency = 4; |
| 287 | |
| 288 | // HighLatency is the expected latency of "very high latency" operations. |
| 289 | // See TargetInstrInfo::isHighLatencyDef(). |
| 290 | // By default, this is set to an arbitrarily high number of cycles |
| 291 | // likely to have some impact on scheduling heuristics. |
| 292 | unsigned HighLatency; |
| 293 | static const unsigned DefaultHighLatency = 10; |
| 294 | |
| 295 | // MispredictPenalty is the typical number of extra cycles the processor |
| 296 | // takes to recover from a branch misprediction. |
| 297 | unsigned MispredictPenalty; |
| 298 | static const unsigned DefaultMispredictPenalty = 10; |
| 299 | |
| 300 | bool PostRAScheduler; // default value is false |
| 301 | |
| 302 | bool CompleteModel; |
| 303 | |
| 304 | unsigned ProcID; |
| 305 | const MCProcResourceDesc *ProcResourceTable; |
| 306 | const MCSchedClassDesc *SchedClassTable; |
| 307 | unsigned NumProcResourceKinds; |
| 308 | unsigned NumSchedClasses; |
| 309 | // Instruction itinerary tables used by InstrItineraryData. |
| 310 | friend class InstrItineraryData; |
| 311 | const InstrItinerary *InstrItineraries; |
| 312 | |
Andrew Scull | cdfcccc | 2018-10-05 20:58:37 +0100 | [diff] [blame] | 313 | const MCExtraProcessorInfo *ExtraProcessorInfo; |
| 314 | |
| 315 | bool hasExtraProcessorInfo() const { return ExtraProcessorInfo; } |
| 316 | |
Andrew Scull | 5e1ddfa | 2018-08-14 10:06:54 +0100 | [diff] [blame] | 317 | unsigned getProcessorID() const { return ProcID; } |
| 318 | |
| 319 | /// Does this machine model include instruction-level scheduling. |
| 320 | bool hasInstrSchedModel() const { return SchedClassTable; } |
| 321 | |
Andrew Scull | cdfcccc | 2018-10-05 20:58:37 +0100 | [diff] [blame] | 322 | const MCExtraProcessorInfo &getExtraProcessorInfo() const { |
| 323 | assert(hasExtraProcessorInfo() && |
| 324 | "No extra information available for this model"); |
| 325 | return *ExtraProcessorInfo; |
| 326 | } |
| 327 | |
Andrew Scull | 5e1ddfa | 2018-08-14 10:06:54 +0100 | [diff] [blame] | 328 | /// Return true if this machine model data for all instructions with a |
| 329 | /// scheduling class (itinerary class or SchedRW list). |
| 330 | bool isComplete() const { return CompleteModel; } |
| 331 | |
| 332 | /// Return true if machine supports out of order execution. |
| 333 | bool isOutOfOrder() const { return MicroOpBufferSize > 1; } |
| 334 | |
| 335 | unsigned getNumProcResourceKinds() const { |
| 336 | return NumProcResourceKinds; |
| 337 | } |
| 338 | |
| 339 | const MCProcResourceDesc *getProcResource(unsigned ProcResourceIdx) const { |
| 340 | assert(hasInstrSchedModel() && "No scheduling machine model"); |
| 341 | |
| 342 | assert(ProcResourceIdx < NumProcResourceKinds && "bad proc resource idx"); |
| 343 | return &ProcResourceTable[ProcResourceIdx]; |
| 344 | } |
| 345 | |
| 346 | const MCSchedClassDesc *getSchedClassDesc(unsigned SchedClassIdx) const { |
| 347 | assert(hasInstrSchedModel() && "No scheduling machine model"); |
| 348 | |
| 349 | assert(SchedClassIdx < NumSchedClasses && "bad scheduling class idx"); |
| 350 | return &SchedClassTable[SchedClassIdx]; |
| 351 | } |
| 352 | |
| 353 | /// Returns the latency value for the scheduling class. |
| 354 | static int computeInstrLatency(const MCSubtargetInfo &STI, |
| 355 | const MCSchedClassDesc &SCDesc); |
| 356 | |
Andrew Scull | cdfcccc | 2018-10-05 20:58:37 +0100 | [diff] [blame] | 357 | int computeInstrLatency(const MCSubtargetInfo &STI, unsigned SClass) const; |
| 358 | int computeInstrLatency(const MCSubtargetInfo &STI, const MCInstrInfo &MCII, |
| 359 | const MCInst &Inst) const; |
| 360 | |
| 361 | // Returns the reciprocal throughput information from a MCSchedClassDesc. |
| 362 | static double |
Andrew Scull | 5e1ddfa | 2018-08-14 10:06:54 +0100 | [diff] [blame] | 363 | getReciprocalThroughput(const MCSubtargetInfo &STI, |
| 364 | const MCSchedClassDesc &SCDesc); |
| 365 | |
Andrew Scull | cdfcccc | 2018-10-05 20:58:37 +0100 | [diff] [blame] | 366 | static double |
| 367 | getReciprocalThroughput(unsigned SchedClass, const InstrItineraryData &IID); |
| 368 | |
| 369 | double |
| 370 | getReciprocalThroughput(const MCSubtargetInfo &STI, const MCInstrInfo &MCII, |
| 371 | const MCInst &Inst) const; |
| 372 | |
Andrew Walbran | 16937d0 | 2019-10-22 13:54:20 +0100 | [diff] [blame] | 373 | /// Returns the maximum forwarding delay for register reads dependent on |
| 374 | /// writes of scheduling class WriteResourceIdx. |
| 375 | static unsigned getForwardingDelayCycles(ArrayRef<MCReadAdvanceEntry> Entries, |
| 376 | unsigned WriteResourceIdx = 0); |
| 377 | |
Andrew Scull | 5e1ddfa | 2018-08-14 10:06:54 +0100 | [diff] [blame] | 378 | /// Returns the default initialized model. |
| 379 | static const MCSchedModel &GetDefaultSchedModel() { return Default; } |
| 380 | static const MCSchedModel Default; |
| 381 | }; |
| 382 | |
Andrew Scull | cdfcccc | 2018-10-05 20:58:37 +0100 | [diff] [blame] | 383 | } // namespace llvm |
Andrew Scull | 5e1ddfa | 2018-08-14 10:06:54 +0100 | [diff] [blame] | 384 | |
| 385 | #endif |