Blame - linux-x64/clang/include/llvm/MC/MCSchedule.h - hafnium/prebuilts

blob: ee0e5b4df9f0edf6426ab24336ce4e3cf18de03b [file] [log] [blame]

Andrew Scull	5e1ddfa	2018-08-14 10:06:54 +0100	[diff] [blame]	1	//===-- llvm/MC/MCSchedule.h - Scheduling ------------------------ C++ --===//
				2	//
Andrew Walbran	16937d0	2019-10-22 13:54:20 +0100	[diff] [blame]	3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				4	// See https://llvm.org/LICENSE.txt for license information.
				5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
Andrew Scull	5e1ddfa	2018-08-14 10:06:54 +0100	[diff] [blame]	6	//
				7	//===----------------------------------------------------------------------===//
				8	//
				9	// This file defines the classes used to describe a subtarget's machine model
				10	// for scheduling and other instruction cost heuristics.
				11	//
				12	//===----------------------------------------------------------------------===//
				13
				14	#ifndef LLVM_MC_MCSCHEDULE_H
				15	#define LLVM_MC_MCSCHEDULE_H
				16
				17	#include "llvm/ADT/Optional.h"
Andrew Scull	cdfcccc	2018-10-05 20:58:37 +0100	[diff] [blame]	18	#include "llvm/Config/llvm-config.h"
Andrew Scull	5e1ddfa	2018-08-14 10:06:54 +0100	[diff] [blame]	19	#include "llvm/Support/DataTypes.h"
				20	#include <cassert>
				21
				22	namespace llvm {
				23
Olivier Deprez	f4ef2d0	2021-04-20 13:36:24 +0200	[diff] [blame^]	24	template <typename T> class ArrayRef;
Andrew Scull	5e1ddfa	2018-08-14 10:06:54 +0100	[diff] [blame]	25	struct InstrItinerary;
				26	class MCSubtargetInfo;
Andrew Scull	cdfcccc	2018-10-05 20:58:37 +0100	[diff] [blame]	27	class MCInstrInfo;
				28	class MCInst;
				29	class InstrItineraryData;
Andrew Scull	5e1ddfa	2018-08-14 10:06:54 +0100	[diff] [blame]	30
				31	/// Define a kind of processor resource that will be modeled by the scheduler.
				32	struct MCProcResourceDesc {
				33	const char *Name;
				34	unsigned NumUnits; // Number of resource of this kind
				35	unsigned SuperIdx; // Index of the resources kind that contains this kind.
				36
				37	// Number of resources that may be buffered.
				38	//
				39	// Buffered resources (BufferSize != 0) may be consumed at some indeterminate
				40	// cycle after dispatch. This should be used for out-of-order cpus when
				41	// instructions that use this resource can be buffered in a reservaton
				42	// station.
				43	//
				44	// Unbuffered resources (BufferSize == 0) always consume their resource some
				45	// fixed number of cycles after dispatch. If a resource is unbuffered, then
				46	// the scheduler will avoid scheduling instructions with conflicting resources
				47	// in the same cycle. This is for in-order cpus, or the in-order portion of
				48	// an out-of-order cpus.
				49	int BufferSize;
				50
				51	// If the resource has sub-units, a pointer to the first element of an array
				52	// of `NumUnits` elements containing the ProcResourceIdx of the sub units.
				53	// nullptr if the resource does not have sub-units.
				54	const unsigned *SubUnitsIdxBegin;
				55
				56	bool operator==(const MCProcResourceDesc &Other) const {
				57	return NumUnits == Other.NumUnits && SuperIdx == Other.SuperIdx
				58	&& BufferSize == Other.BufferSize;
				59	}
				60	};
				61
				62	/// Identify one of the processor resource kinds consumed by a particular
				63	/// scheduling class for the specified number of cycles.
				64	struct MCWriteProcResEntry {
				65	uint16_t ProcResourceIdx;
				66	uint16_t Cycles;
				67
				68	bool operator==(const MCWriteProcResEntry &Other) const {
				69	return ProcResourceIdx == Other.ProcResourceIdx && Cycles == Other.Cycles;
				70	}
				71	};
				72
				73	/// Specify the latency in cpu cycles for a particular scheduling class and def
				74	/// index. -1 indicates an invalid latency. Heuristics would typically consider
				75	/// an instruction with invalid latency to have infinite latency. Also identify
				76	/// the WriteResources of this def. When the operand expands to a sequence of
				77	/// writes, this ID is the last write in the sequence.
				78	struct MCWriteLatencyEntry {
				79	int16_t Cycles;
				80	uint16_t WriteResourceID;
				81
				82	bool operator==(const MCWriteLatencyEntry &Other) const {
				83	return Cycles == Other.Cycles && WriteResourceID == Other.WriteResourceID;
				84	}
				85	};
				86
				87	/// Specify the number of cycles allowed after instruction issue before a
				88	/// particular use operand reads its registers. This effectively reduces the
				89	/// write's latency. Here we allow negative cycles for corner cases where
				90	/// latency increases. This rule only applies when the entry's WriteResource
				91	/// matches the write's WriteResource.
				92	///
				93	/// MCReadAdvanceEntries are sorted first by operand index (UseIdx), then by
				94	/// WriteResourceIdx.
				95	struct MCReadAdvanceEntry {
				96	unsigned UseIdx;
				97	unsigned WriteResourceID;
				98	int Cycles;
				99
				100	bool operator==(const MCReadAdvanceEntry &Other) const {
				101	return UseIdx == Other.UseIdx && WriteResourceID == Other.WriteResourceID
				102	&& Cycles == Other.Cycles;
				103	}
				104	};
				105
				106	/// Summarize the scheduling resources required for an instruction of a
				107	/// particular scheduling class.
				108	///
				109	/// Defined as an aggregate struct for creating tables with initializer lists.
				110	struct MCSchedClassDesc {
				111	static const unsigned short InvalidNumMicroOps = (1U << 14) - 1;
				112	static const unsigned short VariantNumMicroOps = InvalidNumMicroOps - 1;
				113
				114	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
				115	const char* Name;
				116	#endif
				117	uint16_t NumMicroOps : 14;
				118	bool BeginGroup : 1;
				119	bool EndGroup : 1;
				120	uint16_t WriteProcResIdx; // First index into WriteProcResTable.
				121	uint16_t NumWriteProcResEntries;
				122	uint16_t WriteLatencyIdx; // First index into WriteLatencyTable.
				123	uint16_t NumWriteLatencyEntries;
				124	uint16_t ReadAdvanceIdx; // First index into ReadAdvanceTable.
				125	uint16_t NumReadAdvanceEntries;
				126
				127	bool isValid() const {
				128	return NumMicroOps != InvalidNumMicroOps;
				129	}
				130	bool isVariant() const {
				131	return NumMicroOps == VariantNumMicroOps;
				132	}
				133	};
				134
Andrew Scull	cdfcccc	2018-10-05 20:58:37 +0100	[diff] [blame]	135	/// Specify the cost of a register definition in terms of number of physical
				136	/// register allocated at register renaming stage. For example, AMD Jaguar.
				137	/// natively supports 128-bit data types, and operations on 256-bit registers
				138	/// (i.e. YMM registers) are internally split into two COPs (complex operations)
				139	/// and each COP updates a physical register. Basically, on Jaguar, a YMM
				140	/// register write effectively consumes two physical registers. That means,
				141	/// the cost of a YMM write in the BtVer2 model is 2.
				142	struct MCRegisterCostEntry {
				143	unsigned RegisterClassID;
				144	unsigned Cost;
Andrew Walbran	16937d0	2019-10-22 13:54:20 +0100	[diff] [blame]	145	bool AllowMoveElimination;
Andrew Scull	cdfcccc	2018-10-05 20:58:37 +0100	[diff] [blame]	146	};
				147
				148	/// A register file descriptor.
				149	///
				150	/// This struct allows to describe processor register files. In particular, it
				151	/// helps describing the size of the register file, as well as the cost of
				152	/// allocating a register file at register renaming stage.
				153	/// FIXME: this struct can be extended to provide information about the number
				154	/// of read/write ports to the register file. A value of zero for field
				155	/// 'NumPhysRegs' means: this register file has an unbounded number of physical
				156	/// registers.
				157	struct MCRegisterFileDesc {
				158	const char *Name;
				159	uint16_t NumPhysRegs;
				160	uint16_t NumRegisterCostEntries;
				161	// Index of the first cost entry in MCExtraProcessorInfo::RegisterCostTable.
				162	uint16_t RegisterCostEntryIdx;
Andrew Walbran	16937d0	2019-10-22 13:54:20 +0100	[diff] [blame]	163	// A value of zero means: there is no limit in the number of moves that can be
				164	// eliminated every cycle.
				165	uint16_t MaxMovesEliminatedPerCycle;
				166	// Ture if this register file only knows how to optimize register moves from
				167	// known zero registers.
				168	bool AllowZeroMoveEliminationOnly;
Andrew Scull	cdfcccc	2018-10-05 20:58:37 +0100	[diff] [blame]	169	};
				170
				171	/// Provide extra details about the machine processor.
				172	///
				173	/// This is a collection of "optional" processor information that is not
				174	/// normally used by the LLVM machine schedulers, but that can be consumed by
				175	/// external tools like llvm-mca to improve the quality of the peformance
				176	/// analysis.
				177	struct MCExtraProcessorInfo {
				178	// Actual size of the reorder buffer in hardware.
				179	unsigned ReorderBufferSize;
				180	// Number of instructions retired per cycle.
				181	unsigned MaxRetirePerCycle;
				182	const MCRegisterFileDesc *RegisterFiles;
				183	unsigned NumRegisterFiles;
				184	const MCRegisterCostEntry *RegisterCostTable;
				185	unsigned NumRegisterCostEntries;
Andrew Walbran	16937d0	2019-10-22 13:54:20 +0100	[diff] [blame]	186	unsigned LoadQueueID;
				187	unsigned StoreQueueID;
Andrew Scull	cdfcccc	2018-10-05 20:58:37 +0100	[diff] [blame]	188	};
				189
Andrew Scull	5e1ddfa	2018-08-14 10:06:54 +0100	[diff] [blame]	190	/// Machine model for scheduling, bundling, and heuristics.
				191	///
				192	/// The machine model directly provides basic information about the
				193	/// microarchitecture to the scheduler in the form of properties. It also
				194	/// optionally refers to scheduler resource tables and itinerary
				195	/// tables. Scheduler resource tables model the latency and cost for each
				196	/// instruction type. Itinerary tables are an independent mechanism that
				197	/// provides a detailed reservation table describing each cycle of instruction
				198	/// execution. Subtargets may define any or all of the above categories of data
				199	/// depending on the type of CPU and selected scheduler.
Andrew Scull	cdfcccc	2018-10-05 20:58:37 +0100	[diff] [blame]	200	///
				201	/// The machine independent properties defined here are used by the scheduler as
				202	/// an abstract machine model. A real micro-architecture has a number of
				203	/// buffers, queues, and stages. Declaring that a given machine-independent
				204	/// abstract property corresponds to a specific physical property across all
				205	/// subtargets can't be done. Nonetheless, the abstract model is
				206	/// useful. Futhermore, subtargets typically extend this model with processor
				207	/// specific resources to model any hardware features that can be exploited by
Olivier Deprez	f4ef2d0	2021-04-20 13:36:24 +0200	[diff] [blame^]	208	/// scheduling heuristics and aren't sufficiently represented in the abstract.
Andrew Scull	cdfcccc	2018-10-05 20:58:37 +0100	[diff] [blame]	209	///
				210	/// The abstract pipeline is built around the notion of an "issue point". This
				211	/// is merely a reference point for counting machine cycles. The physical
				212	/// machine will have pipeline stages that delay execution. The scheduler does
				213	/// not model those delays because they are irrelevant as long as they are
				214	/// consistent. Inaccuracies arise when instructions have different execution
				215	/// delays relative to each other, in addition to their intrinsic latency. Those
				216	/// special cases can be handled by TableGen constructs such as, ReadAdvance,
				217	/// which reduces latency when reading data, and ResourceCycles, which consumes
				218	/// a processor resource when writing data for a number of abstract
				219	/// cycles.
				220	///
				221	/// TODO: One tool currently missing is the ability to add a delay to
				222	/// ResourceCycles. That would be easy to add and would likely cover all cases
				223	/// currently handled by the legacy itinerary tables.
				224	///
				225	/// A note on out-of-order execution and, more generally, instruction
				226	/// buffers. Part of the CPU pipeline is always in-order. The issue point, which
				227	/// is the point of reference for counting cycles, only makes sense as an
				228	/// in-order part of the pipeline. Other parts of the pipeline are sometimes
				229	/// falling behind and sometimes catching up. It's only interesting to model
				230	/// those other, decoupled parts of the pipeline if they may be predictably
				231	/// resource constrained in a way that the scheduler can exploit.
				232	///
				233	/// The LLVM machine model distinguishes between in-order constraints and
				234	/// out-of-order constraints so that the target's scheduling strategy can apply
				235	/// appropriate heuristics. For a well-balanced CPU pipeline, out-of-order
				236	/// resources would not typically be treated as a hard scheduling
				237	/// constraint. For example, in the GenericScheduler, a delay caused by limited
				238	/// out-of-order resources is not directly reflected in the number of cycles
				239	/// that the scheduler sees between issuing an instruction and its dependent
				240	/// instructions. In other words, out-of-order resources don't directly increase
				241	/// the latency between pairs of instructions. However, they can still be used
				242	/// to detect potential bottlenecks across a sequence of instructions and bias
				243	/// the scheduling heuristics appropriately.
Andrew Scull	5e1ddfa	2018-08-14 10:06:54 +0100	[diff] [blame]	244	struct MCSchedModel {
				245	// IssueWidth is the maximum number of instructions that may be scheduled in
Andrew Scull	cdfcccc	2018-10-05 20:58:37 +0100	[diff] [blame]	246	// the same per-cycle group. This is meant to be a hard in-order constraint
				247	// (a.k.a. "hazard"). In the GenericScheduler strategy, no more than
				248	// IssueWidth micro-ops can ever be scheduled in a particular cycle.
				249	//
				250	// In practice, IssueWidth is useful to model any bottleneck between the
				251	// decoder (after micro-op expansion) and the out-of-order reservation
				252	// stations or the decoder bandwidth itself. If the total number of
				253	// reservation stations is also a bottleneck, or if any other pipeline stage
				254	// has a bandwidth limitation, then that can be naturally modeled by adding an
				255	// out-of-order processor resource.
Andrew Scull	5e1ddfa	2018-08-14 10:06:54 +0100	[diff] [blame]	256	unsigned IssueWidth;
				257	static const unsigned DefaultIssueWidth = 1;
				258
				259	// MicroOpBufferSize is the number of micro-ops that the processor may buffer
				260	// for out-of-order execution.
				261	//
				262	// "0" means operations that are not ready in this cycle are not considered
				263	// for scheduling (they go in the pending queue). Latency is paramount. This
				264	// may be more efficient if many instructions are pending in a schedule.
				265	//
				266	// "1" means all instructions are considered for scheduling regardless of
				267	// whether they are ready in this cycle. Latency still causes issue stalls,
				268	// but we balance those stalls against other heuristics.
				269	//
				270	// "> 1" means the processor is out-of-order. This is a machine independent
				271	// estimate of highly machine specific characteristics such as the register
				272	// renaming pool and reorder buffer.
				273	unsigned MicroOpBufferSize;
				274	static const unsigned DefaultMicroOpBufferSize = 0;
				275
				276	// LoopMicroOpBufferSize is the number of micro-ops that the processor may
				277	// buffer for optimized loop execution. More generally, this represents the
				278	// optimal number of micro-ops in a loop body. A loop may be partially
				279	// unrolled to bring the count of micro-ops in the loop body closer to this
				280	// number.
				281	unsigned LoopMicroOpBufferSize;
				282	static const unsigned DefaultLoopMicroOpBufferSize = 0;
				283
				284	// LoadLatency is the expected latency of load instructions.
				285	unsigned LoadLatency;
				286	static const unsigned DefaultLoadLatency = 4;
				287
				288	// HighLatency is the expected latency of "very high latency" operations.
				289	// See TargetInstrInfo::isHighLatencyDef().
				290	// By default, this is set to an arbitrarily high number of cycles
				291	// likely to have some impact on scheduling heuristics.
				292	unsigned HighLatency;
				293	static const unsigned DefaultHighLatency = 10;
				294
				295	// MispredictPenalty is the typical number of extra cycles the processor
				296	// takes to recover from a branch misprediction.
				297	unsigned MispredictPenalty;
				298	static const unsigned DefaultMispredictPenalty = 10;
				299
				300	bool PostRAScheduler; // default value is false
				301
				302	bool CompleteModel;
				303
				304	unsigned ProcID;
				305	const MCProcResourceDesc *ProcResourceTable;
				306	const MCSchedClassDesc *SchedClassTable;
				307	unsigned NumProcResourceKinds;
				308	unsigned NumSchedClasses;
				309	// Instruction itinerary tables used by InstrItineraryData.
				310	friend class InstrItineraryData;
				311	const InstrItinerary *InstrItineraries;
				312
Andrew Scull	cdfcccc	2018-10-05 20:58:37 +0100	[diff] [blame]	313	const MCExtraProcessorInfo *ExtraProcessorInfo;
				314
				315	bool hasExtraProcessorInfo() const { return ExtraProcessorInfo; }
				316
Andrew Scull	5e1ddfa	2018-08-14 10:06:54 +0100	[diff] [blame]	317	unsigned getProcessorID() const { return ProcID; }
				318
				319	/// Does this machine model include instruction-level scheduling.
				320	bool hasInstrSchedModel() const { return SchedClassTable; }
				321
Andrew Scull	cdfcccc	2018-10-05 20:58:37 +0100	[diff] [blame]	322	const MCExtraProcessorInfo &getExtraProcessorInfo() const {
				323	assert(hasExtraProcessorInfo() &&
				324	"No extra information available for this model");
				325	return *ExtraProcessorInfo;
				326	}
				327
Andrew Scull	5e1ddfa	2018-08-14 10:06:54 +0100	[diff] [blame]	328	/// Return true if this machine model data for all instructions with a
				329	/// scheduling class (itinerary class or SchedRW list).
				330	bool isComplete() const { return CompleteModel; }
				331
				332	/// Return true if machine supports out of order execution.
				333	bool isOutOfOrder() const { return MicroOpBufferSize > 1; }
				334
				335	unsigned getNumProcResourceKinds() const {
				336	return NumProcResourceKinds;
				337	}
				338
				339	const MCProcResourceDesc *getProcResource(unsigned ProcResourceIdx) const {
				340	assert(hasInstrSchedModel() && "No scheduling machine model");
				341
				342	assert(ProcResourceIdx < NumProcResourceKinds && "bad proc resource idx");
				343	return &ProcResourceTable[ProcResourceIdx];
				344	}
				345
				346	const MCSchedClassDesc *getSchedClassDesc(unsigned SchedClassIdx) const {
				347	assert(hasInstrSchedModel() && "No scheduling machine model");
				348
				349	assert(SchedClassIdx < NumSchedClasses && "bad scheduling class idx");
				350	return &SchedClassTable[SchedClassIdx];
				351	}
				352
				353	/// Returns the latency value for the scheduling class.
				354	static int computeInstrLatency(const MCSubtargetInfo &STI,
				355	const MCSchedClassDesc &SCDesc);
				356
Andrew Scull	cdfcccc	2018-10-05 20:58:37 +0100	[diff] [blame]	357	int computeInstrLatency(const MCSubtargetInfo &STI, unsigned SClass) const;
				358	int computeInstrLatency(const MCSubtargetInfo &STI, const MCInstrInfo &MCII,
				359	const MCInst &Inst) const;
				360
				361	// Returns the reciprocal throughput information from a MCSchedClassDesc.
				362	static double
Andrew Scull	5e1ddfa	2018-08-14 10:06:54 +0100	[diff] [blame]	363	getReciprocalThroughput(const MCSubtargetInfo &STI,
				364	const MCSchedClassDesc &SCDesc);
				365
Andrew Scull	cdfcccc	2018-10-05 20:58:37 +0100	[diff] [blame]	366	static double
				367	getReciprocalThroughput(unsigned SchedClass, const InstrItineraryData &IID);
				368
				369	double
				370	getReciprocalThroughput(const MCSubtargetInfo &STI, const MCInstrInfo &MCII,
				371	const MCInst &Inst) const;
				372
Andrew Walbran	16937d0	2019-10-22 13:54:20 +0100	[diff] [blame]	373	/// Returns the maximum forwarding delay for register reads dependent on
				374	/// writes of scheduling class WriteResourceIdx.
				375	static unsigned getForwardingDelayCycles(ArrayRef<MCReadAdvanceEntry> Entries,
				376	unsigned WriteResourceIdx = 0);
				377
Andrew Scull	5e1ddfa	2018-08-14 10:06:54 +0100	[diff] [blame]	378	/// Returns the default initialized model.
				379	static const MCSchedModel &GetDefaultSchedModel() { return Default; }
				380	static const MCSchedModel Default;
				381	};
				382
Andrew Scull	cdfcccc	2018-10-05 20:58:37 +0100	[diff] [blame]	383	} // namespace llvm
Andrew Scull	5e1ddfa	2018-08-14 10:06:54 +0100	[diff] [blame]	384
				385	#endif