Blame - linux-x64/clang/include/llvm/MC/MCSchedule.h - hafnium/prebuilts

blob: 9f53a468903d9679f3ad4b8b672810d42038c179 [file] [log] [blame]

Andrew Scull	5e1ddfa	2018-08-14 10:06:54 +0100	[diff] [blame]	1	//===-- llvm/MC/MCSchedule.h - Scheduling ------------------------ C++ --===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9	//
				10	// This file defines the classes used to describe a subtarget's machine model
				11	// for scheduling and other instruction cost heuristics.
				12	//
				13	//===----------------------------------------------------------------------===//
				14
				15	#ifndef LLVM_MC_MCSCHEDULE_H
				16	#define LLVM_MC_MCSCHEDULE_H
				17
				18	#include "llvm/ADT/Optional.h"
Andrew Scull	cdfcccc	2018-10-05 20:58:37 +0100	[diff] [blame]	19	#include "llvm/Config/llvm-config.h"
Andrew Scull	5e1ddfa	2018-08-14 10:06:54 +0100	[diff] [blame]	20	#include "llvm/Support/DataTypes.h"
				21	#include <cassert>
				22
				23	namespace llvm {
				24
				25	struct InstrItinerary;
				26	class MCSubtargetInfo;
Andrew Scull	cdfcccc	2018-10-05 20:58:37 +0100	[diff] [blame]	27	class MCInstrInfo;
				28	class MCInst;
				29	class InstrItineraryData;
Andrew Scull	5e1ddfa	2018-08-14 10:06:54 +0100	[diff] [blame]	30
				31	/// Define a kind of processor resource that will be modeled by the scheduler.
				32	struct MCProcResourceDesc {
				33	const char *Name;
				34	unsigned NumUnits; // Number of resource of this kind
				35	unsigned SuperIdx; // Index of the resources kind that contains this kind.
				36
				37	// Number of resources that may be buffered.
				38	//
				39	// Buffered resources (BufferSize != 0) may be consumed at some indeterminate
				40	// cycle after dispatch. This should be used for out-of-order cpus when
				41	// instructions that use this resource can be buffered in a reservaton
				42	// station.
				43	//
				44	// Unbuffered resources (BufferSize == 0) always consume their resource some
				45	// fixed number of cycles after dispatch. If a resource is unbuffered, then
				46	// the scheduler will avoid scheduling instructions with conflicting resources
				47	// in the same cycle. This is for in-order cpus, or the in-order portion of
				48	// an out-of-order cpus.
				49	int BufferSize;
				50
				51	// If the resource has sub-units, a pointer to the first element of an array
				52	// of `NumUnits` elements containing the ProcResourceIdx of the sub units.
				53	// nullptr if the resource does not have sub-units.
				54	const unsigned *SubUnitsIdxBegin;
				55
				56	bool operator==(const MCProcResourceDesc &Other) const {
				57	return NumUnits == Other.NumUnits && SuperIdx == Other.SuperIdx
				58	&& BufferSize == Other.BufferSize;
				59	}
				60	};
				61
				62	/// Identify one of the processor resource kinds consumed by a particular
				63	/// scheduling class for the specified number of cycles.
				64	struct MCWriteProcResEntry {
				65	uint16_t ProcResourceIdx;
				66	uint16_t Cycles;
				67
				68	bool operator==(const MCWriteProcResEntry &Other) const {
				69	return ProcResourceIdx == Other.ProcResourceIdx && Cycles == Other.Cycles;
				70	}
				71	};
				72
				73	/// Specify the latency in cpu cycles for a particular scheduling class and def
				74	/// index. -1 indicates an invalid latency. Heuristics would typically consider
				75	/// an instruction with invalid latency to have infinite latency. Also identify
				76	/// the WriteResources of this def. When the operand expands to a sequence of
				77	/// writes, this ID is the last write in the sequence.
				78	struct MCWriteLatencyEntry {
				79	int16_t Cycles;
				80	uint16_t WriteResourceID;
				81
				82	bool operator==(const MCWriteLatencyEntry &Other) const {
				83	return Cycles == Other.Cycles && WriteResourceID == Other.WriteResourceID;
				84	}
				85	};
				86
				87	/// Specify the number of cycles allowed after instruction issue before a
				88	/// particular use operand reads its registers. This effectively reduces the
				89	/// write's latency. Here we allow negative cycles for corner cases where
				90	/// latency increases. This rule only applies when the entry's WriteResource
				91	/// matches the write's WriteResource.
				92	///
				93	/// MCReadAdvanceEntries are sorted first by operand index (UseIdx), then by
				94	/// WriteResourceIdx.
				95	struct MCReadAdvanceEntry {
				96	unsigned UseIdx;
				97	unsigned WriteResourceID;
				98	int Cycles;
				99
				100	bool operator==(const MCReadAdvanceEntry &Other) const {
				101	return UseIdx == Other.UseIdx && WriteResourceID == Other.WriteResourceID
				102	&& Cycles == Other.Cycles;
				103	}
				104	};
				105
				106	/// Summarize the scheduling resources required for an instruction of a
				107	/// particular scheduling class.
				108	///
				109	/// Defined as an aggregate struct for creating tables with initializer lists.
				110	struct MCSchedClassDesc {
				111	static const unsigned short InvalidNumMicroOps = (1U << 14) - 1;
				112	static const unsigned short VariantNumMicroOps = InvalidNumMicroOps - 1;
				113
				114	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
				115	const char* Name;
				116	#endif
				117	uint16_t NumMicroOps : 14;
				118	bool BeginGroup : 1;
				119	bool EndGroup : 1;
				120	uint16_t WriteProcResIdx; // First index into WriteProcResTable.
				121	uint16_t NumWriteProcResEntries;
				122	uint16_t WriteLatencyIdx; // First index into WriteLatencyTable.
				123	uint16_t NumWriteLatencyEntries;
				124	uint16_t ReadAdvanceIdx; // First index into ReadAdvanceTable.
				125	uint16_t NumReadAdvanceEntries;
				126
				127	bool isValid() const {
				128	return NumMicroOps != InvalidNumMicroOps;
				129	}
				130	bool isVariant() const {
				131	return NumMicroOps == VariantNumMicroOps;
				132	}
				133	};
				134
Andrew Scull	cdfcccc	2018-10-05 20:58:37 +0100	[diff] [blame]	135	/// Specify the cost of a register definition in terms of number of physical
				136	/// register allocated at register renaming stage. For example, AMD Jaguar.
				137	/// natively supports 128-bit data types, and operations on 256-bit registers
				138	/// (i.e. YMM registers) are internally split into two COPs (complex operations)
				139	/// and each COP updates a physical register. Basically, on Jaguar, a YMM
				140	/// register write effectively consumes two physical registers. That means,
				141	/// the cost of a YMM write in the BtVer2 model is 2.
				142	struct MCRegisterCostEntry {
				143	unsigned RegisterClassID;
				144	unsigned Cost;
				145	};
				146
				147	/// A register file descriptor.
				148	///
				149	/// This struct allows to describe processor register files. In particular, it
				150	/// helps describing the size of the register file, as well as the cost of
				151	/// allocating a register file at register renaming stage.
				152	/// FIXME: this struct can be extended to provide information about the number
				153	/// of read/write ports to the register file. A value of zero for field
				154	/// 'NumPhysRegs' means: this register file has an unbounded number of physical
				155	/// registers.
				156	struct MCRegisterFileDesc {
				157	const char *Name;
				158	uint16_t NumPhysRegs;
				159	uint16_t NumRegisterCostEntries;
				160	// Index of the first cost entry in MCExtraProcessorInfo::RegisterCostTable.
				161	uint16_t RegisterCostEntryIdx;
				162	};
				163
				164	/// Provide extra details about the machine processor.
				165	///
				166	/// This is a collection of "optional" processor information that is not
				167	/// normally used by the LLVM machine schedulers, but that can be consumed by
				168	/// external tools like llvm-mca to improve the quality of the peformance
				169	/// analysis.
				170	struct MCExtraProcessorInfo {
				171	// Actual size of the reorder buffer in hardware.
				172	unsigned ReorderBufferSize;
				173	// Number of instructions retired per cycle.
				174	unsigned MaxRetirePerCycle;
				175	const MCRegisterFileDesc *RegisterFiles;
				176	unsigned NumRegisterFiles;
				177	const MCRegisterCostEntry *RegisterCostTable;
				178	unsigned NumRegisterCostEntries;
				179
				180	struct PfmCountersInfo {
				181	// An optional name of a performance counter that can be used to measure
				182	// cycles.
				183	const char *CycleCounter;
				184
Andrew Scull	0372a57	2018-11-16 15:47:06 +0000	[diff] [blame]	185	// An optional name of a performance counter that can be used to measure
				186	// uops.
				187	const char *UopsCounter;
				188
Andrew Scull	cdfcccc	2018-10-05 20:58:37 +0100	[diff] [blame]	189	// For each MCProcResourceDesc defined by the processor, an optional list of
				190	// names of performance counters that can be used to measure the resource
				191	// utilization.
				192	const char **IssueCounters;
				193	};
				194	PfmCountersInfo PfmCounters;
				195	};
				196
Andrew Scull	5e1ddfa	2018-08-14 10:06:54 +0100	[diff] [blame]	197	/// Machine model for scheduling, bundling, and heuristics.
				198	///
				199	/// The machine model directly provides basic information about the
				200	/// microarchitecture to the scheduler in the form of properties. It also
				201	/// optionally refers to scheduler resource tables and itinerary
				202	/// tables. Scheduler resource tables model the latency and cost for each
				203	/// instruction type. Itinerary tables are an independent mechanism that
				204	/// provides a detailed reservation table describing each cycle of instruction
				205	/// execution. Subtargets may define any or all of the above categories of data
				206	/// depending on the type of CPU and selected scheduler.
Andrew Scull	cdfcccc	2018-10-05 20:58:37 +0100	[diff] [blame]	207	///
				208	/// The machine independent properties defined here are used by the scheduler as
				209	/// an abstract machine model. A real micro-architecture has a number of
				210	/// buffers, queues, and stages. Declaring that a given machine-independent
				211	/// abstract property corresponds to a specific physical property across all
				212	/// subtargets can't be done. Nonetheless, the abstract model is
				213	/// useful. Futhermore, subtargets typically extend this model with processor
				214	/// specific resources to model any hardware features that can be exploited by
				215	/// sceduling heuristics and aren't sufficiently represented in the abstract.
				216	///
				217	/// The abstract pipeline is built around the notion of an "issue point". This
				218	/// is merely a reference point for counting machine cycles. The physical
				219	/// machine will have pipeline stages that delay execution. The scheduler does
				220	/// not model those delays because they are irrelevant as long as they are
				221	/// consistent. Inaccuracies arise when instructions have different execution
				222	/// delays relative to each other, in addition to their intrinsic latency. Those
				223	/// special cases can be handled by TableGen constructs such as, ReadAdvance,
				224	/// which reduces latency when reading data, and ResourceCycles, which consumes
				225	/// a processor resource when writing data for a number of abstract
				226	/// cycles.
				227	///
				228	/// TODO: One tool currently missing is the ability to add a delay to
				229	/// ResourceCycles. That would be easy to add and would likely cover all cases
				230	/// currently handled by the legacy itinerary tables.
				231	///
				232	/// A note on out-of-order execution and, more generally, instruction
				233	/// buffers. Part of the CPU pipeline is always in-order. The issue point, which
				234	/// is the point of reference for counting cycles, only makes sense as an
				235	/// in-order part of the pipeline. Other parts of the pipeline are sometimes
				236	/// falling behind and sometimes catching up. It's only interesting to model
				237	/// those other, decoupled parts of the pipeline if they may be predictably
				238	/// resource constrained in a way that the scheduler can exploit.
				239	///
				240	/// The LLVM machine model distinguishes between in-order constraints and
				241	/// out-of-order constraints so that the target's scheduling strategy can apply
				242	/// appropriate heuristics. For a well-balanced CPU pipeline, out-of-order
				243	/// resources would not typically be treated as a hard scheduling
				244	/// constraint. For example, in the GenericScheduler, a delay caused by limited
				245	/// out-of-order resources is not directly reflected in the number of cycles
				246	/// that the scheduler sees between issuing an instruction and its dependent
				247	/// instructions. In other words, out-of-order resources don't directly increase
				248	/// the latency between pairs of instructions. However, they can still be used
				249	/// to detect potential bottlenecks across a sequence of instructions and bias
				250	/// the scheduling heuristics appropriately.
Andrew Scull	5e1ddfa	2018-08-14 10:06:54 +0100	[diff] [blame]	251	struct MCSchedModel {
				252	// IssueWidth is the maximum number of instructions that may be scheduled in
Andrew Scull	cdfcccc	2018-10-05 20:58:37 +0100	[diff] [blame]	253	// the same per-cycle group. This is meant to be a hard in-order constraint
				254	// (a.k.a. "hazard"). In the GenericScheduler strategy, no more than
				255	// IssueWidth micro-ops can ever be scheduled in a particular cycle.
				256	//
				257	// In practice, IssueWidth is useful to model any bottleneck between the
				258	// decoder (after micro-op expansion) and the out-of-order reservation
				259	// stations or the decoder bandwidth itself. If the total number of
				260	// reservation stations is also a bottleneck, or if any other pipeline stage
				261	// has a bandwidth limitation, then that can be naturally modeled by adding an
				262	// out-of-order processor resource.
Andrew Scull	5e1ddfa	2018-08-14 10:06:54 +0100	[diff] [blame]	263	unsigned IssueWidth;
				264	static const unsigned DefaultIssueWidth = 1;
				265
				266	// MicroOpBufferSize is the number of micro-ops that the processor may buffer
				267	// for out-of-order execution.
				268	//
				269	// "0" means operations that are not ready in this cycle are not considered
				270	// for scheduling (they go in the pending queue). Latency is paramount. This
				271	// may be more efficient if many instructions are pending in a schedule.
				272	//
				273	// "1" means all instructions are considered for scheduling regardless of
				274	// whether they are ready in this cycle. Latency still causes issue stalls,
				275	// but we balance those stalls against other heuristics.
				276	//
				277	// "> 1" means the processor is out-of-order. This is a machine independent
				278	// estimate of highly machine specific characteristics such as the register
				279	// renaming pool and reorder buffer.
				280	unsigned MicroOpBufferSize;
				281	static const unsigned DefaultMicroOpBufferSize = 0;
				282
				283	// LoopMicroOpBufferSize is the number of micro-ops that the processor may
				284	// buffer for optimized loop execution. More generally, this represents the
				285	// optimal number of micro-ops in a loop body. A loop may be partially
				286	// unrolled to bring the count of micro-ops in the loop body closer to this
				287	// number.
				288	unsigned LoopMicroOpBufferSize;
				289	static const unsigned DefaultLoopMicroOpBufferSize = 0;
				290
				291	// LoadLatency is the expected latency of load instructions.
				292	unsigned LoadLatency;
				293	static const unsigned DefaultLoadLatency = 4;
				294
				295	// HighLatency is the expected latency of "very high latency" operations.
				296	// See TargetInstrInfo::isHighLatencyDef().
				297	// By default, this is set to an arbitrarily high number of cycles
				298	// likely to have some impact on scheduling heuristics.
				299	unsigned HighLatency;
				300	static const unsigned DefaultHighLatency = 10;
				301
				302	// MispredictPenalty is the typical number of extra cycles the processor
				303	// takes to recover from a branch misprediction.
				304	unsigned MispredictPenalty;
				305	static const unsigned DefaultMispredictPenalty = 10;
				306
				307	bool PostRAScheduler; // default value is false
				308
				309	bool CompleteModel;
				310
				311	unsigned ProcID;
				312	const MCProcResourceDesc *ProcResourceTable;
				313	const MCSchedClassDesc *SchedClassTable;
				314	unsigned NumProcResourceKinds;
				315	unsigned NumSchedClasses;
				316	// Instruction itinerary tables used by InstrItineraryData.
				317	friend class InstrItineraryData;
				318	const InstrItinerary *InstrItineraries;
				319
Andrew Scull	cdfcccc	2018-10-05 20:58:37 +0100	[diff] [blame]	320	const MCExtraProcessorInfo *ExtraProcessorInfo;
				321
				322	bool hasExtraProcessorInfo() const { return ExtraProcessorInfo; }
				323
Andrew Scull	5e1ddfa	2018-08-14 10:06:54 +0100	[diff] [blame]	324	unsigned getProcessorID() const { return ProcID; }
				325
				326	/// Does this machine model include instruction-level scheduling.
				327	bool hasInstrSchedModel() const { return SchedClassTable; }
				328
Andrew Scull	cdfcccc	2018-10-05 20:58:37 +0100	[diff] [blame]	329	const MCExtraProcessorInfo &getExtraProcessorInfo() const {
				330	assert(hasExtraProcessorInfo() &&
				331	"No extra information available for this model");
				332	return *ExtraProcessorInfo;
				333	}
				334
Andrew Scull	5e1ddfa	2018-08-14 10:06:54 +0100	[diff] [blame]	335	/// Return true if this machine model data for all instructions with a
				336	/// scheduling class (itinerary class or SchedRW list).
				337	bool isComplete() const { return CompleteModel; }
				338
				339	/// Return true if machine supports out of order execution.
				340	bool isOutOfOrder() const { return MicroOpBufferSize > 1; }
				341
				342	unsigned getNumProcResourceKinds() const {
				343	return NumProcResourceKinds;
				344	}
				345
				346	const MCProcResourceDesc *getProcResource(unsigned ProcResourceIdx) const {
				347	assert(hasInstrSchedModel() && "No scheduling machine model");
				348
				349	assert(ProcResourceIdx < NumProcResourceKinds && "bad proc resource idx");
				350	return &ProcResourceTable[ProcResourceIdx];
				351	}
				352
				353	const MCSchedClassDesc *getSchedClassDesc(unsigned SchedClassIdx) const {
				354	assert(hasInstrSchedModel() && "No scheduling machine model");
				355
				356	assert(SchedClassIdx < NumSchedClasses && "bad scheduling class idx");
				357	return &SchedClassTable[SchedClassIdx];
				358	}
				359
				360	/// Returns the latency value for the scheduling class.
				361	static int computeInstrLatency(const MCSubtargetInfo &STI,
				362	const MCSchedClassDesc &SCDesc);
				363
Andrew Scull	cdfcccc	2018-10-05 20:58:37 +0100	[diff] [blame]	364	int computeInstrLatency(const MCSubtargetInfo &STI, unsigned SClass) const;
				365	int computeInstrLatency(const MCSubtargetInfo &STI, const MCInstrInfo &MCII,
				366	const MCInst &Inst) const;
				367
				368	// Returns the reciprocal throughput information from a MCSchedClassDesc.
				369	static double
Andrew Scull	5e1ddfa	2018-08-14 10:06:54 +0100	[diff] [blame]	370	getReciprocalThroughput(const MCSubtargetInfo &STI,
				371	const MCSchedClassDesc &SCDesc);
				372
Andrew Scull	cdfcccc	2018-10-05 20:58:37 +0100	[diff] [blame]	373	static double
				374	getReciprocalThroughput(unsigned SchedClass, const InstrItineraryData &IID);
				375
				376	double
				377	getReciprocalThroughput(const MCSubtargetInfo &STI, const MCInstrInfo &MCII,
				378	const MCInst &Inst) const;
				379
Andrew Scull	5e1ddfa	2018-08-14 10:06:54 +0100	[diff] [blame]	380	/// Returns the default initialized model.
				381	static const MCSchedModel &GetDefaultSchedModel() { return Default; }
				382	static const MCSchedModel Default;
				383	};
				384
Andrew Scull	cdfcccc	2018-10-05 20:58:37 +0100	[diff] [blame]	385	} // namespace llvm
Andrew Scull	5e1ddfa	2018-08-14 10:06:54 +0100	[diff] [blame]	386
				387	#endif