Blame - linux-x64/clang/include/llvm/MC/MCSchedule.h - hafnium/prebuilts

blob: f2f1dfb369189c566b7fca7dbe5dd759f49ac84a [file] [log] [blame]

Andrew Scull	5e1ddfa	2018-08-14 10:06:54 +0100	[diff] [blame]	1	//===-- llvm/MC/MCSchedule.h - Scheduling ------------------------ C++ --===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9	//
				10	// This file defines the classes used to describe a subtarget's machine model
				11	// for scheduling and other instruction cost heuristics.
				12	//
				13	//===----------------------------------------------------------------------===//
				14
				15	#ifndef LLVM_MC_MCSCHEDULE_H
				16	#define LLVM_MC_MCSCHEDULE_H
				17
				18	#include "llvm/ADT/Optional.h"
Andrew Scull	cdfcccc	2018-10-05 20:58:37 +0100	[diff] [blame^]	19	#include "llvm/Config/llvm-config.h"
Andrew Scull	5e1ddfa	2018-08-14 10:06:54 +0100	[diff] [blame]	20	#include "llvm/Support/DataTypes.h"
				21	#include <cassert>
				22
				23	namespace llvm {
				24
				25	struct InstrItinerary;
				26	class MCSubtargetInfo;
Andrew Scull	cdfcccc	2018-10-05 20:58:37 +0100	[diff] [blame^]	27	class MCInstrInfo;
				28	class MCInst;
				29	class InstrItineraryData;
Andrew Scull	5e1ddfa	2018-08-14 10:06:54 +0100	[diff] [blame]	30
				31	/// Define a kind of processor resource that will be modeled by the scheduler.
				32	struct MCProcResourceDesc {
				33	const char *Name;
				34	unsigned NumUnits; // Number of resource of this kind
				35	unsigned SuperIdx; // Index of the resources kind that contains this kind.
				36
				37	// Number of resources that may be buffered.
				38	//
				39	// Buffered resources (BufferSize != 0) may be consumed at some indeterminate
				40	// cycle after dispatch. This should be used for out-of-order cpus when
				41	// instructions that use this resource can be buffered in a reservaton
				42	// station.
				43	//
				44	// Unbuffered resources (BufferSize == 0) always consume their resource some
				45	// fixed number of cycles after dispatch. If a resource is unbuffered, then
				46	// the scheduler will avoid scheduling instructions with conflicting resources
				47	// in the same cycle. This is for in-order cpus, or the in-order portion of
				48	// an out-of-order cpus.
				49	int BufferSize;
				50
				51	// If the resource has sub-units, a pointer to the first element of an array
				52	// of `NumUnits` elements containing the ProcResourceIdx of the sub units.
				53	// nullptr if the resource does not have sub-units.
				54	const unsigned *SubUnitsIdxBegin;
				55
				56	bool operator==(const MCProcResourceDesc &Other) const {
				57	return NumUnits == Other.NumUnits && SuperIdx == Other.SuperIdx
				58	&& BufferSize == Other.BufferSize;
				59	}
				60	};
				61
				62	/// Identify one of the processor resource kinds consumed by a particular
				63	/// scheduling class for the specified number of cycles.
				64	struct MCWriteProcResEntry {
				65	uint16_t ProcResourceIdx;
				66	uint16_t Cycles;
				67
				68	bool operator==(const MCWriteProcResEntry &Other) const {
				69	return ProcResourceIdx == Other.ProcResourceIdx && Cycles == Other.Cycles;
				70	}
				71	};
				72
				73	/// Specify the latency in cpu cycles for a particular scheduling class and def
				74	/// index. -1 indicates an invalid latency. Heuristics would typically consider
				75	/// an instruction with invalid latency to have infinite latency. Also identify
				76	/// the WriteResources of this def. When the operand expands to a sequence of
				77	/// writes, this ID is the last write in the sequence.
				78	struct MCWriteLatencyEntry {
				79	int16_t Cycles;
				80	uint16_t WriteResourceID;
				81
				82	bool operator==(const MCWriteLatencyEntry &Other) const {
				83	return Cycles == Other.Cycles && WriteResourceID == Other.WriteResourceID;
				84	}
				85	};
				86
				87	/// Specify the number of cycles allowed after instruction issue before a
				88	/// particular use operand reads its registers. This effectively reduces the
				89	/// write's latency. Here we allow negative cycles for corner cases where
				90	/// latency increases. This rule only applies when the entry's WriteResource
				91	/// matches the write's WriteResource.
				92	///
				93	/// MCReadAdvanceEntries are sorted first by operand index (UseIdx), then by
				94	/// WriteResourceIdx.
				95	struct MCReadAdvanceEntry {
				96	unsigned UseIdx;
				97	unsigned WriteResourceID;
				98	int Cycles;
				99
				100	bool operator==(const MCReadAdvanceEntry &Other) const {
				101	return UseIdx == Other.UseIdx && WriteResourceID == Other.WriteResourceID
				102	&& Cycles == Other.Cycles;
				103	}
				104	};
				105
				106	/// Summarize the scheduling resources required for an instruction of a
				107	/// particular scheduling class.
				108	///
				109	/// Defined as an aggregate struct for creating tables with initializer lists.
				110	struct MCSchedClassDesc {
				111	static const unsigned short InvalidNumMicroOps = (1U << 14) - 1;
				112	static const unsigned short VariantNumMicroOps = InvalidNumMicroOps - 1;
				113
				114	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
				115	const char* Name;
				116	#endif
				117	uint16_t NumMicroOps : 14;
				118	bool BeginGroup : 1;
				119	bool EndGroup : 1;
				120	uint16_t WriteProcResIdx; // First index into WriteProcResTable.
				121	uint16_t NumWriteProcResEntries;
				122	uint16_t WriteLatencyIdx; // First index into WriteLatencyTable.
				123	uint16_t NumWriteLatencyEntries;
				124	uint16_t ReadAdvanceIdx; // First index into ReadAdvanceTable.
				125	uint16_t NumReadAdvanceEntries;
				126
				127	bool isValid() const {
				128	return NumMicroOps != InvalidNumMicroOps;
				129	}
				130	bool isVariant() const {
				131	return NumMicroOps == VariantNumMicroOps;
				132	}
				133	};
				134
Andrew Scull	cdfcccc	2018-10-05 20:58:37 +0100	[diff] [blame^]	135	/// Specify the cost of a register definition in terms of number of physical
				136	/// register allocated at register renaming stage. For example, AMD Jaguar.
				137	/// natively supports 128-bit data types, and operations on 256-bit registers
				138	/// (i.e. YMM registers) are internally split into two COPs (complex operations)
				139	/// and each COP updates a physical register. Basically, on Jaguar, a YMM
				140	/// register write effectively consumes two physical registers. That means,
				141	/// the cost of a YMM write in the BtVer2 model is 2.
				142	struct MCRegisterCostEntry {
				143	unsigned RegisterClassID;
				144	unsigned Cost;
				145	};
				146
				147	/// A register file descriptor.
				148	///
				149	/// This struct allows to describe processor register files. In particular, it
				150	/// helps describing the size of the register file, as well as the cost of
				151	/// allocating a register file at register renaming stage.
				152	/// FIXME: this struct can be extended to provide information about the number
				153	/// of read/write ports to the register file. A value of zero for field
				154	/// 'NumPhysRegs' means: this register file has an unbounded number of physical
				155	/// registers.
				156	struct MCRegisterFileDesc {
				157	const char *Name;
				158	uint16_t NumPhysRegs;
				159	uint16_t NumRegisterCostEntries;
				160	// Index of the first cost entry in MCExtraProcessorInfo::RegisterCostTable.
				161	uint16_t RegisterCostEntryIdx;
				162	};
				163
				164	/// Provide extra details about the machine processor.
				165	///
				166	/// This is a collection of "optional" processor information that is not
				167	/// normally used by the LLVM machine schedulers, but that can be consumed by
				168	/// external tools like llvm-mca to improve the quality of the peformance
				169	/// analysis.
				170	struct MCExtraProcessorInfo {
				171	// Actual size of the reorder buffer in hardware.
				172	unsigned ReorderBufferSize;
				173	// Number of instructions retired per cycle.
				174	unsigned MaxRetirePerCycle;
				175	const MCRegisterFileDesc *RegisterFiles;
				176	unsigned NumRegisterFiles;
				177	const MCRegisterCostEntry *RegisterCostTable;
				178	unsigned NumRegisterCostEntries;
				179
				180	struct PfmCountersInfo {
				181	// An optional name of a performance counter that can be used to measure
				182	// cycles.
				183	const char *CycleCounter;
				184
				185	// For each MCProcResourceDesc defined by the processor, an optional list of
				186	// names of performance counters that can be used to measure the resource
				187	// utilization.
				188	const char **IssueCounters;
				189	};
				190	PfmCountersInfo PfmCounters;
				191	};
				192
Andrew Scull	5e1ddfa	2018-08-14 10:06:54 +0100	[diff] [blame]	193	/// Machine model for scheduling, bundling, and heuristics.
				194	///
				195	/// The machine model directly provides basic information about the
				196	/// microarchitecture to the scheduler in the form of properties. It also
				197	/// optionally refers to scheduler resource tables and itinerary
				198	/// tables. Scheduler resource tables model the latency and cost for each
				199	/// instruction type. Itinerary tables are an independent mechanism that
				200	/// provides a detailed reservation table describing each cycle of instruction
				201	/// execution. Subtargets may define any or all of the above categories of data
				202	/// depending on the type of CPU and selected scheduler.
Andrew Scull	cdfcccc	2018-10-05 20:58:37 +0100	[diff] [blame^]	203	///
				204	/// The machine independent properties defined here are used by the scheduler as
				205	/// an abstract machine model. A real micro-architecture has a number of
				206	/// buffers, queues, and stages. Declaring that a given machine-independent
				207	/// abstract property corresponds to a specific physical property across all
				208	/// subtargets can't be done. Nonetheless, the abstract model is
				209	/// useful. Futhermore, subtargets typically extend this model with processor
				210	/// specific resources to model any hardware features that can be exploited by
				211	/// sceduling heuristics and aren't sufficiently represented in the abstract.
				212	///
				213	/// The abstract pipeline is built around the notion of an "issue point". This
				214	/// is merely a reference point for counting machine cycles. The physical
				215	/// machine will have pipeline stages that delay execution. The scheduler does
				216	/// not model those delays because they are irrelevant as long as they are
				217	/// consistent. Inaccuracies arise when instructions have different execution
				218	/// delays relative to each other, in addition to their intrinsic latency. Those
				219	/// special cases can be handled by TableGen constructs such as, ReadAdvance,
				220	/// which reduces latency when reading data, and ResourceCycles, which consumes
				221	/// a processor resource when writing data for a number of abstract
				222	/// cycles.
				223	///
				224	/// TODO: One tool currently missing is the ability to add a delay to
				225	/// ResourceCycles. That would be easy to add and would likely cover all cases
				226	/// currently handled by the legacy itinerary tables.
				227	///
				228	/// A note on out-of-order execution and, more generally, instruction
				229	/// buffers. Part of the CPU pipeline is always in-order. The issue point, which
				230	/// is the point of reference for counting cycles, only makes sense as an
				231	/// in-order part of the pipeline. Other parts of the pipeline are sometimes
				232	/// falling behind and sometimes catching up. It's only interesting to model
				233	/// those other, decoupled parts of the pipeline if they may be predictably
				234	/// resource constrained in a way that the scheduler can exploit.
				235	///
				236	/// The LLVM machine model distinguishes between in-order constraints and
				237	/// out-of-order constraints so that the target's scheduling strategy can apply
				238	/// appropriate heuristics. For a well-balanced CPU pipeline, out-of-order
				239	/// resources would not typically be treated as a hard scheduling
				240	/// constraint. For example, in the GenericScheduler, a delay caused by limited
				241	/// out-of-order resources is not directly reflected in the number of cycles
				242	/// that the scheduler sees between issuing an instruction and its dependent
				243	/// instructions. In other words, out-of-order resources don't directly increase
				244	/// the latency between pairs of instructions. However, they can still be used
				245	/// to detect potential bottlenecks across a sequence of instructions and bias
				246	/// the scheduling heuristics appropriately.
Andrew Scull	5e1ddfa	2018-08-14 10:06:54 +0100	[diff] [blame]	247	struct MCSchedModel {
				248	// IssueWidth is the maximum number of instructions that may be scheduled in
Andrew Scull	cdfcccc	2018-10-05 20:58:37 +0100	[diff] [blame^]	249	// the same per-cycle group. This is meant to be a hard in-order constraint
				250	// (a.k.a. "hazard"). In the GenericScheduler strategy, no more than
				251	// IssueWidth micro-ops can ever be scheduled in a particular cycle.
				252	//
				253	// In practice, IssueWidth is useful to model any bottleneck between the
				254	// decoder (after micro-op expansion) and the out-of-order reservation
				255	// stations or the decoder bandwidth itself. If the total number of
				256	// reservation stations is also a bottleneck, or if any other pipeline stage
				257	// has a bandwidth limitation, then that can be naturally modeled by adding an
				258	// out-of-order processor resource.
Andrew Scull	5e1ddfa	2018-08-14 10:06:54 +0100	[diff] [blame]	259	unsigned IssueWidth;
				260	static const unsigned DefaultIssueWidth = 1;
				261
				262	// MicroOpBufferSize is the number of micro-ops that the processor may buffer
				263	// for out-of-order execution.
				264	//
				265	// "0" means operations that are not ready in this cycle are not considered
				266	// for scheduling (they go in the pending queue). Latency is paramount. This
				267	// may be more efficient if many instructions are pending in a schedule.
				268	//
				269	// "1" means all instructions are considered for scheduling regardless of
				270	// whether they are ready in this cycle. Latency still causes issue stalls,
				271	// but we balance those stalls against other heuristics.
				272	//
				273	// "> 1" means the processor is out-of-order. This is a machine independent
				274	// estimate of highly machine specific characteristics such as the register
				275	// renaming pool and reorder buffer.
				276	unsigned MicroOpBufferSize;
				277	static const unsigned DefaultMicroOpBufferSize = 0;
				278
				279	// LoopMicroOpBufferSize is the number of micro-ops that the processor may
				280	// buffer for optimized loop execution. More generally, this represents the
				281	// optimal number of micro-ops in a loop body. A loop may be partially
				282	// unrolled to bring the count of micro-ops in the loop body closer to this
				283	// number.
				284	unsigned LoopMicroOpBufferSize;
				285	static const unsigned DefaultLoopMicroOpBufferSize = 0;
				286
				287	// LoadLatency is the expected latency of load instructions.
				288	unsigned LoadLatency;
				289	static const unsigned DefaultLoadLatency = 4;
				290
				291	// HighLatency is the expected latency of "very high latency" operations.
				292	// See TargetInstrInfo::isHighLatencyDef().
				293	// By default, this is set to an arbitrarily high number of cycles
				294	// likely to have some impact on scheduling heuristics.
				295	unsigned HighLatency;
				296	static const unsigned DefaultHighLatency = 10;
				297
				298	// MispredictPenalty is the typical number of extra cycles the processor
				299	// takes to recover from a branch misprediction.
				300	unsigned MispredictPenalty;
				301	static const unsigned DefaultMispredictPenalty = 10;
				302
				303	bool PostRAScheduler; // default value is false
				304
				305	bool CompleteModel;
				306
				307	unsigned ProcID;
				308	const MCProcResourceDesc *ProcResourceTable;
				309	const MCSchedClassDesc *SchedClassTable;
				310	unsigned NumProcResourceKinds;
				311	unsigned NumSchedClasses;
				312	// Instruction itinerary tables used by InstrItineraryData.
				313	friend class InstrItineraryData;
				314	const InstrItinerary *InstrItineraries;
				315
Andrew Scull	cdfcccc	2018-10-05 20:58:37 +0100	[diff] [blame^]	316	const MCExtraProcessorInfo *ExtraProcessorInfo;
				317
				318	bool hasExtraProcessorInfo() const { return ExtraProcessorInfo; }
				319
Andrew Scull	5e1ddfa	2018-08-14 10:06:54 +0100	[diff] [blame]	320	unsigned getProcessorID() const { return ProcID; }
				321
				322	/// Does this machine model include instruction-level scheduling.
				323	bool hasInstrSchedModel() const { return SchedClassTable; }
				324
Andrew Scull	cdfcccc	2018-10-05 20:58:37 +0100	[diff] [blame^]	325	const MCExtraProcessorInfo &getExtraProcessorInfo() const {
				326	assert(hasExtraProcessorInfo() &&
				327	"No extra information available for this model");
				328	return *ExtraProcessorInfo;
				329	}
				330
Andrew Scull	5e1ddfa	2018-08-14 10:06:54 +0100	[diff] [blame]	331	/// Return true if this machine model data for all instructions with a
				332	/// scheduling class (itinerary class or SchedRW list).
				333	bool isComplete() const { return CompleteModel; }
				334
				335	/// Return true if machine supports out of order execution.
				336	bool isOutOfOrder() const { return MicroOpBufferSize > 1; }
				337
				338	unsigned getNumProcResourceKinds() const {
				339	return NumProcResourceKinds;
				340	}
				341
				342	const MCProcResourceDesc *getProcResource(unsigned ProcResourceIdx) const {
				343	assert(hasInstrSchedModel() && "No scheduling machine model");
				344
				345	assert(ProcResourceIdx < NumProcResourceKinds && "bad proc resource idx");
				346	return &ProcResourceTable[ProcResourceIdx];
				347	}
				348
				349	const MCSchedClassDesc *getSchedClassDesc(unsigned SchedClassIdx) const {
				350	assert(hasInstrSchedModel() && "No scheduling machine model");
				351
				352	assert(SchedClassIdx < NumSchedClasses && "bad scheduling class idx");
				353	return &SchedClassTable[SchedClassIdx];
				354	}
				355
				356	/// Returns the latency value for the scheduling class.
				357	static int computeInstrLatency(const MCSubtargetInfo &STI,
				358	const MCSchedClassDesc &SCDesc);
				359
Andrew Scull	cdfcccc	2018-10-05 20:58:37 +0100	[diff] [blame^]	360	int computeInstrLatency(const MCSubtargetInfo &STI, unsigned SClass) const;
				361	int computeInstrLatency(const MCSubtargetInfo &STI, const MCInstrInfo &MCII,
				362	const MCInst &Inst) const;
				363
				364	// Returns the reciprocal throughput information from a MCSchedClassDesc.
				365	static double
Andrew Scull	5e1ddfa	2018-08-14 10:06:54 +0100	[diff] [blame]	366	getReciprocalThroughput(const MCSubtargetInfo &STI,
				367	const MCSchedClassDesc &SCDesc);
				368
Andrew Scull	cdfcccc	2018-10-05 20:58:37 +0100	[diff] [blame^]	369	static double
				370	getReciprocalThroughput(unsigned SchedClass, const InstrItineraryData &IID);
				371
				372	double
				373	getReciprocalThroughput(const MCSubtargetInfo &STI, const MCInstrInfo &MCII,
				374	const MCInst &Inst) const;
				375
Andrew Scull	5e1ddfa	2018-08-14 10:06:54 +0100	[diff] [blame]	376	/// Returns the default initialized model.
				377	static const MCSchedModel &GetDefaultSchedModel() { return Default; }
				378	static const MCSchedModel Default;
				379	};
				380
Andrew Scull	cdfcccc	2018-10-05 20:58:37 +0100	[diff] [blame^]	381	} // namespace llvm
Andrew Scull	5e1ddfa	2018-08-14 10:06:54 +0100	[diff] [blame]	382
				383	#endif