Blame - linux-x64/clang/include/llvm/CodeGen/ModuloSchedule.h - hafnium/prebuilts

blob: 1aa23208cfb9643b9e46a27287c41c4fb6e6c488 [file] [log] [blame]

Olivier Deprez	f4ef2d0	2021-04-20 13:36:24 +0200	[diff] [blame]	1	//===- ModuloSchedule.h - Software pipeline schedule expansion ------------===//
				2	//
				3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				4	// See https://llvm.org/LICENSE.txt for license information.
				5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
				6	//
				7	//===----------------------------------------------------------------------===//
				8	//
				9	// Software pipelining (SWP) is an instruction scheduling technique for loops
				10	// that overlaps loop iterations and exploits ILP via compiler transformations.
				11	//
				12	// There are multiple methods for analyzing a loop and creating a schedule.
				13	// An example algorithm is Swing Modulo Scheduling (implemented by the
				14	// MachinePipeliner). The details of how a schedule is arrived at are irrelevant
				15	// for the task of actually rewriting a loop to adhere to the schedule, which
				16	// is what this file does.
				17	//
				18	// A schedule is, for every instruction in a block, a Cycle and a Stage. Note
				19	// that we only support single-block loops, so "block" and "loop" can be used
				20	// interchangably.
				21	//
				22	// The Cycle of an instruction defines a partial order of the instructions in
				23	// the remapped loop. Instructions within a cycle must not consume the output
				24	// of any instruction in the same cycle. Cycle information is assumed to have
				25	// been calculated such that the processor will execute instructions in
				26	// lock-step (for example in a VLIW ISA).
				27	//
				28	// The Stage of an instruction defines the mapping between logical loop
				29	// iterations and pipelined loop iterations. An example (unrolled) pipeline
				30	// may look something like:
				31	//
				32	// I0[0] Execute instruction I0 of iteration 0
				33	// I1[0], I0[1] Execute I0 of iteration 1 and I1 of iteration 1
				34	// I1[1], I0[2]
				35	// I1[2], I0[3]
				36	//
				37	// In the schedule for this unrolled sequence we would say that I0 was scheduled
				38	// in stage 0 and I1 in stage 1:
				39	//
				40	// loop:
				41	// [stage 0] x = I0
				42	// [stage 1] I1 x (from stage 0)
				43	//
				44	// And to actually generate valid code we must insert a phi:
				45	//
				46	// loop:
				47	// x' = phi(x)
				48	// x = I0
				49	// I1 x'
				50	//
				51	// This is a simple example; the rules for how to generate correct code given
				52	// an arbitrary schedule containing loop-carried values are complex.
				53	//
				54	// Note that these examples only mention the steady-state kernel of the
				55	// generated loop; prologs and epilogs must be generated also that prime and
				56	// flush the pipeline. Doing so is nontrivial.
				57	//
				58	//===----------------------------------------------------------------------===//
				59
				60	#ifndef LLVM_LIB_CODEGEN_MODULOSCHEDULE_H
				61	#define LLVM_LIB_CODEGEN_MODULOSCHEDULE_H
				62
				63	#include "llvm/CodeGen/MachineFunction.h"
				64	#include "llvm/CodeGen/MachineLoopInfo.h"
				65	#include "llvm/CodeGen/MachineLoopUtils.h"
				66	#include "llvm/CodeGen/TargetInstrInfo.h"
				67	#include "llvm/CodeGen/TargetSubtargetInfo.h"
				68	#include <deque>
				69	#include <vector>
				70
				71	namespace llvm {
				72	class MachineBasicBlock;
				73	class MachineInstr;
				74	class LiveIntervals;
				75
				76	/// Represents a schedule for a single-block loop. For every instruction we
				77	/// maintain a Cycle and Stage.
				78	class ModuloSchedule {
				79	private:
				80	/// The block containing the loop instructions.
				81	MachineLoop *Loop;
				82
				83	/// The instructions to be generated, in total order. Cycle provides a partial
				84	/// order; the total order within cycles has been decided by the schedule
				85	/// producer.
				86	std::vector<MachineInstr *> ScheduledInstrs;
				87
				88	/// The cycle for each instruction.
				89	DenseMap<MachineInstr *, int> Cycle;
				90
				91	/// The stage for each instruction.
				92	DenseMap<MachineInstr *, int> Stage;
				93
				94	/// The number of stages in this schedule (Max(Stage) + 1).
				95	int NumStages;
				96
				97	public:
				98	/// Create a new ModuloSchedule.
				99	/// \arg ScheduledInstrs The new loop instructions, in total resequenced
				100	/// order.
				101	/// \arg Cycle Cycle index for all instructions in ScheduledInstrs. Cycle does
				102	/// not need to start at zero. ScheduledInstrs must be partially ordered by
				103	/// Cycle.
				104	/// \arg Stage Stage index for all instructions in ScheduleInstrs.
				105	ModuloSchedule(MachineFunction &MF, MachineLoop *Loop,
				106	std::vector<MachineInstr *> ScheduledInstrs,
				107	DenseMap<MachineInstr *, int> Cycle,
				108	DenseMap<MachineInstr *, int> Stage)
				109	: Loop(Loop), ScheduledInstrs(ScheduledInstrs), Cycle(std::move(Cycle)),
				110	Stage(std::move(Stage)) {
				111	NumStages = 0;
				112	for (auto &KV : this->Stage)
				113	NumStages = std::max(NumStages, KV.second);
				114	++NumStages;
				115	}
				116
				117	/// Return the single-block loop being scheduled.
				118	MachineLoop *getLoop() const { return Loop; }
				119
				120	/// Return the number of stages contained in this schedule, which is the
				121	/// largest stage index + 1.
				122	int getNumStages() const { return NumStages; }
				123
				124	/// Return the first cycle in the schedule, which is the cycle index of the
				125	/// first instruction.
				126	int getFirstCycle() { return Cycle[ScheduledInstrs.front()]; }
				127
				128	/// Return the final cycle in the schedule, which is the cycle index of the
				129	/// last instruction.
				130	int getFinalCycle() { return Cycle[ScheduledInstrs.back()]; }
				131
				132	/// Return the stage that MI is scheduled in, or -1.
				133	int getStage(MachineInstr *MI) {
				134	auto I = Stage.find(MI);
				135	return I == Stage.end() ? -1 : I->second;
				136	}
				137
				138	/// Return the cycle that MI is scheduled at, or -1.
				139	int getCycle(MachineInstr *MI) {
				140	auto I = Cycle.find(MI);
				141	return I == Cycle.end() ? -1 : I->second;
				142	}
				143
				144	/// Set the stage of a newly created instruction.
				145	void setStage(MachineInstr *MI, int MIStage) {
				146	assert(Stage.count(MI) == 0);
				147	Stage[MI] = MIStage;
				148	}
				149
				150	/// Return the rescheduled instructions in order.
				151	ArrayRef<MachineInstr *> getInstructions() { return ScheduledInstrs; }
				152
				153	void dump() { print(dbgs()); }
				154	void print(raw_ostream &OS);
				155	};
				156
				157	/// The ModuloScheduleExpander takes a ModuloSchedule and expands it in-place,
				158	/// rewriting the old loop and inserting prologs and epilogs as required.
				159	class ModuloScheduleExpander {
				160	public:
				161	using InstrChangesTy = DenseMap<MachineInstr *, std::pair<unsigned, int64_t>>;
				162
				163	private:
				164	using ValueMapTy = DenseMap<unsigned, unsigned>;
				165	using MBBVectorTy = SmallVectorImpl<MachineBasicBlock *>;
				166	using InstrMapTy = DenseMap<MachineInstr , MachineInstr >;
				167
				168	ModuloSchedule &Schedule;
				169	MachineFunction &MF;
				170	const TargetSubtargetInfo &ST;
				171	MachineRegisterInfo &MRI;
				172	const TargetInstrInfo *TII;
				173	LiveIntervals &LIS;
				174
				175	MachineBasicBlock *BB;
				176	MachineBasicBlock *Preheader;
				177	MachineBasicBlock *NewKernel = nullptr;
				178	std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> LoopInfo;
				179
				180	/// Map for each register and the max difference between its uses and def.
				181	/// The first element in the pair is the max difference in stages. The
				182	/// second is true if the register defines a Phi value and loop value is
				183	/// scheduled before the Phi.
				184	std::map<unsigned, std::pair<unsigned, bool>> RegToStageDiff;
				185
				186	/// Instructions to change when emitting the final schedule.
				187	InstrChangesTy InstrChanges;
				188
				189	void generatePipelinedLoop();
				190	void generateProlog(unsigned LastStage, MachineBasicBlock *KernelBB,
				191	ValueMapTy *VRMap, MBBVectorTy &PrologBBs);
				192	void generateEpilog(unsigned LastStage, MachineBasicBlock *KernelBB,
				193	ValueMapTy *VRMap, MBBVectorTy &EpilogBBs,
				194	MBBVectorTy &PrologBBs);
				195	void generateExistingPhis(MachineBasicBlock NewBB, MachineBasicBlock BB1,
				196	MachineBasicBlock BB2, MachineBasicBlock KernelBB,
				197	ValueMapTy *VRMap, InstrMapTy &InstrMap,
				198	unsigned LastStageNum, unsigned CurStageNum,
				199	bool IsLast);
				200	void generatePhis(MachineBasicBlock NewBB, MachineBasicBlock BB1,
				201	MachineBasicBlock BB2, MachineBasicBlock KernelBB,
				202	ValueMapTy *VRMap, InstrMapTy &InstrMap,
				203	unsigned LastStageNum, unsigned CurStageNum, bool IsLast);
				204	void removeDeadInstructions(MachineBasicBlock *KernelBB,
				205	MBBVectorTy &EpilogBBs);
				206	void splitLifetimes(MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs);
				207	void addBranches(MachineBasicBlock &PreheaderBB, MBBVectorTy &PrologBBs,
				208	MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs,
				209	ValueMapTy *VRMap);
				210	bool computeDelta(MachineInstr &MI, unsigned &Delta);
				211	void updateMemOperands(MachineInstr &NewMI, MachineInstr &OldMI,
				212	unsigned Num);
				213	MachineInstr cloneInstr(MachineInstr OldMI, unsigned CurStageNum,
				214	unsigned InstStageNum);
				215	MachineInstr cloneAndChangeInstr(MachineInstr OldMI, unsigned CurStageNum,
				216	unsigned InstStageNum);
				217	void updateInstruction(MachineInstr *NewMI, bool LastDef,
				218	unsigned CurStageNum, unsigned InstrStageNum,
				219	ValueMapTy *VRMap);
				220	MachineInstr *findDefInLoop(unsigned Reg);
				221	unsigned getPrevMapVal(unsigned StageNum, unsigned PhiStage, unsigned LoopVal,
				222	unsigned LoopStage, ValueMapTy *VRMap,
				223	MachineBasicBlock *BB);
				224	void rewritePhiValues(MachineBasicBlock *NewBB, unsigned StageNum,
				225	ValueMapTy *VRMap, InstrMapTy &InstrMap);
				226	void rewriteScheduledInstr(MachineBasicBlock *BB, InstrMapTy &InstrMap,
				227	unsigned CurStageNum, unsigned PhiNum,
				228	MachineInstr *Phi, unsigned OldReg,
				229	unsigned NewReg, unsigned PrevReg = 0);
				230	bool isLoopCarried(MachineInstr &Phi);
				231
				232	/// Return the max. number of stages/iterations that can occur between a
				233	/// register definition and its uses.
				234	unsigned getStagesForReg(int Reg, unsigned CurStage) {
				235	std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
				236	if ((int)CurStage > Schedule.getNumStages() - 1 && Stages.first == 0 &&
				237	Stages.second)
				238	return 1;
				239	return Stages.first;
				240	}
				241
				242	/// The number of stages for a Phi is a little different than other
				243	/// instructions. The minimum value computed in RegToStageDiff is 1
				244	/// because we assume the Phi is needed for at least 1 iteration.
				245	/// This is not the case if the loop value is scheduled prior to the
				246	/// Phi in the same stage. This function returns the number of stages
				247	/// or iterations needed between the Phi definition and any uses.
				248	unsigned getStagesForPhi(int Reg) {
				249	std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
				250	if (Stages.second)
				251	return Stages.first;
				252	return Stages.first - 1;
				253	}
				254
				255	public:
				256	/// Create a new ModuloScheduleExpander.
				257	/// \arg InstrChanges Modifications to make to instructions with memory
				258	/// operands.
				259	/// FIXME: InstrChanges is opaque and is an implementation detail of an
				260	/// optimization in MachinePipeliner that crosses abstraction boundaries.
				261	ModuloScheduleExpander(MachineFunction &MF, ModuloSchedule &S,
				262	LiveIntervals &LIS, InstrChangesTy InstrChanges)
				263	: Schedule(S), MF(MF), ST(MF.getSubtarget()), MRI(MF.getRegInfo()),
				264	TII(ST.getInstrInfo()), LIS(LIS),
				265	InstrChanges(std::move(InstrChanges)) {}
				266
				267	/// Performs the actual expansion.
				268	void expand();
				269	/// Performs final cleanup after expansion.
				270	void cleanup();
				271
				272	/// Returns the newly rewritten kernel block, or nullptr if this was
				273	/// optimized away.
				274	MachineBasicBlock *getRewrittenKernel() { return NewKernel; }
				275	};
				276
				277	/// A reimplementation of ModuloScheduleExpander. It works by generating a
				278	/// standalone kernel loop and peeling out the prologs and epilogs.
				279	class PeelingModuloScheduleExpander {
				280	public:
				281	PeelingModuloScheduleExpander(MachineFunction &MF, ModuloSchedule &S,
				282	LiveIntervals *LIS)
				283	: Schedule(S), MF(MF), ST(MF.getSubtarget()), MRI(MF.getRegInfo()),
				284	TII(ST.getInstrInfo()), LIS(LIS) {}
				285
				286	void expand();
				287
				288	/// Runs ModuloScheduleExpander and treats it as a golden input to validate
				289	/// aspects of the code generated by PeelingModuloScheduleExpander.
				290	void validateAgainstModuloScheduleExpander();
				291
				292	protected:
				293	ModuloSchedule &Schedule;
				294	MachineFunction &MF;
				295	const TargetSubtargetInfo &ST;
				296	MachineRegisterInfo &MRI;
				297	const TargetInstrInfo *TII;
				298	LiveIntervals *LIS;
				299
				300	/// The original loop block that gets rewritten in-place.
				301	MachineBasicBlock *BB;
				302	/// The original loop preheader.
				303	MachineBasicBlock *Preheader;
				304	/// All prolog and epilog blocks.
				305	SmallVector<MachineBasicBlock *, 4> Prologs, Epilogs;
				306	/// For every block, the stages that are produced.
				307	DenseMap<MachineBasicBlock *, BitVector> LiveStages;
				308	/// For every block, the stages that are available. A stage can be available
				309	/// but not produced (in the epilog) or produced but not available (in the
				310	/// prolog).
				311	DenseMap<MachineBasicBlock *, BitVector> AvailableStages;
				312	/// When peeling the epilogue keep track of the distance between the phi
				313	/// nodes and the kernel.
				314	DenseMap<MachineInstr *, unsigned> PhiNodeLoopIteration;
				315
				316	/// CanonicalMIs and BlockMIs form a bidirectional map between any of the
				317	/// loop kernel clones.
				318	DenseMap<MachineInstr , MachineInstr > CanonicalMIs;
				319	DenseMap<std::pair<MachineBasicBlock , MachineInstr >, MachineInstr *>
				320	BlockMIs;
				321
				322	/// State passed from peelKernel to peelPrologAndEpilogs().
				323	std::deque<MachineBasicBlock *> PeeledFront, PeeledBack;
				324	/// Illegal phis that need to be deleted once we re-link stages.
				325	SmallVector<MachineInstr *, 4> IllegalPhisToDelete;
				326
				327	/// Converts BB from the original loop body to the rewritten, pipelined
				328	/// steady-state.
				329	void rewriteKernel();
				330
				331	/// Peels one iteration of the rewritten kernel (BB) in the specified
				332	/// direction.
				333	MachineBasicBlock *peelKernel(LoopPeelDirection LPD);
				334	// Delete instructions whose stage is less than MinStage in the given basic
				335	// block.
				336	void filterInstructions(MachineBasicBlock *MB, int MinStage);
				337	// Move instructions of the given stage from sourceBB to DestBB. Remap the phi
				338	// instructions to keep a valid IR.
				339	void moveStageBetweenBlocks(MachineBasicBlock *DestBB,
				340	MachineBasicBlock *SourceBB, unsigned Stage);
				341	/// Peel the kernel forwards and backwards to produce prologs and epilogs,
				342	/// and stitch them together.
				343	void peelPrologAndEpilogs();
				344	/// All prolog and epilog blocks are clones of the kernel, so any produced
				345	/// register in one block has an corollary in all other blocks.
				346	Register getEquivalentRegisterIn(Register Reg, MachineBasicBlock *BB);
				347	/// Change all users of MI, if MI is predicated out
				348	/// (LiveStages[MI->getParent()] == false).
				349	void rewriteUsesOf(MachineInstr *MI);
				350	/// Insert branches between prologs, kernel and epilogs.
				351	void fixupBranches();
				352	/// Create a poor-man's LCSSA by cloning only the PHIs from the kernel block
				353	/// to a block dominated by all prologs and epilogs. This allows us to treat
				354	/// the loop exiting block as any other kernel clone.
				355	MachineBasicBlock *CreateLCSSAExitingBlock();
				356	/// Helper to get the stage of an instruction in the schedule.
				357	unsigned getStage(MachineInstr *MI) {
				358	if (CanonicalMIs.count(MI))
				359	MI = CanonicalMIs[MI];
				360	return Schedule.getStage(MI);
				361	}
				362	/// Helper function to find the right canonical register for a phi instruction
				363	/// coming from a peeled out prologue.
				364	Register getPhiCanonicalReg(MachineInstr* CanonicalPhi, MachineInstr* Phi);
				365	/// Target loop info before kernel peeling.
				366	std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> LoopInfo;
				367	};
				368
				369	/// Expander that simply annotates each scheduled instruction with a post-instr
				370	/// symbol that can be consumed by the ModuloScheduleTest pass.
				371	///
				372	/// The post-instr symbol is a way of annotating an instruction that can be
				373	/// roundtripped in MIR. The syntax is:
				374	/// MYINST %0, post-instr-symbol <mcsymbol Stage-1_Cycle-5>
				375	class ModuloScheduleTestAnnotater {
				376	MachineFunction &MF;
				377	ModuloSchedule &S;
				378
				379	public:
				380	ModuloScheduleTestAnnotater(MachineFunction &MF, ModuloSchedule &S)
				381	: MF(MF), S(S) {}
				382
				383	/// Performs the annotation.
				384	void annotate();
				385	};
				386
				387	} // end namespace llvm
				388
				389	#endif // LLVM_LIB_CODEGEN_MODULOSCHEDULE_H