blob: 1aa23208cfb9643b9e46a27287c41c4fb6e6c488 [file] [log] [blame]
Olivier Deprezf4ef2d02021-04-20 13:36:24 +02001//===- ModuloSchedule.h - Software pipeline schedule expansion ------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Software pipelining (SWP) is an instruction scheduling technique for loops
10// that overlaps loop iterations and exploits ILP via compiler transformations.
11//
12// There are multiple methods for analyzing a loop and creating a schedule.
13// An example algorithm is Swing Modulo Scheduling (implemented by the
14// MachinePipeliner). The details of how a schedule is arrived at are irrelevant
15// for the task of actually rewriting a loop to adhere to the schedule, which
16// is what this file does.
17//
18// A schedule is, for every instruction in a block, a Cycle and a Stage. Note
19// that we only support single-block loops, so "block" and "loop" can be used
20// interchangably.
21//
22// The Cycle of an instruction defines a partial order of the instructions in
23// the remapped loop. Instructions within a cycle must not consume the output
24// of any instruction in the same cycle. Cycle information is assumed to have
25// been calculated such that the processor will execute instructions in
26// lock-step (for example in a VLIW ISA).
27//
28// The Stage of an instruction defines the mapping between logical loop
29// iterations and pipelined loop iterations. An example (unrolled) pipeline
30// may look something like:
31//
32// I0[0] Execute instruction I0 of iteration 0
33// I1[0], I0[1] Execute I0 of iteration 1 and I1 of iteration 1
34// I1[1], I0[2]
35// I1[2], I0[3]
36//
37// In the schedule for this unrolled sequence we would say that I0 was scheduled
38// in stage 0 and I1 in stage 1:
39//
40// loop:
41// [stage 0] x = I0
42// [stage 1] I1 x (from stage 0)
43//
44// And to actually generate valid code we must insert a phi:
45//
46// loop:
47// x' = phi(x)
48// x = I0
49// I1 x'
50//
51// This is a simple example; the rules for how to generate correct code given
52// an arbitrary schedule containing loop-carried values are complex.
53//
54// Note that these examples only mention the steady-state kernel of the
55// generated loop; prologs and epilogs must be generated also that prime and
56// flush the pipeline. Doing so is nontrivial.
57//
58//===----------------------------------------------------------------------===//
59
60#ifndef LLVM_LIB_CODEGEN_MODULOSCHEDULE_H
61#define LLVM_LIB_CODEGEN_MODULOSCHEDULE_H
62
63#include "llvm/CodeGen/MachineFunction.h"
64#include "llvm/CodeGen/MachineLoopInfo.h"
65#include "llvm/CodeGen/MachineLoopUtils.h"
66#include "llvm/CodeGen/TargetInstrInfo.h"
67#include "llvm/CodeGen/TargetSubtargetInfo.h"
68#include <deque>
69#include <vector>
70
71namespace llvm {
72class MachineBasicBlock;
73class MachineInstr;
74class LiveIntervals;
75
76/// Represents a schedule for a single-block loop. For every instruction we
77/// maintain a Cycle and Stage.
78class ModuloSchedule {
79private:
80 /// The block containing the loop instructions.
81 MachineLoop *Loop;
82
83 /// The instructions to be generated, in total order. Cycle provides a partial
84 /// order; the total order within cycles has been decided by the schedule
85 /// producer.
86 std::vector<MachineInstr *> ScheduledInstrs;
87
88 /// The cycle for each instruction.
89 DenseMap<MachineInstr *, int> Cycle;
90
91 /// The stage for each instruction.
92 DenseMap<MachineInstr *, int> Stage;
93
94 /// The number of stages in this schedule (Max(Stage) + 1).
95 int NumStages;
96
97public:
98 /// Create a new ModuloSchedule.
99 /// \arg ScheduledInstrs The new loop instructions, in total resequenced
100 /// order.
101 /// \arg Cycle Cycle index for all instructions in ScheduledInstrs. Cycle does
102 /// not need to start at zero. ScheduledInstrs must be partially ordered by
103 /// Cycle.
104 /// \arg Stage Stage index for all instructions in ScheduleInstrs.
105 ModuloSchedule(MachineFunction &MF, MachineLoop *Loop,
106 std::vector<MachineInstr *> ScheduledInstrs,
107 DenseMap<MachineInstr *, int> Cycle,
108 DenseMap<MachineInstr *, int> Stage)
109 : Loop(Loop), ScheduledInstrs(ScheduledInstrs), Cycle(std::move(Cycle)),
110 Stage(std::move(Stage)) {
111 NumStages = 0;
112 for (auto &KV : this->Stage)
113 NumStages = std::max(NumStages, KV.second);
114 ++NumStages;
115 }
116
117 /// Return the single-block loop being scheduled.
118 MachineLoop *getLoop() const { return Loop; }
119
120 /// Return the number of stages contained in this schedule, which is the
121 /// largest stage index + 1.
122 int getNumStages() const { return NumStages; }
123
124 /// Return the first cycle in the schedule, which is the cycle index of the
125 /// first instruction.
126 int getFirstCycle() { return Cycle[ScheduledInstrs.front()]; }
127
128 /// Return the final cycle in the schedule, which is the cycle index of the
129 /// last instruction.
130 int getFinalCycle() { return Cycle[ScheduledInstrs.back()]; }
131
132 /// Return the stage that MI is scheduled in, or -1.
133 int getStage(MachineInstr *MI) {
134 auto I = Stage.find(MI);
135 return I == Stage.end() ? -1 : I->second;
136 }
137
138 /// Return the cycle that MI is scheduled at, or -1.
139 int getCycle(MachineInstr *MI) {
140 auto I = Cycle.find(MI);
141 return I == Cycle.end() ? -1 : I->second;
142 }
143
144 /// Set the stage of a newly created instruction.
145 void setStage(MachineInstr *MI, int MIStage) {
146 assert(Stage.count(MI) == 0);
147 Stage[MI] = MIStage;
148 }
149
150 /// Return the rescheduled instructions in order.
151 ArrayRef<MachineInstr *> getInstructions() { return ScheduledInstrs; }
152
153 void dump() { print(dbgs()); }
154 void print(raw_ostream &OS);
155};
156
157/// The ModuloScheduleExpander takes a ModuloSchedule and expands it in-place,
158/// rewriting the old loop and inserting prologs and epilogs as required.
159class ModuloScheduleExpander {
160public:
161 using InstrChangesTy = DenseMap<MachineInstr *, std::pair<unsigned, int64_t>>;
162
163private:
164 using ValueMapTy = DenseMap<unsigned, unsigned>;
165 using MBBVectorTy = SmallVectorImpl<MachineBasicBlock *>;
166 using InstrMapTy = DenseMap<MachineInstr *, MachineInstr *>;
167
168 ModuloSchedule &Schedule;
169 MachineFunction &MF;
170 const TargetSubtargetInfo &ST;
171 MachineRegisterInfo &MRI;
172 const TargetInstrInfo *TII;
173 LiveIntervals &LIS;
174
175 MachineBasicBlock *BB;
176 MachineBasicBlock *Preheader;
177 MachineBasicBlock *NewKernel = nullptr;
178 std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> LoopInfo;
179
180 /// Map for each register and the max difference between its uses and def.
181 /// The first element in the pair is the max difference in stages. The
182 /// second is true if the register defines a Phi value and loop value is
183 /// scheduled before the Phi.
184 std::map<unsigned, std::pair<unsigned, bool>> RegToStageDiff;
185
186 /// Instructions to change when emitting the final schedule.
187 InstrChangesTy InstrChanges;
188
189 void generatePipelinedLoop();
190 void generateProlog(unsigned LastStage, MachineBasicBlock *KernelBB,
191 ValueMapTy *VRMap, MBBVectorTy &PrologBBs);
192 void generateEpilog(unsigned LastStage, MachineBasicBlock *KernelBB,
193 ValueMapTy *VRMap, MBBVectorTy &EpilogBBs,
194 MBBVectorTy &PrologBBs);
195 void generateExistingPhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
196 MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
197 ValueMapTy *VRMap, InstrMapTy &InstrMap,
198 unsigned LastStageNum, unsigned CurStageNum,
199 bool IsLast);
200 void generatePhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
201 MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
202 ValueMapTy *VRMap, InstrMapTy &InstrMap,
203 unsigned LastStageNum, unsigned CurStageNum, bool IsLast);
204 void removeDeadInstructions(MachineBasicBlock *KernelBB,
205 MBBVectorTy &EpilogBBs);
206 void splitLifetimes(MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs);
207 void addBranches(MachineBasicBlock &PreheaderBB, MBBVectorTy &PrologBBs,
208 MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs,
209 ValueMapTy *VRMap);
210 bool computeDelta(MachineInstr &MI, unsigned &Delta);
211 void updateMemOperands(MachineInstr &NewMI, MachineInstr &OldMI,
212 unsigned Num);
213 MachineInstr *cloneInstr(MachineInstr *OldMI, unsigned CurStageNum,
214 unsigned InstStageNum);
215 MachineInstr *cloneAndChangeInstr(MachineInstr *OldMI, unsigned CurStageNum,
216 unsigned InstStageNum);
217 void updateInstruction(MachineInstr *NewMI, bool LastDef,
218 unsigned CurStageNum, unsigned InstrStageNum,
219 ValueMapTy *VRMap);
220 MachineInstr *findDefInLoop(unsigned Reg);
221 unsigned getPrevMapVal(unsigned StageNum, unsigned PhiStage, unsigned LoopVal,
222 unsigned LoopStage, ValueMapTy *VRMap,
223 MachineBasicBlock *BB);
224 void rewritePhiValues(MachineBasicBlock *NewBB, unsigned StageNum,
225 ValueMapTy *VRMap, InstrMapTy &InstrMap);
226 void rewriteScheduledInstr(MachineBasicBlock *BB, InstrMapTy &InstrMap,
227 unsigned CurStageNum, unsigned PhiNum,
228 MachineInstr *Phi, unsigned OldReg,
229 unsigned NewReg, unsigned PrevReg = 0);
230 bool isLoopCarried(MachineInstr &Phi);
231
232 /// Return the max. number of stages/iterations that can occur between a
233 /// register definition and its uses.
234 unsigned getStagesForReg(int Reg, unsigned CurStage) {
235 std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
236 if ((int)CurStage > Schedule.getNumStages() - 1 && Stages.first == 0 &&
237 Stages.second)
238 return 1;
239 return Stages.first;
240 }
241
242 /// The number of stages for a Phi is a little different than other
243 /// instructions. The minimum value computed in RegToStageDiff is 1
244 /// because we assume the Phi is needed for at least 1 iteration.
245 /// This is not the case if the loop value is scheduled prior to the
246 /// Phi in the same stage. This function returns the number of stages
247 /// or iterations needed between the Phi definition and any uses.
248 unsigned getStagesForPhi(int Reg) {
249 std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
250 if (Stages.second)
251 return Stages.first;
252 return Stages.first - 1;
253 }
254
255public:
256 /// Create a new ModuloScheduleExpander.
257 /// \arg InstrChanges Modifications to make to instructions with memory
258 /// operands.
259 /// FIXME: InstrChanges is opaque and is an implementation detail of an
260 /// optimization in MachinePipeliner that crosses abstraction boundaries.
261 ModuloScheduleExpander(MachineFunction &MF, ModuloSchedule &S,
262 LiveIntervals &LIS, InstrChangesTy InstrChanges)
263 : Schedule(S), MF(MF), ST(MF.getSubtarget()), MRI(MF.getRegInfo()),
264 TII(ST.getInstrInfo()), LIS(LIS),
265 InstrChanges(std::move(InstrChanges)) {}
266
267 /// Performs the actual expansion.
268 void expand();
269 /// Performs final cleanup after expansion.
270 void cleanup();
271
272 /// Returns the newly rewritten kernel block, or nullptr if this was
273 /// optimized away.
274 MachineBasicBlock *getRewrittenKernel() { return NewKernel; }
275};
276
277/// A reimplementation of ModuloScheduleExpander. It works by generating a
278/// standalone kernel loop and peeling out the prologs and epilogs.
279class PeelingModuloScheduleExpander {
280public:
281 PeelingModuloScheduleExpander(MachineFunction &MF, ModuloSchedule &S,
282 LiveIntervals *LIS)
283 : Schedule(S), MF(MF), ST(MF.getSubtarget()), MRI(MF.getRegInfo()),
284 TII(ST.getInstrInfo()), LIS(LIS) {}
285
286 void expand();
287
288 /// Runs ModuloScheduleExpander and treats it as a golden input to validate
289 /// aspects of the code generated by PeelingModuloScheduleExpander.
290 void validateAgainstModuloScheduleExpander();
291
292protected:
293 ModuloSchedule &Schedule;
294 MachineFunction &MF;
295 const TargetSubtargetInfo &ST;
296 MachineRegisterInfo &MRI;
297 const TargetInstrInfo *TII;
298 LiveIntervals *LIS;
299
300 /// The original loop block that gets rewritten in-place.
301 MachineBasicBlock *BB;
302 /// The original loop preheader.
303 MachineBasicBlock *Preheader;
304 /// All prolog and epilog blocks.
305 SmallVector<MachineBasicBlock *, 4> Prologs, Epilogs;
306 /// For every block, the stages that are produced.
307 DenseMap<MachineBasicBlock *, BitVector> LiveStages;
308 /// For every block, the stages that are available. A stage can be available
309 /// but not produced (in the epilog) or produced but not available (in the
310 /// prolog).
311 DenseMap<MachineBasicBlock *, BitVector> AvailableStages;
312 /// When peeling the epilogue keep track of the distance between the phi
313 /// nodes and the kernel.
314 DenseMap<MachineInstr *, unsigned> PhiNodeLoopIteration;
315
316 /// CanonicalMIs and BlockMIs form a bidirectional map between any of the
317 /// loop kernel clones.
318 DenseMap<MachineInstr *, MachineInstr *> CanonicalMIs;
319 DenseMap<std::pair<MachineBasicBlock *, MachineInstr *>, MachineInstr *>
320 BlockMIs;
321
322 /// State passed from peelKernel to peelPrologAndEpilogs().
323 std::deque<MachineBasicBlock *> PeeledFront, PeeledBack;
324 /// Illegal phis that need to be deleted once we re-link stages.
325 SmallVector<MachineInstr *, 4> IllegalPhisToDelete;
326
327 /// Converts BB from the original loop body to the rewritten, pipelined
328 /// steady-state.
329 void rewriteKernel();
330
331 /// Peels one iteration of the rewritten kernel (BB) in the specified
332 /// direction.
333 MachineBasicBlock *peelKernel(LoopPeelDirection LPD);
334 // Delete instructions whose stage is less than MinStage in the given basic
335 // block.
336 void filterInstructions(MachineBasicBlock *MB, int MinStage);
337 // Move instructions of the given stage from sourceBB to DestBB. Remap the phi
338 // instructions to keep a valid IR.
339 void moveStageBetweenBlocks(MachineBasicBlock *DestBB,
340 MachineBasicBlock *SourceBB, unsigned Stage);
341 /// Peel the kernel forwards and backwards to produce prologs and epilogs,
342 /// and stitch them together.
343 void peelPrologAndEpilogs();
344 /// All prolog and epilog blocks are clones of the kernel, so any produced
345 /// register in one block has an corollary in all other blocks.
346 Register getEquivalentRegisterIn(Register Reg, MachineBasicBlock *BB);
347 /// Change all users of MI, if MI is predicated out
348 /// (LiveStages[MI->getParent()] == false).
349 void rewriteUsesOf(MachineInstr *MI);
350 /// Insert branches between prologs, kernel and epilogs.
351 void fixupBranches();
352 /// Create a poor-man's LCSSA by cloning only the PHIs from the kernel block
353 /// to a block dominated by all prologs and epilogs. This allows us to treat
354 /// the loop exiting block as any other kernel clone.
355 MachineBasicBlock *CreateLCSSAExitingBlock();
356 /// Helper to get the stage of an instruction in the schedule.
357 unsigned getStage(MachineInstr *MI) {
358 if (CanonicalMIs.count(MI))
359 MI = CanonicalMIs[MI];
360 return Schedule.getStage(MI);
361 }
362 /// Helper function to find the right canonical register for a phi instruction
363 /// coming from a peeled out prologue.
364 Register getPhiCanonicalReg(MachineInstr* CanonicalPhi, MachineInstr* Phi);
365 /// Target loop info before kernel peeling.
366 std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> LoopInfo;
367};
368
369/// Expander that simply annotates each scheduled instruction with a post-instr
370/// symbol that can be consumed by the ModuloScheduleTest pass.
371///
372/// The post-instr symbol is a way of annotating an instruction that can be
373/// roundtripped in MIR. The syntax is:
374/// MYINST %0, post-instr-symbol <mcsymbol Stage-1_Cycle-5>
375class ModuloScheduleTestAnnotater {
376 MachineFunction &MF;
377 ModuloSchedule &S;
378
379public:
380 ModuloScheduleTestAnnotater(MachineFunction &MF, ModuloSchedule &S)
381 : MF(MF), S(S) {}
382
383 /// Performs the annotation.
384 void annotate();
385};
386
387} // end namespace llvm
388
389#endif // LLVM_LIB_CODEGEN_MODULOSCHEDULE_H