Blame - linux-x64/clang/include/llvm/Analysis/BlockFrequencyInfoImpl.h - hafnium/prebuilts

blob: 40c40b80bc89f952066ffa36d86d4bd0f770ac65 [file] [log] [blame]

Andrew Scull	5e1ddfa	2018-08-14 10:06:54 +0100	[diff] [blame^]	1	//==- BlockFrequencyInfoImpl.h - Block Frequency Implementation --- C++ --==//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9	//
				10	// Shared implementation of BlockFrequency for IR and Machine Instructions.
				11	// See the documentation below for BlockFrequencyInfoImpl for details.
				12	//
				13	//===----------------------------------------------------------------------===//
				14
				15	#ifndef LLVM_ANALYSIS_BLOCKFREQUENCYINFOIMPL_H
				16	#define LLVM_ANALYSIS_BLOCKFREQUENCYINFOIMPL_H
				17
				18	#include "llvm/ADT/DenseMap.h"
				19	#include "llvm/ADT/DenseSet.h"
				20	#include "llvm/ADT/GraphTraits.h"
				21	#include "llvm/ADT/Optional.h"
				22	#include "llvm/ADT/PostOrderIterator.h"
				23	#include "llvm/ADT/SmallVector.h"
				24	#include "llvm/ADT/SparseBitVector.h"
				25	#include "llvm/ADT/Twine.h"
				26	#include "llvm/ADT/iterator_range.h"
				27	#include "llvm/IR/BasicBlock.h"
				28	#include "llvm/Support/BlockFrequency.h"
				29	#include "llvm/Support/BranchProbability.h"
				30	#include "llvm/Support/DOTGraphTraits.h"
				31	#include "llvm/Support/Debug.h"
				32	#include "llvm/Support/ErrorHandling.h"
				33	#include "llvm/Support/Format.h"
				34	#include "llvm/Support/ScaledNumber.h"
				35	#include "llvm/Support/raw_ostream.h"
				36	#include <algorithm>
				37	#include <cassert>
				38	#include <cstddef>
				39	#include <cstdint>
				40	#include <deque>
				41	#include <iterator>
				42	#include <limits>
				43	#include <list>
				44	#include <string>
				45	#include <utility>
				46	#include <vector>
				47
				48	#define DEBUG_TYPE "block-freq"
				49
				50	namespace llvm {
				51
				52	class BranchProbabilityInfo;
				53	class Function;
				54	class Loop;
				55	class LoopInfo;
				56	class MachineBasicBlock;
				57	class MachineBranchProbabilityInfo;
				58	class MachineFunction;
				59	class MachineLoop;
				60	class MachineLoopInfo;
				61
				62	namespace bfi_detail {
				63
				64	struct IrreducibleGraph;
				65
				66	// This is part of a workaround for a GCC 4.7 crash on lambdas.
				67	template <class BT> struct BlockEdgesAdder;
				68
				69	/// \brief Mass of a block.
				70	///
				71	/// This class implements a sort of fixed-point fraction always between 0.0 and
				72	/// 1.0. getMass() == std::numeric_limits<uint64_t>::max() indicates a value of
				73	/// 1.0.
				74	///
				75	/// Masses can be added and subtracted. Simple saturation arithmetic is used,
				76	/// so arithmetic operations never overflow or underflow.
				77	///
				78	/// Masses can be multiplied. Multiplication treats full mass as 1.0 and uses
				79	/// an inexpensive floating-point algorithm that's off-by-one (almost, but not
				80	/// quite, maximum precision).
				81	///
				82	/// Masses can be scaled by \a BranchProbability at maximum precision.
				83	class BlockMass {
				84	uint64_t Mass = 0;
				85
				86	public:
				87	BlockMass() = default;
				88	explicit BlockMass(uint64_t Mass) : Mass(Mass) {}
				89
				90	static BlockMass getEmpty() { return BlockMass(); }
				91
				92	static BlockMass getFull() {
				93	return BlockMass(std::numeric_limits<uint64_t>::max());
				94	}
				95
				96	uint64_t getMass() const { return Mass; }
				97
				98	bool isFull() const { return Mass == std::numeric_limits<uint64_t>::max(); }
				99	bool isEmpty() const { return !Mass; }
				100
				101	bool operator!() const { return isEmpty(); }
				102
				103	/// \brief Add another mass.
				104	///
				105	/// Adds another mass, saturating at \a isFull() rather than overflowing.
				106	BlockMass &operator+=(BlockMass X) {
				107	uint64_t Sum = Mass + X.Mass;
				108	Mass = Sum < Mass ? std::numeric_limits<uint64_t>::max() : Sum;
				109	return *this;
				110	}
				111
				112	/// \brief Subtract another mass.
				113	///
				114	/// Subtracts another mass, saturating at \a isEmpty() rather than
				115	/// undeflowing.
				116	BlockMass &operator-=(BlockMass X) {
				117	uint64_t Diff = Mass - X.Mass;
				118	Mass = Diff > Mass ? 0 : Diff;
				119	return *this;
				120	}
				121
				122	BlockMass &operator*=(BranchProbability P) {
				123	Mass = P.scale(Mass);
				124	return *this;
				125	}
				126
				127	bool operator==(BlockMass X) const { return Mass == X.Mass; }
				128	bool operator!=(BlockMass X) const { return Mass != X.Mass; }
				129	bool operator<=(BlockMass X) const { return Mass <= X.Mass; }
				130	bool operator>=(BlockMass X) const { return Mass >= X.Mass; }
				131	bool operator<(BlockMass X) const { return Mass < X.Mass; }
				132	bool operator>(BlockMass X) const { return Mass > X.Mass; }
				133
				134	/// \brief Convert to scaled number.
				135	///
				136	/// Convert to \a ScaledNumber. \a isFull() gives 1.0, while \a isEmpty()
				137	/// gives slightly above 0.0.
				138	ScaledNumber<uint64_t> toScaled() const;
				139
				140	void dump() const;
				141	raw_ostream &print(raw_ostream &OS) const;
				142	};
				143
				144	inline BlockMass operator+(BlockMass L, BlockMass R) {
				145	return BlockMass(L) += R;
				146	}
				147	inline BlockMass operator-(BlockMass L, BlockMass R) {
				148	return BlockMass(L) -= R;
				149	}
				150	inline BlockMass operator*(BlockMass L, BranchProbability R) {
				151	return BlockMass(L) *= R;
				152	}
				153	inline BlockMass operator*(BranchProbability L, BlockMass R) {
				154	return BlockMass(R) *= L;
				155	}
				156
				157	inline raw_ostream &operator<<(raw_ostream &OS, BlockMass X) {
				158	return X.print(OS);
				159	}
				160
				161	} // end namespace bfi_detail
				162
				163	template <> struct isPodLike<bfi_detail::BlockMass> {
				164	static const bool value = true;
				165	};
				166
				167	/// \brief Base class for BlockFrequencyInfoImpl
				168	///
				169	/// BlockFrequencyInfoImplBase has supporting data structures and some
				170	/// algorithms for BlockFrequencyInfoImplBase. Only algorithms that depend on
				171	/// the block type (or that call such algorithms) are skipped here.
				172	///
				173	/// Nevertheless, the majority of the overall algorithm documention lives with
				174	/// BlockFrequencyInfoImpl. See there for details.
				175	class BlockFrequencyInfoImplBase {
				176	public:
				177	using Scaled64 = ScaledNumber<uint64_t>;
				178	using BlockMass = bfi_detail::BlockMass;
				179
				180	/// \brief Representative of a block.
				181	///
				182	/// This is a simple wrapper around an index into the reverse-post-order
				183	/// traversal of the blocks.
				184	///
				185	/// Unlike a block pointer, its order has meaning (location in the
				186	/// topological sort) and it's class is the same regardless of block type.
				187	struct BlockNode {
				188	using IndexType = uint32_t;
				189
				190	IndexType Index = std::numeric_limits<uint32_t>::max();
				191
				192	BlockNode() = default;
				193	BlockNode(IndexType Index) : Index(Index) {}
				194
				195	bool operator==(const BlockNode &X) const { return Index == X.Index; }
				196	bool operator!=(const BlockNode &X) const { return Index != X.Index; }
				197	bool operator<=(const BlockNode &X) const { return Index <= X.Index; }
				198	bool operator>=(const BlockNode &X) const { return Index >= X.Index; }
				199	bool operator<(const BlockNode &X) const { return Index < X.Index; }
				200	bool operator>(const BlockNode &X) const { return Index > X.Index; }
				201
				202	bool isValid() const { return Index <= getMaxIndex(); }
				203
				204	static size_t getMaxIndex() {
				205	return std::numeric_limits<uint32_t>::max() - 1;
				206	}
				207	};
				208
				209	/// \brief Stats about a block itself.
				210	struct FrequencyData {
				211	Scaled64 Scaled;
				212	uint64_t Integer;
				213	};
				214
				215	/// \brief Data about a loop.
				216	///
				217	/// Contains the data necessary to represent a loop as a pseudo-node once it's
				218	/// packaged.
				219	struct LoopData {
				220	using ExitMap = SmallVector<std::pair<BlockNode, BlockMass>, 4>;
				221	using NodeList = SmallVector<BlockNode, 4>;
				222	using HeaderMassList = SmallVector<BlockMass, 1>;
				223
				224	LoopData *Parent; ///< The parent loop.
				225	bool IsPackaged = false; ///< Whether this has been packaged.
				226	uint32_t NumHeaders = 1; ///< Number of headers.
				227	ExitMap Exits; ///< Successor edges (and weights).
				228	NodeList Nodes; ///< Header and the members of the loop.
				229	HeaderMassList BackedgeMass; ///< Mass returned to each loop header.
				230	BlockMass Mass;
				231	Scaled64 Scale;
				232
				233	LoopData(LoopData *Parent, const BlockNode &Header)
				234	: Parent(Parent), Nodes(1, Header), BackedgeMass(1) {}
				235
				236	template <class It1, class It2>
				237	LoopData(LoopData *Parent, It1 FirstHeader, It1 LastHeader, It2 FirstOther,
				238	It2 LastOther)
				239	: Parent(Parent), Nodes(FirstHeader, LastHeader) {
				240	NumHeaders = Nodes.size();
				241	Nodes.insert(Nodes.end(), FirstOther, LastOther);
				242	BackedgeMass.resize(NumHeaders);
				243	}
				244
				245	bool isHeader(const BlockNode &Node) const {
				246	if (isIrreducible())
				247	return std::binary_search(Nodes.begin(), Nodes.begin() + NumHeaders,
				248	Node);
				249	return Node == Nodes[0];
				250	}
				251
				252	BlockNode getHeader() const { return Nodes[0]; }
				253	bool isIrreducible() const { return NumHeaders > 1; }
				254
				255	HeaderMassList::difference_type getHeaderIndex(const BlockNode &B) {
				256	assert(isHeader(B) && "this is only valid on loop header blocks");
				257	if (isIrreducible())
				258	return std::lower_bound(Nodes.begin(), Nodes.begin() + NumHeaders, B) -
				259	Nodes.begin();
				260	return 0;
				261	}
				262
				263	NodeList::const_iterator members_begin() const {
				264	return Nodes.begin() + NumHeaders;
				265	}
				266
				267	NodeList::const_iterator members_end() const { return Nodes.end(); }
				268	iterator_range<NodeList::const_iterator> members() const {
				269	return make_range(members_begin(), members_end());
				270	}
				271	};
				272
				273	/// \brief Index of loop information.
				274	struct WorkingData {
				275	BlockNode Node; ///< This node.
				276	LoopData *Loop = nullptr; ///< The loop this block is inside.
				277	BlockMass Mass; ///< Mass distribution from the entry block.
				278
				279	WorkingData(const BlockNode &Node) : Node(Node) {}
				280
				281	bool isLoopHeader() const { return Loop && Loop->isHeader(Node); }
				282
				283	bool isDoubleLoopHeader() const {
				284	return isLoopHeader() && Loop->Parent && Loop->Parent->isIrreducible() &&
				285	Loop->Parent->isHeader(Node);
				286	}
				287
				288	LoopData *getContainingLoop() const {
				289	if (!isLoopHeader())
				290	return Loop;
				291	if (!isDoubleLoopHeader())
				292	return Loop->Parent;
				293	return Loop->Parent->Parent;
				294	}
				295
				296	/// \brief Resolve a node to its representative.
				297	///
				298	/// Get the node currently representing Node, which could be a containing
				299	/// loop.
				300	///
				301	/// This function should only be called when distributing mass. As long as
				302	/// there are no irreducible edges to Node, then it will have complexity
				303	/// O(1) in this context.
				304	///
				305	/// In general, the complexity is O(L), where L is the number of loop
				306	/// headers Node has been packaged into. Since this method is called in
				307	/// the context of distributing mass, L will be the number of loop headers
				308	/// an early exit edge jumps out of.
				309	BlockNode getResolvedNode() const {
				310	auto L = getPackagedLoop();
				311	return L ? L->getHeader() : Node;
				312	}
				313
				314	LoopData *getPackagedLoop() const {
				315	if (!Loop \|\| !Loop->IsPackaged)
				316	return nullptr;
				317	auto L = Loop;
				318	while (L->Parent && L->Parent->IsPackaged)
				319	L = L->Parent;
				320	return L;
				321	}
				322
				323	/// \brief Get the appropriate mass for a node.
				324	///
				325	/// Get appropriate mass for Node. If Node is a loop-header (whose loop
				326	/// has been packaged), returns the mass of its pseudo-node. If it's a
				327	/// node inside a packaged loop, it returns the loop's mass.
				328	BlockMass &getMass() {
				329	if (!isAPackage())
				330	return Mass;
				331	if (!isADoublePackage())
				332	return Loop->Mass;
				333	return Loop->Parent->Mass;
				334	}
				335
				336	/// \brief Has ContainingLoop been packaged up?
				337	bool isPackaged() const { return getResolvedNode() != Node; }
				338
				339	/// \brief Has Loop been packaged up?
				340	bool isAPackage() const { return isLoopHeader() && Loop->IsPackaged; }
				341
				342	/// \brief Has Loop been packaged up twice?
				343	bool isADoublePackage() const {
				344	return isDoubleLoopHeader() && Loop->Parent->IsPackaged;
				345	}
				346	};
				347
				348	/// \brief Unscaled probability weight.
				349	///
				350	/// Probability weight for an edge in the graph (including the
				351	/// successor/target node).
				352	///
				353	/// All edges in the original function are 32-bit. However, exit edges from
				354	/// loop packages are taken from 64-bit exit masses, so we need 64-bits of
				355	/// space in general.
				356	///
				357	/// In addition to the raw weight amount, Weight stores the type of the edge
				358	/// in the current context (i.e., the context of the loop being processed).
				359	/// Is this a local edge within the loop, an exit from the loop, or a
				360	/// backedge to the loop header?
				361	struct Weight {
				362	enum DistType { Local, Exit, Backedge };
				363	DistType Type = Local;
				364	BlockNode TargetNode;
				365	uint64_t Amount = 0;
				366
				367	Weight() = default;
				368	Weight(DistType Type, BlockNode TargetNode, uint64_t Amount)
				369	: Type(Type), TargetNode(TargetNode), Amount(Amount) {}
				370	};
				371
				372	/// \brief Distribution of unscaled probability weight.
				373	///
				374	/// Distribution of unscaled probability weight to a set of successors.
				375	///
				376	/// This class collates the successor edge weights for later processing.
				377	///
				378	/// \a DidOverflow indicates whether \a Total did overflow while adding to
				379	/// the distribution. It should never overflow twice.
				380	struct Distribution {
				381	using WeightList = SmallVector<Weight, 4>;
				382
				383	WeightList Weights; ///< Individual successor weights.
				384	uint64_t Total = 0; ///< Sum of all weights.
				385	bool DidOverflow = false; ///< Whether \a Total did overflow.
				386
				387	Distribution() = default;
				388
				389	void addLocal(const BlockNode &Node, uint64_t Amount) {
				390	add(Node, Amount, Weight::Local);
				391	}
				392
				393	void addExit(const BlockNode &Node, uint64_t Amount) {
				394	add(Node, Amount, Weight::Exit);
				395	}
				396
				397	void addBackedge(const BlockNode &Node, uint64_t Amount) {
				398	add(Node, Amount, Weight::Backedge);
				399	}
				400
				401	/// \brief Normalize the distribution.
				402	///
				403	/// Combines multiple edges to the same \a Weight::TargetNode and scales
				404	/// down so that \a Total fits into 32-bits.
				405	///
				406	/// This is linear in the size of \a Weights. For the vast majority of
				407	/// cases, adjacent edge weights are combined by sorting WeightList and
				408	/// combining adjacent weights. However, for very large edge lists an
				409	/// auxiliary hash table is used.
				410	void normalize();
				411
				412	private:
				413	void add(const BlockNode &Node, uint64_t Amount, Weight::DistType Type);
				414	};
				415
				416	/// \brief Data about each block. This is used downstream.
				417	std::vector<FrequencyData> Freqs;
				418
				419	/// \brief Whether each block is an irreducible loop header.
				420	/// This is used downstream.
				421	SparseBitVector<> IsIrrLoopHeader;
				422
				423	/// \brief Loop data: see initializeLoops().
				424	std::vector<WorkingData> Working;
				425
				426	/// \brief Indexed information about loops.
				427	std::list<LoopData> Loops;
				428
				429	/// \brief Virtual destructor.
				430	///
				431	/// Need a virtual destructor to mask the compiler warning about
				432	/// getBlockName().
				433	virtual ~BlockFrequencyInfoImplBase() = default;
				434
				435	/// \brief Add all edges out of a packaged loop to the distribution.
				436	///
				437	/// Adds all edges from LocalLoopHead to Dist. Calls addToDist() to add each
				438	/// successor edge.
				439	///
				440	/// \return \c true unless there's an irreducible backedge.
				441	bool addLoopSuccessorsToDist(const LoopData *OuterLoop, LoopData &Loop,
				442	Distribution &Dist);
				443
				444	/// \brief Add an edge to the distribution.
				445	///
				446	/// Adds an edge to Succ to Dist. If \c LoopHead.isValid(), then whether the
				447	/// edge is local/exit/backedge is in the context of LoopHead. Otherwise,
				448	/// every edge should be a local edge (since all the loops are packaged up).
				449	///
				450	/// \return \c true unless aborted due to an irreducible backedge.
				451	bool addToDist(Distribution &Dist, const LoopData *OuterLoop,
				452	const BlockNode &Pred, const BlockNode &Succ, uint64_t Weight);
				453
				454	LoopData &getLoopPackage(const BlockNode &Head) {
				455	assert(Head.Index < Working.size());
				456	assert(Working[Head.Index].isLoopHeader());
				457	return *Working[Head.Index].Loop;
				458	}
				459
				460	/// \brief Analyze irreducible SCCs.
				461	///
				462	/// Separate irreducible SCCs from \c G, which is an explict graph of \c
				463	/// OuterLoop (or the top-level function, if \c OuterLoop is \c nullptr).
				464	/// Insert them into \a Loops before \c Insert.
				465	///
				466	/// \return the \c LoopData nodes representing the irreducible SCCs.
				467	iterator_range<std::list<LoopData>::iterator>
				468	analyzeIrreducible(const bfi_detail::IrreducibleGraph &G, LoopData *OuterLoop,
				469	std::list<LoopData>::iterator Insert);
				470
				471	/// \brief Update a loop after packaging irreducible SCCs inside of it.
				472	///
				473	/// Update \c OuterLoop. Before finding irreducible control flow, it was
				474	/// partway through \a computeMassInLoop(), so \a LoopData::Exits and \a
				475	/// LoopData::BackedgeMass need to be reset. Also, nodes that were packaged
				476	/// up need to be removed from \a OuterLoop::Nodes.
				477	void updateLoopWithIrreducible(LoopData &OuterLoop);
				478
				479	/// \brief Distribute mass according to a distribution.
				480	///
				481	/// Distributes the mass in Source according to Dist. If LoopHead.isValid(),
				482	/// backedges and exits are stored in its entry in Loops.
				483	///
				484	/// Mass is distributed in parallel from two copies of the source mass.
				485	void distributeMass(const BlockNode &Source, LoopData *OuterLoop,
				486	Distribution &Dist);
				487
				488	/// \brief Compute the loop scale for a loop.
				489	void computeLoopScale(LoopData &Loop);
				490
				491	/// Adjust the mass of all headers in an irreducible loop.
				492	///
				493	/// Initially, irreducible loops are assumed to distribute their mass
				494	/// equally among its headers. This can lead to wrong frequency estimates
				495	/// since some headers may be executed more frequently than others.
				496	///
				497	/// This adjusts header mass distribution so it matches the weights of
				498	/// the backedges going into each of the loop headers.
				499	void adjustLoopHeaderMass(LoopData &Loop);
				500
				501	void distributeIrrLoopHeaderMass(Distribution &Dist);
				502
				503	/// \brief Package up a loop.
				504	void packageLoop(LoopData &Loop);
				505
				506	/// \brief Unwrap loops.
				507	void unwrapLoops();
				508
				509	/// \brief Finalize frequency metrics.
				510	///
				511	/// Calculates final frequencies and cleans up no-longer-needed data
				512	/// structures.
				513	void finalizeMetrics();
				514
				515	/// \brief Clear all memory.
				516	void clear();
				517
				518	virtual std::string getBlockName(const BlockNode &Node) const;
				519	std::string getLoopName(const LoopData &Loop) const;
				520
				521	virtual raw_ostream &print(raw_ostream &OS) const { return OS; }
				522	void dump() const { print(dbgs()); }
				523
				524	Scaled64 getFloatingBlockFreq(const BlockNode &Node) const;
				525
				526	BlockFrequency getBlockFreq(const BlockNode &Node) const;
				527	Optional<uint64_t> getBlockProfileCount(const Function &F,
				528	const BlockNode &Node) const;
				529	Optional<uint64_t> getProfileCountFromFreq(const Function &F,
				530	uint64_t Freq) const;
				531	bool isIrrLoopHeader(const BlockNode &Node);
				532
				533	void setBlockFreq(const BlockNode &Node, uint64_t Freq);
				534
				535	raw_ostream &printBlockFreq(raw_ostream &OS, const BlockNode &Node) const;
				536	raw_ostream &printBlockFreq(raw_ostream &OS,
				537	const BlockFrequency &Freq) const;
				538
				539	uint64_t getEntryFreq() const {
				540	assert(!Freqs.empty());
				541	return Freqs[0].Integer;
				542	}
				543	};
				544
				545	namespace bfi_detail {
				546
				547	template <class BlockT> struct TypeMap {};
				548	template <> struct TypeMap<BasicBlock> {
				549	using BlockT = BasicBlock;
				550	using FunctionT = Function;
				551	using BranchProbabilityInfoT = BranchProbabilityInfo;
				552	using LoopT = Loop;
				553	using LoopInfoT = LoopInfo;
				554	};
				555	template <> struct TypeMap<MachineBasicBlock> {
				556	using BlockT = MachineBasicBlock;
				557	using FunctionT = MachineFunction;
				558	using BranchProbabilityInfoT = MachineBranchProbabilityInfo;
				559	using LoopT = MachineLoop;
				560	using LoopInfoT = MachineLoopInfo;
				561	};
				562
				563	/// \brief Get the name of a MachineBasicBlock.
				564	///
				565	/// Get the name of a MachineBasicBlock. It's templated so that including from
				566	/// CodeGen is unnecessary (that would be a layering issue).
				567	///
				568	/// This is used mainly for debug output. The name is similar to
				569	/// MachineBasicBlock::getFullName(), but skips the name of the function.
				570	template <class BlockT> std::string getBlockName(const BlockT *BB) {
				571	assert(BB && "Unexpected nullptr");
				572	auto MachineName = "BB" + Twine(BB->getNumber());
				573	if (BB->getBasicBlock())
				574	return (MachineName + "[" + BB->getName() + "]").str();
				575	return MachineName.str();
				576	}
				577	/// \brief Get the name of a BasicBlock.
				578	template <> inline std::string getBlockName(const BasicBlock *BB) {
				579	assert(BB && "Unexpected nullptr");
				580	return BB->getName().str();
				581	}
				582
				583	/// \brief Graph of irreducible control flow.
				584	///
				585	/// This graph is used for determining the SCCs in a loop (or top-level
				586	/// function) that has irreducible control flow.
				587	///
				588	/// During the block frequency algorithm, the local graphs are defined in a
				589	/// light-weight way, deferring to the \a BasicBlock or \a MachineBasicBlock
				590	/// graphs for most edges, but getting others from \a LoopData::ExitMap. The
				591	/// latter only has successor information.
				592	///
				593	/// \a IrreducibleGraph makes this graph explicit. It's in a form that can use
				594	/// \a GraphTraits (so that \a analyzeIrreducible() can use \a scc_iterator),
				595	/// and it explicitly lists predecessors and successors. The initialization
				596	/// that relies on \c MachineBasicBlock is defined in the header.
				597	struct IrreducibleGraph {
				598	using BFIBase = BlockFrequencyInfoImplBase;
				599
				600	BFIBase &BFI;
				601
				602	using BlockNode = BFIBase::BlockNode;
				603	struct IrrNode {
				604	BlockNode Node;
				605	unsigned NumIn = 0;
				606	std::deque<const IrrNode *> Edges;
				607
				608	IrrNode(const BlockNode &Node) : Node(Node) {}
				609
				610	using iterator = std::deque<const IrrNode *>::const_iterator;
				611
				612	iterator pred_begin() const { return Edges.begin(); }
				613	iterator succ_begin() const { return Edges.begin() + NumIn; }
				614	iterator pred_end() const { return succ_begin(); }
				615	iterator succ_end() const { return Edges.end(); }
				616	};
				617	BlockNode Start;
				618	const IrrNode *StartIrr = nullptr;
				619	std::vector<IrrNode> Nodes;
				620	SmallDenseMap<uint32_t, IrrNode *, 4> Lookup;
				621
				622	/// \brief Construct an explicit graph containing irreducible control flow.
				623	///
				624	/// Construct an explicit graph of the control flow in \c OuterLoop (or the
				625	/// top-level function, if \c OuterLoop is \c nullptr). Uses \c
				626	/// addBlockEdges to add block successors that have not been packaged into
				627	/// loops.
				628	///
				629	/// \a BlockFrequencyInfoImpl::computeIrreducibleMass() is the only expected
				630	/// user of this.
				631	template <class BlockEdgesAdder>
				632	IrreducibleGraph(BFIBase &BFI, const BFIBase::LoopData *OuterLoop,
				633	BlockEdgesAdder addBlockEdges) : BFI(BFI) {
				634	initialize(OuterLoop, addBlockEdges);
				635	}
				636
				637	template <class BlockEdgesAdder>
				638	void initialize(const BFIBase::LoopData *OuterLoop,
				639	BlockEdgesAdder addBlockEdges);
				640	void addNodesInLoop(const BFIBase::LoopData &OuterLoop);
				641	void addNodesInFunction();
				642
				643	void addNode(const BlockNode &Node) {
				644	Nodes.emplace_back(Node);
				645	BFI.Working[Node.Index].getMass() = BlockMass::getEmpty();
				646	}
				647
				648	void indexNodes();
				649	template <class BlockEdgesAdder>
				650	void addEdges(const BlockNode &Node, const BFIBase::LoopData *OuterLoop,
				651	BlockEdgesAdder addBlockEdges);
				652	void addEdge(IrrNode &Irr, const BlockNode &Succ,
				653	const BFIBase::LoopData *OuterLoop);
				654	};
				655
				656	template <class BlockEdgesAdder>
				657	void IrreducibleGraph::initialize(const BFIBase::LoopData *OuterLoop,
				658	BlockEdgesAdder addBlockEdges) {
				659	if (OuterLoop) {
				660	addNodesInLoop(*OuterLoop);
				661	for (auto N : OuterLoop->Nodes)
				662	addEdges(N, OuterLoop, addBlockEdges);
				663	} else {
				664	addNodesInFunction();
				665	for (uint32_t Index = 0; Index < BFI.Working.size(); ++Index)
				666	addEdges(Index, OuterLoop, addBlockEdges);
				667	}
				668	StartIrr = Lookup[Start.Index];
				669	}
				670
				671	template <class BlockEdgesAdder>
				672	void IrreducibleGraph::addEdges(const BlockNode &Node,
				673	const BFIBase::LoopData *OuterLoop,
				674	BlockEdgesAdder addBlockEdges) {
				675	auto L = Lookup.find(Node.Index);
				676	if (L == Lookup.end())
				677	return;
				678	IrrNode &Irr = *L->second;
				679	const auto &Working = BFI.Working[Node.Index];
				680
				681	if (Working.isAPackage())
				682	for (const auto &I : Working.Loop->Exits)
				683	addEdge(Irr, I.first, OuterLoop);
				684	else
				685	addBlockEdges(*this, Irr, OuterLoop);
				686	}
				687
				688	} // end namespace bfi_detail
				689
				690	/// \brief Shared implementation for block frequency analysis.
				691	///
				692	/// This is a shared implementation of BlockFrequencyInfo and
				693	/// MachineBlockFrequencyInfo, and calculates the relative frequencies of
				694	/// blocks.
				695	///
				696	/// LoopInfo defines a loop as a "non-trivial" SCC dominated by a single block,
				697	/// which is called the header. A given loop, L, can have sub-loops, which are
				698	/// loops within the subgraph of L that exclude its header. (A "trivial" SCC
				699	/// consists of a single block that does not have a self-edge.)
				700	///
				701	/// In addition to loops, this algorithm has limited support for irreducible
				702	/// SCCs, which are SCCs with multiple entry blocks. Irreducible SCCs are
				703	/// discovered on they fly, and modelled as loops with multiple headers.
				704	///
				705	/// The headers of irreducible sub-SCCs consist of its entry blocks and all
				706	/// nodes that are targets of a backedge within it (excluding backedges within
				707	/// true sub-loops). Block frequency calculations act as if a block is
				708	/// inserted that intercepts all the edges to the headers. All backedges and
				709	/// entries point to this block. Its successors are the headers, which split
				710	/// the frequency evenly.
				711	///
				712	/// This algorithm leverages BlockMass and ScaledNumber to maintain precision,
				713	/// separates mass distribution from loop scaling, and dithers to eliminate
				714	/// probability mass loss.
				715	///
				716	/// The implementation is split between BlockFrequencyInfoImpl, which knows the
				717	/// type of graph being modelled (BasicBlock vs. MachineBasicBlock), and
				718	/// BlockFrequencyInfoImplBase, which doesn't. The base class uses \a
				719	/// BlockNode, a wrapper around a uint32_t. BlockNode is numbered from 0 in
				720	/// reverse-post order. This gives two advantages: it's easy to compare the
				721	/// relative ordering of two nodes, and maps keyed on BlockT can be represented
				722	/// by vectors.
				723	///
				724	/// This algorithm is O(V+E), unless there is irreducible control flow, in
				725	/// which case it's O(V*E) in the worst case.
				726	///
				727	/// These are the main stages:
				728	///
				729	/// 0. Reverse post-order traversal (\a initializeRPOT()).
				730	///
				731	/// Run a single post-order traversal and save it (in reverse) in RPOT.
				732	/// All other stages make use of this ordering. Save a lookup from BlockT
				733	/// to BlockNode (the index into RPOT) in Nodes.
				734	///
				735	/// 1. Loop initialization (\a initializeLoops()).
				736	///
				737	/// Translate LoopInfo/MachineLoopInfo into a form suitable for the rest of
				738	/// the algorithm. In particular, store the immediate members of each loop
				739	/// in reverse post-order.
				740	///
				741	/// 2. Calculate mass and scale in loops (\a computeMassInLoops()).
				742	///
				743	/// For each loop (bottom-up), distribute mass through the DAG resulting
				744	/// from ignoring backedges and treating sub-loops as a single pseudo-node.
				745	/// Track the backedge mass distributed to the loop header, and use it to
				746	/// calculate the loop scale (number of loop iterations). Immediate
				747	/// members that represent sub-loops will already have been visited and
				748	/// packaged into a pseudo-node.
				749	///
				750	/// Distributing mass in a loop is a reverse-post-order traversal through
				751	/// the loop. Start by assigning full mass to the Loop header. For each
				752	/// node in the loop:
				753	///
				754	/// - Fetch and categorize the weight distribution for its successors.
				755	/// If this is a packaged-subloop, the weight distribution is stored
				756	/// in \a LoopData::Exits. Otherwise, fetch it from
				757	/// BranchProbabilityInfo.
				758	///
				759	/// - Each successor is categorized as \a Weight::Local, a local edge
				760	/// within the current loop, \a Weight::Backedge, a backedge to the
				761	/// loop header, or \a Weight::Exit, any successor outside the loop.
				762	/// The weight, the successor, and its category are stored in \a
				763	/// Distribution. There can be multiple edges to each successor.
				764	///
				765	/// - If there's a backedge to a non-header, there's an irreducible SCC.
				766	/// The usual flow is temporarily aborted. \a
				767	/// computeIrreducibleMass() finds the irreducible SCCs within the
				768	/// loop, packages them up, and restarts the flow.
				769	///
				770	/// - Normalize the distribution: scale weights down so that their sum
				771	/// is 32-bits, and coalesce multiple edges to the same node.
				772	///
				773	/// - Distribute the mass accordingly, dithering to minimize mass loss,
				774	/// as described in \a distributeMass().
				775	///
				776	/// In the case of irreducible loops, instead of a single loop header,
				777	/// there will be several. The computation of backedge masses is similar
				778	/// but instead of having a single backedge mass, there will be one
				779	/// backedge per loop header. In these cases, each backedge will carry
				780	/// a mass proportional to the edge weights along the corresponding
				781	/// path.
				782	///
				783	/// At the end of propagation, the full mass assigned to the loop will be
				784	/// distributed among the loop headers proportionally according to the
				785	/// mass flowing through their backedges.
				786	///
				787	/// Finally, calculate the loop scale from the accumulated backedge mass.
				788	///
				789	/// 3. Distribute mass in the function (\a computeMassInFunction()).
				790	///
				791	/// Finally, distribute mass through the DAG resulting from packaging all
				792	/// loops in the function. This uses the same algorithm as distributing
				793	/// mass in a loop, except that there are no exit or backedge edges.
				794	///
				795	/// 4. Unpackage loops (\a unwrapLoops()).
				796	///
				797	/// Initialize each block's frequency to a floating point representation of
				798	/// its mass.
				799	///
				800	/// Visit loops top-down, scaling the frequencies of its immediate members
				801	/// by the loop's pseudo-node's frequency.
				802	///
				803	/// 5. Convert frequencies to a 64-bit range (\a finalizeMetrics()).
				804	///
				805	/// Using the min and max frequencies as a guide, translate floating point
				806	/// frequencies to an appropriate range in uint64_t.
				807	///
				808	/// It has some known flaws.
				809	///
				810	/// - The model of irreducible control flow is a rough approximation.
				811	///
				812	/// Modelling irreducible control flow exactly involves setting up and
				813	/// solving a group of infinite geometric series. Such precision is
				814	/// unlikely to be worthwhile, since most of our algorithms give up on
				815	/// irreducible control flow anyway.
				816	///
				817	/// Nevertheless, we might find that we need to get closer. Here's a sort
				818	/// of TODO list for the model with diminishing returns, to be completed as
				819	/// necessary.
				820	///
				821	/// - The headers for the \a LoopData representing an irreducible SCC
				822	/// include non-entry blocks. When these extra blocks exist, they
				823	/// indicate a self-contained irreducible sub-SCC. We could treat them
				824	/// as sub-loops, rather than arbitrarily shoving the problematic
				825	/// blocks into the headers of the main irreducible SCC.
				826	///
				827	/// - Entry frequencies are assumed to be evenly split between the
				828	/// headers of a given irreducible SCC, which is the only option if we
				829	/// need to compute mass in the SCC before its parent loop. Instead,
				830	/// we could partially compute mass in the parent loop, and stop when
				831	/// we get to the SCC. Here, we have the correct ratio of entry
				832	/// masses, which we can use to adjust their relative frequencies.
				833	/// Compute mass in the SCC, and then continue propagation in the
				834	/// parent.
				835	///
				836	/// - We can propagate mass iteratively through the SCC, for some fixed
				837	/// number of iterations. Each iteration starts by assigning the entry
				838	/// blocks their backedge mass from the prior iteration. The final
				839	/// mass for each block (and each exit, and the total backedge mass
				840	/// used for computing loop scale) is the sum of all iterations.
				841	/// (Running this until fixed point would "solve" the geometric
				842	/// series by simulation.)
				843	template <class BT> class BlockFrequencyInfoImpl : BlockFrequencyInfoImplBase {
				844	// This is part of a workaround for a GCC 4.7 crash on lambdas.
				845	friend struct bfi_detail::BlockEdgesAdder<BT>;
				846
				847	using BlockT = typename bfi_detail::TypeMap<BT>::BlockT;
				848	using FunctionT = typename bfi_detail::TypeMap<BT>::FunctionT;
				849	using BranchProbabilityInfoT =
				850	typename bfi_detail::TypeMap<BT>::BranchProbabilityInfoT;
				851	using LoopT = typename bfi_detail::TypeMap<BT>::LoopT;
				852	using LoopInfoT = typename bfi_detail::TypeMap<BT>::LoopInfoT;
				853	using Successor = GraphTraits<const BlockT *>;
				854	using Predecessor = GraphTraits<Inverse<const BlockT *>>;
				855
				856	const BranchProbabilityInfoT *BPI = nullptr;
				857	const LoopInfoT *LI = nullptr;
				858	const FunctionT *F = nullptr;
				859
				860	// All blocks in reverse postorder.
				861	std::vector<const BlockT *> RPOT;
				862	DenseMap<const BlockT *, BlockNode> Nodes;
				863
				864	using rpot_iterator = typename std::vector<const BlockT *>::const_iterator;
				865
				866	rpot_iterator rpot_begin() const { return RPOT.begin(); }
				867	rpot_iterator rpot_end() const { return RPOT.end(); }
				868
				869	size_t getIndex(const rpot_iterator &I) const { return I - rpot_begin(); }
				870
				871	BlockNode getNode(const rpot_iterator &I) const {
				872	return BlockNode(getIndex(I));
				873	}
				874	BlockNode getNode(const BlockT *BB) const { return Nodes.lookup(BB); }
				875
				876	const BlockT *getBlock(const BlockNode &Node) const {
				877	assert(Node.Index < RPOT.size());
				878	return RPOT[Node.Index];
				879	}
				880
				881	/// \brief Run (and save) a post-order traversal.
				882	///
				883	/// Saves a reverse post-order traversal of all the nodes in \a F.
				884	void initializeRPOT();
				885
				886	/// \brief Initialize loop data.
				887	///
				888	/// Build up \a Loops using \a LoopInfo. \a LoopInfo gives us a mapping from
				889	/// each block to the deepest loop it's in, but we need the inverse. For each
				890	/// loop, we store in reverse post-order its "immediate" members, defined as
				891	/// the header, the headers of immediate sub-loops, and all other blocks in
				892	/// the loop that are not in sub-loops.
				893	void initializeLoops();
				894
				895	/// \brief Propagate to a block's successors.
				896	///
				897	/// In the context of distributing mass through \c OuterLoop, divide the mass
				898	/// currently assigned to \c Node between its successors.
				899	///
				900	/// \return \c true unless there's an irreducible backedge.
				901	bool propagateMassToSuccessors(LoopData *OuterLoop, const BlockNode &Node);
				902
				903	/// \brief Compute mass in a particular loop.
				904	///
				905	/// Assign mass to \c Loop's header, and then for each block in \c Loop in
				906	/// reverse post-order, distribute mass to its successors. Only visits nodes
				907	/// that have not been packaged into sub-loops.
				908	///
				909	/// \pre \a computeMassInLoop() has been called for each subloop of \c Loop.
				910	/// \return \c true unless there's an irreducible backedge.
				911	bool computeMassInLoop(LoopData &Loop);
				912
				913	/// \brief Try to compute mass in the top-level function.
				914	///
				915	/// Assign mass to the entry block, and then for each block in reverse
				916	/// post-order, distribute mass to its successors. Skips nodes that have
				917	/// been packaged into loops.
				918	///
				919	/// \pre \a computeMassInLoops() has been called.
				920	/// \return \c true unless there's an irreducible backedge.
				921	bool tryToComputeMassInFunction();
				922
				923	/// \brief Compute mass in (and package up) irreducible SCCs.
				924	///
				925	/// Find the irreducible SCCs in \c OuterLoop, add them to \a Loops (in front
				926	/// of \c Insert), and call \a computeMassInLoop() on each of them.
				927	///
				928	/// If \c OuterLoop is \c nullptr, it refers to the top-level function.
				929	///
				930	/// \pre \a computeMassInLoop() has been called for each subloop of \c
				931	/// OuterLoop.
				932	/// \pre \c Insert points at the last loop successfully processed by \a
				933	/// computeMassInLoop().
				934	/// \pre \c OuterLoop has irreducible SCCs.
				935	void computeIrreducibleMass(LoopData *OuterLoop,
				936	std::list<LoopData>::iterator Insert);
				937
				938	/// \brief Compute mass in all loops.
				939	///
				940	/// For each loop bottom-up, call \a computeMassInLoop().
				941	///
				942	/// \a computeMassInLoop() aborts (and returns \c false) on loops that
				943	/// contain a irreducible sub-SCCs. Use \a computeIrreducibleMass() and then
				944	/// re-enter \a computeMassInLoop().
				945	///
				946	/// \post \a computeMassInLoop() has returned \c true for every loop.
				947	void computeMassInLoops();
				948
				949	/// \brief Compute mass in the top-level function.
				950	///
				951	/// Uses \a tryToComputeMassInFunction() and \a computeIrreducibleMass() to
				952	/// compute mass in the top-level function.
				953	///
				954	/// \post \a tryToComputeMassInFunction() has returned \c true.
				955	void computeMassInFunction();
				956
				957	std::string getBlockName(const BlockNode &Node) const override {
				958	return bfi_detail::getBlockName(getBlock(Node));
				959	}
				960
				961	public:
				962	BlockFrequencyInfoImpl() = default;
				963
				964	const FunctionT *getFunction() const { return F; }
				965
				966	void calculate(const FunctionT &F, const BranchProbabilityInfoT &BPI,
				967	const LoopInfoT &LI);
				968
				969	using BlockFrequencyInfoImplBase::getEntryFreq;
				970
				971	BlockFrequency getBlockFreq(const BlockT *BB) const {
				972	return BlockFrequencyInfoImplBase::getBlockFreq(getNode(BB));
				973	}
				974
				975	Optional<uint64_t> getBlockProfileCount(const Function &F,
				976	const BlockT *BB) const {
				977	return BlockFrequencyInfoImplBase::getBlockProfileCount(F, getNode(BB));
				978	}
				979
				980	Optional<uint64_t> getProfileCountFromFreq(const Function &F,
				981	uint64_t Freq) const {
				982	return BlockFrequencyInfoImplBase::getProfileCountFromFreq(F, Freq);
				983	}
				984
				985	bool isIrrLoopHeader(const BlockT *BB) {
				986	return BlockFrequencyInfoImplBase::isIrrLoopHeader(getNode(BB));
				987	}
				988
				989	void setBlockFreq(const BlockT *BB, uint64_t Freq);
				990
				991	Scaled64 getFloatingBlockFreq(const BlockT *BB) const {
				992	return BlockFrequencyInfoImplBase::getFloatingBlockFreq(getNode(BB));
				993	}
				994
				995	const BranchProbabilityInfoT &getBPI() const { return *BPI; }
				996
				997	/// \brief Print the frequencies for the current function.
				998	///
				999	/// Prints the frequencies for the blocks in the current function.
				1000	///
				1001	/// Blocks are printed in the natural iteration order of the function, rather
				1002	/// than reverse post-order. This provides two advantages: writing -analyze
				1003	/// tests is easier (since blocks come out in source order), and even
				1004	/// unreachable blocks are printed.
				1005	///
				1006	/// \a BlockFrequencyInfoImplBase::print() only knows reverse post-order, so
				1007	/// we need to override it here.
				1008	raw_ostream &print(raw_ostream &OS) const override;
				1009
				1010	using BlockFrequencyInfoImplBase::dump;
				1011	using BlockFrequencyInfoImplBase::printBlockFreq;
				1012
				1013	raw_ostream &printBlockFreq(raw_ostream &OS, const BlockT *BB) const {
				1014	return BlockFrequencyInfoImplBase::printBlockFreq(OS, getNode(BB));
				1015	}
				1016	};
				1017
				1018	template <class BT>
				1019	void BlockFrequencyInfoImpl<BT>::calculate(const FunctionT &F,
				1020	const BranchProbabilityInfoT &BPI,
				1021	const LoopInfoT &LI) {
				1022	// Save the parameters.
				1023	this->BPI = &BPI;
				1024	this->LI = &LI;
				1025	this->F = &F;
				1026
				1027	// Clean up left-over data structures.
				1028	BlockFrequencyInfoImplBase::clear();
				1029	RPOT.clear();
				1030	Nodes.clear();
				1031
				1032	// Initialize.
				1033	DEBUG(dbgs() << "\nblock-frequency: " << F.getName() << "\n================="
				1034	<< std::string(F.getName().size(), '=') << "\n");
				1035	initializeRPOT();
				1036	initializeLoops();
				1037
				1038	// Visit loops in post-order to find the local mass distribution, and then do
				1039	// the full function.
				1040	computeMassInLoops();
				1041	computeMassInFunction();
				1042	unwrapLoops();
				1043	finalizeMetrics();
				1044	}
				1045
				1046	template <class BT>
				1047	void BlockFrequencyInfoImpl<BT>::setBlockFreq(const BlockT *BB, uint64_t Freq) {
				1048	if (Nodes.count(BB))
				1049	BlockFrequencyInfoImplBase::setBlockFreq(getNode(BB), Freq);
				1050	else {
				1051	// If BB is a newly added block after BFI is done, we need to create a new
				1052	// BlockNode for it assigned with a new index. The index can be determined
				1053	// by the size of Freqs.
				1054	BlockNode NewNode(Freqs.size());
				1055	Nodes[BB] = NewNode;
				1056	Freqs.emplace_back();
				1057	BlockFrequencyInfoImplBase::setBlockFreq(NewNode, Freq);
				1058	}
				1059	}
				1060
				1061	template <class BT> void BlockFrequencyInfoImpl<BT>::initializeRPOT() {
				1062	const BlockT *Entry = &F->front();
				1063	RPOT.reserve(F->size());
				1064	std::copy(po_begin(Entry), po_end(Entry), std::back_inserter(RPOT));
				1065	std::reverse(RPOT.begin(), RPOT.end());
				1066
				1067	assert(RPOT.size() - 1 <= BlockNode::getMaxIndex() &&
				1068	"More nodes in function than Block Frequency Info supports");
				1069
				1070	DEBUG(dbgs() << "reverse-post-order-traversal\n");
				1071	for (rpot_iterator I = rpot_begin(), E = rpot_end(); I != E; ++I) {
				1072	BlockNode Node = getNode(I);
				1073	DEBUG(dbgs() << " - " << getIndex(I) << ": " << getBlockName(Node) << "\n");
				1074	Nodes[*I] = Node;
				1075	}
				1076
				1077	Working.reserve(RPOT.size());
				1078	for (size_t Index = 0; Index < RPOT.size(); ++Index)
				1079	Working.emplace_back(Index);
				1080	Freqs.resize(RPOT.size());
				1081	}
				1082
				1083	template <class BT> void BlockFrequencyInfoImpl<BT>::initializeLoops() {
				1084	DEBUG(dbgs() << "loop-detection\n");
				1085	if (LI->empty())
				1086	return;
				1087
				1088	// Visit loops top down and assign them an index.
				1089	std::deque<std::pair<const LoopT , LoopData >> Q;
				1090	for (const LoopT L : LI)
				1091	Q.emplace_back(L, nullptr);
				1092	while (!Q.empty()) {
				1093	const LoopT *Loop = Q.front().first;
				1094	LoopData *Parent = Q.front().second;
				1095	Q.pop_front();
				1096
				1097	BlockNode Header = getNode(Loop->getHeader());
				1098	assert(Header.isValid());
				1099
				1100	Loops.emplace_back(Parent, Header);
				1101	Working[Header.Index].Loop = &Loops.back();
				1102	DEBUG(dbgs() << " - loop = " << getBlockName(Header) << "\n");
				1103
				1104	for (const LoopT L : Loop)
				1105	Q.emplace_back(L, &Loops.back());
				1106	}
				1107
				1108	// Visit nodes in reverse post-order and add them to their deepest containing
				1109	// loop.
				1110	for (size_t Index = 0; Index < RPOT.size(); ++Index) {
				1111	// Loop headers have already been mostly mapped.
				1112	if (Working[Index].isLoopHeader()) {
				1113	LoopData *ContainingLoop = Working[Index].getContainingLoop();
				1114	if (ContainingLoop)
				1115	ContainingLoop->Nodes.push_back(Index);
				1116	continue;
				1117	}
				1118
				1119	const LoopT *Loop = LI->getLoopFor(RPOT[Index]);
				1120	if (!Loop)
				1121	continue;
				1122
				1123	// Add this node to its containing loop's member list.
				1124	BlockNode Header = getNode(Loop->getHeader());
				1125	assert(Header.isValid());
				1126	const auto &HeaderData = Working[Header.Index];
				1127	assert(HeaderData.isLoopHeader());
				1128
				1129	Working[Index].Loop = HeaderData.Loop;
				1130	HeaderData.Loop->Nodes.push_back(Index);
				1131	DEBUG(dbgs() << " - loop = " << getBlockName(Header)
				1132	<< ": member = " << getBlockName(Index) << "\n");
				1133	}
				1134	}
				1135
				1136	template <class BT> void BlockFrequencyInfoImpl<BT>::computeMassInLoops() {
				1137	// Visit loops with the deepest first, and the top-level loops last.
				1138	for (auto L = Loops.rbegin(), E = Loops.rend(); L != E; ++L) {
				1139	if (computeMassInLoop(*L))
				1140	continue;
				1141	auto Next = std::next(L);
				1142	computeIrreducibleMass(&*L, L.base());
				1143	L = std::prev(Next);
				1144	if (computeMassInLoop(*L))
				1145	continue;
				1146	llvm_unreachable("unhandled irreducible control flow");
				1147	}
				1148	}
				1149
				1150	template <class BT>
				1151	bool BlockFrequencyInfoImpl<BT>::computeMassInLoop(LoopData &Loop) {
				1152	// Compute mass in loop.
				1153	DEBUG(dbgs() << "compute-mass-in-loop: " << getLoopName(Loop) << "\n");
				1154
				1155	if (Loop.isIrreducible()) {
				1156	DEBUG(dbgs() << "isIrreducible = true\n");
				1157	Distribution Dist;
				1158	unsigned NumHeadersWithWeight = 0;
				1159	Optional<uint64_t> MinHeaderWeight;
				1160	DenseSet<uint32_t> HeadersWithoutWeight;
				1161	HeadersWithoutWeight.reserve(Loop.NumHeaders);
				1162	for (uint32_t H = 0; H < Loop.NumHeaders; ++H) {
				1163	auto &HeaderNode = Loop.Nodes[H];
				1164	const BlockT *Block = getBlock(HeaderNode);
				1165	IsIrrLoopHeader.set(Loop.Nodes[H].Index);
				1166	Optional<uint64_t> HeaderWeight = Block->getIrrLoopHeaderWeight();
				1167	if (!HeaderWeight) {
				1168	DEBUG(dbgs() << "Missing irr loop header metadata on "
				1169	<< getBlockName(HeaderNode) << "\n");
				1170	HeadersWithoutWeight.insert(H);
				1171	continue;
				1172	}
				1173	DEBUG(dbgs() << getBlockName(HeaderNode)
				1174	<< " has irr loop header weight " << HeaderWeight.getValue()
				1175	<< "\n");
				1176	NumHeadersWithWeight++;
				1177	uint64_t HeaderWeightValue = HeaderWeight.getValue();
				1178	if (!MinHeaderWeight \|\| HeaderWeightValue < MinHeaderWeight)
				1179	MinHeaderWeight = HeaderWeightValue;
				1180	if (HeaderWeightValue) {
				1181	Dist.addLocal(HeaderNode, HeaderWeightValue);
				1182	}
				1183	}
				1184	// As a heuristic, if some headers don't have a weight, give them the
				1185	// minimium weight seen (not to disrupt the existing trends too much by
				1186	// using a weight that's in the general range of the other headers' weights,
				1187	// and the minimum seems to perform better than the average.)
				1188	// FIXME: better update in the passes that drop the header weight.
				1189	// If no headers have a weight, give them even weight (use weight 1).
				1190	if (!MinHeaderWeight)
				1191	MinHeaderWeight = 1;
				1192	for (uint32_t H : HeadersWithoutWeight) {
				1193	auto &HeaderNode = Loop.Nodes[H];
				1194	assert(!getBlock(HeaderNode)->getIrrLoopHeaderWeight() &&
				1195	"Shouldn't have a weight metadata");
				1196	uint64_t MinWeight = MinHeaderWeight.getValue();
				1197	DEBUG(dbgs() << "Giving weight " << MinWeight
				1198	<< " to " << getBlockName(HeaderNode) << "\n");
				1199	if (MinWeight)
				1200	Dist.addLocal(HeaderNode, MinWeight);
				1201	}
				1202	distributeIrrLoopHeaderMass(Dist);
				1203	for (const BlockNode &M : Loop.Nodes)
				1204	if (!propagateMassToSuccessors(&Loop, M))
				1205	llvm_unreachable("unhandled irreducible control flow");
				1206	if (NumHeadersWithWeight == 0)
				1207	// No headers have a metadata. Adjust header mass.
				1208	adjustLoopHeaderMass(Loop);
				1209	} else {
				1210	Working[Loop.getHeader().Index].getMass() = BlockMass::getFull();
				1211	if (!propagateMassToSuccessors(&Loop, Loop.getHeader()))
				1212	llvm_unreachable("irreducible control flow to loop header!?");
				1213	for (const BlockNode &M : Loop.members())
				1214	if (!propagateMassToSuccessors(&Loop, M))
				1215	// Irreducible backedge.
				1216	return false;
				1217	}
				1218
				1219	computeLoopScale(Loop);
				1220	packageLoop(Loop);
				1221	return true;
				1222	}
				1223
				1224	template <class BT>
				1225	bool BlockFrequencyInfoImpl<BT>::tryToComputeMassInFunction() {
				1226	// Compute mass in function.
				1227	DEBUG(dbgs() << "compute-mass-in-function\n");
				1228	assert(!Working.empty() && "no blocks in function");
				1229	assert(!Working[0].isLoopHeader() && "entry block is a loop header");
				1230
				1231	Working[0].getMass() = BlockMass::getFull();
				1232	for (rpot_iterator I = rpot_begin(), IE = rpot_end(); I != IE; ++I) {
				1233	// Check for nodes that have been packaged.
				1234	BlockNode Node = getNode(I);
				1235	if (Working[Node.Index].isPackaged())
				1236	continue;
				1237
				1238	if (!propagateMassToSuccessors(nullptr, Node))
				1239	return false;
				1240	}
				1241	return true;
				1242	}
				1243
				1244	template <class BT> void BlockFrequencyInfoImpl<BT>::computeMassInFunction() {
				1245	if (tryToComputeMassInFunction())
				1246	return;
				1247	computeIrreducibleMass(nullptr, Loops.begin());
				1248	if (tryToComputeMassInFunction())
				1249	return;
				1250	llvm_unreachable("unhandled irreducible control flow");
				1251	}
				1252
				1253	/// \note This should be a lambda, but that crashes GCC 4.7.
				1254	namespace bfi_detail {
				1255
				1256	template <class BT> struct BlockEdgesAdder {
				1257	using BlockT = BT;
				1258	using LoopData = BlockFrequencyInfoImplBase::LoopData;
				1259	using Successor = GraphTraits<const BlockT *>;
				1260
				1261	const BlockFrequencyInfoImpl<BT> &BFI;
				1262
				1263	explicit BlockEdgesAdder(const BlockFrequencyInfoImpl<BT> &BFI)
				1264	: BFI(BFI) {}
				1265
				1266	void operator()(IrreducibleGraph &G, IrreducibleGraph::IrrNode &Irr,
				1267	const LoopData *OuterLoop) {
				1268	const BlockT *BB = BFI.RPOT[Irr.Node.Index];
				1269	for (const auto Succ : children<const BlockT *>(BB))
				1270	G.addEdge(Irr, BFI.getNode(Succ), OuterLoop);
				1271	}
				1272	};
				1273
				1274	} // end namespace bfi_detail
				1275
				1276	template <class BT>
				1277	void BlockFrequencyInfoImpl<BT>::computeIrreducibleMass(
				1278	LoopData *OuterLoop, std::list<LoopData>::iterator Insert) {
				1279	DEBUG(dbgs() << "analyze-irreducible-in-";
				1280	if (OuterLoop) dbgs() << "loop: " << getLoopName(*OuterLoop) << "\n";
				1281	else dbgs() << "function\n");
				1282
				1283	using namespace bfi_detail;
				1284
				1285	// Ideally, addBlockEdges() would be declared here as a lambda, but that
				1286	// crashes GCC 4.7.
				1287	BlockEdgesAdder<BT> addBlockEdges(*this);
				1288	IrreducibleGraph G(*this, OuterLoop, addBlockEdges);
				1289
				1290	for (auto &L : analyzeIrreducible(G, OuterLoop, Insert))
				1291	computeMassInLoop(L);
				1292
				1293	if (!OuterLoop)
				1294	return;
				1295	updateLoopWithIrreducible(*OuterLoop);
				1296	}
				1297
				1298	// A helper function that converts a branch probability into weight.
				1299	inline uint32_t getWeightFromBranchProb(const BranchProbability Prob) {
				1300	return Prob.getNumerator();
				1301	}
				1302
				1303	template <class BT>
				1304	bool
				1305	BlockFrequencyInfoImpl<BT>::propagateMassToSuccessors(LoopData *OuterLoop,
				1306	const BlockNode &Node) {
				1307	DEBUG(dbgs() << " - node: " << getBlockName(Node) << "\n");
				1308	// Calculate probability for successors.
				1309	Distribution Dist;
				1310	if (auto *Loop = Working[Node.Index].getPackagedLoop()) {
				1311	assert(Loop != OuterLoop && "Cannot propagate mass in a packaged loop");
				1312	if (!addLoopSuccessorsToDist(OuterLoop, *Loop, Dist))
				1313	// Irreducible backedge.
				1314	return false;
				1315	} else {
				1316	const BlockT *BB = getBlock(Node);
				1317	for (auto SI = GraphTraits<const BlockT *>::child_begin(BB),
				1318	SE = GraphTraits<const BlockT *>::child_end(BB);
				1319	SI != SE; ++SI)
				1320	if (!addToDist(
				1321	Dist, OuterLoop, Node, getNode(*SI),
				1322	getWeightFromBranchProb(BPI->getEdgeProbability(BB, SI))))
				1323	// Irreducible backedge.
				1324	return false;
				1325	}
				1326
				1327	// Distribute mass to successors, saving exit and backedge data in the
				1328	// loop header.
				1329	distributeMass(Node, OuterLoop, Dist);
				1330	return true;
				1331	}
				1332
				1333	template <class BT>
				1334	raw_ostream &BlockFrequencyInfoImpl<BT>::print(raw_ostream &OS) const {
				1335	if (!F)
				1336	return OS;
				1337	OS << "block-frequency-info: " << F->getName() << "\n";
				1338	for (const BlockT &BB : *F) {
				1339	OS << " - " << bfi_detail::getBlockName(&BB) << ": float = ";
				1340	getFloatingBlockFreq(&BB).print(OS, 5)
				1341	<< ", int = " << getBlockFreq(&BB).getFrequency();
				1342	if (Optional<uint64_t> ProfileCount =
				1343	BlockFrequencyInfoImplBase::getBlockProfileCount(
				1344	F->getFunction(), getNode(&BB)))
				1345	OS << ", count = " << ProfileCount.getValue();
				1346	if (Optional<uint64_t> IrrLoopHeaderWeight =
				1347	BB.getIrrLoopHeaderWeight())
				1348	OS << ", irr_loop_header_weight = " << IrrLoopHeaderWeight.getValue();
				1349	OS << "\n";
				1350	}
				1351
				1352	// Add an extra newline for readability.
				1353	OS << "\n";
				1354	return OS;
				1355	}
				1356
				1357	// Graph trait base class for block frequency information graph
				1358	// viewer.
				1359
				1360	enum GVDAGType { GVDT_None, GVDT_Fraction, GVDT_Integer, GVDT_Count };
				1361
				1362	template <class BlockFrequencyInfoT, class BranchProbabilityInfoT>
				1363	struct BFIDOTGraphTraitsBase : public DefaultDOTGraphTraits {
				1364	using GTraits = GraphTraits<BlockFrequencyInfoT *>;
				1365	using NodeRef = typename GTraits::NodeRef;
				1366	using EdgeIter = typename GTraits::ChildIteratorType;
				1367	using NodeIter = typename GTraits::nodes_iterator;
				1368
				1369	uint64_t MaxFrequency = 0;
				1370
				1371	explicit BFIDOTGraphTraitsBase(bool isSimple = false)
				1372	: DefaultDOTGraphTraits(isSimple) {}
				1373
				1374	static std::string getGraphName(const BlockFrequencyInfoT *G) {
				1375	return G->getFunction()->getName();
				1376	}
				1377
				1378	std::string getNodeAttributes(NodeRef Node, const BlockFrequencyInfoT *Graph,
				1379	unsigned HotPercentThreshold = 0) {
				1380	std::string Result;
				1381	if (!HotPercentThreshold)
				1382	return Result;
				1383
				1384	// Compute MaxFrequency on the fly:
				1385	if (!MaxFrequency) {
				1386	for (NodeIter I = GTraits::nodes_begin(Graph),
				1387	E = GTraits::nodes_end(Graph);
				1388	I != E; ++I) {
				1389	NodeRef N = *I;
				1390	MaxFrequency =
				1391	std::max(MaxFrequency, Graph->getBlockFreq(N).getFrequency());
				1392	}
				1393	}
				1394	BlockFrequency Freq = Graph->getBlockFreq(Node);
				1395	BlockFrequency HotFreq =
				1396	(BlockFrequency(MaxFrequency) *
				1397	BranchProbability::getBranchProbability(HotPercentThreshold, 100));
				1398
				1399	if (Freq < HotFreq)
				1400	return Result;
				1401
				1402	raw_string_ostream OS(Result);
				1403	OS << "color=\"red\"";
				1404	OS.flush();
				1405	return Result;
				1406	}
				1407
				1408	std::string getNodeLabel(NodeRef Node, const BlockFrequencyInfoT *Graph,
				1409	GVDAGType GType, int layout_order = -1) {
				1410	std::string Result;
				1411	raw_string_ostream OS(Result);
				1412
				1413	if (layout_order != -1)
				1414	OS << Node->getName() << "[" << layout_order << "] : ";
				1415	else
				1416	OS << Node->getName() << " : ";
				1417	switch (GType) {
				1418	case GVDT_Fraction:
				1419	Graph->printBlockFreq(OS, Node);
				1420	break;
				1421	case GVDT_Integer:
				1422	OS << Graph->getBlockFreq(Node).getFrequency();
				1423	break;
				1424	case GVDT_Count: {
				1425	auto Count = Graph->getBlockProfileCount(Node);
				1426	if (Count)
				1427	OS << Count.getValue();
				1428	else
				1429	OS << "Unknown";
				1430	break;
				1431	}
				1432	case GVDT_None:
				1433	llvm_unreachable("If we are not supposed to render a graph we should "
				1434	"never reach this point.");
				1435	}
				1436	return Result;
				1437	}
				1438
				1439	std::string getEdgeAttributes(NodeRef Node, EdgeIter EI,
				1440	const BlockFrequencyInfoT *BFI,
				1441	const BranchProbabilityInfoT *BPI,
				1442	unsigned HotPercentThreshold = 0) {
				1443	std::string Str;
				1444	if (!BPI)
				1445	return Str;
				1446
				1447	BranchProbability BP = BPI->getEdgeProbability(Node, EI);
				1448	uint32_t N = BP.getNumerator();
				1449	uint32_t D = BP.getDenominator();
				1450	double Percent = 100.0 * N / D;
				1451	raw_string_ostream OS(Str);
				1452	OS << format("label=\"%.1f%%\"", Percent);
				1453
				1454	if (HotPercentThreshold) {
				1455	BlockFrequency EFreq = BFI->getBlockFreq(Node) * BP;
				1456	BlockFrequency HotFreq = BlockFrequency(MaxFrequency) *
				1457	BranchProbability(HotPercentThreshold, 100);
				1458
				1459	if (EFreq >= HotFreq) {
				1460	OS << ",color=\"red\"";
				1461	}
				1462	}
				1463
				1464	OS.flush();
				1465	return Str;
				1466	}
				1467	};
				1468
				1469	} // end namespace llvm
				1470
				1471	#undef DEBUG_TYPE
				1472
				1473	#endif // LLVM_ANALYSIS_BLOCKFREQUENCYINFOIMPL_H