Update prebuilt Clang to r416183b from Android.

https://android.googlesource.com/platform/prebuilts/clang/host/
linux-x86/+/06a71ddac05c22edb2d10b590e1769b3f8619bef

clang 12.0.5 (based on r416183b) from build 7284624.

Change-Id: I277a316abcf47307562d8b748b84870f31a72866
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/linux-x64/clang/include/llvm/MCA/CodeEmitter.h b/linux-x64/clang/include/llvm/MCA/CodeEmitter.h
new file mode 100644
index 0000000..edbadcc
--- /dev/null
+++ b/linux-x64/clang/include/llvm/MCA/CodeEmitter.h
@@ -0,0 +1,69 @@
+//===--------------------- CodeEmitter.h ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// A utility class used to compute instruction encodings. It buffers encodings
+/// for later usage. It exposes a simple API to compute and get the encodings as
+/// StringRef.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_MCA_CODEEMITTER_H
+#define LLVM_MCA_CODEEMITTER_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <string>
+
+namespace llvm {
+namespace mca {
+
+/// A utility class used to compute instruction encodings for a code region.
+///
+/// It provides a simple API to compute and return instruction encodings as
+/// strings. Encodings are cached internally for later usage.
+class CodeEmitter {
+  const MCSubtargetInfo &STI;
+  const MCAsmBackend &MAB;
+  const MCCodeEmitter &MCE;
+
+  SmallString<256> Code;
+  raw_svector_ostream VecOS;
+  ArrayRef<MCInst> Sequence;
+
+  // An EncodingInfo pair stores <base, length> information.  Base (i.e. first)
+  // is an index to the `Code`. Length (i.e. second) is the encoding size.
+  using EncodingInfo = std::pair<unsigned, unsigned>;
+
+  // A cache of encodings.
+  SmallVector<EncodingInfo, 16> Encodings;
+
+  EncodingInfo getOrCreateEncodingInfo(unsigned MCID);
+
+public:
+  CodeEmitter(const MCSubtargetInfo &ST, const MCAsmBackend &AB,
+              const MCCodeEmitter &CE, ArrayRef<MCInst> S)
+      : STI(ST), MAB(AB), MCE(CE), VecOS(Code), Sequence(S),
+        Encodings(S.size()) {}
+
+  StringRef getEncoding(unsigned MCID) {
+    EncodingInfo EI = getOrCreateEncodingInfo(MCID);
+    return StringRef(&Code[EI.first], EI.second);
+  }
+};
+
+} // namespace mca
+} // namespace llvm
+
+#endif // LLVM_MCA_CODEEMITTER_H
diff --git a/linux-x64/clang/include/llvm/MCA/Context.h b/linux-x64/clang/include/llvm/MCA/Context.h
index 503d780..af3cb8e 100644
--- a/linux-x64/clang/include/llvm/MCA/Context.h
+++ b/linux-x64/clang/include/llvm/MCA/Context.h
@@ -20,7 +20,6 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MCA/HardwareUnits/HardwareUnit.h"
-#include "llvm/MCA/InstrBuilder.h"
 #include "llvm/MCA/Pipeline.h"
 #include "llvm/MCA/SourceMgr.h"
 #include <memory>
@@ -58,6 +57,9 @@
   Context(const Context &C) = delete;
   Context &operator=(const Context &C) = delete;
 
+  const MCRegisterInfo &getMCRegisterInfo() const { return MRI; }
+  const MCSubtargetInfo &getMCSubtargetInfo() const { return STI; }
+
   void addHardwareUnit(std::unique_ptr<HardwareUnit> H) {
     Hardware.push_back(std::move(H));
   }
@@ -65,7 +67,6 @@
   /// Construct a basic pipeline for simulating an out-of-order pipeline.
   /// This pipeline consists of Fetch, Dispatch, Execute, and Retire stages.
   std::unique_ptr<Pipeline> createDefaultPipeline(const PipelineOptions &Opts,
-                                                  InstrBuilder &IB,
                                                   SourceMgr &SrcMgr);
 };
 
diff --git a/linux-x64/clang/include/llvm/MCA/HardwareUnits/LSUnit.h b/linux-x64/clang/include/llvm/MCA/HardwareUnits/LSUnit.h
index ae9a49c..2f9b4ba 100644
--- a/linux-x64/clang/include/llvm/MCA/HardwareUnits/LSUnit.h
+++ b/linux-x64/clang/include/llvm/MCA/HardwareUnits/LSUnit.h
@@ -24,8 +24,6 @@
 namespace llvm {
 namespace mca {
 
-class Scheduler;
-
 /// A node of a memory dependency graph. A MemoryGroup describes a set of
 /// instructions with same memory dependencies.
 ///
@@ -42,7 +40,10 @@
   unsigned NumInstructions;
   unsigned NumExecuting;
   unsigned NumExecuted;
-  SmallVector<MemoryGroup *, 4> Succ;
+  // Successors that are in a order dependency with this group.
+  SmallVector<MemoryGroup *, 4> OrderSucc;
+  // Successors that are in a data dependency with this group.
+  SmallVector<MemoryGroup *, 4> DataSucc;
 
   CriticalDependency CriticalPredecessor;
   InstRef CriticalMemoryInstruction;
@@ -57,8 +58,9 @@
         NumExecuted(0), CriticalPredecessor(), CriticalMemoryInstruction() {}
   MemoryGroup(MemoryGroup &&) = default;
 
-  ArrayRef<MemoryGroup *> getSuccessors() const { return Succ; }
-  unsigned getNumSuccessors() const { return Succ.size(); }
+  size_t getNumSuccessors() const {
+    return OrderSucc.size() + DataSucc.size();
+  }
   unsigned getNumPredecessors() const { return NumPredecessors; }
   unsigned getNumExecutingPredecessors() const {
     return NumExecutingPredecessors;
@@ -77,12 +79,22 @@
     return CriticalPredecessor;
   }
 
-  void addSuccessor(MemoryGroup *Group) {
+  void addSuccessor(MemoryGroup *Group, bool IsDataDependent) {
+    // Do not need to add a dependency if there is no data
+    // dependency and all instructions from this group have been
+    // issued already.
+    if (!IsDataDependent && isExecuting())
+      return;
+
     Group->NumPredecessors++;
     assert(!isExecuted() && "Should have been removed!");
     if (isExecuting())
-      Group->onGroupIssued(CriticalMemoryInstruction);
-    Succ.emplace_back(Group);
+      Group->onGroupIssued(CriticalMemoryInstruction, IsDataDependent);
+
+    if (IsDataDependent)
+      DataSucc.emplace_back(Group);
+    else
+      OrderSucc.emplace_back(Group);
   }
 
   bool isWaiting() const {
@@ -100,10 +112,13 @@
   }
   bool isExecuted() const { return NumInstructions == NumExecuted; }
 
-  void onGroupIssued(const InstRef &IR) {
+  void onGroupIssued(const InstRef &IR, bool ShouldUpdateCriticalDep) {
     assert(!isReady() && "Unexpected group-start event!");
     NumExecutingPredecessors++;
 
+    if (!ShouldUpdateCriticalDep)
+      return;
+
     unsigned Cycles = IR.getInstruction()->getCyclesLeft();
     if (CriticalPredecessor.Cycles < Cycles) {
       CriticalPredecessor.IID = IR.getSourceIndex();
@@ -135,8 +150,14 @@
       return;
 
     // Notify successors that this group started execution.
-    for (MemoryGroup *MG : Succ)
-      MG->onGroupIssued(CriticalMemoryInstruction);
+    for (MemoryGroup *MG : OrderSucc) {
+      MG->onGroupIssued(CriticalMemoryInstruction, false);
+      // Release the order dependency with this group.
+      MG->onGroupExecuted();
+    }
+
+    for (MemoryGroup *MG : DataSucc)
+      MG->onGroupIssued(CriticalMemoryInstruction, true);
   }
 
   void onInstructionExecuted() {
@@ -147,8 +168,8 @@
     if (!isExecuted())
       return;
 
-    // Notify successors that this group has finished execution.
-    for (MemoryGroup *MG : Succ)
+    // Notify data dependent successors that this group has finished execution.
+    for (MemoryGroup *MG : DataSucc)
       MG->onGroupExecuted();
   }
 
@@ -209,8 +230,10 @@
 
   unsigned getUsedLQEntries() const { return UsedLQEntries; }
   unsigned getUsedSQEntries() const { return UsedSQEntries; }
-  unsigned assignLQSlot() { return UsedLQEntries++; }
-  unsigned assignSQSlot() { return UsedSQEntries++; }
+  void acquireLQSlot() { ++UsedLQEntries; }
+  void acquireSQSlot() { ++UsedSQEntries; }
+  void releaseLQSlot() { --UsedLQEntries; }
+  void releaseSQSlot() { --UsedSQEntries; }
 
   bool assumeNoAlias() const { return NoAlias; }
 
@@ -285,13 +308,18 @@
 
   unsigned createMemoryGroup() {
     Groups.insert(
-        std::make_pair(NextGroupID, llvm::make_unique<MemoryGroup>()));
+        std::make_pair(NextGroupID, std::make_unique<MemoryGroup>()));
     return NextGroupID++;
   }
 
-  // Instruction executed event handlers.
   virtual void onInstructionExecuted(const InstRef &IR);
 
+  // Loads are tracked by the LDQ (load queue) from dispatch until completion.
+  // Stores are tracked by the STQ (store queue) from dispatch until commitment.
+  // By default we conservatively assume that the LDQ receives a load at
+  // dispatch. Loads leave the LDQ at retirement stage.
+  virtual void onInstructionRetired(const InstRef &IR);
+
   virtual void onInstructionIssued(const InstRef &IR) {
     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
     Groups[GroupID]->onInstructionIssued(IR);
@@ -407,6 +435,7 @@
   unsigned CurrentLoadGroupID;
   unsigned CurrentLoadBarrierGroupID;
   unsigned CurrentStoreGroupID;
+  unsigned CurrentStoreBarrierGroupID;
 
 public:
   LSUnit(const MCSchedModel &SM)
@@ -415,7 +444,8 @@
       : LSUnit(SM, LQ, SQ, /* NoAlias */ false) {}
   LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ, bool AssumeNoAlias)
       : LSUnitBase(SM, LQ, SQ, AssumeNoAlias), CurrentLoadGroupID(0),
-        CurrentLoadBarrierGroupID(0), CurrentStoreGroupID(0) {}
+        CurrentLoadBarrierGroupID(0), CurrentStoreGroupID(0),
+        CurrentStoreBarrierGroupID(0) {}
 
   /// Returns LSU_AVAILABLE if there are enough load/store queue entries to
   /// accomodate instruction IR.
@@ -436,9 +466,6 @@
   /// 6. A store has to wait until an older store barrier is fully executed.
   unsigned dispatch(const InstRef &IR) override;
 
-  // FIXME: For simplicity, we optimistically assume a similar behavior for
-  // store instructions. In practice, store operations don't tend to leave the
-  // store queue until they reach the 'Retired' stage (See PR39830).
   void onInstructionExecuted(const InstRef &IR) override;
 };
 
diff --git a/linux-x64/clang/include/llvm/MCA/HardwareUnits/RegisterFile.h b/linux-x64/clang/include/llvm/MCA/HardwareUnits/RegisterFile.h
index 3650632..e8ca348 100644
--- a/linux-x64/clang/include/llvm/MCA/HardwareUnits/RegisterFile.h
+++ b/linux-x64/clang/include/llvm/MCA/HardwareUnits/RegisterFile.h
@@ -22,7 +22,6 @@
 #include "llvm/MC/MCSchedule.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MCA/HardwareUnits/HardwareUnit.h"
-#include "llvm/Support/Error.h"
 
 namespace llvm {
 namespace mca {
@@ -84,7 +83,7 @@
   // the target name).
   //
   // Users can limit the number of physical registers that are available in
-  // regsiter file #0 specifying command line flag `-register-file-size=<uint>`.
+  // register file #0 specifying command line flag `-register-file-size=<uint>`.
   SmallVector<RegisterMappingTracker, 4> RegisterFiles;
 
   // This type is used to propagate information about the owner of a register,
@@ -220,7 +219,7 @@
   //
   // Current implementation can simulate up to 32 register files (including the
   // special register file at index #0).
-  unsigned isAvailable(ArrayRef<unsigned> Regs) const;
+  unsigned isAvailable(ArrayRef<MCPhysReg> Regs) const;
 
   // Returns the number of PRFs implemented by this processor.
   unsigned getNumRegisterFiles() const { return RegisterFiles.size(); }
diff --git a/linux-x64/clang/include/llvm/MCA/HardwareUnits/ResourceManager.h b/linux-x64/clang/include/llvm/MCA/HardwareUnits/ResourceManager.h
index 2f91185..b6d4e34 100644
--- a/linux-x64/clang/include/llvm/MCA/HardwareUnits/ResourceManager.h
+++ b/linux-x64/clang/include/llvm/MCA/HardwareUnits/ResourceManager.h
@@ -15,7 +15,6 @@
 #ifndef LLVM_MCA_RESOURCE_MANAGER_H
 #define LLVM_MCA_RESOURCE_MANAGER_H
 
-#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCSchedule.h"
@@ -33,8 +32,7 @@
 /// with a buffer size of -1 is always available if it is not reserved.
 ///
 /// Values of type ResourceStateEvent are returned by method
-/// ResourceState::isBufferAvailable(), which is used to query the internal
-/// state of a resource.
+/// ResourceManager::canBeDispatched()
 ///
 /// The naming convention for resource state events is:
 ///  * Event names start with prefix RS_
@@ -263,16 +261,26 @@
   /// Returns RS_BUFFER_UNAVAILABLE if there are no available slots.
   ResourceStateEvent isBufferAvailable() const;
 
-  /// Reserve a slot in the buffer.
-  void reserveBuffer() {
-    if (AvailableSlots)
-      AvailableSlots--;
+  /// Reserve a buffer slot.
+  ///
+  /// Returns true if the buffer is not full.
+  /// It always returns true if BufferSize is set to zero.
+  bool reserveBuffer() {
+    if (BufferSize <= 0)
+      return true;
+
+    --AvailableSlots;
+    assert(AvailableSlots <= static_cast<unsigned>(BufferSize));
+    return AvailableSlots;
   }
 
-  /// Release a slot in the buffer.
+  /// Releases a slot in the buffer.
   void releaseBuffer() {
-    if (BufferSize > 0)
-      AvailableSlots++;
+    // Ignore dispatch hazards or invalid buffer sizes.
+    if (BufferSize <= 0)
+      return;
+
+    ++AvailableSlots;
     assert(AvailableSlots <= static_cast<unsigned>(BufferSize));
   }
 
@@ -351,9 +359,16 @@
   // Set of processor resource units that are available during this cycle.
   uint64_t AvailableProcResUnits;
 
-  // Set of processor resource groups that are currently reserved.
+  // Set of processor resources that are currently reserved.
   uint64_t ReservedResourceGroups;
 
+  // Set of unavailable scheduler buffer resources. This is used internally to
+  // speedup `canBeDispatched()` queries.
+  uint64_t AvailableBuffers;
+
+  // Set of dispatch hazard buffer resources that are currently unavailable.
+  uint64_t ReservedBuffers;
+
   // Returns the actual resource unit that will be used.
   ResourceRef selectPipe(uint64_t ResourceID);
 
@@ -382,17 +397,20 @@
 
   // Returns RS_BUFFER_AVAILABLE if buffered resources are not reserved, and if
   // there are enough available slots in the buffers.
-  ResourceStateEvent canBeDispatched(ArrayRef<uint64_t> Buffers) const;
+  ResourceStateEvent canBeDispatched(uint64_t ConsumedBuffers) const;
 
   // Return the processor resource identifier associated to this Mask.
   unsigned resolveResourceMask(uint64_t Mask) const;
 
-  // Consume a slot in every buffered resource from array 'Buffers'. Resource
-  // units that are dispatch hazards (i.e. BufferSize=0) are marked as reserved.
-  void reserveBuffers(ArrayRef<uint64_t> Buffers);
+  // Acquires a slot from every buffered resource in mask `ConsumedBuffers`.
+  // Units that are dispatch hazards (i.e. BufferSize=0) are marked as reserved.
+  void reserveBuffers(uint64_t ConsumedBuffers);
 
-  // Release buffer entries previously allocated by method reserveBuffers.
-  void releaseBuffers(ArrayRef<uint64_t> Buffers);
+  // Releases a slot from every buffered resource in mask `ConsumedBuffers`.
+  // ConsumedBuffers is a bitmask of previously acquired buffers (using method
+  // `reserveBuffers`). Units that are dispatch hazards (i.e. BufferSize=0) are
+  // not automatically unreserved by this method.
+  void releaseBuffers(uint64_t ConsumedBuffers);
 
   // Reserve a processor resource. A reserved resource is not available for
   // instruction issue until it is released.
diff --git a/linux-x64/clang/include/llvm/MCA/HardwareUnits/RetireControlUnit.h b/linux-x64/clang/include/llvm/MCA/HardwareUnits/RetireControlUnit.h
index 0629014..acbd454 100644
--- a/linux-x64/clang/include/llvm/MCA/HardwareUnits/RetireControlUnit.h
+++ b/linux-x64/clang/include/llvm/MCA/HardwareUnits/RetireControlUnit.h
@@ -57,34 +57,43 @@
 private:
   unsigned NextAvailableSlotIdx;
   unsigned CurrentInstructionSlotIdx;
-  unsigned AvailableSlots;
+  unsigned NumROBEntries;
+  unsigned AvailableEntries;
   unsigned MaxRetirePerCycle; // 0 means no limit.
   std::vector<RUToken> Queue;
 
-public:
-  RetireControlUnit(const MCSchedModel &SM);
-
-  bool isEmpty() const { return AvailableSlots == Queue.size(); }
-  bool isAvailable(unsigned Quantity = 1) const {
+  unsigned normalizeQuantity(unsigned Quantity) const {
     // Some instructions may declare a number of uOps which exceeds the size
     // of the reorder buffer. To avoid problems, cap the amount of slots to
     // the size of the reorder buffer.
-    Quantity = std::min(Quantity, static_cast<unsigned>(Queue.size()));
+    Quantity = std::min(Quantity, NumROBEntries);
 
     // Further normalize the number of micro opcodes for instructions that
     // declare zero opcodes. This should match the behavior of method
     // reserveSlot().
-    Quantity = std::max(Quantity, 1U);
-    return AvailableSlots >= Quantity;
+    return std::max(Quantity, 1U);
+  }
+
+  unsigned computeNextSlotIdx() const;
+
+public:
+  RetireControlUnit(const MCSchedModel &SM);
+
+  bool isEmpty() const { return AvailableEntries == NumROBEntries; }
+
+  bool isAvailable(unsigned Quantity = 1) const {
+    return AvailableEntries >= normalizeQuantity(Quantity);
   }
 
   unsigned getMaxRetirePerCycle() const { return MaxRetirePerCycle; }
 
-  // Reserves a number of slots, and returns a new token.
-  unsigned reserveSlot(const InstRef &IS, unsigned NumMicroOps);
+  // Reserves a number of slots, and returns a new token reference.
+  unsigned dispatch(const InstRef &IS);
 
   // Return the current token from the RCU's circular token queue.
-  const RUToken &peekCurrentToken() const;
+  const RUToken &getCurrentToken() const;
+
+  const RUToken &peekNextToken() const;
 
   // Advance the pointer to the next token in the circular token queue.
   void consumeCurrentToken();
diff --git a/linux-x64/clang/include/llvm/MCA/HardwareUnits/Scheduler.h b/linux-x64/clang/include/llvm/MCA/HardwareUnits/Scheduler.h
index 27beb84..0293364 100644
--- a/linux-x64/clang/include/llvm/MCA/HardwareUnits/Scheduler.h
+++ b/linux-x64/clang/include/llvm/MCA/HardwareUnits/Scheduler.h
@@ -68,7 +68,7 @@
 /// instructions from the dispatch stage, until the write-back stage.
 ///
 class Scheduler : public HardwareUnit {
-  LSUnit &LSU;
+  LSUnitBase &LSU;
 
   // Instruction selection strategy for this Scheduler.
   std::unique_ptr<SchedulerStrategy> Strategy;
@@ -154,15 +154,15 @@
   bool promoteToPendingSet(SmallVectorImpl<InstRef> &Pending);
 
 public:
-  Scheduler(const MCSchedModel &Model, LSUnit &Lsu)
+  Scheduler(const MCSchedModel &Model, LSUnitBase &Lsu)
       : Scheduler(Model, Lsu, nullptr) {}
 
-  Scheduler(const MCSchedModel &Model, LSUnit &Lsu,
+  Scheduler(const MCSchedModel &Model, LSUnitBase &Lsu,
             std::unique_ptr<SchedulerStrategy> SelectStrategy)
-      : Scheduler(make_unique<ResourceManager>(Model), Lsu,
+      : Scheduler(std::make_unique<ResourceManager>(Model), Lsu,
                   std::move(SelectStrategy)) {}
 
-  Scheduler(std::unique_ptr<ResourceManager> RM, LSUnit &Lsu,
+  Scheduler(std::unique_ptr<ResourceManager> RM, LSUnitBase &Lsu,
             std::unique_ptr<SchedulerStrategy> SelectStrategy)
       : LSU(Lsu), Resources(std::move(RM)), BusyResourceUnits(0),
         NumDispatchedToThePendingSet(0), HadTokenStall(false) {
@@ -228,6 +228,9 @@
                   SmallVectorImpl<InstRef> &Ready);
 
   /// Convert a resource mask into a valid llvm processor resource identifier.
+  ///
+  /// Only the most significant bit of the Mask is used by this method to
+  /// identify the processor resource.
   unsigned getResourceID(uint64_t Mask) const {
     return Resources->resolveResourceMask(Mask);
   }
@@ -264,9 +267,9 @@
   // This routine performs a sanity check.  This routine should only be called
   // when we know that 'IR' is not in the scheduler's instruction queues.
   void sanityCheck(const InstRef &IR) const {
-    assert(find(WaitSet, IR) == WaitSet.end() && "Already in the wait set!");
-    assert(find(ReadySet, IR) == ReadySet.end() && "Already in the ready set!");
-    assert(find(IssuedSet, IR) == IssuedSet.end() && "Already executing!");
+    assert(!is_contained(WaitSet, IR) && "Already in the wait set!");
+    assert(!is_contained(ReadySet, IR) && "Already in the ready set!");
+    assert(!is_contained(IssuedSet, IR) && "Already executing!");
   }
 #endif // !NDEBUG
 };
diff --git a/linux-x64/clang/include/llvm/MCA/Instruction.h b/linux-x64/clang/include/llvm/MCA/Instruction.h
index d4d3f22..c97cb46 100644
--- a/linux-x64/clang/include/llvm/MCA/Instruction.h
+++ b/linux-x64/clang/include/llvm/MCA/Instruction.h
@@ -18,6 +18,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/MC/MCRegister.h" // definition of MCPhysReg.
 #include "llvm/Support/MathExtras.h"
 
 #ifndef NDEBUG
@@ -42,7 +43,7 @@
   unsigned Latency;
   // This field is set to a value different than zero only if this
   // is an implicit definition.
-  unsigned RegisterID;
+  MCPhysReg RegisterID;
   // Instruction itineraries would set this field to the SchedClass ID.
   // Otherwise, it defaults to the WriteResourceID from the MCWriteLatencyEntry
   // element associated to this write.
@@ -70,7 +71,7 @@
   // uses always come first in the sequence of uses.
   unsigned UseIndex;
   // This field is only set if this is an implicit read.
-  unsigned RegisterID;
+  MCPhysReg RegisterID;
   // Scheduling Class Index. It is used to query the scheduling model for the
   // MCSchedClassDesc object.
   unsigned SchedClassID;
@@ -85,7 +86,7 @@
 /// Field RegID is set to the invalid register for memory dependencies.
 struct CriticalDependency {
   unsigned IID;
-  unsigned RegID;
+  MCPhysReg RegID;
   unsigned Cycles;
 };
 
@@ -106,7 +107,7 @@
   // to speedup queries on the register file.
   // For implicit writes, this field always matches the value of
   // field RegisterID from WD.
-  unsigned RegisterID;
+  MCPhysReg RegisterID;
 
   // Physical register file that serves register RegisterID.
   unsigned PRFID;
@@ -146,7 +147,7 @@
   SmallVector<std::pair<ReadState *, int>, 4> Users;
 
 public:
-  WriteState(const WriteDescriptor &Desc, unsigned RegID,
+  WriteState(const WriteDescriptor &Desc, MCPhysReg RegID,
              bool clearsSuperRegs = false, bool writesZero = false)
       : WD(&Desc), CyclesLeft(UNKNOWN_CYCLES), RegisterID(RegID), PRFID(0),
         ClearsSuperRegs(clearsSuperRegs), WritesZero(writesZero),
@@ -158,7 +159,7 @@
 
   int getCyclesLeft() const { return CyclesLeft; }
   unsigned getWriteResourceID() const { return WD->SClassOrWriteResourceID; }
-  unsigned getRegisterID() const { return RegisterID; }
+  MCPhysReg getRegisterID() const { return RegisterID; }
   unsigned getRegisterFileID() const { return PRFID; }
   unsigned getLatency() const { return WD->Latency; }
   unsigned getDependentWriteCyclesLeft() const {
@@ -200,7 +201,7 @@
   }
 
   void setDependentWrite(const WriteState *Other) { DependentWrite = Other; }
-  void writeStartEvent(unsigned IID, unsigned RegID, unsigned Cycles);
+  void writeStartEvent(unsigned IID, MCPhysReg RegID, unsigned Cycles);
   void setWriteZero() { WritesZero = true; }
   void setEliminated() {
     assert(Users.empty() && "Write is in an inconsistent state.");
@@ -226,7 +227,7 @@
 class ReadState {
   const ReadDescriptor *RD;
   // Physical register identified associated to this read.
-  unsigned RegisterID;
+  MCPhysReg RegisterID;
   // Physical register file that serves register RegisterID.
   unsigned PRFID;
   // Number of writes that contribute to the definition of RegisterID.
@@ -253,14 +254,14 @@
   bool IndependentFromDef;
 
 public:
-  ReadState(const ReadDescriptor &Desc, unsigned RegID)
+  ReadState(const ReadDescriptor &Desc, MCPhysReg RegID)
       : RD(&Desc), RegisterID(RegID), PRFID(0), DependentWrites(0),
         CyclesLeft(UNKNOWN_CYCLES), TotalCycles(0), CRD(), IsReady(true),
         IsZero(false), IndependentFromDef(false) {}
 
   const ReadDescriptor &getDescriptor() const { return *RD; }
   unsigned getSchedClass() const { return RD->SchedClassID; }
-  unsigned getRegisterID() const { return RegisterID; }
+  MCPhysReg getRegisterID() const { return RegisterID; }
   unsigned getRegisterFileID() const { return PRFID; }
   const CriticalDependency &getCriticalRegDep() const { return CRD; }
 
@@ -272,7 +273,7 @@
   void setIndependentFromDef() { IndependentFromDef = true; }
 
   void cycleEvent();
-  void writeStartEvent(unsigned IID, unsigned RegID, unsigned Cycles);
+  void writeStartEvent(unsigned IID, MCPhysReg RegID, unsigned Cycles);
   void setDependentWrites(unsigned Writes) {
     DependentWrites = Writes;
     IsReady = !Writes;
@@ -352,11 +353,14 @@
   // reports the number of "consumed cycles".
   SmallVector<std::pair<uint64_t, ResourceUsage>, 4> Resources;
 
-  // A list of buffered resources consumed by this instruction.
-  SmallVector<uint64_t, 4> Buffers;
+  // A bitmask of used hardware buffers.
+  uint64_t UsedBuffers;
 
-  unsigned UsedProcResUnits;
-  unsigned UsedProcResGroups;
+  // A bitmask of used processor resource units.
+  uint64_t UsedProcResUnits;
+
+  // A bitmask of used processor resource groups.
+  uint64_t UsedProcResGroups;
 
   unsigned MaxLatency;
   // Number of MicroOps for this instruction.
@@ -414,6 +418,7 @@
   const InstrDesc &getDesc() const { return Desc; }
 
   unsigned getLatency() const { return Desc.MaxLatency; }
+  unsigned getNumMicroOps() const { return Desc.NumMicroOps; }
 
   bool hasDependentUsers() const {
     return any_of(Defs,
@@ -463,6 +468,12 @@
   // operation.
   unsigned LSUTokenID;
 
+  // A resource mask which identifies buffered resources consumed by this
+  // instruction at dispatch stage. In the absence of macro-fusion, this value
+  // should always match the value of field `UsedBuffers` from the instruction
+  // descriptor (see field InstrBase::Desc).
+  uint64_t UsedBuffers;
+
   // Critical register dependency.
   CriticalDependency CriticalRegDep;
 
@@ -480,12 +491,18 @@
 public:
   Instruction(const InstrDesc &D)
       : InstructionBase(D), Stage(IS_INVALID), CyclesLeft(UNKNOWN_CYCLES),
-        RCUTokenID(0), LSUTokenID(0), CriticalRegDep(), CriticalMemDep(),
-        CriticalResourceMask(0), IsEliminated(false) {}
+        RCUTokenID(0), LSUTokenID(0), UsedBuffers(D.UsedBuffers),
+        CriticalRegDep(), CriticalMemDep(), CriticalResourceMask(0),
+        IsEliminated(false) {}
 
   unsigned getRCUTokenID() const { return RCUTokenID; }
   unsigned getLSUTokenID() const { return LSUTokenID; }
   void setLSUTokenID(unsigned LSUTok) { LSUTokenID = LSUTok; }
+
+  uint64_t getUsedBuffers() const { return UsedBuffers; }
+  void setUsedBuffers(uint64_t Mask) { UsedBuffers = Mask; }
+  void clearUsedBuffers() { UsedBuffers = 0ULL; }
+
   int getCyclesLeft() const { return CyclesLeft; }
 
   // Transition to the dispatch stage, and assign a RCUToken to this
diff --git a/linux-x64/clang/include/llvm/MCA/Pipeline.h b/linux-x64/clang/include/llvm/MCA/Pipeline.h
index 935033f..0ac988c 100644
--- a/linux-x64/clang/include/llvm/MCA/Pipeline.h
+++ b/linux-x64/clang/include/llvm/MCA/Pipeline.h
@@ -15,8 +15,6 @@
 #ifndef LLVM_MCA_PIPELINE_H
 #define LLVM_MCA_PIPELINE_H
 
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/MCA/HardwareUnits/Scheduler.h"
 #include "llvm/MCA/Stages/Stage.h"
 #include "llvm/Support/Error.h"
 
diff --git a/linux-x64/clang/include/llvm/MCA/SourceMgr.h b/linux-x64/clang/include/llvm/MCA/SourceMgr.h
index dbe31db..e844171 100644
--- a/linux-x64/clang/include/llvm/MCA/SourceMgr.h
+++ b/linux-x64/clang/include/llvm/MCA/SourceMgr.h
@@ -16,12 +16,13 @@
 #define LLVM_MCA_SOURCEMGR_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/MCA/Instruction.h"
 
 namespace llvm {
 namespace mca {
 
-class Instruction;
-
+// MSVC >= 19.15, < 19.20 need to see the definition of class Instruction to
+// prevent compiler error C2139 about intrinsic type trait '__is_assignable'.
 typedef std::pair<unsigned, const Instruction &> SourceRef;
 
 class SourceMgr {
diff --git a/linux-x64/clang/include/llvm/MCA/Stages/DispatchStage.h b/linux-x64/clang/include/llvm/MCA/Stages/DispatchStage.h
index d80eded..597f731 100644
--- a/linux-x64/clang/include/llvm/MCA/Stages/DispatchStage.h
+++ b/linux-x64/clang/include/llvm/MCA/Stages/DispatchStage.h
@@ -20,7 +20,6 @@
 
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MCA/HWEventListener.h"
 #include "llvm/MCA/HardwareUnits/RegisterFile.h"
 #include "llvm/MCA/HardwareUnits/RetireControlUnit.h"
 #include "llvm/MCA/Instruction.h"
diff --git a/linux-x64/clang/include/llvm/MCA/Stages/RetireStage.h b/linux-x64/clang/include/llvm/MCA/Stages/RetireStage.h
index 08c216a..f471368 100644
--- a/linux-x64/clang/include/llvm/MCA/Stages/RetireStage.h
+++ b/linux-x64/clang/include/llvm/MCA/Stages/RetireStage.h
@@ -16,6 +16,7 @@
 #ifndef LLVM_MCA_RETIRE_STAGE_H
 #define LLVM_MCA_RETIRE_STAGE_H
 
+#include "llvm/MCA/HardwareUnits/LSUnit.h"
 #include "llvm/MCA/HardwareUnits/RegisterFile.h"
 #include "llvm/MCA/HardwareUnits/RetireControlUnit.h"
 #include "llvm/MCA/Stages/Stage.h"
@@ -27,13 +28,14 @@
   // Owner will go away when we move listeners/eventing to the stages.
   RetireControlUnit &RCU;
   RegisterFile &PRF;
+  LSUnitBase &LSU;
 
   RetireStage(const RetireStage &Other) = delete;
   RetireStage &operator=(const RetireStage &Other) = delete;
 
 public:
-  RetireStage(RetireControlUnit &R, RegisterFile &F)
-      : Stage(), RCU(R), PRF(F) {}
+  RetireStage(RetireControlUnit &R, RegisterFile &F, LSUnitBase &LS)
+      : Stage(), RCU(R), PRF(F), LSU(LS) {}
 
   bool hasWorkToComplete() const override { return !RCU.isEmpty(); }
   Error cycleStart() override;