From 215e6f962d8d6cdec210aea3231474c20d4ac64f Mon Sep 17 00:00:00 2001
From: Vang Thao <vangthao@csus.edu>
Date: Fri, 6 Mar 2020 23:35:25 -0800
Subject: [PATCH 01/40] Added notes on how to possibly start.

---
 include/opt-sched/Scheduler/data_dep.h        |  1 +
 .../opt-sched/Scheduler/sched_basic_data.h    | 21 +++++++++++++++
 lib/Scheduler/bb_spill.cpp                    |  8 ++++++
 lib/Scheduler/data_dep.cpp                    |  6 +++--
 lib/Scheduler/enumerator.cpp                  | 12 +++++++++
 lib/Scheduler/sched_basic_data.cpp            |  4 +++
 lib/Wrapper/OptSchedDDGWrapperBasic.cpp       | 26 +++++++++++++++++++
 lib/Wrapper/OptimizingScheduler.cpp           |  1 +
 8 files changed, 77 insertions(+), 2 deletions(-)

diff --git a/include/opt-sched/Scheduler/data_dep.h b/include/opt-sched/Scheduler/data_dep.h
index 48dda038..d0885fd0 100644
--- a/include/opt-sched/Scheduler/data_dep.h
+++ b/include/opt-sched/Scheduler/data_dep.h
@@ -386,6 +386,7 @@ class DataDepGraph : public llvm::opt_sched::OptSchedDDGWrapperBase,
 
   SchedInstruction *CreateNode_(InstCount instNum, char const *const instName,
                                 InstType instType, char const *const opCode,
+                                /* bool InstrMayLoad, bool InstrMayStore,*/
                                 int nodeID, InstCount fileSchedOrder,
                                 InstCount fileSchedCycle, InstCount fileLB,
                                 InstCount fileUB, int blkNum);
diff --git a/include/opt-sched/Scheduler/sched_basic_data.h b/include/opt-sched/Scheduler/sched_basic_data.h
index 52306e82..6e3ed08b 100644
--- a/include/opt-sched/Scheduler/sched_basic_data.h
+++ b/include/opt-sched/Scheduler/sched_basic_data.h
@@ -425,6 +425,18 @@ class SchedInstruction : public GraphNode {
 
   InstType GetCrtclPathFrmRoot() { return crtclPathFrmRoot_; }
 
+  /// Return true if this instruction could possibly read memory 
+  /// Copied from https://llvm.org/doxygen/classllvm_1_1MachineInstr.html
+  // bool mayLoad() { return MayLoad; }
+
+  /// Return true if this instruction could possibly modify memory.
+  /// Copied from https://llvm.org/doxygen/classllvm_1_1MachineInstr.html
+  // bool mayStore() { return MayStore; }
+
+  /// Set MayCluster to true if clustering memory operations was found
+  /// to be possible.
+  // void setMayCluster () { MayCluster = true; }
+
   friend class SchedRange;
 
 protected:
@@ -432,6 +444,15 @@ class SchedInstruction : public GraphNode {
   string name_;
   // The mnemonic of this instruction, e.g. "add" or "jmp".
   string opCode_;
+  /// Indicate if this instruction may be a load operation
+  // bool MayLoad;
+  /// Indicate if this instruction may be a store operation
+  // bool MayStore;
+  /// Data structure to store a possible clustering with other isntructions.
+  /// This data structure should have a fast lookup operation.
+  // dataStructure PossibleClustures;
+  /// This value should be set to true if clustering may be possible.
+  // bool MayCluster;
   // A numberical ID for this instruction.
   int nodeID_;
   // The type of this instruction.
diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp
index 736a38ad..815dc277 100644
--- a/lib/Scheduler/bb_spill.cpp
+++ b/lib/Scheduler/bb_spill.cpp
@@ -431,6 +431,14 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst,
   int liveRegs;
   InstCount newSpillCost;
 
+  // Possibly keep track of the current memory clustering size here
+  // and in UpdateSpillInfoForUnSchdul_()
+  // if inst->mayCluster() then
+  //   if current instruction is already part of a cluster then
+  //       increment cluster size by 1 
+  //   else if not in a cluster then
+  //       start clustering by initializing cluster values
+  
   defCnt = inst->GetDefs(defs);
   useCnt = inst->GetUses(uses);
 
diff --git a/lib/Scheduler/data_dep.cpp b/lib/Scheduler/data_dep.cpp
index 5974f496..65d2f0b8 100644
--- a/lib/Scheduler/data_dep.cpp
+++ b/lib/Scheduler/data_dep.cpp
@@ -824,8 +824,10 @@ FUNC_RESULT DataDepGraph::SkipGraph(SpecsBuffer *buf, bool &endOfFileReached) {
 
 SchedInstruction *DataDepGraph::CreateNode_(
     InstCount instNum, char const *const instName, InstType instType,
-    char const *const opCode, int nodeID, InstCount fileSchedOrder,
-    InstCount fileSchedCycle, InstCount fileLB, InstCount fileUB, int blkNum) {
+    char const *const opCode,
+    /* bool InstrMayLoad, bool InstrMayStore,*/ int nodeID,
+    InstCount fileSchedOrder, InstCount fileSchedCycle, InstCount fileLB,
+    InstCount fileUB, int blkNum) {
 
   SchedInstruction *newInstPtr;
   newInstPtr = new SchedInstruction(instNum, instName, instType, opCode,
diff --git a/lib/Scheduler/enumerator.cpp b/lib/Scheduler/enumerator.cpp
index d9c4e3b1..e94f2170 100644
--- a/lib/Scheduler/enumerator.cpp
+++ b/lib/Scheduler/enumerator.cpp
@@ -982,6 +982,18 @@ bool Enumerator::FindNxtFsblBrnch_(EnumTreeNode *&newNode) {
   if (crntBrnchNum == 0 && SchedForRPOnly_)
     crntNode_->SetFoundInstWithUse(IsUseInRdyLst_());
 
+  // Note: This is just a thought, we might not need this here.
+  // Check if clustering is possible.
+  // We want to only do memory clustering in the second pass for now.
+  // if (crntBrnchNum == 0 && EnableMemClustering && SecondPass)
+  //   // TODO: Implement these functions/attributes
+  //   // and implement cost. Also keep track of current 
+  //   // cluster size since we do not want to exceed 15 
+  //   // memory operations in a cluster (This and the cost
+  //   // is probably done somewhere else and not here).
+  //   ClusteringPossible = crntNode_->CheckForClustering();
+  //   crntNode_->SetClusteringPossible(ClusteringPossible);
+
   for (i = crntBrnchNum; i < brnchCnt && crntNode_->IsFeasible(); i++) {
 #ifdef IS_DEBUG_FLOW
     Logger::Info("Probing branch %d out of %d", i, brnchCnt);
diff --git a/lib/Scheduler/sched_basic_data.cpp b/lib/Scheduler/sched_basic_data.cpp
index ef552365..bdec48cb 100644
--- a/lib/Scheduler/sched_basic_data.cpp
+++ b/lib/Scheduler/sched_basic_data.cpp
@@ -6,6 +6,7 @@ using namespace llvm::opt_sched;
 
 SchedInstruction::SchedInstruction(InstCount num, const string &name,
                                    InstType instType, const string &opCode,
+                                   /* bool InstrMayLoad, bool InstrMayStore,*/
                                    InstCount maxInstCnt, int nodeID,
                                    InstCount fileSchedOrder,
                                    InstCount fileSchedCycle, InstCount fileLB,
@@ -15,6 +16,9 @@ SchedInstruction::SchedInstruction(InstCount num, const string &name,
   name_ = name;
   opCode_ = opCode;
   instType_ = instType;
+  // MayLoad = InstrMayLoad;
+  // MayStore = InstrMayStore;
+  // MayCluster = false;
 
   frwrdLwrBound_ = INVALID_VALUE;
   bkwrdLwrBound_ = INVALID_VALUE;
diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
index ba6985cf..b7d3444f 100644
--- a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
+++ b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
@@ -376,6 +376,8 @@ inline void OptSchedDDGWrapperBasic::setupRoot() {
   int RootNum = DAG->SUnits.size();
   root_ = CreateNode_(RootNum, "artificial",
                       MM->GetInstTypeByName("artificial"), "__optsched_entry",
+                      // mayLoad = false;
+                      // mayStore = false;
                       RootNum, // nodeID
                       RootNum, // fileSchedOrder
                       RootNum, // fileSchedCycle
@@ -394,6 +396,8 @@ inline void OptSchedDDGWrapperBasic::setupLeaf() {
   int LeafNum = DAG->SUnits.size() + 1;
   CreateNode_(LeafNum, "artificial", MM->GetInstTypeByName("artificial"),
               "__optsched_exit",
+              // mayLoad = false;
+              // mayStore = false;
               LeafNum, // nodeID
               LeafNum, // fileSchedOrder
               LeafNum, // fileSchedCycle
@@ -467,6 +471,8 @@ void OptSchedDDGWrapperBasic::convertSUnit(const SUnit &SU) {
   }
 
   CreateNode_(SU.NodeNum, InstName.c_str(), InstType, InstName.c_str(),
+              // MI->mayLoad()
+              // MI->mayStore()
               SU.NodeNum, // nodeID
               SU.NodeNum, // fileSchedOrder
               SU.NodeNum, // fileSchedCycle
@@ -500,6 +506,26 @@ void OptSchedDDGWrapperBasic::countBoundaryLiveness(
   }
 }
 
+
+/// Iterate through SUnits and find all possible clustering then transfer
+/// the information over to the SchedInstruction class as a bitvector.
+/// Partially copied from https://github.com/llvm/llvm-project/blob/master/llvm/lib/CodeGen/MachineScheduler.cpp#L1615
+// void findPossibleClusters() {
+//   Copy how LLVM handles clustering except instead of actually
+//   modifying the DAG, we can possibly set MayCluster to true.
+//   Then add the nodes that can be clustered together into a
+//   data structure.
+
+//   for (auto &SU : DAG->SUnits) {
+//     if ((IsLoad && !SU.getInstr()->mayLoad()) ||
+//        (!IsLoad && !SU.getInstr()->mayStore()))
+//       continue;
+//      ...
+//      ...
+//   }
+//      ...
+// }
+
 LLVMRegTypeFilter::LLVMRegTypeFilter(
     const MachineModel *MM, const llvm::TargetRegisterInfo *TRI,
     const std::vector<unsigned> &RegionPressure, float RegFilterFactor)
diff --git a/lib/Wrapper/OptimizingScheduler.cpp b/lib/Wrapper/OptimizingScheduler.cpp
index 5d0416c5..be70dfa2 100644
--- a/lib/Wrapper/OptimizingScheduler.cpp
+++ b/lib/Wrapper/OptimizingScheduler.cpp
@@ -380,6 +380,7 @@ void ScheduleDAGOptSched::schedule() {
       OST->createDDGWrapper(C, this, MM.get(), LatencyPrecision, RegionName);
   DDG->convertSUnits();
   DDG->convertRegFiles();
+  // DDG->findPossibleClusters();
 
   auto *BDDG = static_cast<OptSchedDDGWrapperBasic *>(DDG.get());
   addGraphTransformations(BDDG);

From 449456b09f718c04107fd21b65d526646c24ff07 Mon Sep 17 00:00:00 2001
From: Vang Thao <vangthao@csus.edu>
Date: Mon, 9 Mar 2020 18:21:50 -0700
Subject: [PATCH 02/40] Idea on how to implement checking if an instruction is
 part of a cluster and potential issues.

---
 lib/Scheduler/bb_spill.cpp | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp
index 815dc277..527ac0cd 100644
--- a/lib/Scheduler/bb_spill.cpp
+++ b/lib/Scheduler/bb_spill.cpp
@@ -434,10 +434,29 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst,
   // Possibly keep track of the current memory clustering size here
   // and in UpdateSpillInfoForUnSchdul_()
   // if inst->mayCluster() then
-  //   if current instruction is already part of a cluster then
+
+  //   // Can use bit operations to check if it is part of an active clustering
+  //   // Possible implementation: if (curClusterBitVector[inst->GetNum])
+  //   if curInst is part of an active cluster then
   //       increment cluster size by 1 
   //   else if not in a cluster then
   //       start clustering by initializing cluster values
+  //       // Possibly use bit operations to activate part of cluster
+  //       // Ex:
+  //       // Instr 0, 3, 4 can be clustered and there are 5 total instructions
+  //       // curClusterBitVector Bitvector: 11001
+  //
+  // Potential Issues: 
+  // 1. How to implement this when un-scheduling? Need to keep track if new instruction disable a cluster
+  //     so that when we backtrack, we can re-activate the cluster.
+  // 2. Keeping track of the average clustering size when we aren't done scheduling.
+  //    Cost function that was discussed during the meeting on Friday:
+  //      (15 - averageClusteringSize) * ClusteringWeight
+  //      We want to minimize this cost but there is an issue in the following example
+  //    Ex: Partial schedule was able to cluster a block of 15. averageClusteringSize : 15, CostFnc: (15-15)*Weight = 0
+  //          Any cluster block below size 15 will decrease the average cluster size and increase the cost.
+  //          This makes our B&B enumerator actually favor not doing clustering.
+
   
   defCnt = inst->GetDefs(defs);
   useCnt = inst->GetUses(uses);

From a6376ab44178f8f04456a9f37c36007ca5d33843 Mon Sep 17 00:00:00 2001
From: Vang Thao <vangthao@csus.edu>
Date: Tue, 10 Mar 2020 19:33:36 -0700
Subject: [PATCH 03/40] Added LLVM's method to check if we should cluster
 MemOps

---
 .../Scheduler/OptSchedDDGWrapperBase.h        |   2 +
 lib/Wrapper/AMDGPU/OptSchedDDGWrapperGCN.cpp  |   2 +-
 lib/Wrapper/OptSchedDDGWrapperBasic.cpp       | 100 +++++++++++++++---
 lib/Wrapper/OptSchedDDGWrapperBasic.h         |  46 ++++++++
 lib/Wrapper/OptimizingScheduler.cpp           |   3 +-
 5 files changed, 139 insertions(+), 14 deletions(-)

diff --git a/include/opt-sched/Scheduler/OptSchedDDGWrapperBase.h b/include/opt-sched/Scheduler/OptSchedDDGWrapperBase.h
index 8eb1499d..4db4673c 100644
--- a/include/opt-sched/Scheduler/OptSchedDDGWrapperBase.h
+++ b/include/opt-sched/Scheduler/OptSchedDDGWrapperBase.h
@@ -17,6 +17,8 @@ class OptSchedDDGWrapperBase {
   virtual void convertSUnits() = 0;
 
   virtual void convertRegFiles() = 0;
+
+  virtual void findPossibleClusters() = 0;
 };
 
 } // namespace opt_sched
diff --git a/lib/Wrapper/AMDGPU/OptSchedDDGWrapperGCN.cpp b/lib/Wrapper/AMDGPU/OptSchedDDGWrapperGCN.cpp
index 57aa0713..0aaf5bc4 100644
--- a/lib/Wrapper/AMDGPU/OptSchedDDGWrapperGCN.cpp
+++ b/lib/Wrapper/AMDGPU/OptSchedDDGWrapperGCN.cpp
@@ -182,7 +182,7 @@ void OptSchedDDGWrapperGCN::convertRegFiles() {
     }
 
   LLVM_DEBUG(DAG->dumpLLVMRegisters());
-  LLVM_DEBUG(dumpOptSchedRegisters());
+  //LLVM_DEBUG(dumpOptSchedRegisters());
 }
 
 void OptSchedDDGWrapperGCN::addSubRegDefs(SchedInstruction *Instr, unsigned Reg,
diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
index b7d3444f..a26b254a 100644
--- a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
+++ b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
@@ -9,6 +9,8 @@
 #include "opt-sched/Scheduler/logger.h"
 #include "opt-sched/Scheduler/register.h"
 #include "opt-sched/Scheduler/sched_basic_data.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -20,12 +22,14 @@
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Target/TargetMachine.h"
+#include <bitset>
 #include <cstdio>
 #include <map>
 #include <queue>
 #include <set>
 #include <stack>
 #include <string>
+#include <utility>
 #include <vector>
 
 #define DEBUG_TYPE "optsched-ddg-wrapper"
@@ -205,7 +209,7 @@ void OptSchedDDGWrapperBasic::addDefsAndUses() {
     }
 
   LLVM_DEBUG(DAG->dumpLLVMRegisters());
-  LLVM_DEBUG(dumpOptSchedRegisters());
+  //LLVM_DEBUG(dumpOptSchedRegisters());
 }
 
 void OptSchedDDGWrapperBasic::addUse(unsigned RegUnit, InstCount Index) {
@@ -506,25 +510,97 @@ void OptSchedDDGWrapperBasic::countBoundaryLiveness(
   }
 }
 
+// Partially copied from
+// https://github.com/RadeonOpenCompute/llvm/blob/roc-ocl-2.4.0/lib/CodeGen/MachineScheduler.cpp#L1554
+void OptSchedDDGWrapperBasic::clusterNeighboringMemOps_(
+    ArrayRef<const SUnit *> MemOps) {
+  SmallVector<MemOpInfo, 32> MemOpRecords;
+  dbgs() << "Processing possible clusters\n";
+  for (const SUnit *SU : MemOps) {
+    dbgs() << "  " << SU->NodeNum << " is in the chain.\n";
+    MachineOperand *BaseOp;
+    int64_t Offset;
+    if (DAG->TII->getMemOperandWithOffset(*SU->getInstr(), BaseOp, Offset, DAG->TRI))
+      MemOpRecords.push_back(MemOpInfo(SU, BaseOp, Offset));
+  }
+
+  if (MemOpRecords.size() < 2) {
+    dbgs() << "  Unable to cluster memop cluster of 1.\n";
+    return;
+  }
+
+  llvm::sort(MemOpRecords);
+  unsigned ClusterLength = 1;
+  for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) {
+    const SUnit *SUa = MemOpRecords[Idx].SU;
+    const SUnit *SUb = MemOpRecords[Idx + 1].SU;
+    dbgs() << "  Checking possible clustering of (" << SUa->NodeNum << ") and (" << SUb->NodeNum << ")\n";
+    if (DAG->TII->shouldClusterMemOps(*MemOpRecords[Idx].BaseOp,
+                                 *MemOpRecords[Idx + 1].BaseOp,
+                                 ClusterLength)) {
+	    dbgs() << "    Cluster possible at SU(" << SUa->NodeNum << ")- SU(" << SUb->NodeNum << ")\n";
+      ++ClusterLength;
+    } else
+      ClusterLength = 1;
+  }
+}
 
 /// Iterate through SUnits and find all possible clustering then transfer
 /// the information over to the SchedInstruction class as a bitvector.
-/// Partially copied from https://github.com/llvm/llvm-project/blob/master/llvm/lib/CodeGen/MachineScheduler.cpp#L1615
-// void findPossibleClusters() {
+/// Partially copied from https://github.com/RadeonOpenCompute/llvm/blob/roc-ocl-2.4.0/lib/CodeGen/MachineScheduler.cpp#L1595
+void OptSchedDDGWrapperBasic::findPossibleClusters() {
 //   Copy how LLVM handles clustering except instead of actually
 //   modifying the DAG, we can possibly set MayCluster to true.
 //   Then add the nodes that can be clustered together into a
 //   data structure.
 
-//   for (auto &SU : DAG->SUnits) {
-//     if ((IsLoad && !SU.getInstr()->mayLoad()) ||
-//        (!IsLoad && !SU.getInstr()->mayStore()))
-//       continue;
-//      ...
-//      ...
-//   }
-//      ...
-// }
+  // Experiment with clustering loads first
+  bool IsLoad = true;
+
+  dbgs() << "Looking for load clusters\n";
+  DenseMap<unsigned, unsigned> StoreChainIDs;
+  // Map each store chain to a set of dependent MemOps.
+  SmallVector<SmallVector<const SUnit *, 4>, 32> StoreChainDependents;
+  for (const SUnit &SU : DAG->SUnits) {
+    if ((IsLoad && !SU.getInstr()->mayLoad()) ||
+        (!IsLoad && !SU.getInstr()->mayStore()))
+      continue;
+    auto MI = SU.getInstr();
+    dbgs() << "  Instruction (" << SU.NodeNum << ") " << DAG->TII->getName(MI->getOpcode())  << " may load.\n";
+
+    unsigned ChainPredID = DAG->SUnits.size();
+    for (const SDep &Pred : SU.Preds) {
+      if (Pred.isCtrl()) {
+        auto PredMI = Pred.getSUnit()->getInstr();
+        dbgs() << "    Breaking chain at (" << Pred.getSUnit()->NodeNum << ") " << DAG->TII->getName(PredMI->getOpcode()) << '\n';
+        ChainPredID = Pred.getSUnit()->NodeNum;
+        break;
+      }
+    }
+    // Check if this chain-like pred has been seen
+    // before. ChainPredID==MaxNodeID at the top of the schedule.
+    unsigned NumChains = StoreChainDependents.size();
+    dbgs() << "    ChainPredID " << ChainPredID << ", NumChains " << NumChains << '\n';
+    std::pair<DenseMap<unsigned, unsigned>::iterator, bool> Result =
+        StoreChainIDs.insert(std::make_pair(ChainPredID, NumChains));
+    if (Result.second)
+      StoreChainDependents.resize(NumChains + 1);
+    dbgs() << "    Pushing (" << SU.NodeNum << ") on the chain.\n";
+    StoreChainDependents[Result.first->second].push_back(&SU);
+    dbgs() << "    inPrinting size of SCD: " << StoreChainDependents.size() << '\n';
+  }
+
+
+  dbgs() << "  outPrinting size of SCD: " << StoreChainDependents.size() << '\n';
+  // Iterate over the store chains.
+  for (auto &SCD : StoreChainDependents) {
+    dbgs() << "    Printing the list before clustering: ";
+    for (auto SU1 : SCD)
+    	dbgs() << SU1->NodeNum << " ";
+    dbgs() << '\n';
+    clusterNeighboringMemOps_(SCD);
+  }
+}
 
 LLVMRegTypeFilter::LLVMRegTypeFilter(
     const MachineModel *MM, const llvm::TargetRegisterInfo *TRI,
diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.h b/lib/Wrapper/OptSchedDDGWrapperBasic.h
index 88631511..9970fab9 100644
--- a/lib/Wrapper/OptSchedDDGWrapperBasic.h
+++ b/lib/Wrapper/OptSchedDDGWrapperBasic.h
@@ -14,6 +14,7 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 #include <map>
 #include <set>
 #include <vector>
@@ -49,6 +50,7 @@ class OptSchedDDGWrapperBasic : public DataDepGraph {
 
   void convertSUnits() override;
   void convertRegFiles() override;
+  void findPossibleClusters() override;
 
 protected:
   // A convenience machMdl_ pointer casted to OptSchedMachineModel*.
@@ -133,6 +135,9 @@ class OptSchedDDGWrapperBasic : public DataDepGraph {
   // Find liveness info generated by the region boundary.
   void discoverBoundaryLiveness(const llvm::MachineInstr *MI);
 
+  void clusterNeighboringMemOps_(
+		      ArrayRef<const SUnit *> MemOps);
+
   // Holds a register live range, mapping a producer to a set of consumers.
   struct LiveRange {
     // The node which defines the register tracked by this live range.
@@ -140,6 +145,47 @@ class OptSchedDDGWrapperBasic : public DataDepGraph {
     // The nodes which use the register tracked by this live range.
     std::vector<SchedInstruction *> consumers;
   };
+
+// Copied from
+// https://github.com/RadeonOpenCompute/llvm/blob/roc-ocl-2.4.0/lib/CodeGen/MachineScheduler.cpp#L1467
+  struct MemOpInfo {
+    const SUnit *SU;
+    MachineOperand *BaseOp;
+    int64_t Offset;
+
+    MemOpInfo(const SUnit *su, MachineOperand *Op, int64_t ofs)
+        : SU(su), BaseOp(Op), Offset(ofs) {}
+
+    bool operator<(const MemOpInfo &RHS) const {
+      if (BaseOp->getType() != RHS.BaseOp->getType())
+        return BaseOp->getType() < RHS.BaseOp->getType();
+
+      if (BaseOp->isReg())
+        return std::make_tuple(BaseOp->getReg(), Offset, SU->NodeNum) <
+               std::make_tuple(RHS.BaseOp->getReg(), RHS.Offset,
+                               RHS.SU->NodeNum);
+      if (BaseOp->isFI()) {
+        const MachineFunction &MF =
+            *BaseOp->getParent()->getParent()->getParent();
+        const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
+        bool StackGrowsDown = TFI.getStackGrowthDirection() ==
+                              TargetFrameLowering::StackGrowsDown;
+        // Can't use tuple comparison here since we might need to use a
+        // different order when the stack grows down.
+        if (BaseOp->getIndex() != RHS.BaseOp->getIndex())
+          return StackGrowsDown ? BaseOp->getIndex() > RHS.BaseOp->getIndex()
+                                : BaseOp->getIndex() < RHS.BaseOp->getIndex();
+
+        if (Offset != RHS.Offset)
+          return StackGrowsDown ? Offset > RHS.Offset : Offset < RHS.Offset;
+
+        return SU->NodeNum < RHS.SU->NodeNum;
+      }
+
+      llvm_unreachable("MemOpClusterMutation only supports register or frame "
+                       "index bases.");
+    }
+  };
 };
 
 // Exclude certain registers from being visible to the scheduler. Use LLVM's
diff --git a/lib/Wrapper/OptimizingScheduler.cpp b/lib/Wrapper/OptimizingScheduler.cpp
index be70dfa2..c9877c62 100644
--- a/lib/Wrapper/OptimizingScheduler.cpp
+++ b/lib/Wrapper/OptimizingScheduler.cpp
@@ -380,7 +380,8 @@ void ScheduleDAGOptSched::schedule() {
       OST->createDDGWrapper(C, this, MM.get(), LatencyPrecision, RegionName);
   DDG->convertSUnits();
   DDG->convertRegFiles();
-  // DDG->findPossibleClusters();
+  dbgs() << "Printing possible clusters\n";
+  DDG->findPossibleClusters();
 
   auto *BDDG = static_cast<OptSchedDDGWrapperBasic *>(DDG.get());
   addGraphTransformations(BDDG);

From 186a1f3c3b041da856bbaba4ad067f9e4a920f3d Mon Sep 17 00:00:00 2001
From: Vang Thao <vangthao@csus.edu>
Date: Thu, 12 Mar 2020 19:13:13 -0700
Subject: [PATCH 04/40] Idea for  implementation (WIP)

---
 include/opt-sched/Scheduler/bb_spill.h        |  33 +++++
 .../opt-sched/Scheduler/sched_basic_data.h    |  10 +-
 lib/Scheduler/bb_spill.cpp                    | 123 ++++++++++++++----
 lib/Scheduler/sched_basic_data.cpp            |  11 +-
 lib/Wrapper/OptSchedDDGWrapperBasic.cpp       |  19 ++-
 5 files changed, 167 insertions(+), 29 deletions(-)

diff --git a/include/opt-sched/Scheduler/bb_spill.h b/include/opt-sched/Scheduler/bb_spill.h
index 27e3cbed..d857189e 100644
--- a/include/opt-sched/Scheduler/bb_spill.h
+++ b/include/opt-sched/Scheduler/bb_spill.h
@@ -9,11 +9,13 @@ Last Update:  Apr. 2011
 #ifndef OPTSCHED_SPILL_BB_SPILL_H
 #define OPTSCHED_SPILL_BB_SPILL_H
 
+#include "opt-sched/Scheduler/bit_vector.h"
 #include "opt-sched/Scheduler/OptSchedTarget.h"
 #include "opt-sched/Scheduler/defines.h"
 #include "opt-sched/Scheduler/sched_region.h"
 #include "llvm/ADT/SmallVector.h"
 #include <map>
+#include <memory>
 #include <set>
 #include <vector>
 
@@ -33,6 +35,37 @@ class BBWithSpill : public SchedRegion {
   InstCount crntSpillCost_;
   InstCount optmlSpillCost_;
 
+  /// May not need this variable
+  bool CurrentlyClustering;
+
+  /// Current cluster size
+  unsigned int CurrentClusterSize; 
+
+  /// Bitvector containing active bits for instructions that can be clustered
+  /// together
+  std::shared_ptr<BitVector> CurrentClusterVector;
+
+  /// Experimental variables and values for cost adjustment
+  int ClusteringWeight;
+  int ClusterInitialCost;
+
+  // Data struct to contain information about the previous clusters
+  struct PastClusters {
+    std::shared_ptr<BitVector> ClusterVector;
+    int ClusterSize;
+    int InstNum; // Instruction number that ended this cluster
+
+    // Constructor
+    PastClusters(std::shared_ptr<BitVector> Cluster, int size, int num)
+        : ClusterVector(Cluster), ClusterSize(size), InstNum(num) {}
+  };
+
+  /// Vector containing the (n-1) past clusters
+  llvm::SmallVector<std::unique_ptr<PastClusters>> PastClustersList;
+
+  /// Pointer to the latest past cluster
+  std::unique_ptr<PastClusters> LastCluster;
+
   // The target machine
   const OptSchedTarget *OST;
 
diff --git a/include/opt-sched/Scheduler/sched_basic_data.h b/include/opt-sched/Scheduler/sched_basic_data.h
index 6e3ed08b..01ff8882 100644
--- a/include/opt-sched/Scheduler/sched_basic_data.h
+++ b/include/opt-sched/Scheduler/sched_basic_data.h
@@ -16,6 +16,7 @@ Last Update:  Sept. 2013
 #include "opt-sched/Scheduler/hash_table.h"
 #include "opt-sched/Scheduler/machine_model.h"
 #include <iostream>
+#include <memory>
 
 namespace llvm {
 namespace opt_sched {
@@ -435,8 +436,9 @@ class SchedInstruction : public GraphNode {
 
   /// Set MayCluster to true if clustering memory operations was found
   /// to be possible.
-  // void setMayCluster () { MayCluster = true; }
-
+  void SetMayCluster(std::shared_ptr<BitVector> PossibleClustersVector);
+  bool GetMayCluster() { return MayCluster; }
+  auto GetClusterVector();
   friend class SchedRange;
 
 protected:
@@ -450,9 +452,9 @@ class SchedInstruction : public GraphNode {
   // bool MayStore;
   /// Data structure to store a possible clustering with other isntructions.
   /// This data structure should have a fast lookup operation.
-  // dataStructure PossibleClustures;
+  std::shared_ptr<BitVector> PossibleClusturesBitVector;
   /// This value should be set to true if clustering may be possible.
-  // bool MayCluster;
+  bool MayCluster;
   // A numberical ID for this instruction.
   int nodeID_;
   // The type of this instruction.
diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp
index 527ac0cd..8986ed4b 100644
--- a/lib/Scheduler/bb_spill.cpp
+++ b/lib/Scheduler/bb_spill.cpp
@@ -69,6 +69,13 @@ BBWithSpill::BBWithSpill(const OptSchedTarget *OST_, DataDepGraph *dataDepGraph,
   schduldEntryInstCnt_ = 0;
   schduldExitInstCnt_ = 0;
   schduldInstCnt_ = 0;
+  
+  CurrentClusterSize = 0;
+  CurrentClusterVector = nullptr;
+  ClusteringWeight = 10000;
+  ClusterInitialCost = 10000000;
+  PastClustersList.clear();
+  LastCluster = nullptr;
 }
 /****************************************************************************/
 
@@ -431,33 +438,63 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst,
   int liveRegs;
   InstCount newSpillCost;
 
+  // Scheduling cases for clustering project:
+  // 1.) Cluster -> Cluster
+      // Simple case, just increment 1 from cluster size
+  // 2.) Cluster -> Non-Cluster
+      // ?? End clustering
+  // 3.) Non-Cluster -> Cluster
+      // Simple case, initialize clustering
+
   // Possibly keep track of the current memory clustering size here
   // and in UpdateSpillInfoForUnSchdul_()
-  // if inst->mayCluster() then
-
-  //   // Can use bit operations to check if it is part of an active clustering
-  //   // Possible implementation: if (curClusterBitVector[inst->GetNum])
-  //   if curInst is part of an active cluster then
-  //       increment cluster size by 1 
-  //   else if not in a cluster then
-  //       start clustering by initializing cluster values
-  //       // Possibly use bit operations to activate part of cluster
-  //       // Ex:
-  //       // Instr 0, 3, 4 can be clustered and there are 5 total instructions
-  //       // curClusterBitVector Bitvector: 11001
-  //
-  // Potential Issues: 
-  // 1. How to implement this when un-scheduling? Need to keep track if new instruction disable a cluster
-  //     so that when we backtrack, we can re-activate the cluster.
-  // 2. Keeping track of the average clustering size when we aren't done scheduling.
+  if (isSecondPass) {
+    if (inst->GetMayCluster()) {
+      if (CurrentClusterSize > 0 && CurrentClusterVector->GetBit(inst->GetNum()) {
+        // Case 1: Currently clustering and this current instruction is part of
+        // the cluster
+        CurrentClusterSize++;
+        if (CurrentClusterSize > 1)
+          // Only decrement the cost if we cluster at least 2 operations
+          // together (EXPERIMENTAL FOR NOW)
+          ClusterInitialCost -= ClusteringWeight;
+     } else {
+        // Case 3: Not currently clustering. Initialize clustering
+        // Sidenote: What if we go from current cluster to a different cluster?
+        CurrentClusterVector.reset(); // Clear cluster vector
+        CurrentClusterVector = inst->GetClusterVector(); // Set active cluster
+        CurrentClusterSize = 1; // Current size is 1
+     }
+    } else if (CurrentClusterSize > 1) {
+      // Case 2: Exiting out of an active cluster
+      // Save the cluster to restore when backtracking.
+      if (LastCluster) {
+        // List of previous clusters
+        PastClustersList.push_back(std::move(LastCluster));
+
+        // Current previous cluster
+        LastCluster = llvm::make_unique<PastClusters>(
+            CurrentClusterVector, CurrentClusterSize, inst->GetNum());
+      } else
+        LastCluster = llvm::make_unique<PastClusters>(
+            CurrentClusterVector, CurrentClusterSize, inst->GetNum());
+      CurrentClusterVector.reset(); // Reset active cluster
+      CurrentClusterSize = 0;       // Set cluster size to 0
+    }
+  }
+  // Potential Issues:
+  // 1. Keeping track of the average clustering size when we aren't done
+  // scheduling.
   //    Cost function that was discussed during the meeting on Friday:
   //      (15 - averageClusteringSize) * ClusteringWeight
-  //      We want to minimize this cost but there is an issue in the following example
-  //    Ex: Partial schedule was able to cluster a block of 15. averageClusteringSize : 15, CostFnc: (15-15)*Weight = 0
-  //          Any cluster block below size 15 will decrease the average cluster size and increase the cost.
-  //          This makes our B&B enumerator actually favor not doing clustering.
+  //      We want to minimize this cost but there is an issue in the following
+  //      example
+  //    Ex: Partial schedule was able to cluster a block of 15.
+  //    averageClusteringSize : 15, CostFnc: (15-15)*Weight = 0
+  //          Any cluster block below size 15 will decrease the average
+  //          cluster size and increase the cost. This makes our B&B
+  //          enumerator actually favor not doing clustering.
 
-  
   defCnt = inst->GetDefs(defs);
   useCnt = inst->GetUses(uses);
 
@@ -649,6 +686,48 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) {
                inst->GetNum());
 #endif
 
+  // Backtracking cases for clustering project:
+  // 1.) Cluster <- Cluster 
+      // Simple case, just decrement 1 from cluster size
+  // 2.) Cluster <- Non-Cluster
+      // Have to restore state of Cluster and ??
+      // Can/should we use a stack to restore state?
+  // 3.) Non-Cluster <- Cluster
+      // Simple case, just decrement 1 from cluster size
+      // If cluster size == 0, delete CurrentClusterVector
+  if (isSecondPass) {
+    if (inst->GetMayCluster()) {
+      // Case 1
+      if (CurrentClusterSize > 0 && CurrentClusterVector->GetBit(inst->GetNum()) {
+        // Currently clustering and this current instruction is part of the
+        // cluter
+        if (CurrentClusterSize > 1)
+          ClusterInitialCost += ClusteringWeight; // Re-add the cost
+        CurrentClusterSize--;
+      } else {
+        // Case 3
+        CurrentClusterSize--;
+        if (CurrentClusterSize == 0)
+          CurrentClusterVector.reset();
+      }
+    } else if (LastCluster) {
+      if (LastCluster->InstNum == inst->GetNum()) {
+        // Case 2: If there was a previous cluster and
+        // this instruction ended the cluster then restore the previous
+        // cluster's state
+        CurrentClusterSize = LastCluster->ClusterSize;
+        CurrentClusterVector = LastCluster->ClusterVector;
+        LastCluster.reset(); // Release current cluster pointer
+
+        // Get previous cluster from vector list
+        if (!PastClustersList.empty()) {
+          LastCluster = std::move(PastClustersList.back());
+          PastClustersList.pop_back();
+        }
+    }
+  }
+
+
   defCnt = inst->GetDefs(defs);
   useCnt = inst->GetUses(uses);
 
diff --git a/lib/Scheduler/sched_basic_data.cpp b/lib/Scheduler/sched_basic_data.cpp
index bdec48cb..2ff63c33 100644
--- a/lib/Scheduler/sched_basic_data.cpp
+++ b/lib/Scheduler/sched_basic_data.cpp
@@ -18,7 +18,7 @@ SchedInstruction::SchedInstruction(InstCount num, const string &name,
   instType_ = instType;
   // MayLoad = InstrMayLoad;
   // MayStore = InstrMayStore;
-  // MayCluster = false;
+  MayCluster = false; 
 
   frwrdLwrBound_ = INVALID_VALUE;
   bkwrdLwrBound_ = INVALID_VALUE;
@@ -742,6 +742,15 @@ int16_t SchedInstruction::CmputLastUseCnt() {
   return lastUseCnt_;
 }
 
+void SchedInstruction::SetMayCluster(std::shared_ptr<BitVector> PossibleClustersVector) {
+  if (PossibleClustersVector->GetOneCnt > 0) {
+    PossibleClusturesBitVector = PossibleClustersVector;
+    MayCluster = true;
+  }
+}
+
+auto SchedInstruction::GetClusterVector() { return PossibleClusturesBitVector; }
+
 /******************************************************************************
  * SchedRange                                                                 *
  ******************************************************************************/
diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
index a26b254a..cd4f394f 100644
--- a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
+++ b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
@@ -5,6 +5,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "OptSchedDDGWrapperBasic.h"
+#include "opt-sched/Scheduler/bit_vector.h"
 #include "opt-sched/Scheduler/config.h"
 #include "opt-sched/Scheduler/logger.h"
 #include "opt-sched/Scheduler/register.h"
@@ -22,7 +23,6 @@
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Target/TargetMachine.h"
-#include <bitset>
 #include <cstdio>
 #include <map>
 #include <queue>
@@ -516,6 +516,7 @@ void OptSchedDDGWrapperBasic::clusterNeighboringMemOps_(
     ArrayRef<const SUnit *> MemOps) {
   SmallVector<MemOpInfo, 32> MemOpRecords;
   dbgs() << "Processing possible clusters\n";
+  
   for (const SUnit *SU : MemOps) {
     dbgs() << "  " << SU->NodeNum << " is in the chain.\n";
     MachineOperand *BaseOp;
@@ -529,6 +530,8 @@ void OptSchedDDGWrapperBasic::clusterNeighboringMemOps_(
     return;
   }
 
+  auto ClusterVector = llvm::make_unique<BitVector>(DAG->SUnits.size());
+
   llvm::sort(MemOpRecords);
   unsigned ClusterLength = 1;
   for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) {
@@ -538,11 +541,23 @@ void OptSchedDDGWrapperBasic::clusterNeighboringMemOps_(
     if (DAG->TII->shouldClusterMemOps(*MemOpRecords[Idx].BaseOp,
                                  *MemOpRecords[Idx + 1].BaseOp,
                                  ClusterLength)) {
-	    dbgs() << "    Cluster possible at SU(" << SUa->NodeNum << ")- SU(" << SUb->NodeNum << ")\n";
+	  dbgs() << "    Cluster possible at SU(" << SUa->NodeNum << ")- SU(" << SUb->NodeNum << ")\n";
       ++ClusterLength;
+      ClusterVector->SetBit(SUa->NodeNum);
+      ClusterVector->SetBit(SUb->NodeNum);
     } else
       ClusterLength = 1;
   }
+  dbgs () << "Printing bit vector: ";
+  for (int i = ClusterVector->GetSize() - 1; i >= 0; i--) {
+    if (ClusterVector->GetBit(i))
+      dbgs() << "1";
+    else
+      dbgs() << "0";
+  }
+  dbgs() << '\n';
+  insts_[SUa->NodeNum]->SetMayCluster(ClusterVector);
+  insts_[SUb->NodeNum]->SetMayCluster(ClusterVector);
 }
 
 /// Iterate through SUnits and find all possible clustering then transfer

From c4b097344aeffdc818f7a465deb61c7798bb0bc6 Mon Sep 17 00:00:00 2001
From: vang <vang@optimizer2.ecs.csus.edu>
Date: Thu, 12 Mar 2020 20:12:45 -0700
Subject: [PATCH 05/40] Fixed some compilation issues

---
 include/opt-sched/Scheduler/bb_spill.h         | 2 +-
 include/opt-sched/Scheduler/sched_basic_data.h | 2 +-
 lib/Scheduler/bb_spill.cpp                     | 6 +++---
 lib/Scheduler/sched_basic_data.cpp             | 4 ++--
 lib/Wrapper/OptSchedDDGWrapperBasic.cpp        | 4 ++--
 5 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/opt-sched/Scheduler/bb_spill.h b/include/opt-sched/Scheduler/bb_spill.h
index d857189e..ee2325da 100644
--- a/include/opt-sched/Scheduler/bb_spill.h
+++ b/include/opt-sched/Scheduler/bb_spill.h
@@ -61,7 +61,7 @@ class BBWithSpill : public SchedRegion {
   };
 
   /// Vector containing the (n-1) past clusters
-  llvm::SmallVector<std::unique_ptr<PastClusters>> PastClustersList;
+  llvm::SmallVector<std::unique_ptr<PastClusters>, 0> PastClustersList;
 
   /// Pointer to the latest past cluster
   std::unique_ptr<PastClusters> LastCluster;
diff --git a/include/opt-sched/Scheduler/sched_basic_data.h b/include/opt-sched/Scheduler/sched_basic_data.h
index 01ff8882..2eededb2 100644
--- a/include/opt-sched/Scheduler/sched_basic_data.h
+++ b/include/opt-sched/Scheduler/sched_basic_data.h
@@ -438,7 +438,7 @@ class SchedInstruction : public GraphNode {
   /// to be possible.
   void SetMayCluster(std::shared_ptr<BitVector> PossibleClustersVector);
   bool GetMayCluster() { return MayCluster; }
-  auto GetClusterVector();
+  std::shared_ptr<BitVector> GetClusterVector();
   friend class SchedRange;
 
 protected:
diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp
index 8986ed4b..ccaad4dc 100644
--- a/lib/Scheduler/bb_spill.cpp
+++ b/lib/Scheduler/bb_spill.cpp
@@ -450,7 +450,7 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst,
   // and in UpdateSpillInfoForUnSchdul_()
   if (isSecondPass) {
     if (inst->GetMayCluster()) {
-      if (CurrentClusterSize > 0 && CurrentClusterVector->GetBit(inst->GetNum()) {
+      if (CurrentClusterSize > 0 && CurrentClusterVector->GetBit(inst->GetNum())) {
         // Case 1: Currently clustering and this current instruction is part of
         // the cluster
         CurrentClusterSize++;
@@ -698,7 +698,7 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) {
   if (isSecondPass) {
     if (inst->GetMayCluster()) {
       // Case 1
-      if (CurrentClusterSize > 0 && CurrentClusterVector->GetBit(inst->GetNum()) {
+      if (CurrentClusterSize > 0 && CurrentClusterVector->GetBit(inst->GetNum())) {
         // Currently clustering and this current instruction is part of the
         // cluter
         if (CurrentClusterSize > 1)
@@ -725,7 +725,7 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) {
           PastClustersList.pop_back();
         }
     }
-  }
+  }}
 
 
   defCnt = inst->GetDefs(defs);
diff --git a/lib/Scheduler/sched_basic_data.cpp b/lib/Scheduler/sched_basic_data.cpp
index 2ff63c33..f74b605f 100644
--- a/lib/Scheduler/sched_basic_data.cpp
+++ b/lib/Scheduler/sched_basic_data.cpp
@@ -743,13 +743,13 @@ int16_t SchedInstruction::CmputLastUseCnt() {
 }
 
 void SchedInstruction::SetMayCluster(std::shared_ptr<BitVector> PossibleClustersVector) {
-  if (PossibleClustersVector->GetOneCnt > 0) {
+  if (PossibleClustersVector->GetOneCnt() > 0) {
     PossibleClusturesBitVector = PossibleClustersVector;
     MayCluster = true;
   }
 }
 
-auto SchedInstruction::GetClusterVector() { return PossibleClusturesBitVector; }
+std::shared_ptr<BitVector> SchedInstruction::GetClusterVector() { return PossibleClusturesBitVector; }
 
 /******************************************************************************
  * SchedRange                                                                 *
diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
index cd4f394f..c2e38723 100644
--- a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
+++ b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
@@ -545,6 +545,8 @@ void OptSchedDDGWrapperBasic::clusterNeighboringMemOps_(
       ++ClusterLength;
       ClusterVector->SetBit(SUa->NodeNum);
       ClusterVector->SetBit(SUb->NodeNum);
+      insts_[SUa->NodeNum]->SetMayCluster(ClusterVector);
+      insts_[SUb->NodeNum]->SetMayCluster(ClusterVector);
     } else
       ClusterLength = 1;
   }
@@ -556,8 +558,6 @@ void OptSchedDDGWrapperBasic::clusterNeighboringMemOps_(
       dbgs() << "0";
   }
   dbgs() << '\n';
-  insts_[SUa->NodeNum]->SetMayCluster(ClusterVector);
-  insts_[SUb->NodeNum]->SetMayCluster(ClusterVector);
 }
 
 /// Iterate through SUnits and find all possible clustering then transfer

From 75b02f4eb9f00d768efccea3af6f5d1944370701 Mon Sep 17 00:00:00 2001
From: Vang Thao <vangthao@csus.edu>
Date: Fri, 13 Mar 2020 08:42:28 -0700
Subject: [PATCH 06/40] Fixed some compiler bugs, and added experimental cost.

---
 include/opt-sched/Scheduler/bb_spill.h  |  3 ---
 lib/Scheduler/bb_spill.cpp              | 15 +++++++++++----
 lib/Scheduler/enumerator.cpp            |  2 +-
 lib/Scheduler/sched_region.cpp          |  8 +++++++-
 lib/Wrapper/OptSchedDDGWrapperBasic.cpp |  2 +-
 5 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/include/opt-sched/Scheduler/bb_spill.h b/include/opt-sched/Scheduler/bb_spill.h
index ee2325da..84f6282e 100644
--- a/include/opt-sched/Scheduler/bb_spill.h
+++ b/include/opt-sched/Scheduler/bb_spill.h
@@ -35,9 +35,6 @@ class BBWithSpill : public SchedRegion {
   InstCount crntSpillCost_;
   InstCount optmlSpillCost_;
 
-  /// May not need this variable
-  bool CurrentlyClustering;
-
   /// Current cluster size
   unsigned int CurrentClusterSize; 
 
diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp
index ccaad4dc..5f9696cb 100644
--- a/lib/Scheduler/bb_spill.cpp
+++ b/lib/Scheduler/bb_spill.cpp
@@ -72,8 +72,8 @@ BBWithSpill::BBWithSpill(const OptSchedTarget *OST_, DataDepGraph *dataDepGraph,
   
   CurrentClusterSize = 0;
   CurrentClusterVector = nullptr;
-  ClusteringWeight = 10000;
-  ClusterInitialCost = 10000000;
+  ClusteringWeight = 1000;
+  ClusterInitialCost = 1000000;
   PastClustersList.clear();
   LastCluster = nullptr;
 }
@@ -376,6 +376,9 @@ InstCount BBWithSpill::CmputNormCost_(InstSchedule *sched,
   cost -= costLwrBound_;
   execCost -= costLwrBound_;
 
+  if (isSecondPass)
+	  cost += ClusterInitialCost; 
+
   sched->SetCost(cost);
   sched->SetExecCost(execCost);
   return cost;
@@ -454,10 +457,12 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst,
         // Case 1: Currently clustering and this current instruction is part of
         // the cluster
         CurrentClusterSize++;
-        if (CurrentClusterSize > 1)
+        if (CurrentClusterSize > 2) {
           // Only decrement the cost if we cluster at least 2 operations
           // together (EXPERIMENTAL FOR NOW)
           ClusterInitialCost -= ClusteringWeight;
+          Logger::Info("More than 2 instructions clustered together!");
+	}
      } else {
         // Case 3: Not currently clustering. Initialize clustering
         // Sidenote: What if we go from current cluster to a different cluster?
@@ -701,8 +706,10 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) {
       if (CurrentClusterSize > 0 && CurrentClusterVector->GetBit(inst->GetNum())) {
         // Currently clustering and this current instruction is part of the
         // cluter
-        if (CurrentClusterSize > 1)
+        if (CurrentClusterSize > 2) {
           ClusterInitialCost += ClusteringWeight; // Re-add the cost
+	  Logger::Info("More than 2 instructions clustered together. Undoing!!");
+	}
         CurrentClusterSize--;
       } else {
         // Case 3
diff --git a/lib/Scheduler/enumerator.cpp b/lib/Scheduler/enumerator.cpp
index e94f2170..89d157b9 100644
--- a/lib/Scheduler/enumerator.cpp
+++ b/lib/Scheduler/enumerator.cpp
@@ -974,7 +974,7 @@ bool Enumerator::FindNxtFsblBrnch_(EnumTreeNode *&newNode) {
 #ifdef IS_DEBUG_READY_LIST
   Logger::Info("Ready List Size is %d", rdyInstCnt);
   // Warning! That will reset the instruction iterator!
-  // rdyLst_->Print(Logger::GetLogStream());
+   rdyLst_->Print(Logger::GetLogStream());
 
   stats::maxReadyListSize.SetMax(rdyInstCnt);
 #endif
diff --git a/lib/Scheduler/sched_region.cpp b/lib/Scheduler/sched_region.cpp
index 4353dc18..c0c8f0e4 100644
--- a/lib/Scheduler/sched_region.cpp
+++ b/lib/Scheduler/sched_region.cpp
@@ -600,6 +600,12 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule(
     Logger::Info("DAG %s PEAK %d", dataDepGraph_->GetDagID(), maxSpillCost);
   }
 #endif
+
+  if (isSecondPass) {
+    Logger::Info("Printing final schedule.");
+    bestSched->Print(Logger::GetLogStream(), "Best Sched");
+  }
+	
   return rslt;
 }
 
@@ -641,7 +647,7 @@ FUNC_RESULT SchedRegion::Optimize_(Milliseconds startTime,
     }
     stats::unsolvedProblemSize.Record(dataDepGraph_->GetInstCnt());
   }
-
+	  
   return rslt;
 }
 
diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
index c2e38723..cd0fd07b 100644
--- a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
+++ b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
@@ -530,7 +530,7 @@ void OptSchedDDGWrapperBasic::clusterNeighboringMemOps_(
     return;
   }
 
-  auto ClusterVector = llvm::make_unique<BitVector>(DAG->SUnits.size());
+  auto ClusterVector = std::make_shared<BitVector>(DAG->SUnits.size());
 
   llvm::sort(MemOpRecords);
   unsigned ClusterLength = 1;

From 00501ae26353b8d44322f5be4170987482f489f5 Mon Sep 17 00:00:00 2001
From: Vang Thao <vangthao@csus.edu>
Date: Fri, 13 Mar 2020 09:28:20 -0700
Subject: [PATCH 07/40] Cleaned up debug statements. NFC

---
 lib/Scheduler/enumerator.cpp            |  2 +-
 lib/Scheduler/sched_region.cpp          |  5 ++--
 lib/Wrapper/OptSchedDDGWrapperBasic.cpp | 40 ++++++++++++-------------
 lib/Wrapper/OptimizingScheduler.cpp     |  1 -
 4 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/lib/Scheduler/enumerator.cpp b/lib/Scheduler/enumerator.cpp
index 89d157b9..e94f2170 100644
--- a/lib/Scheduler/enumerator.cpp
+++ b/lib/Scheduler/enumerator.cpp
@@ -974,7 +974,7 @@ bool Enumerator::FindNxtFsblBrnch_(EnumTreeNode *&newNode) {
 #ifdef IS_DEBUG_READY_LIST
   Logger::Info("Ready List Size is %d", rdyInstCnt);
   // Warning! That will reset the instruction iterator!
-   rdyLst_->Print(Logger::GetLogStream());
+  // rdyLst_->Print(Logger::GetLogStream());
 
   stats::maxReadyListSize.SetMax(rdyInstCnt);
 #endif
diff --git a/lib/Scheduler/sched_region.cpp b/lib/Scheduler/sched_region.cpp
index c0c8f0e4..5ff6ae94 100644
--- a/lib/Scheduler/sched_region.cpp
+++ b/lib/Scheduler/sched_region.cpp
@@ -601,11 +601,13 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule(
   }
 #endif
 
+#ifdef IS_DEBUG_MEMORY_CLUSTERING
   if (isSecondPass) {
     Logger::Info("Printing final schedule.");
     bestSched->Print(Logger::GetLogStream(), "Best Sched");
   }
-	
+#endif
+
   return rslt;
 }
 
@@ -647,7 +649,6 @@ FUNC_RESULT SchedRegion::Optimize_(Milliseconds startTime,
     }
     stats::unsolvedProblemSize.Record(dataDepGraph_->GetInstCnt());
   }
-	  
   return rslt;
 }
 
diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
index cd0fd07b..8f607c68 100644
--- a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
+++ b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
@@ -515,10 +515,10 @@ void OptSchedDDGWrapperBasic::countBoundaryLiveness(
 void OptSchedDDGWrapperBasic::clusterNeighboringMemOps_(
     ArrayRef<const SUnit *> MemOps) {
   SmallVector<MemOpInfo, 32> MemOpRecords;
-  dbgs() << "Processing possible clusters\n";
+  LLVM_DEBUG(dbgs() << "Processing possible clusters\n");
   
   for (const SUnit *SU : MemOps) {
-    dbgs() << "  " << SU->NodeNum << " is in the chain.\n";
+    LLVM_DEBUG(dbgs() << "  " << SU->NodeNum << " is in the chain.\n");
     MachineOperand *BaseOp;
     int64_t Offset;
     if (DAG->TII->getMemOperandWithOffset(*SU->getInstr(), BaseOp, Offset, DAG->TRI))
@@ -526,7 +526,7 @@ void OptSchedDDGWrapperBasic::clusterNeighboringMemOps_(
   }
 
   if (MemOpRecords.size() < 2) {
-    dbgs() << "  Unable to cluster memop cluster of 1.\n";
+    LLVM_DEBUG(dbgs() << "  Unable to cluster memop cluster of 1.\n");
     return;
   }
 
@@ -537,11 +537,11 @@ void OptSchedDDGWrapperBasic::clusterNeighboringMemOps_(
   for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) {
     const SUnit *SUa = MemOpRecords[Idx].SU;
     const SUnit *SUb = MemOpRecords[Idx + 1].SU;
-    dbgs() << "  Checking possible clustering of (" << SUa->NodeNum << ") and (" << SUb->NodeNum << ")\n";
+    LLVM_DEBUG(dbgs() << "  Checking possible clustering of (" << SUa->NodeNum << ") and (" << SUb->NodeNum << ")\n");
     if (DAG->TII->shouldClusterMemOps(*MemOpRecords[Idx].BaseOp,
                                  *MemOpRecords[Idx + 1].BaseOp,
                                  ClusterLength)) {
-	  dbgs() << "    Cluster possible at SU(" << SUa->NodeNum << ")- SU(" << SUb->NodeNum << ")\n";
+      LLVM_DEBUG(dbgs() << "    Cluster possible at SU(" << SUa->NodeNum << ")- SU(" << SUb->NodeNum << ")\n");
       ++ClusterLength;
       ClusterVector->SetBit(SUa->NodeNum);
       ClusterVector->SetBit(SUb->NodeNum);
@@ -550,14 +550,16 @@ void OptSchedDDGWrapperBasic::clusterNeighboringMemOps_(
     } else
       ClusterLength = 1;
   }
-  dbgs () << "Printing bit vector: ";
+#ifdef IS_DEBUG_MEMORY_CLUSTERING
+  LLVM_DEBUG(dbgs () << "Printing bit vector: ");
   for (int i = ClusterVector->GetSize() - 1; i >= 0; i--) {
     if (ClusterVector->GetBit(i))
-      dbgs() << "1";
+      LLVM_DEBUG(dbgs() << "1");
     else
-      dbgs() << "0";
+      LLVM_DEBUG(dbgs() << "0");
   }
-  dbgs() << '\n';
+  LLVM_DEBUG(dbgs() << '\n');
+#endif
 }
 
 /// Iterate through SUnits and find all possible clustering then transfer
@@ -572,7 +574,7 @@ void OptSchedDDGWrapperBasic::findPossibleClusters() {
   // Experiment with clustering loads first
   bool IsLoad = true;
 
-  dbgs() << "Looking for load clusters\n";
+  LLVM_DEBUG(dbgs() << "Looking for load clusters\n");
   DenseMap<unsigned, unsigned> StoreChainIDs;
   // Map each store chain to a set of dependent MemOps.
   SmallVector<SmallVector<const SUnit *, 4>, 32> StoreChainDependents;
@@ -581,13 +583,13 @@ void OptSchedDDGWrapperBasic::findPossibleClusters() {
         (!IsLoad && !SU.getInstr()->mayStore()))
       continue;
     auto MI = SU.getInstr();
-    dbgs() << "  Instruction (" << SU.NodeNum << ") " << DAG->TII->getName(MI->getOpcode())  << " may load.\n";
+    LLVM_DEBUG(dbgs() << "  Instruction (" << SU.NodeNum << ") " << DAG->TII->getName(MI->getOpcode())  << " may load.\n");
 
     unsigned ChainPredID = DAG->SUnits.size();
     for (const SDep &Pred : SU.Preds) {
       if (Pred.isCtrl()) {
         auto PredMI = Pred.getSUnit()->getInstr();
-        dbgs() << "    Breaking chain at (" << Pred.getSUnit()->NodeNum << ") " << DAG->TII->getName(PredMI->getOpcode()) << '\n';
+        LLVM_DEBUG(dbgs() << "    Breaking chain at (" << Pred.getSUnit()->NodeNum << ") " << DAG->TII->getName(PredMI->getOpcode()) << '\n');
         ChainPredID = Pred.getSUnit()->NodeNum;
         break;
       }
@@ -595,24 +597,22 @@ void OptSchedDDGWrapperBasic::findPossibleClusters() {
     // Check if this chain-like pred has been seen
     // before. ChainPredID==MaxNodeID at the top of the schedule.
     unsigned NumChains = StoreChainDependents.size();
-    dbgs() << "    ChainPredID " << ChainPredID << ", NumChains " << NumChains << '\n';
+    LLVM_DEBUG(dbgs() << "    ChainPredID " << ChainPredID << ", NumChains " << NumChains << '\n');
     std::pair<DenseMap<unsigned, unsigned>::iterator, bool> Result =
         StoreChainIDs.insert(std::make_pair(ChainPredID, NumChains));
     if (Result.second)
       StoreChainDependents.resize(NumChains + 1);
-    dbgs() << "    Pushing (" << SU.NodeNum << ") on the chain.\n";
     StoreChainDependents[Result.first->second].push_back(&SU);
-    dbgs() << "    inPrinting size of SCD: " << StoreChainDependents.size() << '\n';
   }
 
-
-  dbgs() << "  outPrinting size of SCD: " << StoreChainDependents.size() << '\n';
   // Iterate over the store chains.
   for (auto &SCD : StoreChainDependents) {
-    dbgs() << "    Printing the list before clustering: ";
+#ifdef IS_DEBUG_MEMORY_CLUSTERING
+    LLVM_DEBUG(dbgs() << "    Printing the list before clustering: ");
     for (auto SU1 : SCD)
-    	dbgs() << SU1->NodeNum << " ";
-    dbgs() << '\n';
+    	LLVM_DEBUG(dbgs() << SU1->NodeNum << " ");
+    LLVM_DEBUG(dbgs() << '\n');
+#endif
     clusterNeighboringMemOps_(SCD);
   }
 }
diff --git a/lib/Wrapper/OptimizingScheduler.cpp b/lib/Wrapper/OptimizingScheduler.cpp
index c9877c62..ca383ac6 100644
--- a/lib/Wrapper/OptimizingScheduler.cpp
+++ b/lib/Wrapper/OptimizingScheduler.cpp
@@ -380,7 +380,6 @@ void ScheduleDAGOptSched::schedule() {
       OST->createDDGWrapper(C, this, MM.get(), LatencyPrecision, RegionName);
   DDG->convertSUnits();
   DDG->convertRegFiles();
-  dbgs() << "Printing possible clusters\n";
   DDG->findPossibleClusters();
 
   auto *BDDG = static_cast<OptSchedDDGWrapperBasic *>(DDG.get());

From 3603da9cb7e75bc3942d427586d466a8de169732 Mon Sep 17 00:00:00 2001
From: Vang Thao <vangthao@csus.edu>
Date: Fri, 13 Mar 2020 10:47:39 -0700
Subject: [PATCH 08/40] Added clustering cost to ChkCostFsblty, and added
 TODOs.

---
 include/opt-sched/Scheduler/bb_spill.h        | 17 ++++++---
 .../opt-sched/Scheduler/sched_basic_data.h    | 14 +-------
 lib/Scheduler/bb_spill.cpp                    | 35 ++++++++++++-------
 lib/Scheduler/enumerator.cpp                  | 12 -------
 lib/Wrapper/OptSchedDDGWrapperBasic.cpp       | 12 +++----
 5 files changed, 39 insertions(+), 51 deletions(-)

diff --git a/include/opt-sched/Scheduler/bb_spill.h b/include/opt-sched/Scheduler/bb_spill.h
index 84f6282e..62771561 100644
--- a/include/opt-sched/Scheduler/bb_spill.h
+++ b/include/opt-sched/Scheduler/bb_spill.h
@@ -42,25 +42,32 @@ class BBWithSpill : public SchedRegion {
   /// together
   std::shared_ptr<BitVector> CurrentClusterVector;
 
+  // TODO: Implement cost function for clustering
   /// Experimental variables and values for cost adjustment
   int ClusteringWeight;
   int ClusterInitialCost;
 
-  // Data struct to contain information about the previous clusters
+  /// Data struct to contain information about the previous clusters
   struct PastClusters {
     std::shared_ptr<BitVector> ClusterVector;
+    /// Size of the cluster when it was ended by an instruction not in the
+    /// cluster
     int ClusterSize;
-    int InstNum; // Instruction number that ended this cluster
 
-    // Constructor
+    /// Instruction number that ended this cluster
+    int InstNum; 
+
+    /// Constructor for this struct
     PastClusters(std::shared_ptr<BitVector> Cluster, int size, int num)
         : ClusterVector(Cluster), ClusterSize(size), InstNum(num) {}
   };
 
   /// Vector containing the (n-1) past clusters
-  llvm::SmallVector<std::unique_ptr<PastClusters>, 0> PastClustersList;
+  llvm::SmallVector<std::unique_ptr<PastClusters>, 4> PastClustersList;
 
-  /// Pointer to the latest past cluster
+  /// Pointer to the last cluster. This is kept out of the vector to
+  /// avoid having to fetch it every time we compare the current instruction
+  /// number to the one that ended the cluster.
   std::unique_ptr<PastClusters> LastCluster;
 
   // The target machine
diff --git a/include/opt-sched/Scheduler/sched_basic_data.h b/include/opt-sched/Scheduler/sched_basic_data.h
index 2eededb2..166dbbad 100644
--- a/include/opt-sched/Scheduler/sched_basic_data.h
+++ b/include/opt-sched/Scheduler/sched_basic_data.h
@@ -426,14 +426,6 @@ class SchedInstruction : public GraphNode {
 
   InstType GetCrtclPathFrmRoot() { return crtclPathFrmRoot_; }
 
-  /// Return true if this instruction could possibly read memory 
-  /// Copied from https://llvm.org/doxygen/classllvm_1_1MachineInstr.html
-  // bool mayLoad() { return MayLoad; }
-
-  /// Return true if this instruction could possibly modify memory.
-  /// Copied from https://llvm.org/doxygen/classllvm_1_1MachineInstr.html
-  // bool mayStore() { return MayStore; }
-
   /// Set MayCluster to true if clustering memory operations was found
   /// to be possible.
   void SetMayCluster(std::shared_ptr<BitVector> PossibleClustersVector);
@@ -446,11 +438,7 @@ class SchedInstruction : public GraphNode {
   string name_;
   // The mnemonic of this instruction, e.g. "add" or "jmp".
   string opCode_;
-  /// Indicate if this instruction may be a load operation
-  // bool MayLoad;
-  /// Indicate if this instruction may be a store operation
-  // bool MayStore;
-  /// Data structure to store a possible clustering with other isntructions.
+  /// Data structure to store a possible clustering with other instructions.
   /// This data structure should have a fast lookup operation.
   std::shared_ptr<BitVector> PossibleClusturesBitVector;
   /// This value should be set to true if clustering may be possible.
diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp
index 5f9696cb..c91bf552 100644
--- a/lib/Scheduler/bb_spill.cpp
+++ b/lib/Scheduler/bb_spill.cpp
@@ -376,6 +376,7 @@ InstCount BBWithSpill::CmputNormCost_(InstSchedule *sched,
   cost -= costLwrBound_;
   execCost -= costLwrBound_;
 
+  // TODO: Implement cost function for clustering
   if (isSecondPass)
 	  cost += ClusterInitialCost; 
 
@@ -443,17 +444,19 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst,
 
   // Scheduling cases for clustering project:
   // 1.) Cluster -> Cluster
-      // Simple case, just increment 1 from cluster size
+  // Simple case, just increment 1 from cluster size
   // 2.) Cluster -> Non-Cluster
-      // ?? End clustering
+  // ?? End clustering
   // 3.) Non-Cluster -> Cluster
-      // Simple case, initialize clustering
+  // Simple case, initialize clustering
 
   // Possibly keep track of the current memory clustering size here
   // and in UpdateSpillInfoForUnSchdul_()
   if (isSecondPass) {
     if (inst->GetMayCluster()) {
-      if (CurrentClusterSize > 0 && CurrentClusterVector->GetBit(inst->GetNum())) {
+      // TODO: Check for different cluster to different cluster scheduling
+      if (CurrentClusterSize > 0 &&
+          CurrentClusterVector->GetBit(inst->GetNum())) {
         // Case 1: Currently clustering and this current instruction is part of
         // the cluster
         CurrentClusterSize++;
@@ -461,15 +464,14 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst,
           // Only decrement the cost if we cluster at least 2 operations
           // together (EXPERIMENTAL FOR NOW)
           ClusterInitialCost -= ClusteringWeight;
-          Logger::Info("More than 2 instructions clustered together!");
-	}
-     } else {
+          //Logger::Info("More than 2 instructions clustered together!");
+        }
+      } else {
         // Case 3: Not currently clustering. Initialize clustering
-        // Sidenote: What if we go from current cluster to a different cluster?
-        CurrentClusterVector.reset(); // Clear cluster vector
+        CurrentClusterVector.reset();                    // Clear cluster vector
         CurrentClusterVector = inst->GetClusterVector(); // Set active cluster
-        CurrentClusterSize = 1; // Current size is 1
-     }
+        CurrentClusterSize = 1;                          // Current size is 1
+      }
     } else if (CurrentClusterSize > 1) {
       // Case 2: Exiting out of an active cluster
       // Save the cluster to restore when backtracking.
@@ -701,14 +703,16 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) {
       // Simple case, just decrement 1 from cluster size
       // If cluster size == 0, delete CurrentClusterVector
   if (isSecondPass) {
+    // TODO: Check for different cluster to different cluster
+    // backtracking.
     if (inst->GetMayCluster()) {
       // Case 1
       if (CurrentClusterSize > 0 && CurrentClusterVector->GetBit(inst->GetNum())) {
         // Currently clustering and this current instruction is part of the
-        // cluter
+        // cluster
         if (CurrentClusterSize > 2) {
           ClusterInitialCost += ClusteringWeight; // Re-add the cost
-	  Logger::Info("More than 2 instructions clustered together. Undoing!!");
+	        //Logger::Info("More than 2 instructions clustered together. Undoing!!");
 	}
         CurrentClusterSize--;
       } else {
@@ -1027,6 +1031,7 @@ void BBWithSpill::SetupForSchdulng_() {
 bool BBWithSpill::ChkCostFsblty(InstCount trgtLngth, EnumTreeNode *node) {
   bool fsbl = true;
   InstCount crntCost, dynmcCostLwrBound;
+
   if (spillCostFunc_ == SCF_SLIL) {
     crntCost = dynamicSlilLowerBound_ * SCW_ + trgtLngth * schedCostFactor_;
   } else {
@@ -1035,6 +1040,10 @@ bool BBWithSpill::ChkCostFsblty(InstCount trgtLngth, EnumTreeNode *node) {
   crntCost -= costLwrBound_;
   dynmcCostLwrBound = crntCost;
 
+  // TODO: Implement cost function for clustering
+  if (isSecondPass)
+    cost += ClusterInitialCost; 
+
   // assert(cost >= 0);
   assert(dynmcCostLwrBound >= 0);
 
diff --git a/lib/Scheduler/enumerator.cpp b/lib/Scheduler/enumerator.cpp
index e94f2170..d9c4e3b1 100644
--- a/lib/Scheduler/enumerator.cpp
+++ b/lib/Scheduler/enumerator.cpp
@@ -982,18 +982,6 @@ bool Enumerator::FindNxtFsblBrnch_(EnumTreeNode *&newNode) {
   if (crntBrnchNum == 0 && SchedForRPOnly_)
     crntNode_->SetFoundInstWithUse(IsUseInRdyLst_());
 
-  // Note: This is just a thought, we might not need this here.
-  // Check if clustering is possible.
-  // We want to only do memory clustering in the second pass for now.
-  // if (crntBrnchNum == 0 && EnableMemClustering && SecondPass)
-  //   // TODO: Implement these functions/attributes
-  //   // and implement cost. Also keep track of current 
-  //   // cluster size since we do not want to exceed 15 
-  //   // memory operations in a cluster (This and the cost
-  //   // is probably done somewhere else and not here).
-  //   ClusteringPossible = crntNode_->CheckForClustering();
-  //   crntNode_->SetClusteringPossible(ClusteringPossible);
-
   for (i = crntBrnchNum; i < brnchCnt && crntNode_->IsFeasible(); i++) {
 #ifdef IS_DEBUG_FLOW
     Logger::Info("Probing branch %d out of %d", i, brnchCnt);
diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
index 8f607c68..247294e9 100644
--- a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
+++ b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
@@ -510,8 +510,8 @@ void OptSchedDDGWrapperBasic::countBoundaryLiveness(
   }
 }
 
-// Partially copied from
-// https://github.com/RadeonOpenCompute/llvm/blob/roc-ocl-2.4.0/lib/CodeGen/MachineScheduler.cpp#L1554
+/// Partially copied from
+/// https://github.com/RadeonOpenCompute/llvm/blob/roc-ocl-2.4.0/lib/CodeGen/MachineScheduler.cpp#L1554
 void OptSchedDDGWrapperBasic::clusterNeighboringMemOps_(
     ArrayRef<const SUnit *> MemOps) {
   SmallVector<MemOpInfo, 32> MemOpRecords;
@@ -566,12 +566,8 @@ void OptSchedDDGWrapperBasic::clusterNeighboringMemOps_(
 /// the information over to the SchedInstruction class as a bitvector.
 /// Partially copied from https://github.com/RadeonOpenCompute/llvm/blob/roc-ocl-2.4.0/lib/CodeGen/MachineScheduler.cpp#L1595
 void OptSchedDDGWrapperBasic::findPossibleClusters() {
-//   Copy how LLVM handles clustering except instead of actually
-//   modifying the DAG, we can possibly set MayCluster to true.
-//   Then add the nodes that can be clustered together into a
-//   data structure.
-
-  // Experiment with clustering loads first
+  // TODO: Add For-loop to also do store clusters. Currently only does load
+  // clusters
   bool IsLoad = true;
 
   LLVM_DEBUG(dbgs() << "Looking for load clusters\n");

From 035272bc36327dc15ad688ab0625c71ffe05cb12 Mon Sep 17 00:00:00 2001
From: vang thao <vang@optimizer-amd.ecs.csus.edu>
Date: Fri, 13 Mar 2020 16:37:16 -0700
Subject: [PATCH 09/40] Fix typo for variable and disabled terminating
 enumerator when we find a schedule in the ILP pass

---
 lib/Scheduler/bb_spill.cpp | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp
index c91bf552..63190abe 100644
--- a/lib/Scheduler/bb_spill.cpp
+++ b/lib/Scheduler/bb_spill.cpp
@@ -464,7 +464,7 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst,
           // Only decrement the cost if we cluster at least 2 operations
           // together (EXPERIMENTAL FOR NOW)
           ClusterInitialCost -= ClusteringWeight;
-          //Logger::Info("More than 2 instructions clustered together!");
+          Logger::Info("More than 2 instructions clustered together!");
         }
       } else {
         // Case 3: Not currently clustering. Initialize clustering
@@ -712,7 +712,7 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) {
         // cluster
         if (CurrentClusterSize > 2) {
           ClusterInitialCost += ClusteringWeight; // Re-add the cost
-	        //Logger::Info("More than 2 instructions clustered together. Undoing!!");
+	  Logger::Info("More than 2 instructions clustered together. Undoing!!");
 	}
         CurrentClusterSize--;
       } else {
@@ -930,17 +930,18 @@ FUNC_RESULT BBWithSpill::Enumerate_(Milliseconds startTime,
     HandlEnumrtrRslt_(rslt, trgtLngth);
 
     if (bestCost_ == 0 || rslt == RES_ERROR ||
-        (lngthDeadline == rgnDeadline && rslt == RES_TIMEOUT) ||
-        (rslt == RES_SUCCESS && isSecondPass)) {
+        (lngthDeadline == rgnDeadline && rslt == RES_TIMEOUT)) { //||
+        //(rslt == RES_SUCCESS && isSecondPass)) {
 
       // If doing two pass optsched and on the second pass then terminate if a
       // schedule is found with the same min-RP found in first pass.
+      /*
       if (rslt == RES_SUCCESS && isSecondPass) {
         Logger::Info("Schedule found in second pass, terminating BB loop.");
 
         if (trgtLngth  < schedUprBound_)
           Logger::Info("Schedule found with length %d is shorter than current schedule with length %d.", trgtLngth, schedUprBound_);
-      }
+      }*/
 
       break;
     }
@@ -1042,7 +1043,7 @@ bool BBWithSpill::ChkCostFsblty(InstCount trgtLngth, EnumTreeNode *node) {
 
   // TODO: Implement cost function for clustering
   if (isSecondPass)
-    cost += ClusterInitialCost; 
+    crntCost += ClusterInitialCost; 
 
   // assert(cost >= 0);
   assert(dynmcCostLwrBound >= 0);

From a2cd231990157acdce17769d6e428acdb4f439ba Mon Sep 17 00:00:00 2001
From: vang thao <vang@optimizer-amd.ecs.csus.edu>
Date: Tue, 17 Mar 2020 13:07:34 -0700
Subject: [PATCH 10/40] Debugging statements and reset mem clustering info in
 InitForSchduling

---
 lib/Scheduler/bb_spill.cpp   | 10 ++++++++--
 lib/Scheduler/enumerator.cpp | 11 ++++++++++-
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp
index 63190abe..bb06fc18 100644
--- a/lib/Scheduler/bb_spill.cpp
+++ b/lib/Scheduler/bb_spill.cpp
@@ -327,6 +327,12 @@ InstCount BBWithSpill::CmputCostLwrBound() {
 void BBWithSpill::InitForSchdulng() {
   InitForCostCmputtn_();
 
+  CurrentClusterSize = 0;
+  CurrentClusterVector.reset();
+  ClusterInitialCost = 1000000;
+  PastClustersList.clear();
+  LastCluster.reset();
+
   schduldEntryInstCnt_ = 0;
   schduldExitInstCnt_ = 0;
   schduldInstCnt_ = 0;
@@ -464,7 +470,7 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst,
           // Only decrement the cost if we cluster at least 2 operations
           // together (EXPERIMENTAL FOR NOW)
           ClusterInitialCost -= ClusteringWeight;
-          Logger::Info("More than 2 instructions clustered together!");
+          Logger::Info("Currently clustering %d instructions together", CurrentClusterSize);
         }
       } else {
         // Case 3: Not currently clustering. Initialize clustering
@@ -712,9 +718,9 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) {
         // cluster
         if (CurrentClusterSize > 2) {
           ClusterInitialCost += ClusteringWeight; // Re-add the cost
-	  Logger::Info("More than 2 instructions clustered together. Undoing!!");
 	}
         CurrentClusterSize--;
+	Logger::Info("Undoing an instruction from the cluster. Current size: %d", CurrentClusterSize);
       } else {
         // Case 3
         CurrentClusterSize--;
diff --git a/lib/Scheduler/enumerator.cpp b/lib/Scheduler/enumerator.cpp
index d9c4e3b1..39019034 100644
--- a/lib/Scheduler/enumerator.cpp
+++ b/lib/Scheduler/enumerator.cpp
@@ -1065,6 +1065,7 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode,
   if (inst != NULL)
     if (inst->GetPreFxdCycle() != INVALID_VALUE)
       if (inst->GetPreFxdCycle() != crntCycleNum_) {
+	Logger::Info("Pruned due to prefixed cycle");
         return false;
       }
 
@@ -1073,6 +1074,7 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode,
 #ifdef IS_DEBUG_INFSBLTY_TESTS
       stats::forwardLBInfeasibilityHits++;
 #endif
+      Logger::Info("Pruned due to forward lowerbound");
       return false;
     }
 
@@ -1080,6 +1082,7 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode,
 #ifdef IS_DEBUG_INFSBLTY_TESTS
       stats::backwardLBInfeasibilityHits++;
 #endif
+      Logger::Info("Pruned due to backward lowerbound");
       return false;
     }
   }
@@ -1100,6 +1103,7 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode,
         stats::nodeSuperiorityInfeasibilityHits++;
 #endif
         isNodeDmntd = true;
+	Logger::Info("Pruned due to node superiority");
         return false;
       }
   }
@@ -1117,6 +1121,7 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode,
 #ifdef IS_DEBUG_INFSBLTY_TESTS
     stats::slotCountInfeasibilityHits++;
 #endif
+    Logger::Info("Pruned due to issue slot infeasibility");
     return false;
   }
 
@@ -1127,6 +1132,7 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode,
 #ifdef IS_DEBUG_INFSBLTY_TESTS
     stats::rangeTighteningInfeasibilityHits++;
 #endif
+    Logger::Info("Pruned due to range tightening infeasibility");
     return false;
   }
 
@@ -1144,6 +1150,7 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode,
 #ifdef IS_DEBUG_INFSBLTY_TESTS
         stats::historyDominationInfeasibilityHits++;
 #endif
+	Logger::Info("Pruned due to history domination");
         return false;
       }
   }
@@ -1158,7 +1165,7 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode,
       stats::relaxedSchedulingInfeasibilityHits++;
 #endif
       isRlxInfsbl = true;
-
+      Logger::Info("Pruned due to relaxed schedule");
       return false;
     }
   }
@@ -2071,6 +2078,7 @@ bool LengthCostEnumerator::ProbeBranch_(SchedInstruction *inst,
   isFsbl = ChkCostFsblty_(inst, newNode);
 
   if (isFsbl == false) {
+    Logger::Info("Pruned due to cost infeasibility");
     return false;
   }
 
@@ -2088,6 +2096,7 @@ bool LengthCostEnumerator::ProbeBranch_(SchedInstruction *inst,
 #endif
       rgn_->UnschdulInst(inst, crntCycleNum_, crntSlotNum_, parent);
 
+      Logger::Info("Pruned due to history domination");
       return false;
     }
   }

From 760c38d5a4153f73f5d85724581524ddba90f2b0 Mon Sep 17 00:00:00 2001
From: Vang Thao <vangthao@csus.edu>
Date: Tue, 17 Mar 2020 14:09:14 -0700
Subject: [PATCH 11/40] Added setting or memory clustering in settings. Fixed
 clustering for cluster to cluster mem-ops. More Debug statements.

---
 example/optsched-cfg/sched.ini         |  5 ++
 include/opt-sched/Scheduler/bb_spill.h |  4 ++
 lib/Scheduler/bb_spill.cpp             | 94 +++++++++++++++++---------
 3 files changed, 72 insertions(+), 31 deletions(-)

diff --git a/example/optsched-cfg/sched.ini b/example/optsched-cfg/sched.ini
index 07e9a626..8addb5f5 100644
--- a/example/optsched-cfg/sched.ini
+++ b/example/optsched-cfg/sched.ini
@@ -14,6 +14,11 @@ PRINT_SPILL_COUNTS YES
 # NO
 USE_TWO_PASS NO
 
+# Cluster memory operations together in the second pass
+# YES
+# NO
+CLUSTER_MEMORY_OPS NO
+
 # These 3 flags control which schedulers will be used.
 # Each one can be individually toggled. The heuristic
 # list scheduler or ACO must be run before the 
diff --git a/include/opt-sched/Scheduler/bb_spill.h b/include/opt-sched/Scheduler/bb_spill.h
index 62771561..df798d43 100644
--- a/include/opt-sched/Scheduler/bb_spill.h
+++ b/include/opt-sched/Scheduler/bb_spill.h
@@ -42,6 +42,10 @@ class BBWithSpill : public SchedRegion {
   /// together
   std::shared_ptr<BitVector> CurrentClusterVector;
 
+  /// Flag to enable or disable clustering memory operations
+  /// in the ILP pass.
+  bool ClusterMemoryOperations;
+
   // TODO: Implement cost function for clustering
   /// Experimental variables and values for cost adjustment
   int ClusteringWeight;
diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp
index bb06fc18..eaf577f3 100644
--- a/lib/Scheduler/bb_spill.cpp
+++ b/lib/Scheduler/bb_spill.cpp
@@ -76,6 +76,8 @@ BBWithSpill::BBWithSpill(const OptSchedTarget *OST_, DataDepGraph *dataDepGraph,
   ClusterInitialCost = 1000000;
   PastClustersList.clear();
   LastCluster = nullptr;
+
+  ClusterMemoryOperations = schedIni.GetBool("CLUSTER_MEMORY_OPS");
 }
 /****************************************************************************/
 
@@ -383,7 +385,7 @@ InstCount BBWithSpill::CmputNormCost_(InstSchedule *sched,
   execCost -= costLwrBound_;
 
   // TODO: Implement cost function for clustering
-  if (isSecondPass)
+  if (isSecondPass && ClusterMemoryOperations)
 	  cost += ClusterInitialCost; 
 
   sched->SetCost(cost);
@@ -458,19 +460,38 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst,
 
   // Possibly keep track of the current memory clustering size here
   // and in UpdateSpillInfoForUnSchdul_()
-  if (isSecondPass) {
+  if (isSecondPass && ClusterMemoryOperations) {
     if (inst->GetMayCluster()) {
-      // TODO: Check for different cluster to different cluster scheduling
-      if (CurrentClusterSize > 0 &&
-          CurrentClusterVector->GetBit(inst->GetNum())) {
-        // Case 1: Currently clustering and this current instruction is part of
-        // the cluster
-        CurrentClusterSize++;
-        if (CurrentClusterSize > 2) {
-          // Only decrement the cost if we cluster at least 2 operations
-          // together (EXPERIMENTAL FOR NOW)
-          ClusterInitialCost -= ClusteringWeight;
-          Logger::Info("Currently clustering %d instructions together", CurrentClusterSize);
+      // If there is a current active cluster
+      if (CurrentClusterSize > 0) {
+        // The instruction is in the current active cluster
+        if (CurrentClusterVector->GetBit(inst->GetNum())) {
+          // Case 1: Currently clustering and this current instruction is part
+          // of the cluster
+          CurrentClusterSize++;
+          if (CurrentClusterSize > 2) {
+            // Only decrement the cost if we cluster at least 2 operations
+            // together (EXPERIMENTAL FOR NOW)
+            ClusterInitialCost -= ClusteringWeight;
+            Logger::Info("Currently clustering %d instructions together",
+                         CurrentClusterSize);
+          }
+        } else {
+          Logger::Info("Inst %d pushing cluster size %d onto the stack due to "
+                       "cluster to cluster op",
+                       inst->GetNum(), CurrentClusterSize);
+          // The instruction is in another cluster that is not currently active.
+          // Exit out of the currently active cluster into a new one.
+          if (LastCluster) {
+            PastClustersList.push_back(std::move(LastCluster));
+            LastCluster = llvm::make_unique<PastClusters>(
+                CurrentClusterVector, CurrentClusterSize, inst->GetNum());
+          } else 
+            LastCluster = llvm::make_unique<PastClusters>(
+                CurrentClusterVector, CurrentClusterSize, inst->GetNum());
+          CurrentClusterVector.reset();
+          CurrentClusterVector = inst->GetClusterVector();
+          CurrentClusterSize = 1;
         }
       } else {
         // Case 3: Not currently clustering. Initialize clustering
@@ -479,6 +500,8 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst,
         CurrentClusterSize = 1;                          // Current size is 1
       }
     } else if (CurrentClusterSize > 1) {
+      Logger::Info("Inst %d pushing cluster size %d onto the stack",
+                   inst->GetNum(), CurrentClusterSize);
       // Case 2: Exiting out of an active cluster
       // Save the cluster to restore when backtracking.
       if (LastCluster) {
@@ -708,24 +731,31 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) {
   // 3.) Non-Cluster <- Cluster
       // Simple case, just decrement 1 from cluster size
       // If cluster size == 0, delete CurrentClusterVector
-  if (isSecondPass) {
+  if (isSecondPass && ClusterMemoryOperations) {
     // TODO: Check for different cluster to different cluster
     // backtracking.
     if (inst->GetMayCluster()) {
-      // Case 1
-      if (CurrentClusterSize > 0 && CurrentClusterVector->GetBit(inst->GetNum())) {
-        // Currently clustering and this current instruction is part of the
-        // cluster
-        if (CurrentClusterSize > 2) {
-          ClusterInitialCost += ClusteringWeight; // Re-add the cost
-	}
-        CurrentClusterSize--;
-	Logger::Info("Undoing an instruction from the cluster. Current size: %d", CurrentClusterSize);
-      } else {
-        // Case 3
-        CurrentClusterSize--;
-        if (CurrentClusterSize == 0)
-          CurrentClusterVector.reset();
+      // Case 1 and 3
+      if (CurrentClusterSize > 2) {
+        ClusterInitialCost += ClusteringWeight; // Re-add the cost
+      }
+      CurrentClusterSize--;
+      Logger::Info("Undoing an instruction from the cluster. Current size: %d",
+                   CurrentClusterSize);
+
+      if (CurrentClusterSize == 0) {
+        CurrentClusterVector.reset();
+        if (LastCluster->InstNum == inst->GetNum()) {
+          CurrentClusterSize = LastCluster->ClusterSize;
+          CurrentClusterVector = LastCluster->ClusterVector;
+          LastCluster.reset(); // Release current cluster pointer
+
+          // Get previous cluster from vector list
+          if (!PastClustersList.empty()) {
+            LastCluster = std::move(PastClustersList.back());
+            PastClustersList.pop_back();
+          }
+        }
       }
     } else if (LastCluster) {
       if (LastCluster->InstNum == inst->GetNum()) {
@@ -741,9 +771,11 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) {
           LastCluster = std::move(PastClustersList.back());
           PastClustersList.pop_back();
         }
+        Logger::Info("Inst %d popping cluster size %d off the stack",
+                     inst->GetNum(), CurrentClusterSize);
+      }
     }
-  }}
-
+  }
 
   defCnt = inst->GetDefs(defs);
   useCnt = inst->GetUses(uses);
@@ -1048,7 +1080,7 @@ bool BBWithSpill::ChkCostFsblty(InstCount trgtLngth, EnumTreeNode *node) {
   dynmcCostLwrBound = crntCost;
 
   // TODO: Implement cost function for clustering
-  if (isSecondPass)
+  if (isSecondPass && ClusterMemoryOperations)
     crntCost += ClusterInitialCost; 
 
   // assert(cost >= 0);

From 8b5e2cc5811129bf643e1bb029d804171f857936 Mon Sep 17 00:00:00 2001
From: Vang Thao <vangthao@csus.edu>
Date: Tue, 17 Mar 2020 14:11:11 -0700
Subject: [PATCH 12/40] Fix missing var.

---
 lib/Scheduler/bb_spill.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp
index eaf577f3..706eeedb 100644
--- a/lib/Scheduler/bb_spill.cpp
+++ b/lib/Scheduler/bb_spill.cpp
@@ -77,6 +77,7 @@ BBWithSpill::BBWithSpill(const OptSchedTarget *OST_, DataDepGraph *dataDepGraph,
   PastClustersList.clear();
   LastCluster = nullptr;
 
+  Config &schedIni = SchedulerOptions::getInstance();
   ClusterMemoryOperations = schedIni.GetBool("CLUSTER_MEMORY_OPS");
 }
 /****************************************************************************/

From 93f01e3c167d6a610e7eabb58e866e872adbac95 Mon Sep 17 00:00:00 2001
From: Vang Thao <vangthao@csus.edu>
Date: Tue, 17 Mar 2020 14:26:14 -0700
Subject: [PATCH 13/40] Fix memory segmentation

---
 lib/Scheduler/bb_spill.cpp | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp
index 706eeedb..e141bf83 100644
--- a/lib/Scheduler/bb_spill.cpp
+++ b/lib/Scheduler/bb_spill.cpp
@@ -744,17 +744,24 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) {
       Logger::Info("Undoing an instruction from the cluster. Current size: %d",
                    CurrentClusterSize);
 
+      // If there is no more member in the currently active cluster then disable
+      // the cluster
       if (CurrentClusterSize == 0) {
         CurrentClusterVector.reset();
-        if (LastCluster->InstNum == inst->GetNum()) {
-          CurrentClusterSize = LastCluster->ClusterSize;
-          CurrentClusterVector = LastCluster->ClusterVector;
-          LastCluster.reset(); // Release current cluster pointer
-
-          // Get previous cluster from vector list
-          if (!PastClustersList.empty()) {
-            LastCluster = std::move(PastClustersList.back());
-            PastClustersList.pop_back();
+
+        // If there was a previously active cluster, check last cluster to see
+        // if we need to restore the state
+        if (LastCluster) {
+          if (LastCluster->InstNum == inst->GetNum()) {
+            CurrentClusterSize = LastCluster->ClusterSize;
+            CurrentClusterVector = LastCluster->ClusterVector;
+            LastCluster.reset(); // Release current cluster pointer
+
+            // Get previous cluster from vector list
+            if (!PastClustersList.empty()) {
+              LastCluster = std::move(PastClustersList.back());
+              PastClustersList.pop_back();
+            }
           }
         }
       }

From 111d5eb167ecdd67fdcb629e06359e06d95d6177 Mon Sep 17 00:00:00 2001
From: Vang Thao <vangthao@csus.edu>
Date: Tue, 17 Mar 2020 15:17:44 -0700
Subject: [PATCH 14/40] Use an integer instead of a vector for cluster groups.

---
 include/opt-sched/Scheduler/bb_spill.h        | 11 +++----
 .../opt-sched/Scheduler/sched_basic_data.h    | 15 +++++----
 lib/Scheduler/bb_spill.cpp                    | 33 +++++++++----------
 lib/Scheduler/sched_basic_data.cpp            | 13 +++-----
 lib/Wrapper/OptSchedDDGWrapperBasic.cpp       | 29 ++++++++--------
 lib/Wrapper/OptSchedDDGWrapperBasic.h         |  3 ++
 6 files changed, 54 insertions(+), 50 deletions(-)

diff --git a/include/opt-sched/Scheduler/bb_spill.h b/include/opt-sched/Scheduler/bb_spill.h
index df798d43..91c13f37 100644
--- a/include/opt-sched/Scheduler/bb_spill.h
+++ b/include/opt-sched/Scheduler/bb_spill.h
@@ -38,9 +38,8 @@ class BBWithSpill : public SchedRegion {
   /// Current cluster size
   unsigned int CurrentClusterSize; 
 
-  /// Bitvector containing active bits for instructions that can be clustered
-  /// together
-  std::shared_ptr<BitVector> CurrentClusterVector;
+  /// Current active cluster group
+  int ActiveClusterGroup;
 
   /// Flag to enable or disable clustering memory operations
   /// in the ILP pass.
@@ -53,7 +52,7 @@ class BBWithSpill : public SchedRegion {
 
   /// Data struct to contain information about the previous clusters
   struct PastClusters {
-    std::shared_ptr<BitVector> ClusterVector;
+    int ClusterGroup;
     /// Size of the cluster when it was ended by an instruction not in the
     /// cluster
     int ClusterSize;
@@ -62,8 +61,8 @@ class BBWithSpill : public SchedRegion {
     int InstNum; 
 
     /// Constructor for this struct
-    PastClusters(std::shared_ptr<BitVector> Cluster, int size, int num)
-        : ClusterVector(Cluster), ClusterSize(size), InstNum(num) {}
+    PastClusters(int Cluster, int size, int num)
+        : ClusterGroup(Cluster), ClusterSize(size), InstNum(num) {}
   };
 
   /// Vector containing the (n-1) past clusters
diff --git a/include/opt-sched/Scheduler/sched_basic_data.h b/include/opt-sched/Scheduler/sched_basic_data.h
index 166dbbad..9a6d631b 100644
--- a/include/opt-sched/Scheduler/sched_basic_data.h
+++ b/include/opt-sched/Scheduler/sched_basic_data.h
@@ -16,7 +16,6 @@ Last Update:  Sept. 2013
 #include "opt-sched/Scheduler/hash_table.h"
 #include "opt-sched/Scheduler/machine_model.h"
 #include <iostream>
-#include <memory>
 
 namespace llvm {
 namespace opt_sched {
@@ -428,9 +427,11 @@ class SchedInstruction : public GraphNode {
 
   /// Set MayCluster to true if clustering memory operations was found
   /// to be possible.
-  void SetMayCluster(std::shared_ptr<BitVector> PossibleClustersVector);
+  void SetMayCluster(int ClusteringGroup);
   bool GetMayCluster() { return MayCluster; }
-  std::shared_ptr<BitVector> GetClusterVector();
+  int GetClusterGroup() { return ClusterGroup; }
+  static int GetActiveCluster() { return ActiveCluster; }
+  static int SetActiveCluster(int Active) { ActiveCluster = Active; }
   friend class SchedRange;
 
 protected:
@@ -438,11 +439,13 @@ class SchedInstruction : public GraphNode {
   string name_;
   // The mnemonic of this instruction, e.g. "add" or "jmp".
   string opCode_;
-  /// Data structure to store a possible clustering with other instructions.
-  /// This data structure should have a fast lookup operation.
-  std::shared_ptr<BitVector> PossibleClusturesBitVector;
+  /// The cluster group that the current instruction is a part of.
+  /// Default of 0 means that it is not part of any cluster.
+  int ClusterGroup;
   /// This value should be set to true if clustering may be possible.
   bool MayCluster;
+  /// Currently active cluster. Used for ready list.
+  static int ActiveCluster;
   // A numberical ID for this instruction.
   int nodeID_;
   // The type of this instruction.
diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp
index e141bf83..4c14eb78 100644
--- a/lib/Scheduler/bb_spill.cpp
+++ b/lib/Scheduler/bb_spill.cpp
@@ -71,7 +71,7 @@ BBWithSpill::BBWithSpill(const OptSchedTarget *OST_, DataDepGraph *dataDepGraph,
   schduldInstCnt_ = 0;
   
   CurrentClusterSize = 0;
-  CurrentClusterVector = nullptr;
+  ActiveClusterGroup = 0;
   ClusteringWeight = 1000;
   ClusterInitialCost = 1000000;
   PastClustersList.clear();
@@ -331,7 +331,7 @@ void BBWithSpill::InitForSchdulng() {
   InitForCostCmputtn_();
 
   CurrentClusterSize = 0;
-  CurrentClusterVector.reset();
+  ActiveClusterGroup = 0;
   ClusterInitialCost = 1000000;
   PastClustersList.clear();
   LastCluster.reset();
@@ -466,7 +466,7 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst,
       // If there is a current active cluster
       if (CurrentClusterSize > 0) {
         // The instruction is in the current active cluster
-        if (CurrentClusterVector->GetBit(inst->GetNum())) {
+        if (ActiveClusterGroup == inst->GetClusterGroup()) {
           // Case 1: Currently clustering and this current instruction is part
           // of the cluster
           CurrentClusterSize++;
@@ -486,19 +486,18 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst,
           if (LastCluster) {
             PastClustersList.push_back(std::move(LastCluster));
             LastCluster = llvm::make_unique<PastClusters>(
-                CurrentClusterVector, CurrentClusterSize, inst->GetNum());
+                ActiveClusterGroup, CurrentClusterSize, inst->GetNum());
           } else 
             LastCluster = llvm::make_unique<PastClusters>(
-                CurrentClusterVector, CurrentClusterSize, inst->GetNum());
-          CurrentClusterVector.reset();
-          CurrentClusterVector = inst->GetClusterVector();
+                ActiveClusterGroup, CurrentClusterSize, inst->GetNum());
+
+          ActiveClusterGroup = inst->GetClusterGroup();
           CurrentClusterSize = 1;
         }
       } else {
         // Case 3: Not currently clustering. Initialize clustering
-        CurrentClusterVector.reset();                    // Clear cluster vector
-        CurrentClusterVector = inst->GetClusterVector(); // Set active cluster
-        CurrentClusterSize = 1;                          // Current size is 1
+        ActiveClusterGroup = inst->GetClusterGroup();
+        CurrentClusterSize = 1;
       }
     } else if (CurrentClusterSize > 1) {
       Logger::Info("Inst %d pushing cluster size %d onto the stack",
@@ -511,11 +510,11 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst,
 
         // Current previous cluster
         LastCluster = llvm::make_unique<PastClusters>(
-            CurrentClusterVector, CurrentClusterSize, inst->GetNum());
+            ActiveClusterGroup, CurrentClusterSize, inst->GetNum());
       } else
         LastCluster = llvm::make_unique<PastClusters>(
-            CurrentClusterVector, CurrentClusterSize, inst->GetNum());
-      CurrentClusterVector.reset(); // Reset active cluster
+            ActiveClusterGroup, CurrentClusterSize, inst->GetNum());
+      ActiveClusterGroup = 0;     // Reset active cluster
       CurrentClusterSize = 0;       // Set cluster size to 0
     }
   }
@@ -731,7 +730,7 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) {
       // Can/should we use a stack to restore state?
   // 3.) Non-Cluster <- Cluster
       // Simple case, just decrement 1 from cluster size
-      // If cluster size == 0, delete CurrentClusterVector
+      // If cluster size == 0, set ActiveClusterGroup = 0;
   if (isSecondPass && ClusterMemoryOperations) {
     // TODO: Check for different cluster to different cluster
     // backtracking.
@@ -747,14 +746,14 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) {
       // If there is no more member in the currently active cluster then disable
       // the cluster
       if (CurrentClusterSize == 0) {
-        CurrentClusterVector.reset();
+        ActiveClusterGroup = 0;
 
         // If there was a previously active cluster, check last cluster to see
         // if we need to restore the state
         if (LastCluster) {
           if (LastCluster->InstNum == inst->GetNum()) {
             CurrentClusterSize = LastCluster->ClusterSize;
-            CurrentClusterVector = LastCluster->ClusterVector;
+            ActiveClusterGroup = LastCluster->ClusterGroup;
             LastCluster.reset(); // Release current cluster pointer
 
             // Get previous cluster from vector list
@@ -771,7 +770,7 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) {
         // this instruction ended the cluster then restore the previous
         // cluster's state
         CurrentClusterSize = LastCluster->ClusterSize;
-        CurrentClusterVector = LastCluster->ClusterVector;
+        ActiveClusterGroup = LastCluster->ClusterGroup;
         LastCluster.reset(); // Release current cluster pointer
 
         // Get previous cluster from vector list
diff --git a/lib/Scheduler/sched_basic_data.cpp b/lib/Scheduler/sched_basic_data.cpp
index f74b605f..536cd99c 100644
--- a/lib/Scheduler/sched_basic_data.cpp
+++ b/lib/Scheduler/sched_basic_data.cpp
@@ -6,7 +6,6 @@ using namespace llvm::opt_sched;
 
 SchedInstruction::SchedInstruction(InstCount num, const string &name,
                                    InstType instType, const string &opCode,
-                                   /* bool InstrMayLoad, bool InstrMayStore,*/
                                    InstCount maxInstCnt, int nodeID,
                                    InstCount fileSchedOrder,
                                    InstCount fileSchedCycle, InstCount fileLB,
@@ -16,8 +15,8 @@ SchedInstruction::SchedInstruction(InstCount num, const string &name,
   name_ = name;
   opCode_ = opCode;
   instType_ = instType;
-  // MayLoad = InstrMayLoad;
-  // MayStore = InstrMayStore;
+  ClusterGroup = 0;
+  ActiveCluster = 0;
   MayCluster = false; 
 
   frwrdLwrBound_ = INVALID_VALUE;
@@ -742,15 +741,13 @@ int16_t SchedInstruction::CmputLastUseCnt() {
   return lastUseCnt_;
 }
 
-void SchedInstruction::SetMayCluster(std::shared_ptr<BitVector> PossibleClustersVector) {
-  if (PossibleClustersVector->GetOneCnt() > 0) {
-    PossibleClusturesBitVector = PossibleClustersVector;
+void SchedInstruction::SetMayCluster(int ClusteringGroup) {
+  if (ClusteringGroup > 0) {
+    ClusterGroup = ClusteringGroup;
     MayCluster = true;
   }
 }
 
-std::shared_ptr<BitVector> SchedInstruction::GetClusterVector() { return PossibleClusturesBitVector; }
-
 /******************************************************************************
  * SchedRange                                                                 *
  ******************************************************************************/
diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
index 247294e9..cdbd8d1a 100644
--- a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
+++ b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
@@ -75,6 +75,8 @@ OptSchedDDGWrapperBasic::OptSchedDDGWrapperBasic(
   if (ShouldFilterRegisterTypes)
     RTFilter = createLLVMRegTypeFilter(MM, DAG->TRI,
                                        DAG->getRegPressure().MaxSetPressure);
+
+  ClusterCount = 0;
 }
 
 void OptSchedDDGWrapperBasic::convertSUnits() {
@@ -380,8 +382,6 @@ inline void OptSchedDDGWrapperBasic::setupRoot() {
   int RootNum = DAG->SUnits.size();
   root_ = CreateNode_(RootNum, "artificial",
                       MM->GetInstTypeByName("artificial"), "__optsched_entry",
-                      // mayLoad = false;
-                      // mayStore = false;
                       RootNum, // nodeID
                       RootNum, // fileSchedOrder
                       RootNum, // fileSchedCycle
@@ -400,8 +400,6 @@ inline void OptSchedDDGWrapperBasic::setupLeaf() {
   int LeafNum = DAG->SUnits.size() + 1;
   CreateNode_(LeafNum, "artificial", MM->GetInstTypeByName("artificial"),
               "__optsched_exit",
-              // mayLoad = false;
-              // mayStore = false;
               LeafNum, // nodeID
               LeafNum, // fileSchedOrder
               LeafNum, // fileSchedCycle
@@ -475,8 +473,6 @@ void OptSchedDDGWrapperBasic::convertSUnit(const SUnit &SU) {
   }
 
   CreateNode_(SU.NodeNum, InstName.c_str(), InstType, InstName.c_str(),
-              // MI->mayLoad()
-              // MI->mayStore()
               SU.NodeNum, // nodeID
               SU.NodeNum, // fileSchedOrder
               SU.NodeNum, // fileSchedCycle
@@ -515,8 +511,9 @@ void OptSchedDDGWrapperBasic::countBoundaryLiveness(
 void OptSchedDDGWrapperBasic::clusterNeighboringMemOps_(
     ArrayRef<const SUnit *> MemOps) {
   SmallVector<MemOpInfo, 32> MemOpRecords;
+  bool ClusterPossible = false;
+
   LLVM_DEBUG(dbgs() << "Processing possible clusters\n");
-  
   for (const SUnit *SU : MemOps) {
     LLVM_DEBUG(dbgs() << "  " << SU->NodeNum << " is in the chain.\n");
     MachineOperand *BaseOp;
@@ -530,8 +527,6 @@ void OptSchedDDGWrapperBasic::clusterNeighboringMemOps_(
     return;
   }
 
-  auto ClusterVector = std::make_shared<BitVector>(DAG->SUnits.size());
-
   llvm::sort(MemOpRecords);
   unsigned ClusterLength = 1;
   for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) {
@@ -542,11 +537,19 @@ void OptSchedDDGWrapperBasic::clusterNeighboringMemOps_(
                                  *MemOpRecords[Idx + 1].BaseOp,
                                  ClusterLength)) {
       LLVM_DEBUG(dbgs() << "    Cluster possible at SU(" << SUa->NodeNum << ")- SU(" << SUb->NodeNum << ")\n");
+      
+      // If clustering was possible then increase the cluster count. This only
+      // happens once every cluster
+      if (!ClusterPossible) {
+        ClusterPossible = true;
+        ClusterCount++;
+      }
+
+      // Tell the instructions what cluster number they are in
+      insts_[SUa->NodeNum]->SetMayCluster(ClusterCount);
+      insts_[SUb->NodeNum]->SetMayCluster(ClusterCount);
+
       ++ClusterLength;
-      ClusterVector->SetBit(SUa->NodeNum);
-      ClusterVector->SetBit(SUb->NodeNum);
-      insts_[SUa->NodeNum]->SetMayCluster(ClusterVector);
-      insts_[SUb->NodeNum]->SetMayCluster(ClusterVector);
     } else
       ClusterLength = 1;
   }
diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.h b/lib/Wrapper/OptSchedDDGWrapperBasic.h
index 9970fab9..76d5d7ea 100644
--- a/lib/Wrapper/OptSchedDDGWrapperBasic.h
+++ b/lib/Wrapper/OptSchedDDGWrapperBasic.h
@@ -146,6 +146,9 @@ class OptSchedDDGWrapperBasic : public DataDepGraph {
     std::vector<SchedInstruction *> consumers;
   };
 
+  /// Count of the total clusters possible 
+  int ClusterCount;
+
 // Copied from
 // https://github.com/RadeonOpenCompute/llvm/blob/roc-ocl-2.4.0/lib/CodeGen/MachineScheduler.cpp#L1467
   struct MemOpInfo {

From 298fb0fe2d063bcaa2ae99c5a015f84c6a17ca59 Mon Sep 17 00:00:00 2001
From: vang thao <vang@optimizer-amd.ecs.csus.edu>
Date: Wed, 18 Mar 2020 00:01:26 -0700
Subject: [PATCH 15/40] Fix error with static variable.

---
 include/opt-sched/Scheduler/sched_basic_data.h | 2 +-
 lib/Scheduler/sched_basic_data.cpp             | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/opt-sched/Scheduler/sched_basic_data.h b/include/opt-sched/Scheduler/sched_basic_data.h
index 9a6d631b..3a737d8e 100644
--- a/include/opt-sched/Scheduler/sched_basic_data.h
+++ b/include/opt-sched/Scheduler/sched_basic_data.h
@@ -431,7 +431,7 @@ class SchedInstruction : public GraphNode {
   bool GetMayCluster() { return MayCluster; }
   int GetClusterGroup() { return ClusterGroup; }
   static int GetActiveCluster() { return ActiveCluster; }
-  static int SetActiveCluster(int Active) { ActiveCluster = Active; }
+  static void SetActiveCluster(int Active) { ActiveCluster = Active; }
   friend class SchedRange;
 
 protected:
diff --git a/lib/Scheduler/sched_basic_data.cpp b/lib/Scheduler/sched_basic_data.cpp
index 536cd99c..2fa5f09d 100644
--- a/lib/Scheduler/sched_basic_data.cpp
+++ b/lib/Scheduler/sched_basic_data.cpp
@@ -4,6 +4,8 @@
 
 using namespace llvm::opt_sched;
 
+int SchedInstruction::ActiveCluster = 0;
+
 SchedInstruction::SchedInstruction(InstCount num, const string &name,
                                    InstType instType, const string &opCode,
                                    InstCount maxInstCnt, int nodeID,
@@ -16,7 +18,6 @@ SchedInstruction::SchedInstruction(InstCount num, const string &name,
   opCode_ = opCode;
   instType_ = instType;
   ClusterGroup = 0;
-  ActiveCluster = 0;
   MayCluster = false; 
 
   frwrdLwrBound_ = INVALID_VALUE;

From 30c7d9cf598d3c7fe7b1b34cade690d02839f284 Mon Sep 17 00:00:00 2001
From: Vang Thao <vangthao@csus.edu>
Date: Wed, 18 Mar 2020 17:44:02 -0700
Subject: [PATCH 16/40] Added MEM heuristic priority. Not yet implemented.

---
 include/opt-sched/Scheduler/data_dep.h        |  5 +++
 .../opt-sched/Scheduler/sched_basic_data.h    |  5 ++-
 lib/Scheduler/bb_spill.cpp                    | 42 ++++++++++++-------
 lib/Scheduler/data_dep.cpp                    |  2 +
 lib/Scheduler/ready_list.cpp                  | 20 +++++++++
 lib/Wrapper/OptSchedDDGWrapperBasic.cpp       |  2 +
 lib/Wrapper/OptimizingScheduler.cpp           |  8 ++--
 7 files changed, 64 insertions(+), 20 deletions(-)

diff --git a/include/opt-sched/Scheduler/data_dep.h b/include/opt-sched/Scheduler/data_dep.h
index d0885fd0..803574c9 100644
--- a/include/opt-sched/Scheduler/data_dep.h
+++ b/include/opt-sched/Scheduler/data_dep.h
@@ -291,7 +291,12 @@ class DataDepGraph : public llvm::opt_sched::OptSchedDDGWrapperBase,
 
   RegisterFile *getRegFiles() { return RegFiles.get(); }
 
+  int getMaxClusterCount() { return MaxClusterCount; }
+  void setMaxClusterCount(int Max) { MaxClusterCount = Max; }
+
 protected:
+  int MaxClusterCount;
+
   // TODO(max): Get rid of this.
   // Number of basic blocks
   int32_t bscBlkCnt_;
diff --git a/include/opt-sched/Scheduler/sched_basic_data.h b/include/opt-sched/Scheduler/sched_basic_data.h
index 3a737d8e..cdfad226 100644
--- a/include/opt-sched/Scheduler/sched_basic_data.h
+++ b/include/opt-sched/Scheduler/sched_basic_data.h
@@ -51,7 +51,10 @@ enum LISTSCHED_HEURISTIC {
   LSH_LS = 7,
 
   // LLVM list scheduler order
-  LSH_LLVM = 8
+  LSH_LLVM = 8,
+
+  // Memory clustering
+  LSH_MEM = 9
 };
 
 #define MAX_SCHED_PRIRTS 10
diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp
index 4c14eb78..ebad0f9e 100644
--- a/lib/Scheduler/bb_spill.cpp
+++ b/lib/Scheduler/bb_spill.cpp
@@ -492,32 +492,43 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst,
                 ActiveClusterGroup, CurrentClusterSize, inst->GetNum());
 
           ActiveClusterGroup = inst->GetClusterGroup();
+          inst->SetActiveCluster(ActiveClusterGroup);
           CurrentClusterSize = 1;
         }
       } else {
         // Case 3: Not currently clustering. Initialize clustering
         ActiveClusterGroup = inst->GetClusterGroup();
+        inst->SetActiveCluster(ActiveClusterGroup);
         CurrentClusterSize = 1;
       }
-    } else if (CurrentClusterSize > 1) {
-      Logger::Info("Inst %d pushing cluster size %d onto the stack",
-                   inst->GetNum(), CurrentClusterSize);
+    } else if (CurrentClusterSize > 0) {
       // Case 2: Exiting out of an active cluster
-      // Save the cluster to restore when backtracking.
-      if (LastCluster) {
-        // List of previous clusters
-        PastClustersList.push_back(std::move(LastCluster));
-
-        // Current previous cluster
-        LastCluster = llvm::make_unique<PastClusters>(
-            ActiveClusterGroup, CurrentClusterSize, inst->GetNum());
-      } else
-        LastCluster = llvm::make_unique<PastClusters>(
-            ActiveClusterGroup, CurrentClusterSize, inst->GetNum());
+      // Only save the state if we cluster 2 or more instructions together
+      // already
+      if (CurrentClusterSize > 1) {
+        Logger::Info("Inst %d pushing cluster size %d onto the stack",
+                     inst->GetNum(), CurrentClusterSize);
+        
+        // Save the cluster to restore when backtracking.
+        if (LastCluster) {
+          // Save previous current cluster in a vector
+          PastClustersList.push_back(std::move(LastCluster));
+
+          // Current cluster
+          LastCluster = llvm::make_unique<PastClusters>(
+              ActiveClusterGroup, CurrentClusterSize, inst->GetNum());
+        } else
+          // This is the first cluster that we are saving
+          LastCluster = llvm::make_unique<PastClusters>(
+              ActiveClusterGroup, CurrentClusterSize, inst->GetNum());
+      }
+
       ActiveClusterGroup = 0;     // Reset active cluster
+      inst->SetActiveCluster(0);
       CurrentClusterSize = 0;       // Set cluster size to 0
     }
   }
+
   // Potential Issues:
   // 1. Keeping track of the average clustering size when we aren't done
   // scheduling.
@@ -747,6 +758,7 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) {
       // the cluster
       if (CurrentClusterSize == 0) {
         ActiveClusterGroup = 0;
+        inst->SetActiveCluster(0);
 
         // If there was a previously active cluster, check last cluster to see
         // if we need to restore the state
@@ -754,6 +766,7 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) {
           if (LastCluster->InstNum == inst->GetNum()) {
             CurrentClusterSize = LastCluster->ClusterSize;
             ActiveClusterGroup = LastCluster->ClusterGroup;
+            inst->SetActiveCluster(ActiveClusterGroup);
             LastCluster.reset(); // Release current cluster pointer
 
             // Get previous cluster from vector list
@@ -771,6 +784,7 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) {
         // cluster's state
         CurrentClusterSize = LastCluster->ClusterSize;
         ActiveClusterGroup = LastCluster->ClusterGroup;
+        inst->SetActiveCluster(ActiveClusterGroup);
         LastCluster.reset(); // Release current cluster pointer
 
         // Get previous cluster from vector list
diff --git a/lib/Scheduler/data_dep.cpp b/lib/Scheduler/data_dep.cpp
index 65d2f0b8..a6652c9f 100644
--- a/lib/Scheduler/data_dep.cpp
+++ b/lib/Scheduler/data_dep.cpp
@@ -197,6 +197,8 @@ DataDepGraph::DataDepGraph(MachineModel *machMdl, LATENCY_PRECISION ltncyPrcsn)
   exitInstCnt_ = 0;
 
   RegFiles = llvm::make_unique<RegisterFile[]>(machMdl_->GetRegTypeCnt());
+
+  MaxClusterCount = 0;
 }
 
 DataDepGraph::~DataDepGraph() {
diff --git a/lib/Scheduler/ready_list.cpp b/lib/Scheduler/ready_list.cpp
index 398ca5ed..553238aa 100644
--- a/lib/Scheduler/ready_list.cpp
+++ b/lib/Scheduler/ready_list.cpp
@@ -68,6 +68,13 @@ ReadyList::ReadyList(DataDepGraph *dataDepGraph, SchedPriorities prirts) {
       ltncySumBits_ = Utilities::clcltBitsNeededToHoldNum(maxLtncySum_);
       totKeyBits += ltncySumBits_;
       break;
+
+    case LSH_MEM:
+      Logger::Info("MEM heuristic detected");
+      break;
+
+    default:
+      break;
     } // end switch
   }   // end for
 
@@ -111,6 +118,13 @@ ReadyList::ReadyList(DataDepGraph *dataDepGraph, SchedPriorities prirts) {
       AddPrirtyToKey_(maxPriority_, keySize, ltncySumBits_, maxLtncySum_,
                       maxLtncySum_);
       break;
+
+    case LSH_MEM:
+      Logger::Info("MEM heuristic detected");
+      break;
+
+    default:
+      break;
     }
   }
 }
@@ -190,6 +204,12 @@ unsigned long ReadyList::CmputKey_(SchedInstruction *inst, bool isUpdate,
       AddPrirtyToKey_(key, keySize, ltncySumBits_, inst->GetLtncySum(),
                       maxLtncySum_);
       break;
+
+    case LSH_MEM:
+      break;
+
+    default:
+      break;
     }
   }
   return key;
diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
index cdbd8d1a..6d50fc8f 100644
--- a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
+++ b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
@@ -543,6 +543,8 @@ void OptSchedDDGWrapperBasic::clusterNeighboringMemOps_(
       if (!ClusterPossible) {
         ClusterPossible = true;
         ClusterCount++;
+        setMaxClusterCount(ClusterCount);
+        Logger::Info("Setting max cluster count to %d", ClusterCount);
       }
 
       // Tell the instructions what cluster number they are in
diff --git a/lib/Wrapper/OptimizingScheduler.cpp b/lib/Wrapper/OptimizingScheduler.cpp
index ca383ac6..08a12466 100644
--- a/lib/Wrapper/OptimizingScheduler.cpp
+++ b/lib/Wrapper/OptimizingScheduler.cpp
@@ -47,11 +47,9 @@ constexpr struct {
   const char* Name;
   LISTSCHED_HEURISTIC HID;
 } HeuristicNames[] = {
-    {"CP", LSH_CP},   {"LUC", LSH_LUC},
-    {"UC", LSH_UC},   {"NID", LSH_NID},
-    {"CPR", LSH_CPR}, {"ISO", LSH_ISO},
-    {"SC", LSH_SC},   {"LS", LSH_LS},
-    {"LLVM", LSH_LLVM}
+    {"CP", LSH_CP},     {"LUC", LSH_LUC}, {"UC", LSH_UC}, {"NID", LSH_NID},
+    {"CPR", LSH_CPR},   {"ISO", LSH_ISO}, {"SC", LSH_SC}, {"LS", LSH_LS},
+    {"LLVM", LSH_LLVM}, {"MEM", LSH_MEM}
 };
 
 // Default path to the the configuration directory for opt-sched.

From d71246088d33d4a583281d0cdd485e09f89a382c Mon Sep 17 00:00:00 2001
From: Vang Thao <vangthao@csus.edu>
Date: Wed, 18 Mar 2020 18:36:47 -0700
Subject: [PATCH 17/40] ALso save state for cluster of size 1.

---
 lib/Scheduler/bb_spill.cpp | 32 ++++++++++++++------------------
 1 file changed, 14 insertions(+), 18 deletions(-)

diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp
index ebad0f9e..534ef298 100644
--- a/lib/Scheduler/bb_spill.cpp
+++ b/lib/Scheduler/bb_spill.cpp
@@ -503,25 +503,21 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst,
       }
     } else if (CurrentClusterSize > 0) {
       // Case 2: Exiting out of an active cluster
-      // Only save the state if we cluster 2 or more instructions together
-      // already
-      if (CurrentClusterSize > 1) {
-        Logger::Info("Inst %d pushing cluster size %d onto the stack",
-                     inst->GetNum(), CurrentClusterSize);
+      Logger::Info("Inst %d pushing cluster size %d onto the stack",
+                    inst->GetNum(), CurrentClusterSize);
         
-        // Save the cluster to restore when backtracking.
-        if (LastCluster) {
-          // Save previous current cluster in a vector
-          PastClustersList.push_back(std::move(LastCluster));
-
-          // Current cluster
-          LastCluster = llvm::make_unique<PastClusters>(
-              ActiveClusterGroup, CurrentClusterSize, inst->GetNum());
-        } else
-          // This is the first cluster that we are saving
-          LastCluster = llvm::make_unique<PastClusters>(
-              ActiveClusterGroup, CurrentClusterSize, inst->GetNum());
-      }
+      // Save the cluster to restore when backtracking.
+      if (LastCluster) {
+        // Save previous current cluster in a vector
+        PastClustersList.push_back(std::move(LastCluster));
+
+        // Current cluster
+        LastCluster = llvm::make_unique<PastClusters>(
+            ActiveClusterGroup, CurrentClusterSize, inst->GetNum());
+      } else
+        // This is the first cluster that we are saving
+        LastCluster = llvm::make_unique<PastClusters>(
+            ActiveClusterGroup, CurrentClusterSize, inst->GetNum());
 
       ActiveClusterGroup = 0;     // Reset active cluster
       inst->SetActiveCluster(0);

From 91967badf67e129063621e1351b0516fb7f7219d Mon Sep 17 00:00:00 2001
From: Vang Thao <vangthao@csus.edu>
Date: Thu, 19 Mar 2020 00:50:34 -0700
Subject: [PATCH 18/40] First implementation of MEM heuristic.

---
 example/optsched-cfg/sched.ini           | 3 +++
 include/opt-sched/Scheduler/ready_list.h | 1 +
 lib/Scheduler/bb_spill.cpp               | 1 +
 lib/Scheduler/ready_list.cpp             | 9 +++++++--
 lib/Wrapper/OptimizingScheduler.cpp      | 3 +++
 lib/Wrapper/OptimizingScheduler.h        | 2 ++
 6 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/example/optsched-cfg/sched.ini b/example/optsched-cfg/sched.ini
index 8addb5f5..8968dbdb 100644
--- a/example/optsched-cfg/sched.ini
+++ b/example/optsched-cfg/sched.ini
@@ -84,6 +84,9 @@ TIMEOUT_PER INSTR
 # Example: LUC_CP_NID
 HEURISTIC LUC_CP_NID
 
+# Same as HEURISTIC except with MEM_ prefix.
+SECOND_PASS_HEURISTIC MEM_LUC_CP_NID
+
 # The heuristic used for the enumerator. If the two pass scheduling
 # approach is enabled, then this value will be used for the first pass.
 # Same valid values as HEURISTIC.
diff --git a/include/opt-sched/Scheduler/ready_list.h b/include/opt-sched/Scheduler/ready_list.h
index 3c7bb1a6..054b19f1 100644
--- a/include/opt-sched/Scheduler/ready_list.h
+++ b/include/opt-sched/Scheduler/ready_list.h
@@ -115,6 +115,7 @@ class ReadyList {
   int16_t ltncySumBits_;
   int16_t nodeID_Bits_;
   int16_t inptSchedOrderBits_;
+  int16_t ClusterBit;
 
   // Constructs the priority-list key based on the schemes listed in prirts_.
   unsigned long CmputKey_(SchedInstruction *inst, bool isUpdate, bool &changed);
diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp
index 534ef298..f774ce03 100644
--- a/lib/Scheduler/bb_spill.cpp
+++ b/lib/Scheduler/bb_spill.cpp
@@ -330,6 +330,7 @@ InstCount BBWithSpill::CmputCostLwrBound() {
 void BBWithSpill::InitForSchdulng() {
   InitForCostCmputtn_();
 
+  SchedInstruction::SetActiveCluster(0);
   CurrentClusterSize = 0;
   ActiveClusterGroup = 0;
   ClusterInitialCost = 1000000;
diff --git a/lib/Scheduler/ready_list.cpp b/lib/Scheduler/ready_list.cpp
index 553238aa..421e7034 100644
--- a/lib/Scheduler/ready_list.cpp
+++ b/lib/Scheduler/ready_list.cpp
@@ -70,7 +70,8 @@ ReadyList::ReadyList(DataDepGraph *dataDepGraph, SchedPriorities prirts) {
       break;
 
     case LSH_MEM:
-      Logger::Info("MEM heuristic detected");
+      ClusterBit = Utilities::clcltBitsNeededToHoldNum(1);
+      totKeyBits += ClusterBit
       break;
 
     default:
@@ -120,7 +121,7 @@ ReadyList::ReadyList(DataDepGraph *dataDepGraph, SchedPriorities prirts) {
       break;
 
     case LSH_MEM:
-      Logger::Info("MEM heuristic detected");
+      AddPrirtyToKey_(maxPriority_, keySize, ClusterBit, 1, 1);
       break;
 
     default:
@@ -206,6 +207,10 @@ unsigned long ReadyList::CmputKey_(SchedInstruction *inst, bool isUpdate,
       break;
 
     case LSH_MEM:
+      unsigned long ValueForKey =
+          inst->GetClusterGroup() == SchedInstruction::GetActiveCluster() ? 1
+                                                                          : 0;
+      AddPrirtyToKey_(key, keySize, ClusterBit, ValueForKey, 1);
       break;
 
     default:
diff --git a/lib/Wrapper/OptimizingScheduler.cpp b/lib/Wrapper/OptimizingScheduler.cpp
index 08a12466..e8edbdee 100644
--- a/lib/Wrapper/OptimizingScheduler.cpp
+++ b/lib/Wrapper/OptimizingScheduler.cpp
@@ -556,6 +556,8 @@ void ScheduleDAGOptSched::loadOptSchedConfig() {
   LowerBoundAlgorithm = parseLowerBoundAlgorithm();
   HeuristicPriorities = parseHeuristic(schedIni.GetString("HEURISTIC"));
   EnumPriorities = parseHeuristic(schedIni.GetString("ENUM_HEURISTIC"));
+  SecondPassPriorities =
+      parseHeuristic(schedIni.GetString("SECOND_PASS_HEURISTIC"))
   SecondPassEnumPriorities =
       parseHeuristic(schedIni.GetString("SECOND_PASS_ENUM_HEURISTIC"));
   SCF = parseSpillCostFunc();
@@ -818,6 +820,7 @@ void ScheduleDAGOptSched::scheduleOptSchedBalanced() {
 
   // Set the heuristic for the enumerator in the second pass.
   EnumPriorities = SecondPassEnumPriorities;
+  HeuristicPriorities = SecondPassPriorities;
 
   // Force the input to the balanced scheduler to be the sequential order of the
   // (hopefully) good register pressure schedule. We don’t want the list
diff --git a/lib/Wrapper/OptimizingScheduler.h b/lib/Wrapper/OptimizingScheduler.h
index 13b92e7d..784c0681 100644
--- a/lib/Wrapper/OptimizingScheduler.h
+++ b/lib/Wrapper/OptimizingScheduler.h
@@ -158,6 +158,8 @@ class ScheduleDAGOptSched : public ScheduleDAGMILive {
   // The heuristic used for the enumerator.
   SchedPriorities EnumPriorities;
 
+  SchedPriorities SecondPassPriorities;
+
   // The heuristic used for the second pass enumerator in the two-pass scheduling approach.
   SchedPriorities SecondPassEnumPriorities;
 

From ed248f094568d30a77fe00122271a89233736075 Mon Sep 17 00:00:00 2001
From: vang thao <vang@optimizer-amd.ecs.csus.edu>
Date: Thu, 19 Mar 2020 23:05:55 -0700
Subject: [PATCH 19/40] Print out ready list and changes to linked list (Vlad)

---
 include/opt-sched/Scheduler/lnkd_lst.h | 70 ++++++++++++++++++--------
 lib/Scheduler/bb_spill.cpp             |  3 +-
 lib/Scheduler/enumerator.cpp           | 10 ++--
 lib/Scheduler/list_sched.cpp           |  2 +
 lib/Scheduler/ready_list.cpp           |  9 ++--
 lib/Wrapper/OptimizingScheduler.cpp    |  2 +-
 6 files changed, 63 insertions(+), 33 deletions(-)

diff --git a/include/opt-sched/Scheduler/lnkd_lst.h b/include/opt-sched/Scheduler/lnkd_lst.h
index ee553398..3c311f9d 100644
--- a/include/opt-sched/Scheduler/lnkd_lst.h
+++ b/include/opt-sched/Scheduler/lnkd_lst.h
@@ -571,43 +571,69 @@ inline T *PriorityList<T, K>::GetNxtPriorityElmnt(K &key) {
   }
 }
 
+//(Vlad) added functionality to decrease priority
+//used for decreasing priority of clusterable instrs
+//when leaving a cluster
 template <class T, class K>
 void PriorityList<T, K>::BoostEntry(KeyedEntry<T, K> *entry, K newKey) {
   KeyedEntry<T, K> *crnt;
   KeyedEntry<T, K> *next = entry->GetNext();
   KeyedEntry<T, K> *prev = entry->GetPrev();
 
-  assert(newKey > entry->key);
   assert(LinkedList<T>::topEntry_ != NULL);
 
-  entry->key = newKey;
+  if (entry->key < newKey) //behave normally
+  {
+    entry->key = newKey;
 
-  // If it is already at the top, or its previous still has a larger key,
-  // then the entry is already in place and no boosting is needed
-  if (entry == LinkedList<T>::topEntry_ || prev->key >= newKey)
-    return;
+    // If it is already at the top, or its previous still has a larger key,
+    // then the entry is already in place and no boosting is needed
+    if (entry == LinkedList<T>::topEntry_ || prev->key >= newKey)
+      return;
 
-  prev = NULL;
+    prev = NULL;
 
-  for (crnt = entry->GetPrev(); crnt != NULL; crnt = crnt->GetPrev()) {
-    if (crnt->key >= newKey) {
-      assert(crnt != entry);
-      assert(crnt != entry->GetPrev());
-      prev = crnt;
-      break;
+    for (crnt = entry->GetPrev(); crnt != NULL; crnt = crnt->GetPrev()) {
+      if (crnt->key >= newKey) {
+        assert(crnt != entry);
+        assert(crnt != entry->GetPrev());
+        prev = crnt;
+        break;
+      }
     }
-  }
 
-  if (prev == NULL) {
-    next = (KeyedEntry<T, K> *)LinkedList<T>::topEntry_;
-  } else {
-    next = prev->GetNext();
-    assert(next != NULL);
+    if (prev == NULL) {
+      next = (KeyedEntry<T, K> *)LinkedList<T>::topEntry_;
+    } else {
+      next = prev->GetNext();
+      assert(next != NULL);
+    }
+
+    assert(next != entry->GetNext());
+    LinkedList<T>::RmvEntry_(entry, false);
+    InsrtEntry_(entry, next);
   }
+  else //move entry down on priority list
+  {
+    entry->key = newKey;
+
+    //if it is at the bottom or next entry still has a smaller key,
+    //then the entry is already in place
+    if (entry == LinkedList<T>::bottomEntry_ || next->key <= newKey)
+      return;
 
-  assert(next != entry->GetNext());
-  LinkedList<T>::RmvEntry_(entry, false);
-  InsrtEntry_(entry, next);
+    for (crnt = entry->GetNext(); crnt != NULL; crnt = crnt->GetNext())
+    {
+      if (crnt->key <= newKey)
+      {
+        next = crnt;
+	break;
+      }
+    }
+
+    LinkedList<T>::RmvEntry_(entry, false);
+    InsrtEntry_(entry, next);
+  }
 
   this->itrtrReset_ = true;
 }
diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp
index f774ce03..66b569b8 100644
--- a/lib/Scheduler/bb_spill.cpp
+++ b/lib/Scheduler/bb_spill.cpp
@@ -525,7 +525,7 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst,
       CurrentClusterSize = 0;       // Set cluster size to 0
     }
   }
-
+  Logger::Info("Currently active cluster %d", ActiveClusterGroup);
   // Potential Issues:
   // 1. Keeping track of the average clustering size when we aren't done
   // scheduling.
@@ -794,6 +794,7 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) {
       }
     }
   }
+  Logger::Info("Currently active cluster %d", ActiveClusterGroup);
 
   defCnt = inst->GetDefs(defs);
   useCnt = inst->GetUses(uses);
diff --git a/lib/Scheduler/enumerator.cpp b/lib/Scheduler/enumerator.cpp
index 39019034..728df961 100644
--- a/lib/Scheduler/enumerator.cpp
+++ b/lib/Scheduler/enumerator.cpp
@@ -971,13 +971,13 @@ bool Enumerator::FindNxtFsblBrnch_(EnumTreeNode *&newNode) {
   assert(crntNode_->IsLeaf() || (brnchCnt != rdyInstCnt) ? 1 : rdyInstCnt);
   // brnchCnt == rdyInstCnt == 0 ? 1 : rdyInstCnt);
 
-#ifdef IS_DEBUG_READY_LIST
-  Logger::Info("Ready List Size is %d", rdyInstCnt);
+//#ifdef IS_DEBUG_READY_LIST
+//  Logger::Info("Ready List Size is %d", rdyInstCnt);
   // Warning! That will reset the instruction iterator!
-  // rdyLst_->Print(Logger::GetLogStream());
+   rdyLst_->Print(Logger::GetLogStream());
 
-  stats::maxReadyListSize.SetMax(rdyInstCnt);
-#endif
+//  stats::maxReadyListSize.SetMax(rdyInstCnt);
+//#endif
 
   if (crntBrnchNum == 0 && SchedForRPOnly_)
     crntNode_->SetFoundInstWithUse(IsUseInRdyLst_());
diff --git a/lib/Scheduler/list_sched.cpp b/lib/Scheduler/list_sched.cpp
index 9bf96951..f737def5 100644
--- a/lib/Scheduler/list_sched.cpp
+++ b/lib/Scheduler/list_sched.cpp
@@ -40,6 +40,8 @@ FUNC_RESULT ListScheduler::FindSchedule(InstSchedule *sched, SchedRegion *rgn) {
   while (!IsSchedComplete_()) {
     UpdtRdyLst_(crntCycleNum_, crntSlotNum_);
     rdyLst_->ResetIterator();
+    rdyLst_->Print(Logger::GetLogStream());
+    rdyLst_->ResetIterator();
 
     iterCnt++;
     rdyLstSize = rdyLst_->GetInstCnt();
diff --git a/lib/Scheduler/ready_list.cpp b/lib/Scheduler/ready_list.cpp
index 421e7034..0e4bcd0b 100644
--- a/lib/Scheduler/ready_list.cpp
+++ b/lib/Scheduler/ready_list.cpp
@@ -71,7 +71,7 @@ ReadyList::ReadyList(DataDepGraph *dataDepGraph, SchedPriorities prirts) {
 
     case LSH_MEM:
       ClusterBit = Utilities::clcltBitsNeededToHoldNum(1);
-      totKeyBits += ClusterBit
+      totKeyBits += ClusterBit;
       break;
 
     default:
@@ -158,6 +158,7 @@ unsigned long ReadyList::CmputKey_(SchedInstruction *inst, bool isUpdate,
   int16_t keySize = 0;
   int i;
   int16_t oldLastUseCnt, newLastUseCnt;
+  unsigned long ValueForKey;
   changed = true;
   if (isUpdate)
     changed = false;
@@ -207,7 +208,7 @@ unsigned long ReadyList::CmputKey_(SchedInstruction *inst, bool isUpdate,
       break;
 
     case LSH_MEM:
-      unsigned long ValueForKey =
+      ValueForKey =
           inst->GetClusterGroup() == SchedInstruction::GetActiveCluster() ? 1
                                                                           : 0;
       AddPrirtyToKey_(key, keySize, ClusterBit, ValueForKey, 1);
@@ -232,9 +233,9 @@ void ReadyList::AddLatestSubLists(LinkedList<SchedInstruction> *lst1,
 
 void ReadyList::Print(std::ostream &out) {
   out << "Ready List: ";
-  for (const auto *crntInst = prirtyLst_->GetFrstElmnt(); crntInst != NULL;
+  for (auto *crntInst = prirtyLst_->GetFrstElmnt(); crntInst != NULL;
        crntInst = prirtyLst_->GetNxtElmnt()) {
-    out << " " << crntInst->GetNum();
+    out << " " << crntInst->GetNum() << "(" << crntInst->GetClusterGroup() << ")";
   }
   out << '\n';
 
diff --git a/lib/Wrapper/OptimizingScheduler.cpp b/lib/Wrapper/OptimizingScheduler.cpp
index e8edbdee..a984469d 100644
--- a/lib/Wrapper/OptimizingScheduler.cpp
+++ b/lib/Wrapper/OptimizingScheduler.cpp
@@ -557,7 +557,7 @@ void ScheduleDAGOptSched::loadOptSchedConfig() {
   HeuristicPriorities = parseHeuristic(schedIni.GetString("HEURISTIC"));
   EnumPriorities = parseHeuristic(schedIni.GetString("ENUM_HEURISTIC"));
   SecondPassPriorities =
-      parseHeuristic(schedIni.GetString("SECOND_PASS_HEURISTIC"))
+      parseHeuristic(schedIni.GetString("SECOND_PASS_HEURISTIC"));
   SecondPassEnumPriorities =
       parseHeuristic(schedIni.GetString("SECOND_PASS_ENUM_HEURISTIC"));
   SCF = parseSpillCostFunc();

From 366405768ea81f02646c3f118836d4367ffa17e0 Mon Sep 17 00:00:00 2001
From: Vang Thao <vangthao@csus.edu>
Date: Fri, 20 Mar 2020 09:28:06 -0700
Subject: [PATCH 20/40] Extract more information about each cluster to be later
 used in lower bound estimation.

---
 include/opt-sched/Scheduler/data_dep.h  |  9 +++++++++
 lib/Scheduler/data_dep.cpp              |  6 ++++++
 lib/Wrapper/OptSchedDDGWrapperBasic.cpp | 23 ++++++++++++++++++-----
 lib/Wrapper/OptSchedDDGWrapperBasic.h   |  2 +-
 4 files changed, 34 insertions(+), 6 deletions(-)

diff --git a/include/opt-sched/Scheduler/data_dep.h b/include/opt-sched/Scheduler/data_dep.h
index 803574c9..40833f1c 100644
--- a/include/opt-sched/Scheduler/data_dep.h
+++ b/include/opt-sched/Scheduler/data_dep.h
@@ -13,6 +13,7 @@ Last Update:  Mar. 2011
 #include "opt-sched/Scheduler/buffers.h"
 #include "opt-sched/Scheduler/defines.h"
 #include "opt-sched/Scheduler/sched_basic_data.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include <memory>
 
@@ -293,9 +294,17 @@ class DataDepGraph : public llvm::opt_sched::OptSchedDDGWrapperBase,
 
   int getMaxClusterCount() { return MaxClusterCount; }
   void setMaxClusterCount(int Max) { MaxClusterCount = Max; }
+  int getMaxInstructionsInAllClusters() { return MaxInstructionsInAllClusters; }
+  void setMaxInstructionsInAllClusters(int Max) {
+    MaxInstructionsInAllClusters = Max;
+  }
+
+  int getMaxInstructionsInCluster(int Cluster);
 
 protected:
   int MaxClusterCount;
+  int MaxInstructionsInAllClusters;
+  MapVector<int, int> MaxInstructionsInEachClusters;
 
   // TODO(max): Get rid of this.
   // Number of basic blocks
diff --git a/lib/Scheduler/data_dep.cpp b/lib/Scheduler/data_dep.cpp
index a6652c9f..c42bcafe 100644
--- a/lib/Scheduler/data_dep.cpp
+++ b/lib/Scheduler/data_dep.cpp
@@ -199,6 +199,7 @@ DataDepGraph::DataDepGraph(MachineModel *machMdl, LATENCY_PRECISION ltncyPrcsn)
   RegFiles = llvm::make_unique<RegisterFile[]>(machMdl_->GetRegTypeCnt());
 
   MaxClusterCount = 0;
+  MaxInstructionsInClusters = 0;
 }
 
 DataDepGraph::~DataDepGraph() {
@@ -213,6 +214,11 @@ DataDepGraph::~DataDepGraph() {
   delete[] instCntPerType_;
 }
 
+int DataDepGraph::getMaxInstructionsInCluster(int Cluster) { 
+  assert(Cluster > 0);
+  return MaxInstructionsInEachClusters[Cluster];
+}
+
 FUNC_RESULT DataDepGraph::SetupForSchdulng(bool cmputTrnstvClsr) {
   assert(wasSetupForSchduling_ == false);
 
diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
index 6d50fc8f..98f7c9d5 100644
--- a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
+++ b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
@@ -508,10 +508,11 @@ void OptSchedDDGWrapperBasic::countBoundaryLiveness(
 
 /// Partially copied from
 /// https://github.com/RadeonOpenCompute/llvm/blob/roc-ocl-2.4.0/lib/CodeGen/MachineScheduler.cpp#L1554
-void OptSchedDDGWrapperBasic::clusterNeighboringMemOps_(
+int OptSchedDDGWrapperBasic::clusterNeighboringMemOps_(
     ArrayRef<const SUnit *> MemOps) {
   SmallVector<MemOpInfo, 32> MemOpRecords;
   bool ClusterPossible = false;
+  int TotalInstructionsPossible = 0;
 
   LLVM_DEBUG(dbgs() << "Processing possible clusters\n");
   for (const SUnit *SU : MemOps) {
@@ -524,7 +525,7 @@ void OptSchedDDGWrapperBasic::clusterNeighboringMemOps_(
 
   if (MemOpRecords.size() < 2) {
     LLVM_DEBUG(dbgs() << "  Unable to cluster memop cluster of 1.\n");
-    return;
+    return 0;
   }
 
   llvm::sort(MemOpRecords);
@@ -548,8 +549,15 @@ void OptSchedDDGWrapperBasic::clusterNeighboringMemOps_(
       }
 
       // Tell the instructions what cluster number they are in
-      insts_[SUa->NodeNum]->SetMayCluster(ClusterCount);
-      insts_[SUb->NodeNum]->SetMayCluster(ClusterCount);
+      if (insts_[SUa->NodeNum]->GetClusterGroup() == 0) {
+        insts_[SUa->NodeNum]->SetMayCluster(ClusterCount);
+        TotalInstructionsPossible++;
+      }
+
+      if (insts_[SUb->NodeNum]->GetClusterGroup() == 0) {
+        insts_[SUb->NodeNum]->SetMayCluster(ClusterCount);
+        TotalInstructionsPossible++;
+      }
 
       ++ClusterLength;
     } else
@@ -565,6 +573,8 @@ void OptSchedDDGWrapperBasic::clusterNeighboringMemOps_(
   }
   LLVM_DEBUG(dbgs() << '\n');
 #endif
+  MaxInstructionsInEachClusters.insert(ClusterCount, TotalInstructionsPossible);
+  return TotalInstructionsPossible;
 }
 
 /// Iterate through SUnits and find all possible clustering then transfer
@@ -574,6 +584,7 @@ void OptSchedDDGWrapperBasic::findPossibleClusters() {
   // TODO: Add For-loop to also do store clusters. Currently only does load
   // clusters
   bool IsLoad = true;
+  int TotalInstructionsPossible = 0;
 
   LLVM_DEBUG(dbgs() << "Looking for load clusters\n");
   DenseMap<unsigned, unsigned> StoreChainIDs;
@@ -614,8 +625,10 @@ void OptSchedDDGWrapperBasic::findPossibleClusters() {
     	LLVM_DEBUG(dbgs() << SU1->NodeNum << " ");
     LLVM_DEBUG(dbgs() << '\n');
 #endif
-    clusterNeighboringMemOps_(SCD);
+    TotalInstructionsPossible += clusterNeighboringMemOps_(SCD);
   }
+
+  setMaxInstructionsInClusters(TotalInstructionsPossible);
 }
 
 LLVMRegTypeFilter::LLVMRegTypeFilter(
diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.h b/lib/Wrapper/OptSchedDDGWrapperBasic.h
index 76d5d7ea..4fd3937b 100644
--- a/lib/Wrapper/OptSchedDDGWrapperBasic.h
+++ b/lib/Wrapper/OptSchedDDGWrapperBasic.h
@@ -135,7 +135,7 @@ class OptSchedDDGWrapperBasic : public DataDepGraph {
   // Find liveness info generated by the region boundary.
   void discoverBoundaryLiveness(const llvm::MachineInstr *MI);
 
-  void clusterNeighboringMemOps_(
+  int clusterNeighboringMemOps_(
 		      ArrayRef<const SUnit *> MemOps);
 
   // Holds a register live range, mapping a producer to a set of consumers.

From b8e4ac597da12d15591955d9bc19e470cc095e96 Mon Sep 17 00:00:00 2001
From: vang thao <vang@optimizer-amd.ecs.csus.edu>
Date: Fri, 20 Mar 2020 09:34:44 -0700
Subject: [PATCH 21/40] Error fixes

---
 lib/Scheduler/data_dep.cpp              | 2 +-
 lib/Wrapper/OptSchedDDGWrapperBasic.cpp | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/Scheduler/data_dep.cpp b/lib/Scheduler/data_dep.cpp
index c42bcafe..005bec55 100644
--- a/lib/Scheduler/data_dep.cpp
+++ b/lib/Scheduler/data_dep.cpp
@@ -199,7 +199,7 @@ DataDepGraph::DataDepGraph(MachineModel *machMdl, LATENCY_PRECISION ltncyPrcsn)
   RegFiles = llvm::make_unique<RegisterFile[]>(machMdl_->GetRegTypeCnt());
 
   MaxClusterCount = 0;
-  MaxInstructionsInClusters = 0;
+  MaxInstructionsInAllClusters = 0;
 }
 
 DataDepGraph::~DataDepGraph() {
diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
index 98f7c9d5..875bd5a4 100644
--- a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
+++ b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
@@ -573,7 +573,7 @@ int OptSchedDDGWrapperBasic::clusterNeighboringMemOps_(
   }
   LLVM_DEBUG(dbgs() << '\n');
 #endif
-  MaxInstructionsInEachClusters.insert(ClusterCount, TotalInstructionsPossible);
+  MaxInstructionsInEachClusters.insert(std::make_pair(ClusterCount, TotalInstructionsPossible));
   return TotalInstructionsPossible;
 }
 
@@ -628,7 +628,7 @@ void OptSchedDDGWrapperBasic::findPossibleClusters() {
     TotalInstructionsPossible += clusterNeighboringMemOps_(SCD);
   }
 
-  setMaxInstructionsInClusters(TotalInstructionsPossible);
+ setMaxInstructionsInAllClusters(TotalInstructionsPossible);
 }
 
 LLVMRegTypeFilter::LLVMRegTypeFilter(

From b519e2532a615aefe0dc2bc7b442b9db53aeb392 Mon Sep 17 00:00:00 2001
From: vang thao <vang@optimizer-amd.ecs.csus.edu>
Date: Fri, 20 Mar 2020 13:47:02 -0700
Subject: [PATCH 22/40] First implementation of cost function

---
 example/optsched-cfg/sched.ini         |  2 +
 include/opt-sched/Scheduler/bb_spill.h |  7 ++
 lib/Scheduler/bb_spill.cpp             | 96 ++++++++++++++++++--------
 lib/Scheduler/enumerator.cpp           | 30 ++++----
 lib/Scheduler/list_sched.cpp           |  2 -
 lib/Wrapper/OptimizingScheduler.cpp    | 10 +++
 6 files changed, 100 insertions(+), 47 deletions(-)

diff --git a/example/optsched-cfg/sched.ini b/example/optsched-cfg/sched.ini
index 8968dbdb..d9b45132 100644
--- a/example/optsched-cfg/sched.ini
+++ b/example/optsched-cfg/sched.ini
@@ -19,6 +19,8 @@ USE_TWO_PASS NO
 # NO
 CLUSTER_MEMORY_OPS NO
 
+CLUSTER_WEIGHT 1000000
+
 # These 3 flags control which schedulers will be used.
 # Each one can be individually toggled. The heuristic
 # list scheduler or ACO must be run before the 
diff --git a/include/opt-sched/Scheduler/bb_spill.h b/include/opt-sched/Scheduler/bb_spill.h
index 91c13f37..9ab1381f 100644
--- a/include/opt-sched/Scheduler/bb_spill.h
+++ b/include/opt-sched/Scheduler/bb_spill.h
@@ -13,6 +13,7 @@ Last Update:  Apr. 2011
 #include "opt-sched/Scheduler/OptSchedTarget.h"
 #include "opt-sched/Scheduler/defines.h"
 #include "opt-sched/Scheduler/sched_region.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include <map>
 #include <memory>
@@ -38,6 +39,11 @@ class BBWithSpill : public SchedRegion {
   /// Current cluster size
   unsigned int CurrentClusterSize; 
 
+  MapVector<int, int> InstructionsScheduledInEachCluster;
+
+  int MaxClusterBlocks;
+  int CurrentClusterBlocks;
+
   /// Current active cluster group
   int ActiveClusterGroup;
 
@@ -49,6 +55,7 @@ class BBWithSpill : public SchedRegion {
   /// Experimental variables and values for cost adjustment
   int ClusteringWeight;
   int ClusterInitialCost;
+  int TotalInstructionsInClusters;
 
   /// Data struct to contain information about the previous clusters
   struct PastClusters {
diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp
index 66b569b8..bf9c2a8c 100644
--- a/lib/Scheduler/bb_spill.cpp
+++ b/lib/Scheduler/bb_spill.cpp
@@ -72,13 +72,19 @@ BBWithSpill::BBWithSpill(const OptSchedTarget *OST_, DataDepGraph *dataDepGraph,
   
   CurrentClusterSize = 0;
   ActiveClusterGroup = 0;
-  ClusteringWeight = 1000;
   ClusterInitialCost = 1000000;
   PastClustersList.clear();
   LastCluster = nullptr;
+  TotalInstructionsInClusters = 0;
 
   Config &schedIni = SchedulerOptions::getInstance();
   ClusterMemoryOperations = schedIni.GetBool("CLUSTER_MEMORY_OPS");
+  ClusteringWeight = schedIni.GetInt("CLUSTER_WEIGHT");
+  MaxClusterBlocks = dataDepGraph_->getMaxClusterCount();
+  CurrentClusterBlocks = MaxClusterBlocks;
+  for (int begin = 1; begin <= MaxClusterBlocks; begin++) {
+    InstructionsScheduledInEachCluster[begin] = 0;
+  }
 }
 /****************************************************************************/
 
@@ -316,6 +322,11 @@ InstCount BBWithSpill::CmputCostLwrBound() {
   InstCount staticLowerBound =
       schedLwrBound_ * schedCostFactor_ + spillCostLwrBound * SCW_;
 
+
+  if (isSecondPass && ClusterMemoryOperations) {
+    staticLowerBound  += MaxClusterBlocks * ClusteringWeight;
+  }
+
 #if defined(IS_DEBUG_STATIC_LOWER_BOUND)
   Logger::Info(
       "DAG %s spillCostLB %d scFactor %d lengthLB %d lenFactor %d staticLB %d",
@@ -336,6 +347,10 @@ void BBWithSpill::InitForSchdulng() {
   ClusterInitialCost = 1000000;
   PastClustersList.clear();
   LastCluster.reset();
+  CurrentClusterBlocks = MaxClusterBlocks;
+  for (int begin = 1; begin <= MaxClusterBlocks; begin++) {
+    InstructionsScheduledInEachCluster[begin] = 0;
+  }
 
   schduldEntryInstCnt_ = 0;
   schduldExitInstCnt_ = 0;
@@ -383,12 +398,12 @@ InstCount BBWithSpill::CmputNormCost_(InstSchedule *sched,
                                       InstCount &execCost, bool trackCnflcts) {
   InstCount cost = CmputCost_(sched, compMode, execCost, trackCnflcts);
 
-  cost -= costLwrBound_;
-  execCost -= costLwrBound_;
-
   // TODO: Implement cost function for clustering
   if (isSecondPass && ClusterMemoryOperations)
-	  cost += ClusterInitialCost; 
+    cost += CurrentClusterBlocks * ClusteringWeight;
+
+  cost -= costLwrBound_;
+  execCost -= costLwrBound_;
 
   sched->SetCost(cost);
   sched->SetExecCost(execCost);
@@ -471,17 +486,11 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst,
           // Case 1: Currently clustering and this current instruction is part
           // of the cluster
           CurrentClusterSize++;
-          if (CurrentClusterSize > 2) {
-            // Only decrement the cost if we cluster at least 2 operations
-            // together (EXPERIMENTAL FOR NOW)
-            ClusterInitialCost -= ClusteringWeight;
-            Logger::Info("Currently clustering %d instructions together",
-                         CurrentClusterSize);
-          }
+          InstructionsScheduledInEachCluster[ActiveClusterGroup]++; 
         } else {
-          Logger::Info("Inst %d pushing cluster size %d onto the stack due to "
-                       "cluster to cluster op",
-                       inst->GetNum(), CurrentClusterSize);
+          //Logger::Info("Inst %d pushing cluster size %d onto the stack due to "
+            //           "cluster to cluster op",
+            //           inst->GetNum(), CurrentClusterSize);
           // The instruction is in another cluster that is not currently active.
           // Exit out of the currently active cluster into a new one.
           if (LastCluster) {
@@ -491,21 +500,29 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst,
           } else 
             LastCluster = llvm::make_unique<PastClusters>(
                 ActiveClusterGroup, CurrentClusterSize, inst->GetNum());
+          
+	  // If this cluster did not finish then that means there have to be an extra cluster block to finish all of the instructions
+	  // // in the cluster
+	  if (InstructionsScheduledInEachCluster[ActiveClusterGroup] < dataDepGraph_->getMaxInstructionsInCluster(ActiveClusterGroup)) {
+	    CurrentClusterBlocks++;
+	  }
 
           ActiveClusterGroup = inst->GetClusterGroup();
           inst->SetActiveCluster(ActiveClusterGroup);
           CurrentClusterSize = 1;
+          InstructionsScheduledInEachCluster[ActiveClusterGroup]++;
         }
       } else {
         // Case 3: Not currently clustering. Initialize clustering
         ActiveClusterGroup = inst->GetClusterGroup();
         inst->SetActiveCluster(ActiveClusterGroup);
         CurrentClusterSize = 1;
+        InstructionsScheduledInEachCluster[ActiveClusterGroup]++;
       }
     } else if (CurrentClusterSize > 0) {
       // Case 2: Exiting out of an active cluster
-      Logger::Info("Inst %d pushing cluster size %d onto the stack",
-                    inst->GetNum(), CurrentClusterSize);
+//      Logger::Info("Inst %d pushing cluster size %d onto the stack",
+  //                  inst->GetNum(), CurrentClusterSize);
         
       // Save the cluster to restore when backtracking.
       if (LastCluster) {
@@ -520,12 +537,23 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst,
         LastCluster = llvm::make_unique<PastClusters>(
             ActiveClusterGroup, CurrentClusterSize, inst->GetNum());
 
+      // If InstrScheduledInEachCluster != Max
+      // blocks++
+
+      // If this cluster did not finish then that means there have to be an extra cluster block to finish all of the instructions
+      // in the cluster
+      if (InstructionsScheduledInEachCluster[ActiveClusterGroup] < dataDepGraph_->getMaxInstructionsInCluster(ActiveClusterGroup)) {
+        CurrentClusterBlocks++;
+      }
+
+      assert(InstructionsScheduledInEachCluster[ActiveClusterGroup] <= dataDepGraph_->getMaxInstructionsInCluster(ActiveClusterGroup));
+
       ActiveClusterGroup = 0;     // Reset active cluster
       inst->SetActiveCluster(0);
       CurrentClusterSize = 0;       // Set cluster size to 0
     }
   }
-  Logger::Info("Currently active cluster %d", ActiveClusterGroup);
+//  Logger::Info("Currently active cluster %d", ActiveClusterGroup);
   // Potential Issues:
   // 1. Keeping track of the average clustering size when we aren't done
   // scheduling.
@@ -744,12 +772,11 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) {
     // backtracking.
     if (inst->GetMayCluster()) {
       // Case 1 and 3
-      if (CurrentClusterSize > 2) {
-        ClusterInitialCost += ClusteringWeight; // Re-add the cost
-      }
       CurrentClusterSize--;
-      Logger::Info("Undoing an instruction from the cluster. Current size: %d",
-                   CurrentClusterSize);
+      InstructionsScheduledInEachCluster[ActiveClusterGroup]--;
+      assert(InstructionsScheduledInEachCluster[ActiveClusterGroup] >= 0);
+      //Logger::Info("Undoing an instruction from the cluster. Current size: %d",
+        //           CurrentClusterSize);
 
       // If there is no more member in the currently active cluster then disable
       // the cluster
@@ -771,6 +798,10 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) {
               LastCluster = std::move(PastClustersList.back());
               PastClustersList.pop_back();
             }
+	    if (InstructionsScheduledInEachCluster[ActiveClusterGroup] != dataDepGraph_->getMaxInstructionsInCluster(ActiveClusterGroup)) {
+ 	      CurrentClusterBlocks--;
+	      assert(CurrentClusterBlocks >= MaxClusterBlocks);
+	    }
           }
         }
       }
@@ -789,12 +820,17 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) {
           LastCluster = std::move(PastClustersList.back());
           PastClustersList.pop_back();
         }
-        Logger::Info("Inst %d popping cluster size %d off the stack",
-                     inst->GetNum(), CurrentClusterSize);
+        //Logger::Info("Inst %d popping cluster size %d off the stacks",
+          //           inst->GetNum(), CurrentClusterSize);
+
+       if (InstructionsScheduledInEachCluster[ActiveClusterGroup] != dataDepGraph_->getMaxInstructionsInCluster(ActiveClusterGroup)) {
+         CurrentClusterBlocks--;
+         assert(CurrentClusterBlocks >= MaxClusterBlocks);
+       }
       }
     }
   }
-  Logger::Info("Currently active cluster %d", ActiveClusterGroup);
+//  Logger::Info("Currently active cluster %d", ActiveClusterGroup);
 
   defCnt = inst->GetDefs(defs);
   useCnt = inst->GetUses(uses);
@@ -1095,12 +1131,12 @@ bool BBWithSpill::ChkCostFsblty(InstCount trgtLngth, EnumTreeNode *node) {
   } else {
     crntCost = crntSpillCost_ * SCW_ + trgtLngth * schedCostFactor_;
   }
-  crntCost -= costLwrBound_;
-  dynmcCostLwrBound = crntCost;
-
   // TODO: Implement cost function for clustering
   if (isSecondPass && ClusterMemoryOperations)
-    crntCost += ClusterInitialCost; 
+    crntCost += CurrentClusterBlocks * ClusteringWeight;
+
+  crntCost -= costLwrBound_;
+  dynmcCostLwrBound = crntCost;
 
   // assert(cost >= 0);
   assert(dynmcCostLwrBound >= 0);
diff --git a/lib/Scheduler/enumerator.cpp b/lib/Scheduler/enumerator.cpp
index 728df961..115e03b6 100644
--- a/lib/Scheduler/enumerator.cpp
+++ b/lib/Scheduler/enumerator.cpp
@@ -971,13 +971,13 @@ bool Enumerator::FindNxtFsblBrnch_(EnumTreeNode *&newNode) {
   assert(crntNode_->IsLeaf() || (brnchCnt != rdyInstCnt) ? 1 : rdyInstCnt);
   // brnchCnt == rdyInstCnt == 0 ? 1 : rdyInstCnt);
 
-//#ifdef IS_DEBUG_READY_LIST
-//  Logger::Info("Ready List Size is %d", rdyInstCnt);
+#ifdef IS_DEBUG_READY_LIST
+  Logger::Info("Ready List Size is %d", rdyInstCnt);
   // Warning! That will reset the instruction iterator!
-   rdyLst_->Print(Logger::GetLogStream());
+  // rdyLst_->Print(Logger::GetLogStream());
 
-//  stats::maxReadyListSize.SetMax(rdyInstCnt);
-//#endif
+  stats::maxReadyListSize.SetMax(rdyInstCnt);
+#endif
 
   if (crntBrnchNum == 0 && SchedForRPOnly_)
     crntNode_->SetFoundInstWithUse(IsUseInRdyLst_());
@@ -1065,7 +1065,7 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode,
   if (inst != NULL)
     if (inst->GetPreFxdCycle() != INVALID_VALUE)
       if (inst->GetPreFxdCycle() != crntCycleNum_) {
-	Logger::Info("Pruned due to prefixed cycle");
+	//Logger::Info("Pruned due to prefixed cycle");
         return false;
       }
 
@@ -1074,7 +1074,7 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode,
 #ifdef IS_DEBUG_INFSBLTY_TESTS
       stats::forwardLBInfeasibilityHits++;
 #endif
-      Logger::Info("Pruned due to forward lowerbound");
+      //Logger::Info("Pruned due to forward lowerbound");
       return false;
     }
 
@@ -1082,7 +1082,7 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode,
 #ifdef IS_DEBUG_INFSBLTY_TESTS
       stats::backwardLBInfeasibilityHits++;
 #endif
-      Logger::Info("Pruned due to backward lowerbound");
+      //Logger::Info("Pruned due to backward lowerbound");
       return false;
     }
   }
@@ -1103,7 +1103,7 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode,
         stats::nodeSuperiorityInfeasibilityHits++;
 #endif
         isNodeDmntd = true;
-	Logger::Info("Pruned due to node superiority");
+	//Logger::Info("Pruned due to node superiority");
         return false;
       }
   }
@@ -1121,7 +1121,7 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode,
 #ifdef IS_DEBUG_INFSBLTY_TESTS
     stats::slotCountInfeasibilityHits++;
 #endif
-    Logger::Info("Pruned due to issue slot infeasibility");
+    //Logger::Info("Pruned due to issue slot infeasibility");
     return false;
   }
 
@@ -1132,7 +1132,7 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode,
 #ifdef IS_DEBUG_INFSBLTY_TESTS
     stats::rangeTighteningInfeasibilityHits++;
 #endif
-    Logger::Info("Pruned due to range tightening infeasibility");
+    //Logger::Info("Pruned due to range tightening infeasibility");
     return false;
   }
 
@@ -1150,7 +1150,7 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode,
 #ifdef IS_DEBUG_INFSBLTY_TESTS
         stats::historyDominationInfeasibilityHits++;
 #endif
-	Logger::Info("Pruned due to history domination");
+	//Logger::Info("Pruned due to history domination");
         return false;
       }
   }
@@ -1165,7 +1165,7 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode,
       stats::relaxedSchedulingInfeasibilityHits++;
 #endif
       isRlxInfsbl = true;
-      Logger::Info("Pruned due to relaxed schedule");
+      //Logger::Info("Pruned due to relaxed schedule");
       return false;
     }
   }
@@ -2078,7 +2078,7 @@ bool LengthCostEnumerator::ProbeBranch_(SchedInstruction *inst,
   isFsbl = ChkCostFsblty_(inst, newNode);
 
   if (isFsbl == false) {
-    Logger::Info("Pruned due to cost infeasibility");
+    //Logger::Info("Pruned due to cost infeasibility");
     return false;
   }
 
@@ -2096,7 +2096,7 @@ bool LengthCostEnumerator::ProbeBranch_(SchedInstruction *inst,
 #endif
       rgn_->UnschdulInst(inst, crntCycleNum_, crntSlotNum_, parent);
 
-      Logger::Info("Pruned due to history domination");
+      //Logger::Info("Pruned due to history domination");
       return false;
     }
   }
diff --git a/lib/Scheduler/list_sched.cpp b/lib/Scheduler/list_sched.cpp
index f737def5..9bf96951 100644
--- a/lib/Scheduler/list_sched.cpp
+++ b/lib/Scheduler/list_sched.cpp
@@ -40,8 +40,6 @@ FUNC_RESULT ListScheduler::FindSchedule(InstSchedule *sched, SchedRegion *rgn) {
   while (!IsSchedComplete_()) {
     UpdtRdyLst_(crntCycleNum_, crntSlotNum_);
     rdyLst_->ResetIterator();
-    rdyLst_->Print(Logger::GetLogStream());
-    rdyLst_->ResetIterator();
 
     iterCnt++;
     rdyLstSize = rdyLst_->GetInstCnt();
diff --git a/lib/Wrapper/OptimizingScheduler.cpp b/lib/Wrapper/OptimizingScheduler.cpp
index a984469d..dba961a1 100644
--- a/lib/Wrapper/OptimizingScheduler.cpp
+++ b/lib/Wrapper/OptimizingScheduler.cpp
@@ -379,6 +379,16 @@ void ScheduleDAGOptSched::schedule() {
   DDG->convertSUnits();
   DDG->convertRegFiles();
   DDG->findPossibleClusters();
+  if (SecondPass) {
+    auto DDG2 = static_cast<DataDepGraph *>(DDG.get());
+    int end = DDG2->getMaxClusterCount();
+    if (end > 0) {
+      Logger::Info("Total clusters in region: %d", end);
+      for (int begin = 1; begin <= end; begin++) {
+        Logger::Info("Cluster %d has total instructions %d", begin, DDG2->getMaxInstructionsInCluster(begin));
+      }
+    }
+  }
 
   auto *BDDG = static_cast<OptSchedDDGWrapperBasic *>(DDG.get());
   addGraphTransformations(BDDG);

From 26a89c3b686e0e051646ece0ae2fc27e2c09407b Mon Sep 17 00:00:00 2001
From: Vang Thao <vangthao@csus.edu>
Date: Fri, 20 Mar 2020 14:00:34 -0700
Subject: [PATCH 23/40] Some code cleanup. No functional changes.

---
 include/opt-sched/Scheduler/bb_spill.h  |  2 --
 include/opt-sched/Scheduler/data_dep.h  |  1 -
 lib/Scheduler/bb_spill.cpp              |  2 --
 lib/Scheduler/data_dep.cpp              |  6 ++----
 lib/Scheduler/enumerator.cpp            | 11 +----------
 lib/Wrapper/OptSchedDDGWrapperBasic.cpp |  7 +++----
 lib/Wrapper/OptSchedDDGWrapperBasic.h   |  3 +--
 7 files changed, 7 insertions(+), 25 deletions(-)

diff --git a/include/opt-sched/Scheduler/bb_spill.h b/include/opt-sched/Scheduler/bb_spill.h
index 9ab1381f..a2d6afa5 100644
--- a/include/opt-sched/Scheduler/bb_spill.h
+++ b/include/opt-sched/Scheduler/bb_spill.h
@@ -9,7 +9,6 @@ Last Update:  Apr. 2011
 #ifndef OPTSCHED_SPILL_BB_SPILL_H
 #define OPTSCHED_SPILL_BB_SPILL_H
 
-#include "opt-sched/Scheduler/bit_vector.h"
 #include "opt-sched/Scheduler/OptSchedTarget.h"
 #include "opt-sched/Scheduler/defines.h"
 #include "opt-sched/Scheduler/sched_region.h"
@@ -54,7 +53,6 @@ class BBWithSpill : public SchedRegion {
   // TODO: Implement cost function for clustering
   /// Experimental variables and values for cost adjustment
   int ClusteringWeight;
-  int ClusterInitialCost;
   int TotalInstructionsInClusters;
 
   /// Data struct to contain information about the previous clusters
diff --git a/include/opt-sched/Scheduler/data_dep.h b/include/opt-sched/Scheduler/data_dep.h
index 40833f1c..6857ec79 100644
--- a/include/opt-sched/Scheduler/data_dep.h
+++ b/include/opt-sched/Scheduler/data_dep.h
@@ -400,7 +400,6 @@ class DataDepGraph : public llvm::opt_sched::OptSchedDDGWrapperBase,
 
   SchedInstruction *CreateNode_(InstCount instNum, char const *const instName,
                                 InstType instType, char const *const opCode,
-                                /* bool InstrMayLoad, bool InstrMayStore,*/
                                 int nodeID, InstCount fileSchedOrder,
                                 InstCount fileSchedCycle, InstCount fileLB,
                                 InstCount fileUB, int blkNum);
diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp
index bf9c2a8c..6717046f 100644
--- a/lib/Scheduler/bb_spill.cpp
+++ b/lib/Scheduler/bb_spill.cpp
@@ -72,11 +72,9 @@ BBWithSpill::BBWithSpill(const OptSchedTarget *OST_, DataDepGraph *dataDepGraph,
   
   CurrentClusterSize = 0;
   ActiveClusterGroup = 0;
-  ClusterInitialCost = 1000000;
   PastClustersList.clear();
   LastCluster = nullptr;
   TotalInstructionsInClusters = 0;
-
   Config &schedIni = SchedulerOptions::getInstance();
   ClusterMemoryOperations = schedIni.GetBool("CLUSTER_MEMORY_OPS");
   ClusteringWeight = schedIni.GetInt("CLUSTER_WEIGHT");
diff --git a/lib/Scheduler/data_dep.cpp b/lib/Scheduler/data_dep.cpp
index 005bec55..4db8ace6 100644
--- a/lib/Scheduler/data_dep.cpp
+++ b/lib/Scheduler/data_dep.cpp
@@ -832,10 +832,8 @@ FUNC_RESULT DataDepGraph::SkipGraph(SpecsBuffer *buf, bool &endOfFileReached) {
 
 SchedInstruction *DataDepGraph::CreateNode_(
     InstCount instNum, char const *const instName, InstType instType,
-    char const *const opCode,
-    /* bool InstrMayLoad, bool InstrMayStore,*/ int nodeID,
-    InstCount fileSchedOrder, InstCount fileSchedCycle, InstCount fileLB,
-    InstCount fileUB, int blkNum) {
+    char const *const opCode, int nodeID, InstCount fileSchedOrder,
+    InstCount fileSchedCycle, InstCount fileLB, InstCount fileUB, int blkNum) {
 
   SchedInstruction *newInstPtr;
   newInstPtr = new SchedInstruction(instNum, instName, instType, opCode,
diff --git a/lib/Scheduler/enumerator.cpp b/lib/Scheduler/enumerator.cpp
index 115e03b6..d9c4e3b1 100644
--- a/lib/Scheduler/enumerator.cpp
+++ b/lib/Scheduler/enumerator.cpp
@@ -1065,7 +1065,6 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode,
   if (inst != NULL)
     if (inst->GetPreFxdCycle() != INVALID_VALUE)
       if (inst->GetPreFxdCycle() != crntCycleNum_) {
-	//Logger::Info("Pruned due to prefixed cycle");
         return false;
       }
 
@@ -1074,7 +1073,6 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode,
 #ifdef IS_DEBUG_INFSBLTY_TESTS
       stats::forwardLBInfeasibilityHits++;
 #endif
-      //Logger::Info("Pruned due to forward lowerbound");
       return false;
     }
 
@@ -1082,7 +1080,6 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode,
 #ifdef IS_DEBUG_INFSBLTY_TESTS
       stats::backwardLBInfeasibilityHits++;
 #endif
-      //Logger::Info("Pruned due to backward lowerbound");
       return false;
     }
   }
@@ -1103,7 +1100,6 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode,
         stats::nodeSuperiorityInfeasibilityHits++;
 #endif
         isNodeDmntd = true;
-	//Logger::Info("Pruned due to node superiority");
         return false;
       }
   }
@@ -1121,7 +1117,6 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode,
 #ifdef IS_DEBUG_INFSBLTY_TESTS
     stats::slotCountInfeasibilityHits++;
 #endif
-    //Logger::Info("Pruned due to issue slot infeasibility");
     return false;
   }
 
@@ -1132,7 +1127,6 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode,
 #ifdef IS_DEBUG_INFSBLTY_TESTS
     stats::rangeTighteningInfeasibilityHits++;
 #endif
-    //Logger::Info("Pruned due to range tightening infeasibility");
     return false;
   }
 
@@ -1150,7 +1144,6 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode,
 #ifdef IS_DEBUG_INFSBLTY_TESTS
         stats::historyDominationInfeasibilityHits++;
 #endif
-	//Logger::Info("Pruned due to history domination");
         return false;
       }
   }
@@ -1165,7 +1158,7 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode,
       stats::relaxedSchedulingInfeasibilityHits++;
 #endif
       isRlxInfsbl = true;
-      //Logger::Info("Pruned due to relaxed schedule");
+
       return false;
     }
   }
@@ -2078,7 +2071,6 @@ bool LengthCostEnumerator::ProbeBranch_(SchedInstruction *inst,
   isFsbl = ChkCostFsblty_(inst, newNode);
 
   if (isFsbl == false) {
-    //Logger::Info("Pruned due to cost infeasibility");
     return false;
   }
 
@@ -2096,7 +2088,6 @@ bool LengthCostEnumerator::ProbeBranch_(SchedInstruction *inst,
 #endif
       rgn_->UnschdulInst(inst, crntCycleNum_, crntSlotNum_, parent);
 
-      //Logger::Info("Pruned due to history domination");
       return false;
     }
   }
diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
index 875bd5a4..461cde3d 100644
--- a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
+++ b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
@@ -5,7 +5,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "OptSchedDDGWrapperBasic.h"
-#include "opt-sched/Scheduler/bit_vector.h"
 #include "opt-sched/Scheduler/config.h"
 #include "opt-sched/Scheduler/logger.h"
 #include "opt-sched/Scheduler/register.h"
@@ -508,7 +507,7 @@ void OptSchedDDGWrapperBasic::countBoundaryLiveness(
 
 /// Partially copied from
 /// https://github.com/RadeonOpenCompute/llvm/blob/roc-ocl-2.4.0/lib/CodeGen/MachineScheduler.cpp#L1554
-int OptSchedDDGWrapperBasic::clusterNeighboringMemOps_(
+int OptSchedDDGWrapperBasic::clusterNeighboringMemOps(
     ArrayRef<const SUnit *> MemOps) {
   SmallVector<MemOpInfo, 32> MemOpRecords;
   bool ClusterPossible = false;
@@ -578,7 +577,7 @@ int OptSchedDDGWrapperBasic::clusterNeighboringMemOps_(
 }
 
 /// Iterate through SUnits and find all possible clustering then transfer
-/// the information over to the SchedInstruction class as a bitvector.
+/// the information so that our scheduler can access it.
 /// Partially copied from https://github.com/RadeonOpenCompute/llvm/blob/roc-ocl-2.4.0/lib/CodeGen/MachineScheduler.cpp#L1595
 void OptSchedDDGWrapperBasic::findPossibleClusters() {
   // TODO: Add For-loop to also do store clusters. Currently only does load
@@ -625,7 +624,7 @@ void OptSchedDDGWrapperBasic::findPossibleClusters() {
     	LLVM_DEBUG(dbgs() << SU1->NodeNum << " ");
     LLVM_DEBUG(dbgs() << '\n');
 #endif
-    TotalInstructionsPossible += clusterNeighboringMemOps_(SCD);
+    TotalInstructionsPossible += clusterNeighboringMemOps(SCD);
   }
 
  setMaxInstructionsInAllClusters(TotalInstructionsPossible);
diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.h b/lib/Wrapper/OptSchedDDGWrapperBasic.h
index 4fd3937b..eef47684 100644
--- a/lib/Wrapper/OptSchedDDGWrapperBasic.h
+++ b/lib/Wrapper/OptSchedDDGWrapperBasic.h
@@ -135,8 +135,7 @@ class OptSchedDDGWrapperBasic : public DataDepGraph {
   // Find liveness info generated by the region boundary.
   void discoverBoundaryLiveness(const llvm::MachineInstr *MI);
 
-  int clusterNeighboringMemOps_(
-		      ArrayRef<const SUnit *> MemOps);
+  int clusterNeighboringMemOps(ArrayRef<const SUnit *> MemOps);
 
   // Holds a register live range, mapping a producer to a set of consumers.
   struct LiveRange {

From ec8e0bd4f759d28f023d16d1e87609071901c949 Mon Sep 17 00:00:00 2001
From: vang thao <vang@optimizer-amd.ecs.csus.edu>
Date: Fri, 20 Mar 2020 14:09:08 -0700
Subject: [PATCH 24/40] Missed variable to clean up

---
 lib/Scheduler/bb_spill.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp
index 6717046f..5fa4caea 100644
--- a/lib/Scheduler/bb_spill.cpp
+++ b/lib/Scheduler/bb_spill.cpp
@@ -342,7 +342,6 @@ void BBWithSpill::InitForSchdulng() {
   SchedInstruction::SetActiveCluster(0);
   CurrentClusterSize = 0;
   ActiveClusterGroup = 0;
-  ClusterInitialCost = 1000000;
   PastClustersList.clear();
   LastCluster.reset();
   CurrentClusterBlocks = MaxClusterBlocks;

From f467f83a7aa4e24ff6a98a4b38d24665f0d1d37c Mon Sep 17 00:00:00 2001
From: vang thao <vang@optimizer-amd.ecs.csus.edu>
Date: Thu, 26 Mar 2020 18:16:18 -0700
Subject: [PATCH 25/40] Fix issues with enumerator not updating priorities

---
 include/opt-sched/Scheduler/enumerator.h      |  3 ++
 include/opt-sched/Scheduler/lnkd_lst.h        | 29 ++++++++++---------
 .../opt-sched/Scheduler/sched_basic_data.h    |  6 ++++
 lib/Scheduler/bb_spill.cpp                    | 16 ++--------
 lib/Scheduler/ready_list.cpp                  | 23 +++++++++++----
 lib/Scheduler/sched_basic_data.cpp            |  8 +++++
 lib/Wrapper/OptimizingScheduler.cpp           |  1 +
 7 files changed, 53 insertions(+), 33 deletions(-)

diff --git a/include/opt-sched/Scheduler/enumerator.h b/include/opt-sched/Scheduler/enumerator.h
index 85dc5b18..145f24d4 100644
--- a/include/opt-sched/Scheduler/enumerator.h
+++ b/include/opt-sched/Scheduler/enumerator.h
@@ -926,6 +926,9 @@ inline void Enumerator::UpdtRdyLst_(InstCount cycleNum, int slotNum) {
   }
 
   rdyLst_->AddLatestSubLists(lst1, lst2);
+
+ if (prirts_.isDynmc)
+    rdyLst_->UpdatePriorities();
 }
 /*****************************************************************************/
 
diff --git a/include/opt-sched/Scheduler/lnkd_lst.h b/include/opt-sched/Scheduler/lnkd_lst.h
index 3c311f9d..d6b696cb 100644
--- a/include/opt-sched/Scheduler/lnkd_lst.h
+++ b/include/opt-sched/Scheduler/lnkd_lst.h
@@ -177,7 +177,8 @@ class PriorityList : public LinkedList<T> {
   T *GetNxtPriorityElmnt();
   T *GetNxtPriorityElmnt(K &key);
   // Copies all the data from another list. The existing list must be empty.
-  void CopyList(PriorityList<T, K> const *const otherLst);
+  void CopyList(PriorityList<T, K> const *const otherLst,
+                KeyedEntry<T, unsigned long> **keyedEntries_);
 
 protected:
   KeyedEntry<T, K> *allocKeyEntries_;
@@ -572,8 +573,8 @@ inline T *PriorityList<T, K>::GetNxtPriorityElmnt(K &key) {
 }
 
 //(Vlad) added functionality to decrease priority
-//used for decreasing priority of clusterable instrs
-//when leaving a cluster
+// used for decreasing priority of clusterable instrs
+// when leaving a cluster
 template <class T, class K>
 void PriorityList<T, K>::BoostEntry(KeyedEntry<T, K> *entry, K newKey) {
   KeyedEntry<T, K> *crnt;
@@ -582,7 +583,7 @@ void PriorityList<T, K>::BoostEntry(KeyedEntry<T, K> *entry, K newKey) {
 
   assert(LinkedList<T>::topEntry_ != NULL);
 
-  if (entry->key < newKey) //behave normally
+  if (entry->key < newKey) // behave normally
   {
     entry->key = newKey;
 
@@ -612,22 +613,19 @@ void PriorityList<T, K>::BoostEntry(KeyedEntry<T, K> *entry, K newKey) {
     assert(next != entry->GetNext());
     LinkedList<T>::RmvEntry_(entry, false);
     InsrtEntry_(entry, next);
-  }
-  else //move entry down on priority list
+  } else // move entry down on priority list
   {
     entry->key = newKey;
 
-    //if it is at the bottom or next entry still has a smaller key,
-    //then the entry is already in place
+    // if it is at the bottom or next entry still has a smaller key,
+    // then the entry is already in place
     if (entry == LinkedList<T>::bottomEntry_ || next->key <= newKey)
       return;
 
-    for (crnt = entry->GetNext(); crnt != NULL; crnt = crnt->GetNext())
-    {
-      if (crnt->key <= newKey)
-      {
+    for (crnt = entry->GetNext(); crnt != NULL; crnt = crnt->GetNext()) {
+      if (crnt->key <= newKey) {
         next = crnt;
-	break;
+        break;
       }
     }
 
@@ -639,7 +637,9 @@ void PriorityList<T, K>::BoostEntry(KeyedEntry<T, K> *entry, K newKey) {
 }
 
 template <class T, class K>
-void PriorityList<T, K>::CopyList(PriorityList<T, K> const *const otherLst) {
+void PriorityList<T, K>::CopyList(
+    PriorityList<T, K> const *const otherLst,
+    KeyedEntry<T, unsigned long> **keyedEntries_) {
   assert(LinkedList<T>::elmntCnt_ == 0);
 
   for (KeyedEntry<T, K> *entry = (KeyedEntry<T, K> *)otherLst->topEntry_;
@@ -648,6 +648,7 @@ void PriorityList<T, K>::CopyList(PriorityList<T, K> const *const otherLst) {
     K key = entry->key;
     KeyedEntry<T, K> *newEntry = AllocEntry_(elmnt, key);
     LinkedList<T>::AppendEntry_(newEntry);
+    keyedEntries_[entry->element->GetNum()] = newEntry;
 
     if (entry == otherLst->rtrvEntry_) {
       LinkedList<T>::rtrvEntry_ = newEntry;
diff --git a/include/opt-sched/Scheduler/sched_basic_data.h b/include/opt-sched/Scheduler/sched_basic_data.h
index cdfad226..c9d2c107 100644
--- a/include/opt-sched/Scheduler/sched_basic_data.h
+++ b/include/opt-sched/Scheduler/sched_basic_data.h
@@ -435,6 +435,8 @@ class SchedInstruction : public GraphNode {
   int GetClusterGroup() { return ClusterGroup; }
   static int GetActiveCluster() { return ActiveCluster; }
   static void SetActiveCluster(int Active) { ActiveCluster = Active; }
+  bool getWasActive() { return WasActive; }
+  bool computeWasActive();
   friend class SchedRange;
 
 protected:
@@ -442,6 +444,10 @@ class SchedInstruction : public GraphNode {
   string name_;
   // The mnemonic of this instruction, e.g. "add" or "jmp".
   string opCode_;
+
+  bool WasActive;
+ 
+
   /// The cluster group that the current instruction is a part of.
   /// Default of 0 means that it is not part of any cluster.
   int ClusterGroup;
diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp
index 5fa4caea..3dfa48b3 100644
--- a/lib/Scheduler/bb_spill.cpp
+++ b/lib/Scheduler/bb_spill.cpp
@@ -550,19 +550,7 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst,
       CurrentClusterSize = 0;       // Set cluster size to 0
     }
   }
-//  Logger::Info("Currently active cluster %d", ActiveClusterGroup);
-  // Potential Issues:
-  // 1. Keeping track of the average clustering size when we aren't done
-  // scheduling.
-  //    Cost function that was discussed during the meeting on Friday:
-  //      (15 - averageClusteringSize) * ClusteringWeight
-  //      We want to minimize this cost but there is an issue in the following
-  //      example
-  //    Ex: Partial schedule was able to cluster a block of 15.
-  //    averageClusteringSize : 15, CostFnc: (15-15)*Weight = 0
-  //          Any cluster block below size 15 will decrease the average
-  //          cluster size and increase the cost. This makes our B&B
-  //          enumerator actually favor not doing clustering.
+  // Logger::Info("schedule, Currently active cluster %d", ActiveClusterGroup);
 
   defCnt = inst->GetDefs(defs);
   useCnt = inst->GetUses(uses);
@@ -827,7 +815,7 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) {
       }
     }
   }
-//  Logger::Info("Currently active cluster %d", ActiveClusterGroup);
+//  Logger::Info("unschedule, Currently active cluster %d", ActiveClusterGroup);
 
   defCnt = inst->GetDefs(defs);
   useCnt = inst->GetUses(uses);
diff --git a/lib/Scheduler/ready_list.cpp b/lib/Scheduler/ready_list.cpp
index 0e4bcd0b..5a1be9d6 100644
--- a/lib/Scheduler/ready_list.cpp
+++ b/lib/Scheduler/ready_list.cpp
@@ -149,7 +149,7 @@ void ReadyList::CopyList(ReadyList *othrLst) {
   assert(prirtyLst_->GetElmntCnt() == 0);
   assert(latestSubLst_->GetElmntCnt() == 0);
   assert(othrLst != NULL);
-  prirtyLst_->CopyList(othrLst->prirtyLst_);
+  prirtyLst_->CopyList(othrLst->prirtyLst_, keyedEntries_);
 }
 
 unsigned long ReadyList::CmputKey_(SchedInstruction *inst, bool isUpdate,
@@ -159,6 +159,7 @@ unsigned long ReadyList::CmputKey_(SchedInstruction *inst, bool isUpdate,
   int i;
   int16_t oldLastUseCnt, newLastUseCnt;
   unsigned long ValueForKey;
+  bool OldWasActive, NewWasActive;
   changed = true;
   if (isUpdate)
     changed = false;
@@ -174,9 +175,10 @@ unsigned long ReadyList::CmputKey_(SchedInstruction *inst, bool isUpdate,
     case LSH_LUC:
       oldLastUseCnt = inst->GetLastUseCnt();
       newLastUseCnt = inst->CmputLastUseCnt();
-      assert(!isUpdate || newLastUseCnt >= oldLastUseCnt);
-      if (newLastUseCnt != oldLastUseCnt)
+      // assert(!isUpdate || newLastUseCnt >= oldLastUseCnt);
+      if (newLastUseCnt != oldLastUseCnt) {
         changed = true;
+      }
 
       AddPrirtyToKey_(key, keySize, useCntBits_, newLastUseCnt, maxUseCnt_);
       break;
@@ -208,9 +210,19 @@ unsigned long ReadyList::CmputKey_(SchedInstruction *inst, bool isUpdate,
       break;
 
     case LSH_MEM:
-      ValueForKey =
-          inst->GetClusterGroup() == SchedInstruction::GetActiveCluster() ? 1
+      if (inst->GetClusterGroup() == 0)
+	      ValueForKey = 0;
+      else {
+        OldWasActive = inst->getWasActive();
+        NewWasActive = inst->computeWasActive();
+
+        if (OldWasActive != NewWasActive) {
+          changed = true;
+        }
+        ValueForKey =
+            inst->GetClusterGroup() == SchedInstruction::GetActiveCluster() ? 1
                                                                           : 0;
+      }
       AddPrirtyToKey_(key, keySize, ClusterBit, ValueForKey, 1);
       break;
 
@@ -298,6 +310,7 @@ void ReadyList::AddInst(SchedInstruction *inst) {
   assert(changed == true);
   KeyedEntry<SchedInstruction, unsigned long> *entry =
       prirtyLst_->InsrtElmnt(inst, key, true);
+
   InstCount instNum = inst->GetNum();
   if (prirts_.isDynmc)
     keyedEntries_[instNum] = entry;
diff --git a/lib/Scheduler/sched_basic_data.cpp b/lib/Scheduler/sched_basic_data.cpp
index 2fa5f09d..c858be34 100644
--- a/lib/Scheduler/sched_basic_data.cpp
+++ b/lib/Scheduler/sched_basic_data.cpp
@@ -64,6 +64,7 @@ SchedInstruction::SchedInstruction(InstCount num, const string &name,
 
   mustBeInBBEntry_ = false;
   mustBeInBBExit_ = false;
+  WasActive = false;
 }
 
 SchedInstruction::~SchedInstruction() {
@@ -72,6 +73,13 @@ SchedInstruction::~SchedInstruction() {
   delete crntRange_;
 }
 
+bool SchedInstruction::computeWasActive() {
+  if (ClusterGroup == 0) return false;
+
+  WasActive = GetActiveCluster() == GetClusterGroup();
+  return WasActive;
+}
+
 void SchedInstruction::SetupForSchdulng(InstCount instCnt, bool isCP_FromScsr,
                                         bool isCP_FromPrdcsr) {
   if (memAllocd_)
diff --git a/lib/Wrapper/OptimizingScheduler.cpp b/lib/Wrapper/OptimizingScheduler.cpp
index dba961a1..295e9002 100644
--- a/lib/Wrapper/OptimizingScheduler.cpp
+++ b/lib/Wrapper/OptimizingScheduler.cpp
@@ -676,6 +676,7 @@ SchedPriorities ScheduleDAGOptSched::parseHeuristic(const std::string &Str) {
     Priorities.vctr[Priorities.cnt++] = LSH;
     switch (LSH) {
     // Is LUC still the only dynamic heuristic?
+    case LSH_MEM:
     case LSH_LUC:
       Priorities.isDynmc = true;
       break;

From 7fcb9a47928937acd48dad638b90456a6e555954 Mon Sep 17 00:00:00 2001
From: vang thao <vang@optimizer-amd.ecs.csus.edu>
Date: Sat, 4 Apr 2020 14:14:06 -0700
Subject: [PATCH 26/40] Added store clustering and debugging statements

---
 .../Scheduler/OptSchedDDGWrapperBase.h        |  2 +-
 include/opt-sched/Scheduler/bb_spill.h        |  5 ++
 lib/Scheduler/bb_spill.cpp                    | 60 ++++++++++++++++++-
 lib/Wrapper/AMDGPU/OptSchedGCNTarget.cpp      |  4 ++
 lib/Wrapper/OptSchedDDGWrapperBasic.cpp       | 43 ++++---------
 lib/Wrapper/OptSchedDDGWrapperBasic.h         |  2 +-
 lib/Wrapper/OptimizingScheduler.cpp           | 14 ++++-
 7 files changed, 93 insertions(+), 37 deletions(-)

diff --git a/include/opt-sched/Scheduler/OptSchedDDGWrapperBase.h b/include/opt-sched/Scheduler/OptSchedDDGWrapperBase.h
index 4db4673c..b10c9248 100644
--- a/include/opt-sched/Scheduler/OptSchedDDGWrapperBase.h
+++ b/include/opt-sched/Scheduler/OptSchedDDGWrapperBase.h
@@ -18,7 +18,7 @@ class OptSchedDDGWrapperBase {
 
   virtual void convertRegFiles() = 0;
 
-  virtual void findPossibleClusters() = 0;
+  virtual int findPossibleClusters(bool IsLoad) = 0;
 };
 
 } // namespace opt_sched
diff --git a/include/opt-sched/Scheduler/bb_spill.h b/include/opt-sched/Scheduler/bb_spill.h
index a2d6afa5..61461710 100644
--- a/include/opt-sched/Scheduler/bb_spill.h
+++ b/include/opt-sched/Scheduler/bb_spill.h
@@ -14,6 +14,7 @@ Last Update:  Apr. 2011
 #include "opt-sched/Scheduler/sched_region.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include <map>
 #include <memory>
 #include <set>
@@ -65,6 +66,8 @@ class BBWithSpill : public SchedRegion {
     /// Instruction number that ended this cluster
     int InstNum; 
 
+    std::unique_ptr<llvm::SmallVector<llvm::StringRef, 4>> InstrList;
+
     /// Constructor for this struct
     PastClusters(int Cluster, int size, int num)
         : ClusterGroup(Cluster), ClusterSize(size), InstNum(num) {}
@@ -73,6 +76,8 @@ class BBWithSpill : public SchedRegion {
   /// Vector containing the (n-1) past clusters
   llvm::SmallVector<std::unique_ptr<PastClusters>, 4> PastClustersList;
 
+  std::unique_ptr<llvm::SmallVector<llvm::StringRef, 4>> InstrList;
+
   /// Pointer to the last cluster. This is kept out of the vector to
   /// avoid having to fetch it every time we compare the current instruction
   /// number to the one that ended the cluster.
diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp
index 3dfa48b3..f3b49c3e 100644
--- a/lib/Scheduler/bb_spill.cpp
+++ b/lib/Scheduler/bb_spill.cpp
@@ -349,6 +349,8 @@ void BBWithSpill::InitForSchdulng() {
     InstructionsScheduledInEachCluster[begin] = 0;
   }
 
+  InstrList.reset();
+
   schduldEntryInstCnt_ = 0;
   schduldExitInstCnt_ = 0;
   schduldInstCnt_ = 0;
@@ -483,7 +485,10 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst,
           // Case 1: Currently clustering and this current instruction is part
           // of the cluster
           CurrentClusterSize++;
-          InstructionsScheduledInEachCluster[ActiveClusterGroup]++; 
+          InstructionsScheduledInEachCluster[ActiveClusterGroup]++;
+	  
+	  InstrList->push_back(inst->GetName());
+
         } else {
           //Logger::Info("Inst %d pushing cluster size %d onto the stack due to "
             //           "cluster to cluster op",
@@ -497,7 +502,9 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst,
           } else 
             LastCluster = llvm::make_unique<PastClusters>(
                 ActiveClusterGroup, CurrentClusterSize, inst->GetNum());
-          
+
+          LastCluster->InstrList = std::move(InstrList);
+
 	  // If this cluster did not finish then that means there have to be an extra cluster block to finish all of the instructions
 	  // // in the cluster
 	  if (InstructionsScheduledInEachCluster[ActiveClusterGroup] < dataDepGraph_->getMaxInstructionsInCluster(ActiveClusterGroup)) {
@@ -508,6 +515,10 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst,
           inst->SetActiveCluster(ActiveClusterGroup);
           CurrentClusterSize = 1;
           InstructionsScheduledInEachCluster[ActiveClusterGroup]++;
+
+	  InstrList = llvm::make_unique<llvm::SmallVector<llvm::StringRef, 4>>();
+  	  InstrList->push_back(inst->GetName());
+
         }
       } else {
         // Case 3: Not currently clustering. Initialize clustering
@@ -515,6 +526,10 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst,
         inst->SetActiveCluster(ActiveClusterGroup);
         CurrentClusterSize = 1;
         InstructionsScheduledInEachCluster[ActiveClusterGroup]++;
+
+	InstrList = llvm::make_unique<llvm::SmallVector<llvm::StringRef, 4>>();
+	InstrList->push_back(inst->GetName());
+
       }
     } else if (CurrentClusterSize > 0) {
       // Case 2: Exiting out of an active cluster
@@ -534,6 +549,8 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst,
         LastCluster = llvm::make_unique<PastClusters>(
             ActiveClusterGroup, CurrentClusterSize, inst->GetNum());
 
+      LastCluster->InstrList = std::move(InstrList);
+
       // If InstrScheduledInEachCluster != Max
       // blocks++
 
@@ -760,6 +777,9 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) {
       CurrentClusterSize--;
       InstructionsScheduledInEachCluster[ActiveClusterGroup]--;
       assert(InstructionsScheduledInEachCluster[ActiveClusterGroup] >= 0);
+
+      InstrList->pop_back();
+
       //Logger::Info("Undoing an instruction from the cluster. Current size: %d",
         //           CurrentClusterSize);
 
@@ -776,6 +796,9 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) {
             CurrentClusterSize = LastCluster->ClusterSize;
             ActiveClusterGroup = LastCluster->ClusterGroup;
             inst->SetActiveCluster(ActiveClusterGroup);
+
+	    InstrList = std::move(LastCluster->InstrList);
+
             LastCluster.reset(); // Release current cluster pointer
 
             // Get previous cluster from vector list
@@ -798,6 +821,9 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) {
         CurrentClusterSize = LastCluster->ClusterSize;
         ActiveClusterGroup = LastCluster->ClusterGroup;
         inst->SetActiveCluster(ActiveClusterGroup);
+
+	InstrList = std::move(LastCluster->InstrList);
+
         LastCluster.reset(); // Release current cluster pointer
 
         // Get previous cluster from vector list
@@ -1080,6 +1106,36 @@ InstCount BBWithSpill::UpdtOptmlSched(InstSchedule *crntSched,
     bestSchedLngth_ = crntSched->GetCrntLngth();
     enumBestSched_->Copy(crntSched);
     bestSched_ = enumBestSched_;
+   
+    if (isSecondPass && ClusterMemoryOperations) {
+      dbgs() << "Printing clustered instructions:\n";
+      int i = 1;
+      for (const auto &clusters : PastClustersList) {
+        dbgs() << "Printing cluster " << i << ": ";
+        for (const auto &instr : *clusters->InstrList) {
+          dbgs() << instr << " ";
+        }
+        i++;
+      dbgs() << '\n';
+      }
+
+      if (LastCluster) {
+        dbgs() << "Printing cluster " << i << ": ";
+        for (const auto &instr : *(LastCluster->InstrList)) {
+          dbgs() << instr << " ";
+        }
+        i++;
+      dbgs() << '\n';
+      }
+    
+      if (InstrList && InstrList->size() > 0) {
+        dbgs() << "Printing cluster " << i << ": ";
+        for (const auto &instr : *InstrList) {
+          dbgs() << instr << " ";
+        }
+      dbgs() << '\n';
+      }
+    }
   }
 
   return bestCost_;
diff --git a/lib/Wrapper/AMDGPU/OptSchedGCNTarget.cpp b/lib/Wrapper/AMDGPU/OptSchedGCNTarget.cpp
index a250408b..798dc122 100644
--- a/lib/Wrapper/AMDGPU/OptSchedGCNTarget.cpp
+++ b/lib/Wrapper/AMDGPU/OptSchedGCNTarget.cpp
@@ -212,6 +212,10 @@ bool OptSchedGCNTarget::shouldKeepSchedule() {
       dbgs() << "Reverting Scheduling because of a decrease in occupancy from "
              << RegionStartingOccupancy << " to " << RegionEndingOccupancy
              << ".\n");
+  Logger::Info(
+      "Reverting Scheduling because of a decrease in occupancy from %d to %d.", RegionStartingOccupancy, RegionEndingOccupancy
+);
+
   return false;
 }
 
diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
index 461cde3d..6dfdb4a9 100644
--- a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
+++ b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
@@ -513,9 +513,7 @@ int OptSchedDDGWrapperBasic::clusterNeighboringMemOps(
   bool ClusterPossible = false;
   int TotalInstructionsPossible = 0;
 
-  LLVM_DEBUG(dbgs() << "Processing possible clusters\n");
   for (const SUnit *SU : MemOps) {
-    LLVM_DEBUG(dbgs() << "  " << SU->NodeNum << " is in the chain.\n");
     MachineOperand *BaseOp;
     int64_t Offset;
     if (DAG->TII->getMemOperandWithOffset(*SU->getInstr(), BaseOp, Offset, DAG->TRI))
@@ -523,7 +521,7 @@ int OptSchedDDGWrapperBasic::clusterNeighboringMemOps(
   }
 
   if (MemOpRecords.size() < 2) {
-    LLVM_DEBUG(dbgs() << "  Unable to cluster memop cluster of 1.\n");
+    dbgs() << "  Unable to cluster memop cluster of 1.\n";
     return 0;
   }
 
@@ -532,11 +530,11 @@ int OptSchedDDGWrapperBasic::clusterNeighboringMemOps(
   for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) {
     const SUnit *SUa = MemOpRecords[Idx].SU;
     const SUnit *SUb = MemOpRecords[Idx + 1].SU;
-    LLVM_DEBUG(dbgs() << "  Checking possible clustering of (" << SUa->NodeNum << ") and (" << SUb->NodeNum << ")\n");
+    dbgs() << "  Checking possible clustering of (" << SUa->NodeNum << ") and (" << SUb->NodeNum << ")\n";
     if (DAG->TII->shouldClusterMemOps(*MemOpRecords[Idx].BaseOp,
                                  *MemOpRecords[Idx + 1].BaseOp,
                                  ClusterLength)) {
-      LLVM_DEBUG(dbgs() << "    Cluster possible at SU(" << SUa->NodeNum << ")- SU(" << SUb->NodeNum << ")\n");
+      dbgs() << "    Cluster possible at SU(" << SUa->NodeNum << ")- SU(" << SUb->NodeNum << ")\n";
       
       // If clustering was possible then increase the cluster count. This only
       // happens once every cluster
@@ -544,7 +542,7 @@ int OptSchedDDGWrapperBasic::clusterNeighboringMemOps(
         ClusterPossible = true;
         ClusterCount++;
         setMaxClusterCount(ClusterCount);
-        Logger::Info("Setting max cluster count to %d", ClusterCount);
+        dbgs() << "    Setting total cluster count to " << ClusterCount << "\n";
       }
 
       // Tell the instructions what cluster number they are in
@@ -562,16 +560,6 @@ int OptSchedDDGWrapperBasic::clusterNeighboringMemOps(
     } else
       ClusterLength = 1;
   }
-#ifdef IS_DEBUG_MEMORY_CLUSTERING
-  LLVM_DEBUG(dbgs () << "Printing bit vector: ");
-  for (int i = ClusterVector->GetSize() - 1; i >= 0; i--) {
-    if (ClusterVector->GetBit(i))
-      LLVM_DEBUG(dbgs() << "1");
-    else
-      LLVM_DEBUG(dbgs() << "0");
-  }
-  LLVM_DEBUG(dbgs() << '\n');
-#endif
   MaxInstructionsInEachClusters.insert(std::make_pair(ClusterCount, TotalInstructionsPossible));
   return TotalInstructionsPossible;
 }
@@ -579,13 +567,10 @@ int OptSchedDDGWrapperBasic::clusterNeighboringMemOps(
 /// Iterate through SUnits and find all possible clustering then transfer
 /// the information so that our scheduler can access it.
 /// Partially copied from https://github.com/RadeonOpenCompute/llvm/blob/roc-ocl-2.4.0/lib/CodeGen/MachineScheduler.cpp#L1595
-void OptSchedDDGWrapperBasic::findPossibleClusters() {
+int OptSchedDDGWrapperBasic::findPossibleClusters(bool IsLoad) {
   // TODO: Add For-loop to also do store clusters. Currently only does load
   // clusters
-  bool IsLoad = true;
   int TotalInstructionsPossible = 0;
-
-  LLVM_DEBUG(dbgs() << "Looking for load clusters\n");
   DenseMap<unsigned, unsigned> StoreChainIDs;
   // Map each store chain to a set of dependent MemOps.
   SmallVector<SmallVector<const SUnit *, 4>, 32> StoreChainDependents;
@@ -594,13 +579,12 @@ void OptSchedDDGWrapperBasic::findPossibleClusters() {
         (!IsLoad && !SU.getInstr()->mayStore()))
       continue;
     auto MI = SU.getInstr();
-    LLVM_DEBUG(dbgs() << "  Instruction (" << SU.NodeNum << ") " << DAG->TII->getName(MI->getOpcode())  << " may load.\n");
+
+    dbgs() << "Instruction (" << SU.NodeNum << ") " << DAG->TII->getName(MI->getOpcode())  << " may " << (IsLoad ? "load" : "store") << "\n";
 
     unsigned ChainPredID = DAG->SUnits.size();
     for (const SDep &Pred : SU.Preds) {
       if (Pred.isCtrl()) {
-        auto PredMI = Pred.getSUnit()->getInstr();
-        LLVM_DEBUG(dbgs() << "    Breaking chain at (" << Pred.getSUnit()->NodeNum << ") " << DAG->TII->getName(PredMI->getOpcode()) << '\n');
         ChainPredID = Pred.getSUnit()->NodeNum;
         break;
       }
@@ -608,7 +592,6 @@ void OptSchedDDGWrapperBasic::findPossibleClusters() {
     // Check if this chain-like pred has been seen
     // before. ChainPredID==MaxNodeID at the top of the schedule.
     unsigned NumChains = StoreChainDependents.size();
-    LLVM_DEBUG(dbgs() << "    ChainPredID " << ChainPredID << ", NumChains " << NumChains << '\n');
     std::pair<DenseMap<unsigned, unsigned>::iterator, bool> Result =
         StoreChainIDs.insert(std::make_pair(ChainPredID, NumChains));
     if (Result.second)
@@ -618,16 +601,14 @@ void OptSchedDDGWrapperBasic::findPossibleClusters() {
 
   // Iterate over the store chains.
   for (auto &SCD : StoreChainDependents) {
-#ifdef IS_DEBUG_MEMORY_CLUSTERING
-    LLVM_DEBUG(dbgs() << "    Printing the list before clustering: ");
+    dbgs() << "Printing the Node ID of the current chain: ";
     for (auto SU1 : SCD)
-    	LLVM_DEBUG(dbgs() << SU1->NodeNum << " ");
-    LLVM_DEBUG(dbgs() << '\n');
-#endif
+    	dbgs() << SU1->NodeNum << " ";
+    dbgs() << '\n';
     TotalInstructionsPossible += clusterNeighboringMemOps(SCD);
   }
-
- setMaxInstructionsInAllClusters(TotalInstructionsPossible);
+ return TotalInstructionsPossible;
+// setMaxInstructionsInAllClusters(TotalInstructionsPossible);
 }
 
 LLVMRegTypeFilter::LLVMRegTypeFilter(
diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.h b/lib/Wrapper/OptSchedDDGWrapperBasic.h
index eef47684..373ddc52 100644
--- a/lib/Wrapper/OptSchedDDGWrapperBasic.h
+++ b/lib/Wrapper/OptSchedDDGWrapperBasic.h
@@ -50,7 +50,7 @@ class OptSchedDDGWrapperBasic : public DataDepGraph {
 
   void convertSUnits() override;
   void convertRegFiles() override;
-  void findPossibleClusters() override;
+  int findPossibleClusters(bool IsLoad) override;
 
 protected:
   // A convenience machMdl_ pointer casted to OptSchedMachineModel*.
diff --git a/lib/Wrapper/OptimizingScheduler.cpp b/lib/Wrapper/OptimizingScheduler.cpp
index 295e9002..ee75b2e3 100644
--- a/lib/Wrapper/OptimizingScheduler.cpp
+++ b/lib/Wrapper/OptimizingScheduler.cpp
@@ -378,14 +378,24 @@ void ScheduleDAGOptSched::schedule() {
       OST->createDDGWrapper(C, this, MM.get(), LatencyPrecision, RegionName);
   DDG->convertSUnits();
   DDG->convertRegFiles();
-  DDG->findPossibleClusters();
   if (SecondPass) {
+    dbgs() << "Finding load clusters.\n";
+    int TotalLoadsInstructionsClusterable = DDG->findPossibleClusters(true);
+    if (TotalLoadsInstructionsClusterable == 0)
+      dbgs() << "  No load clustering possible\n";
+    dbgs() << "Finding store clusters.\n";
+    int TotalStoreInstructionsClusterable = DDG->findPossibleClusters(false);
+    if (TotalStoreInstructionsClusterable == 0)
+      dbgs() << "  No store clustering possible\n";
+
     auto DDG2 = static_cast<DataDepGraph *>(DDG.get());
+    Logger::Info("Total clusterable instructions: %d loads, %d stores", TotalLoadsInstructionsClusterable, TotalStoreInstructionsClusterable);
+    DDG2->setMaxInstructionsInAllClusters(TotalLoadsInstructionsClusterable + TotalStoreInstructionsClusterable);
     int end = DDG2->getMaxClusterCount();
     if (end > 0) {
       Logger::Info("Total clusters in region: %d", end);
       for (int begin = 1; begin <= end; begin++) {
-        Logger::Info("Cluster %d has total instructions %d", begin, DDG2->getMaxInstructionsInCluster(begin));
+        Logger::Info("  Cluster %d has total instructions %d", begin, DDG2->getMaxInstructionsInCluster(begin));
       }
     }
   }

From cccccc31805f8422cfdbc57659093ec36960e838 Mon Sep 17 00:00:00 2001
From: Vang Thao <vangthao@csus.edu>
Date: Wed, 8 Apr 2020 19:11:43 -0700
Subject: [PATCH 27/40] Fix segmentation fault due to copying ready list when a
 dynamic heuristic is not used.

---
 include/opt-sched/Scheduler/lnkd_lst.h |  5 +++--
 lib/Scheduler/ready_list.cpp           | 16 ++++++++++------
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/include/opt-sched/Scheduler/lnkd_lst.h b/include/opt-sched/Scheduler/lnkd_lst.h
index d6b696cb..bcb770d2 100644
--- a/include/opt-sched/Scheduler/lnkd_lst.h
+++ b/include/opt-sched/Scheduler/lnkd_lst.h
@@ -178,7 +178,7 @@ class PriorityList : public LinkedList<T> {
   T *GetNxtPriorityElmnt(K &key);
   // Copies all the data from another list. The existing list must be empty.
   void CopyList(PriorityList<T, K> const *const otherLst,
-                KeyedEntry<T, unsigned long> **keyedEntries_);
+                KeyedEntry<T, unsigned long> **keyedEntries_ = nullptr);
 
 protected:
   KeyedEntry<T, K> *allocKeyEntries_;
@@ -648,7 +648,8 @@ void PriorityList<T, K>::CopyList(
     K key = entry->key;
     KeyedEntry<T, K> *newEntry = AllocEntry_(elmnt, key);
     LinkedList<T>::AppendEntry_(newEntry);
-    keyedEntries_[entry->element->GetNum()] = newEntry;
+    if (keyedEntries_)
+      keyedEntries_[entry->element->GetNum()] = newEntry;
 
     if (entry == otherLst->rtrvEntry_) {
       LinkedList<T>::rtrvEntry_ = newEntry;
diff --git a/lib/Scheduler/ready_list.cpp b/lib/Scheduler/ready_list.cpp
index 5a1be9d6..ad377e84 100644
--- a/lib/Scheduler/ready_list.cpp
+++ b/lib/Scheduler/ready_list.cpp
@@ -8,13 +8,18 @@ using namespace llvm::opt_sched;
 ReadyList::ReadyList(DataDepGraph *dataDepGraph, SchedPriorities prirts) {
   prirts_ = prirts;
   prirtyLst_ = NULL;
-  keyedEntries_ = NULL;
   int i;
   uint16_t totKeyBits = 0;
 
   useCntBits_ = crtclPathBits_ = scsrCntBits_ = ltncySumBits_ = nodeID_Bits_ =
       inptSchedOrderBits_ = 0;
 
+  if (prirts_.isDynmc)
+    keyedEntries_ = new KeyedEntry<SchedInstruction, unsigned long>
+        *[dataDepGraph->GetInstCnt()];
+  else
+    keyedEntries_ = nullptr;
+
   // Calculate the number of bits needed to hold the maximum value of each
   // priority scheme
   for (i = 0; i < prirts.cnt; i++) {
@@ -27,8 +32,6 @@ ReadyList::ReadyList(DataDepGraph *dataDepGraph, SchedPriorities prirts) {
       break;
 
     case LSH_LUC:
-      keyedEntries_ = new KeyedEntry<SchedInstruction, unsigned long>
-          *[dataDepGraph->GetInstCnt()];
       for (int j = 0; j < dataDepGraph->GetInstCnt(); j++) {
         keyedEntries_[j] = NULL;
       }
@@ -211,7 +214,7 @@ unsigned long ReadyList::CmputKey_(SchedInstruction *inst, bool isUpdate,
 
     case LSH_MEM:
       if (inst->GetClusterGroup() == 0)
-	      ValueForKey = 0;
+        ValueForKey = 0;
       else {
         OldWasActive = inst->getWasActive();
         NewWasActive = inst->computeWasActive();
@@ -221,7 +224,7 @@ unsigned long ReadyList::CmputKey_(SchedInstruction *inst, bool isUpdate,
         }
         ValueForKey =
             inst->GetClusterGroup() == SchedInstruction::GetActiveCluster() ? 1
-                                                                          : 0;
+                                                                            : 0;
       }
       AddPrirtyToKey_(key, keySize, ClusterBit, ValueForKey, 1);
       break;
@@ -247,7 +250,8 @@ void ReadyList::Print(std::ostream &out) {
   out << "Ready List: ";
   for (auto *crntInst = prirtyLst_->GetFrstElmnt(); crntInst != NULL;
        crntInst = prirtyLst_->GetNxtElmnt()) {
-    out << " " << crntInst->GetNum() << "(" << crntInst->GetClusterGroup() << ")";
+    out << " " << crntInst->GetNum() << "(" << crntInst->GetClusterGroup()
+        << ")";
   }
   out << '\n';
 

From b4f55af8b4da528923405d93f06fd33d40ad60db Mon Sep 17 00:00:00 2001
From: Vang Thao <vangthao@csus.edu>
Date: Wed, 22 Apr 2020 20:52:25 -0700
Subject: [PATCH 28/40] Updated comments for easier review.

---
 example/optsched-cfg/sched.ini                |  12 +-
 include/opt-sched/Scheduler/bb_spill.h        |  32 ++-
 include/opt-sched/Scheduler/data_dep.h        |  20 +-
 include/opt-sched/Scheduler/enumerator.h      |   6 +-
 .../opt-sched/Scheduler/sched_basic_data.h    |   5 +-
 lib/Scheduler/bb_spill.cpp                    | 221 ++++++++++--------
 lib/Scheduler/data_dep.cpp                    |   6 +-
 lib/Scheduler/ready_list.cpp                  |  21 +-
 lib/Scheduler/sched_basic_data.cpp            |   1 +
 lib/Scheduler/sched_region.cpp                |   7 -
 lib/Wrapper/AMDGPU/OptSchedDDGWrapperGCN.cpp  |   2 +-
 lib/Wrapper/OptSchedDDGWrapperBasic.cpp       |  44 ++--
 lib/Wrapper/OptimizingScheduler.cpp           |  30 ++-
 13 files changed, 228 insertions(+), 179 deletions(-)

diff --git a/example/optsched-cfg/sched.ini b/example/optsched-cfg/sched.ini
index d9b45132..11370b31 100644
--- a/example/optsched-cfg/sched.ini
+++ b/example/optsched-cfg/sched.ini
@@ -14,12 +14,14 @@ PRINT_SPILL_COUNTS YES
 # NO
 USE_TWO_PASS NO
 
-# Cluster memory operations together in the second pass
+# Allow enumerator to try to cluster memory operations together in the second pass.
 # YES
 # NO
 CLUSTER_MEMORY_OPS NO
 
-CLUSTER_WEIGHT 1000000
+# The weight for clustering. This factor determines the importance of
+# trying to find clusters when enumerating.
+CLUSTER_WEIGHT 1000
 
 # These 3 flags control which schedulers will be used.
 # Each one can be individually toggled. The heuristic
@@ -86,16 +88,14 @@ TIMEOUT_PER INSTR
 # Example: LUC_CP_NID
 HEURISTIC LUC_CP_NID
 
-# Same as HEURISTIC except with MEM_ prefix.
-SECOND_PASS_HEURISTIC MEM_LUC_CP_NID
-
 # The heuristic used for the enumerator. If the two pass scheduling
 # approach is enabled, then this value will be used for the first pass.
 # Same valid values as HEURISTIC.
 ENUM_HEURISTIC LUC_CP_NID
 
 # The heuuristic used for the enumerator in the second pass in the two-pass scheduling approach.
-# Same valid values as HEURISTIC.
+# Same valid values as HEURISTIC with an additional heuristic:
+# Cluster: Favor instructions that are part of an active memory clustering group.
 SECOND_PASS_ENUM_HEURISTIC LUC_CP_NID
 
 # The spill cost function to be used. Valid values are:
diff --git a/include/opt-sched/Scheduler/bb_spill.h b/include/opt-sched/Scheduler/bb_spill.h
index 61461710..5a5b1ced 100644
--- a/include/opt-sched/Scheduler/bb_spill.h
+++ b/include/opt-sched/Scheduler/bb_spill.h
@@ -41,45 +41,53 @@ class BBWithSpill : public SchedRegion {
 
   MapVector<int, int> InstructionsScheduledInEachCluster;
 
-  int MaxClusterBlocks;
+  /// The minimum amount of cluster blocks possible.
+  int MinClusterBlocks;
+
+  /// The minimum amount of cluster blocks + the optimistic expected cluster
+  /// blocks remaining.
   int CurrentClusterBlocks;
 
-  /// Current active cluster group
+  /// Current active cluster group.
   int ActiveClusterGroup;
 
-  /// Flag to enable or disable clustering memory operations
-  /// in the ILP pass.
+  /// Flag to enable or disable clustering memory operations in the ILP pass.
+  /// Reads from the sched.ini file then set the flag accordingly.
   bool ClusterMemoryOperations;
 
-  // TODO: Implement cost function for clustering
-  /// Experimental variables and values for cost adjustment
+  /// The weight for memory ops clustering.
   int ClusteringWeight;
-  int TotalInstructionsInClusters;
 
   /// Data struct to contain information about the previous clusters
   struct PastClusters {
+    /// The cluster group
     int ClusterGroup;
     /// Size of the cluster when it was ended by an instruction not in the
     /// cluster
     int ClusterSize;
 
-    /// Instruction number that ended this cluster
+    /// Instruction number that ended this cluster. Used to check if we should
+    /// restore the cluster state when backtracking.
     int InstNum; 
 
+    /// Contains the actual names of the instructions in the cluster. Only used
+    /// for printing and debugging purposes.
     std::unique_ptr<llvm::SmallVector<llvm::StringRef, 4>> InstrList;
 
     /// Constructor for this struct
-    PastClusters(int Cluster, int size, int num)
-        : ClusterGroup(Cluster), ClusterSize(size), InstNum(num) {}
+    PastClusters(int Cluster, int Size, int Instructions)
+        : ClusterGroup(Cluster), ClusterSize(Size), InstNum(Instructions) {}
   };
 
   /// Vector containing the (n-1) past clusters
   llvm::SmallVector<std::unique_ptr<PastClusters>, 4> PastClustersList;
 
+  /// Contains the actual names of the instructions in the current cluster.
+  /// Only used for printing and debugging purposes.
   std::unique_ptr<llvm::SmallVector<llvm::StringRef, 4>> InstrList;
 
-  /// Pointer to the last cluster. This is kept out of the vector to
-  /// avoid having to fetch it every time we compare the current instruction
+  /// Pointer to the last cluster. This is kept out of the vector to avoid
+  /// having to fetch it every time we compare the current instruction
   /// number to the one that ended the cluster.
   std::unique_ptr<PastClusters> LastCluster;
 
diff --git a/include/opt-sched/Scheduler/data_dep.h b/include/opt-sched/Scheduler/data_dep.h
index 6857ec79..c450f454 100644
--- a/include/opt-sched/Scheduler/data_dep.h
+++ b/include/opt-sched/Scheduler/data_dep.h
@@ -292,18 +292,20 @@ class DataDepGraph : public llvm::opt_sched::OptSchedDDGWrapperBase,
 
   RegisterFile *getRegFiles() { return RegFiles.get(); }
 
-  int getMaxClusterCount() { return MaxClusterCount; }
-  void setMaxClusterCount(int Max) { MaxClusterCount = Max; }
-  int getMaxInstructionsInAllClusters() { return MaxInstructionsInAllClusters; }
-  void setMaxInstructionsInAllClusters(int Max) {
-    MaxInstructionsInAllClusters = Max;
+  // Memory clustering helper functions
+  int getMinClusterCount() { return MinClusterCount; }
+  void setMinClusterCount(int Max) { MinClusterCount = Max; }
+  int getTotalInstructionsInAllClusters() { return TotalInstructionsInAllClusters; }
+  void setTotalInstructionsInAllClusters(int Max) {
+    TotalInstructionsInAllClusters = Max;
   }
-
-  int getMaxInstructionsInCluster(int Cluster);
+  int getTotalInstructionsInCluster(int Cluster);
 
 protected:
-  int MaxClusterCount;
-  int MaxInstructionsInAllClusters;
+  int MinClusterCount;
+  int TotalInstructionsInAllClusters;
+  /// Map the cluster block to the total number of instructions found in the
+  /// block
   MapVector<int, int> MaxInstructionsInEachClusters;
 
   // TODO(max): Get rid of this.
diff --git a/include/opt-sched/Scheduler/enumerator.h b/include/opt-sched/Scheduler/enumerator.h
index 145f24d4..534ec741 100644
--- a/include/opt-sched/Scheduler/enumerator.h
+++ b/include/opt-sched/Scheduler/enumerator.h
@@ -917,6 +917,9 @@ inline void Enumerator::UpdtRdyLst_(InstCount cycleNum, int slotNum) {
   LinkedList<SchedInstruction> *lst1 = NULL;
   LinkedList<SchedInstruction> *lst2 = frstRdyLstPerCycle_[cycleNum];
 
+  if (prirts_.isDynmc)
+    rdyLst_->UpdatePriorities();
+
   if (slotNum == 0 && prevCycleNum >= 0) {
     // If at the begining of a new cycle other than the very first cycle, then
     // we also have to include the instructions that might have become ready in
@@ -926,9 +929,6 @@ inline void Enumerator::UpdtRdyLst_(InstCount cycleNum, int slotNum) {
   }
 
   rdyLst_->AddLatestSubLists(lst1, lst2);
-
- if (prirts_.isDynmc)
-    rdyLst_->UpdatePriorities();
 }
 /*****************************************************************************/
 
diff --git a/include/opt-sched/Scheduler/sched_basic_data.h b/include/opt-sched/Scheduler/sched_basic_data.h
index c9d2c107..35868786 100644
--- a/include/opt-sched/Scheduler/sched_basic_data.h
+++ b/include/opt-sched/Scheduler/sched_basic_data.h
@@ -53,8 +53,9 @@ enum LISTSCHED_HEURISTIC {
   // LLVM list scheduler order
   LSH_LLVM = 8,
 
-  // Memory clustering
-  LSH_MEM = 9
+  // Dynamic memory clustering heuristic, favor instructions that are part of
+  // an active cluster
+  LSH_CLUSTER = 9
 };
 
 #define MAX_SCHED_PRIRTS 10
diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp
index f3b49c3e..3632d648 100644
--- a/lib/Scheduler/bb_spill.cpp
+++ b/lib/Scheduler/bb_spill.cpp
@@ -70,17 +70,17 @@ BBWithSpill::BBWithSpill(const OptSchedTarget *OST_, DataDepGraph *dataDepGraph,
   schduldExitInstCnt_ = 0;
   schduldInstCnt_ = 0;
   
+  // Memory clustering variables initialization
   CurrentClusterSize = 0;
   ActiveClusterGroup = 0;
   PastClustersList.clear();
   LastCluster = nullptr;
-  TotalInstructionsInClusters = 0;
   Config &schedIni = SchedulerOptions::getInstance();
   ClusterMemoryOperations = schedIni.GetBool("CLUSTER_MEMORY_OPS");
   ClusteringWeight = schedIni.GetInt("CLUSTER_WEIGHT");
-  MaxClusterBlocks = dataDepGraph_->getMaxClusterCount();
-  CurrentClusterBlocks = MaxClusterBlocks;
-  for (int begin = 1; begin <= MaxClusterBlocks; begin++) {
+  MinClusterBlocks = dataDepGraph_->getMinClusterCount();
+  CurrentClusterBlocks = MinClusterBlocks;
+  for (int begin = 1; begin <= MinClusterBlocks; begin++) {
     InstructionsScheduledInEachCluster[begin] = 0;
   }
 }
@@ -320,9 +320,9 @@ InstCount BBWithSpill::CmputCostLwrBound() {
   InstCount staticLowerBound =
       schedLwrBound_ * schedCostFactor_ + spillCostLwrBound * SCW_;
 
-
+  // Add the minimum of the possible clusters to the lower bound
   if (isSecondPass && ClusterMemoryOperations) {
-    staticLowerBound  += MaxClusterBlocks * ClusteringWeight;
+    staticLowerBound  += MinClusterBlocks * ClusteringWeight;
   }
 
 #if defined(IS_DEBUG_STATIC_LOWER_BOUND)
@@ -339,16 +339,6 @@ InstCount BBWithSpill::CmputCostLwrBound() {
 void BBWithSpill::InitForSchdulng() {
   InitForCostCmputtn_();
 
-  SchedInstruction::SetActiveCluster(0);
-  CurrentClusterSize = 0;
-  ActiveClusterGroup = 0;
-  PastClustersList.clear();
-  LastCluster.reset();
-  CurrentClusterBlocks = MaxClusterBlocks;
-  for (int begin = 1; begin <= MaxClusterBlocks; begin++) {
-    InstructionsScheduledInEachCluster[begin] = 0;
-  }
-
   InstrList.reset();
 
   schduldEntryInstCnt_ = 0;
@@ -358,6 +348,19 @@ void BBWithSpill::InitForSchdulng() {
 /*****************************************************************************/
 
 void BBWithSpill::InitForCostCmputtn_() {
+  // Init/Reset memory clustering values if it is enabled
+  if (isSecondPass && ClusterMemoryOperations) {
+    SchedInstruction::SetActiveCluster(0);
+    CurrentClusterSize = 0;
+    ActiveClusterGroup = 0;
+    PastClustersList.clear();
+    LastCluster.reset();
+    CurrentClusterBlocks = MinClusterBlocks;
+    for (int begin = 1; begin <= MinClusterBlocks; begin++) {
+      InstructionsScheduledInEachCluster[begin] = 0;
+    }
+}
+
   int i;
 
   crntCycleNum_ = 0;
@@ -397,10 +400,6 @@ InstCount BBWithSpill::CmputNormCost_(InstSchedule *sched,
                                       InstCount &execCost, bool trackCnflcts) {
   InstCount cost = CmputCost_(sched, compMode, execCost, trackCnflcts);
 
-  // TODO: Implement cost function for clustering
-  if (isSecondPass && ClusterMemoryOperations)
-    cost += CurrentClusterBlocks * ClusteringWeight;
-
   cost -= costLwrBound_;
   execCost -= costLwrBound_;
 
@@ -425,6 +424,9 @@ InstCount BBWithSpill::CmputCost_(InstSchedule *sched, COST_COMP_MODE compMode,
   InstCount cost = sched->GetCrntLngth() * schedCostFactor_;
   execCost = cost;
   cost += crntSpillCost_ * SCW_;
+  // Add the current clustering cost
+  if (isSecondPass && ClusterMemoryOperations)
+    cost += CurrentClusterBlocks * ClusteringWeight;
   sched->SetSpillCosts(spillCosts_);
   sched->SetPeakRegPressures(peakRegPressures_);
   sched->SetSpillCost(crntSpillCost_);
@@ -467,58 +469,66 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst,
   InstCount newSpillCost;
 
   // Scheduling cases for clustering project:
-  // 1.) Cluster -> Cluster
-  // Simple case, just increment 1 from cluster size
-  // 2.) Cluster -> Non-Cluster
-  // ?? End clustering
+  // 1.) Same Cluster -> Same Cluster
+  // 2.) Cluster -> Different Cluster
   // 3.) Non-Cluster -> Cluster
-  // Simple case, initialize clustering
-
+  // 4.) Cluster -> Non-Cluster
+  
   // Possibly keep track of the current memory clustering size here
   // and in UpdateSpillInfoForUnSchdul_()
   if (isSecondPass && ClusterMemoryOperations) {
+    // Check if the current instruction is part of a cluster
     if (inst->GetMayCluster()) {
-      // If there is a current active cluster
+      // Check if there is a current active cluster
       if (CurrentClusterSize > 0) {
-        // The instruction is in the current active cluster
+        // Check if the instruction is in the same cluster group as the active
+        // cluster
         if (ActiveClusterGroup == inst->GetClusterGroup()) {
-          // Case 1: Currently clustering and this current instruction is part
-          // of the cluster
+          // Case 1: Simple case where the current instruction is part of an
+          // already active cluster.
           CurrentClusterSize++;
           InstructionsScheduledInEachCluster[ActiveClusterGroup]++;
 	  
-	  InstrList->push_back(inst->GetName());
+	        InstrList->push_back(inst->GetName());
 
         } else {
-          //Logger::Info("Inst %d pushing cluster size %d onto the stack due to "
-            //           "cluster to cluster op",
-            //           inst->GetNum(), CurrentClusterSize);
-          // The instruction is in another cluster that is not currently active.
-          // Exit out of the currently active cluster into a new one.
+          // Case 2: Else the instruction is part of different cluster that
+          // is not currently active. Store information of the old cluster
+          // group and start clustering for the new cluster.
           if (LastCluster) {
+            // Save previous clusters in a vector except the last cluster
+            // that we just exited out of.
             PastClustersList.push_back(std::move(LastCluster));
+
+            // Last cluster that we just exited out of, used for fast accessing
+            // to its contents
             LastCluster = llvm::make_unique<PastClusters>(
                 ActiveClusterGroup, CurrentClusterSize, inst->GetNum());
           } else 
+            // This is the first cluster block that we exited out of.
             LastCluster = llvm::make_unique<PastClusters>(
                 ActiveClusterGroup, CurrentClusterSize, inst->GetNum());
 
           LastCluster->InstrList = std::move(InstrList);
 
-	  // If this cluster did not finish then that means there have to be an extra cluster block to finish all of the instructions
-	  // // in the cluster
-	  if (InstructionsScheduledInEachCluster[ActiveClusterGroup] < dataDepGraph_->getMaxInstructionsInCluster(ActiveClusterGroup)) {
-	    CurrentClusterBlocks++;
-	  }
+          // If the old cluster did not finish clustering all possible 
+          // instructions in its cluster then that means there have to be an
+          // extra cluster block to finish all of the instructions in the
+          // cluster.
+          if (InstructionsScheduledInEachCluster[ActiveClusterGroup] <
+              dataDepGraph_->getTotalInstructionsInCluster(
+              ActiveClusterGroup)) {
+            CurrentClusterBlocks++;
+          }
 
+          // Finish setting up the new cluster
           ActiveClusterGroup = inst->GetClusterGroup();
           inst->SetActiveCluster(ActiveClusterGroup);
           CurrentClusterSize = 1;
           InstructionsScheduledInEachCluster[ActiveClusterGroup]++;
-
-	  InstrList = llvm::make_unique<llvm::SmallVector<llvm::StringRef, 4>>();
-  	  InstrList->push_back(inst->GetName());
-
+	        InstrList = llvm::make_unique<
+              llvm::SmallVector<llvm::StringRef, 4>>();
+  	      InstrList->push_back(inst->GetName());
         }
       } else {
         // Case 3: Not currently clustering. Initialize clustering
@@ -526,48 +536,46 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst,
         inst->SetActiveCluster(ActiveClusterGroup);
         CurrentClusterSize = 1;
         InstructionsScheduledInEachCluster[ActiveClusterGroup]++;
-
-	InstrList = llvm::make_unique<llvm::SmallVector<llvm::StringRef, 4>>();
-	InstrList->push_back(inst->GetName());
-
+	      InstrList = llvm::make_unique<llvm::SmallVector<llvm::StringRef, 4>>();
+	      InstrList->push_back(inst->GetName());
       }
     } else if (CurrentClusterSize > 0) {
-      // Case 2: Exiting out of an active cluster
-//      Logger::Info("Inst %d pushing cluster size %d onto the stack",
-  //                  inst->GetNum(), CurrentClusterSize);
-        
+      // Case 4: Exiting out of an active cluster
       // Save the cluster to restore when backtracking.
       if (LastCluster) {
-        // Save previous current cluster in a vector
+        // Save previous clusters in a vector except the last cluster
+        // that we just exited out of.
         PastClustersList.push_back(std::move(LastCluster));
 
-        // Current cluster
+        // Last cluster that we just exited out of, used for fast accessing
+        // to its contents.
         LastCluster = llvm::make_unique<PastClusters>(
             ActiveClusterGroup, CurrentClusterSize, inst->GetNum());
       } else
-        // This is the first cluster that we are saving
+        // This is the first cluster block that we exited out of.
         LastCluster = llvm::make_unique<PastClusters>(
             ActiveClusterGroup, CurrentClusterSize, inst->GetNum());
 
       LastCluster->InstrList = std::move(InstrList);
 
-      // If InstrScheduledInEachCluster != Max
-      // blocks++
-
-      // If this cluster did not finish then that means there have to be an extra cluster block to finish all of the instructions
-      // in the cluster
-      if (InstructionsScheduledInEachCluster[ActiveClusterGroup] < dataDepGraph_->getMaxInstructionsInCluster(ActiveClusterGroup)) {
+      // If this cluster did not finish then that means there have to be an
+      // extra cluster block to finish all of the instructions in the cluster.
+      if (InstructionsScheduledInEachCluster[ActiveClusterGroup] <
+            dataDepGraph_->getTotalInstructionsInCluster(ActiveClusterGroup)) {
         CurrentClusterBlocks++;
       }
 
-      assert(InstructionsScheduledInEachCluster[ActiveClusterGroup] <= dataDepGraph_->getMaxInstructionsInCluster(ActiveClusterGroup));
+      // Assert that the total instructions accounted for doesn't exceed the
+      // expected total instructions in the cluster
+      assert(InstructionsScheduledInEachCluster[ActiveClusterGroup] <=
+            dataDepGraph_->getTotalInstructionsInCluster(ActiveClusterGroup));
 
-      ActiveClusterGroup = 0;     // Reset active cluster
+      // Reset active cluster
+      ActiveClusterGroup = 0;
       inst->SetActiveCluster(0);
-      CurrentClusterSize = 0;       // Set cluster size to 0
+      CurrentClusterSize = 0;
     }
   }
-  // Logger::Info("schedule, Currently active cluster %d", ActiveClusterGroup);
 
   defCnt = inst->GetDefs(defs);
   useCnt = inst->GetUses(uses);
@@ -761,43 +769,39 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) {
 #endif
 
   // Backtracking cases for clustering project:
-  // 1.) Cluster <- Cluster 
-      // Simple case, just decrement 1 from cluster size
-  // 2.) Cluster <- Non-Cluster
-      // Have to restore state of Cluster and ??
-      // Can/should we use a stack to restore state?
-  // 3.) Non-Cluster <- Cluster
-      // Simple case, just decrement 1 from cluster size
-      // If cluster size == 0, set ActiveClusterGroup = 0;
+  // 1.) Same Cluster <- Same Cluster 
+  // 2.) Non-Cluster <- Cluster
+  // 3.) Different Cluster <- Cluster
+  // 4.) Cluster <- Non-cluster
   if (isSecondPass && ClusterMemoryOperations) {
-    // TODO: Check for different cluster to different cluster
-    // backtracking.
+    // If the instruction we are backtracking from is part of a cluster
     if (inst->GetMayCluster()) {
-      // Case 1 and 3
+      // Case 1, 2, and 3
+      // Reduce the cluster size
       CurrentClusterSize--;
+      // Decrement instructions scheduled in this cluster
       InstructionsScheduledInEachCluster[ActiveClusterGroup]--;
       assert(InstructionsScheduledInEachCluster[ActiveClusterGroup] >= 0);
 
+      // Remove instruction's name from the list
       InstrList->pop_back();
 
-      //Logger::Info("Undoing an instruction from the cluster. Current size: %d",
-        //           CurrentClusterSize);
-
-      // If there is no more member in the currently active cluster then disable
-      // the cluster
+      // Case 2: If there are no more instructions in the currently active
+      // cluster then it indicates that we backtracked out of a cluster.
       if (CurrentClusterSize == 0) {
+        // Set active cluster to none.
         ActiveClusterGroup = 0;
         inst->SetActiveCluster(0);
 
-        // If there was a previously active cluster, check last cluster to see
-        // if we need to restore the state
+        // Case 3: Check If this instruction ended another cluster
         if (LastCluster) {
+          // If so, then we need to restore the state of the previous cluster
           if (LastCluster->InstNum == inst->GetNum()) {
             CurrentClusterSize = LastCluster->ClusterSize;
             ActiveClusterGroup = LastCluster->ClusterGroup;
             inst->SetActiveCluster(ActiveClusterGroup);
 
-	    InstrList = std::move(LastCluster->InstrList);
+	          InstrList = std::move(LastCluster->InstrList);
 
             LastCluster.reset(); // Release current cluster pointer
 
@@ -806,42 +810,52 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) {
               LastCluster = std::move(PastClustersList.back());
               PastClustersList.pop_back();
             }
-	    if (InstructionsScheduledInEachCluster[ActiveClusterGroup] != dataDepGraph_->getMaxInstructionsInCluster(ActiveClusterGroup)) {
- 	      CurrentClusterBlocks--;
-	      assert(CurrentClusterBlocks >= MaxClusterBlocks);
-	    }
+
+            // If we backtracked into another cluster that has not yet
+            // scheduled all of its instructions in the cluster, then undo our
+            // remaining cluster block estimate. There is a possibility that it
+            // is able to cluster all of the instructions in its cluster block
+            // and does not need an extra block.
+	          if (InstructionsScheduledInEachCluster[ActiveClusterGroup] !=
+                  dataDepGraph_->getTotalInstructionsInCluster(
+                  ActiveClusterGroup)) {
+ 	            CurrentClusterBlocks--;
+	            assert(CurrentClusterBlocks >= MinClusterBlocks);
+	          }
           }
         }
       }
     } else if (LastCluster) {
       if (LastCluster->InstNum == inst->GetNum()) {
-        // Case 2: If there was a previous cluster and
-        // this instruction ended the cluster then restore the previous
-        // cluster's state
+        // Case 4: If there was a previous cluster and this instruction
+        // ended the cluster then restore the previous cluster's state
         CurrentClusterSize = LastCluster->ClusterSize;
         ActiveClusterGroup = LastCluster->ClusterGroup;
         inst->SetActiveCluster(ActiveClusterGroup);
 
-	InstrList = std::move(LastCluster->InstrList);
+	      InstrList = std::move(LastCluster->InstrList);
 
-        LastCluster.reset(); // Release current cluster pointer
+        LastCluster.reset();
 
         // Get previous cluster from vector list
         if (!PastClustersList.empty()) {
           LastCluster = std::move(PastClustersList.back());
           PastClustersList.pop_back();
         }
-        //Logger::Info("Inst %d popping cluster size %d off the stacks",
-          //           inst->GetNum(), CurrentClusterSize);
-
-       if (InstructionsScheduledInEachCluster[ActiveClusterGroup] != dataDepGraph_->getMaxInstructionsInCluster(ActiveClusterGroup)) {
-         CurrentClusterBlocks--;
-         assert(CurrentClusterBlocks >= MaxClusterBlocks);
-       }
+       
+        // If we backtracked into another cluster that has not yet
+        // scheduled all of its instructions in the cluster, then undo our
+        // remaining cluster block estimate. There is a possibility that it is
+        // able to cluster all of the instructions in its cluster block and
+        // does not need an extra block.
+        if (InstructionsScheduledInEachCluster[ActiveClusterGroup] !=
+            dataDepGraph_->getTotalInstructionsInCluster(ActiveClusterGroup)) {
+          CurrentClusterBlocks--;
+          assert(CurrentClusterBlocks >= MinClusterBlocks);
+        }
       }
     }
   }
-//  Logger::Info("unschedule, Currently active cluster %d", ActiveClusterGroup);
 
   defCnt = inst->GetDefs(defs);
   useCnt = inst->GetUses(uses);
@@ -1107,6 +1121,7 @@ InstCount BBWithSpill::UpdtOptmlSched(InstSchedule *crntSched,
     enumBestSched_->Copy(crntSched);
     bestSched_ = enumBestSched_;
    
+    // Print the instructions in the clusters after finding a schedule.
     if (isSecondPass && ClusterMemoryOperations) {
       dbgs() << "Printing clustered instructions:\n";
       int i = 1;
@@ -1172,7 +1187,7 @@ bool BBWithSpill::ChkCostFsblty(InstCount trgtLngth, EnumTreeNode *node) {
   } else {
     crntCost = crntSpillCost_ * SCW_ + trgtLngth * schedCostFactor_;
   }
-  // TODO: Implement cost function for clustering
+  // Add the cost of clustering
   if (isSecondPass && ClusterMemoryOperations)
     crntCost += CurrentClusterBlocks * ClusteringWeight;
 
diff --git a/lib/Scheduler/data_dep.cpp b/lib/Scheduler/data_dep.cpp
index 4db8ace6..24100b05 100644
--- a/lib/Scheduler/data_dep.cpp
+++ b/lib/Scheduler/data_dep.cpp
@@ -198,8 +198,8 @@ DataDepGraph::DataDepGraph(MachineModel *machMdl, LATENCY_PRECISION ltncyPrcsn)
 
   RegFiles = llvm::make_unique<RegisterFile[]>(machMdl_->GetRegTypeCnt());
 
-  MaxClusterCount = 0;
-  MaxInstructionsInAllClusters = 0;
+  MinClusterCount = 0;
+  TotalInstructionsInAllClusters = 0;
 }
 
 DataDepGraph::~DataDepGraph() {
@@ -214,7 +214,7 @@ DataDepGraph::~DataDepGraph() {
   delete[] instCntPerType_;
 }
 
-int DataDepGraph::getMaxInstructionsInCluster(int Cluster) { 
+int DataDepGraph::getTotalInstructionsInCluster(int Cluster) { 
   assert(Cluster > 0);
   return MaxInstructionsInEachClusters[Cluster];
 }
diff --git a/lib/Scheduler/ready_list.cpp b/lib/Scheduler/ready_list.cpp
index ad377e84..65620e03 100644
--- a/lib/Scheduler/ready_list.cpp
+++ b/lib/Scheduler/ready_list.cpp
@@ -14,9 +14,13 @@ ReadyList::ReadyList(DataDepGraph *dataDepGraph, SchedPriorities prirts) {
   useCntBits_ = crtclPathBits_ = scsrCntBits_ = ltncySumBits_ = nodeID_Bits_ =
       inptSchedOrderBits_ = 0;
 
-  if (prirts_.isDynmc)
+  if (prirts_.isDynmc) {
     keyedEntries_ = new KeyedEntry<SchedInstruction, unsigned long>
         *[dataDepGraph->GetInstCnt()];
+    for (int j = 0; j < dataDepGraph->GetInstCnt(); j++) {
+      keyedEntries_[j] = nullptr;
+    }
+  }
   else
     keyedEntries_ = nullptr;
 
@@ -32,9 +36,6 @@ ReadyList::ReadyList(DataDepGraph *dataDepGraph, SchedPriorities prirts) {
       break;
 
     case LSH_LUC:
-      for (int j = 0; j < dataDepGraph->GetInstCnt(); j++) {
-        keyedEntries_[j] = NULL;
-      }
       maxUseCnt_ = dataDepGraph->GetMaxUseCnt();
       useCntBits_ = Utilities::clcltBitsNeededToHoldNum(maxUseCnt_);
       totKeyBits += useCntBits_;
@@ -72,7 +73,10 @@ ReadyList::ReadyList(DataDepGraph *dataDepGraph, SchedPriorities prirts) {
       totKeyBits += ltncySumBits_;
       break;
 
-    case LSH_MEM:
+    case LSH_CLUSTER:
+      // Bits needed: 1
+      // 0: Not part of an active cluster
+      // 1: Part of an active cluster
       ClusterBit = Utilities::clcltBitsNeededToHoldNum(1);
       totKeyBits += ClusterBit;
       break;
@@ -123,7 +127,7 @@ ReadyList::ReadyList(DataDepGraph *dataDepGraph, SchedPriorities prirts) {
                       maxLtncySum_);
       break;
 
-    case LSH_MEM:
+    case LSH_CLUSTER:
       AddPrirtyToKey_(maxPriority_, keySize, ClusterBit, 1, 1);
       break;
 
@@ -212,7 +216,8 @@ unsigned long ReadyList::CmputKey_(SchedInstruction *inst, bool isUpdate,
                       maxLtncySum_);
       break;
 
-    case LSH_MEM:
+    case LSH_CLUSTER:
+      // Partially copied how LUC is calculated to be updated.
       if (inst->GetClusterGroup() == 0)
         ValueForKey = 0;
       else {
@@ -224,7 +229,7 @@ unsigned long ReadyList::CmputKey_(SchedInstruction *inst, bool isUpdate,
         }
         ValueForKey =
             inst->GetClusterGroup() == SchedInstruction::GetActiveCluster() ? 1
-                                                                            : 0;
+                : 0;
       }
       AddPrirtyToKey_(key, keySize, ClusterBit, ValueForKey, 1);
       break;
diff --git a/lib/Scheduler/sched_basic_data.cpp b/lib/Scheduler/sched_basic_data.cpp
index c858be34..c59f7e7c 100644
--- a/lib/Scheduler/sched_basic_data.cpp
+++ b/lib/Scheduler/sched_basic_data.cpp
@@ -4,6 +4,7 @@
 
 using namespace llvm::opt_sched;
 
+// Initially set the active clustering to 0 for none.
 int SchedInstruction::ActiveCluster = 0;
 
 SchedInstruction::SchedInstruction(InstCount num, const string &name,
diff --git a/lib/Scheduler/sched_region.cpp b/lib/Scheduler/sched_region.cpp
index 5ff6ae94..0954cbb9 100644
--- a/lib/Scheduler/sched_region.cpp
+++ b/lib/Scheduler/sched_region.cpp
@@ -601,13 +601,6 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule(
   }
 #endif
 
-#ifdef IS_DEBUG_MEMORY_CLUSTERING
-  if (isSecondPass) {
-    Logger::Info("Printing final schedule.");
-    bestSched->Print(Logger::GetLogStream(), "Best Sched");
-  }
-#endif
-
   return rslt;
 }
 
diff --git a/lib/Wrapper/AMDGPU/OptSchedDDGWrapperGCN.cpp b/lib/Wrapper/AMDGPU/OptSchedDDGWrapperGCN.cpp
index 0aaf5bc4..57aa0713 100644
--- a/lib/Wrapper/AMDGPU/OptSchedDDGWrapperGCN.cpp
+++ b/lib/Wrapper/AMDGPU/OptSchedDDGWrapperGCN.cpp
@@ -182,7 +182,7 @@ void OptSchedDDGWrapperGCN::convertRegFiles() {
     }
 
   LLVM_DEBUG(DAG->dumpLLVMRegisters());
-  //LLVM_DEBUG(dumpOptSchedRegisters());
+  LLVM_DEBUG(dumpOptSchedRegisters());
 }
 
 void OptSchedDDGWrapperGCN::addSubRegDefs(SchedInstruction *Instr, unsigned Reg,
diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
index 6dfdb4a9..4a808010 100644
--- a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
+++ b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
@@ -210,7 +210,7 @@ void OptSchedDDGWrapperBasic::addDefsAndUses() {
     }
 
   LLVM_DEBUG(DAG->dumpLLVMRegisters());
-  //LLVM_DEBUG(dumpOptSchedRegisters());
+  LLVM_DEBUG(dumpOptSchedRegisters());
 }
 
 void OptSchedDDGWrapperBasic::addUse(unsigned RegUnit, InstCount Index) {
@@ -505,14 +505,18 @@ void OptSchedDDGWrapperBasic::countBoundaryLiveness(
   }
 }
 
-/// Partially copied from
-/// https://github.com/RadeonOpenCompute/llvm/blob/roc-ocl-2.4.0/lib/CodeGen/MachineScheduler.cpp#L1554
+// Iterate through all chains found by LLVm and verify that the instructions
+// are actually able to be clustered together.
+// Partially copied from
+// https://github.com/RadeonOpenCompute/llvm/blob/roc-ocl-2.4.0/lib/CodeGen/MachineScheduler.cpp#L1554
 int OptSchedDDGWrapperBasic::clusterNeighboringMemOps(
     ArrayRef<const SUnit *> MemOps) {
-  SmallVector<MemOpInfo, 32> MemOpRecords;
+  // Will be set to true if clustering was found to be possible in this chain.
   bool ClusterPossible = false;
+  // Keep track of the count of instructions that are able to be clustered
+  // and return the number.
   int TotalInstructionsPossible = 0;
-
+  SmallVector<MemOpInfo, 32> MemOpRecords;
   for (const SUnit *SU : MemOps) {
     MachineOperand *BaseOp;
     int64_t Offset;
@@ -537,15 +541,15 @@ int OptSchedDDGWrapperBasic::clusterNeighboringMemOps(
       dbgs() << "    Cluster possible at SU(" << SUa->NodeNum << ")- SU(" << SUb->NodeNum << ")\n";
       
       // If clustering was possible then increase the cluster count. This only
-      // happens once every cluster
+      // happens once every new cluster
       if (!ClusterPossible) {
         ClusterPossible = true;
         ClusterCount++;
-        setMaxClusterCount(ClusterCount);
+        setMinClusterCount(ClusterCount);
         dbgs() << "    Setting total cluster count to " << ClusterCount << "\n";
       }
 
-      // Tell the instructions what cluster number they are in
+      // Tell the instructions what cluster group they are in
       if (insts_[SUa->NodeNum]->GetClusterGroup() == 0) {
         insts_[SUa->NodeNum]->SetMayCluster(ClusterCount);
         TotalInstructionsPossible++;
@@ -560,17 +564,23 @@ int OptSchedDDGWrapperBasic::clusterNeighboringMemOps(
     } else
       ClusterLength = 1;
   }
+  // Save the total instructions possible in this cluster. This number will be
+  // used in enumeration to estimate an optimistic cost on the remaining
+  // cluster blocks.
   MaxInstructionsInEachClusters.insert(std::make_pair(ClusterCount, TotalInstructionsPossible));
+
+  // Return the total number of instructions in this cluster block
   return TotalInstructionsPossible;
 }
 
-/// Iterate through SUnits and find all possible clustering then transfer
-/// the information so that our scheduler can access it.
-/// Partially copied from https://github.com/RadeonOpenCompute/llvm/blob/roc-ocl-2.4.0/lib/CodeGen/MachineScheduler.cpp#L1595
+// Iterate through SUnits and find all possible clustering using LLVM/AMD's
+// method for possible clustering detection then transfer the information to
+// our scheduler so that our scheduler can access it during enumeration.
+// Partially copied from https://github.com/RadeonOpenCompute/llvm/blob/roc-ocl-2.4.0/lib/CodeGen/MachineScheduler.cpp#L1595
 int OptSchedDDGWrapperBasic::findPossibleClusters(bool IsLoad) {
-  // TODO: Add For-loop to also do store clusters. Currently only does load
-  // clusters
+  // The count of all of the instructions that are in a load/store cluster.
   int TotalInstructionsPossible = 0;
+  // Map DAG NodeNum to store chain ID.
   DenseMap<unsigned, unsigned> StoreChainIDs;
   // Map each store chain to a set of dependent MemOps.
   SmallVector<SmallVector<const SUnit *, 4>, 32> StoreChainDependents;
@@ -580,7 +590,10 @@ int OptSchedDDGWrapperBasic::findPossibleClusters(bool IsLoad) {
       continue;
     auto MI = SU.getInstr();
 
-    dbgs() << "Instruction (" << SU.NodeNum << ") " << DAG->TII->getName(MI->getOpcode())  << " may " << (IsLoad ? "load" : "store") << "\n";
+    // Print which instruction may load or store. Used for debugging purposes.
+    dbgs() << "Instruction (" << SU.NodeNum << ") " <<
+      DAG->TII->getName(MI->getOpcode())  << " may " <<
+      (IsLoad ? "load" : "store") << "\n";
 
     unsigned ChainPredID = DAG->SUnits.size();
     for (const SDep &Pred : SU.Preds) {
@@ -601,14 +614,15 @@ int OptSchedDDGWrapperBasic::findPossibleClusters(bool IsLoad) {
 
   // Iterate over the store chains.
   for (auto &SCD : StoreChainDependents) {
+    // Print the chain that LLVM has found
     dbgs() << "Printing the Node ID of the current chain: ";
     for (auto SU1 : SCD)
     	dbgs() << SU1->NodeNum << " ";
     dbgs() << '\n';
+
     TotalInstructionsPossible += clusterNeighboringMemOps(SCD);
   }
  return TotalInstructionsPossible;
-// setMaxInstructionsInAllClusters(TotalInstructionsPossible);
 }
 
 LLVMRegTypeFilter::LLVMRegTypeFilter(
diff --git a/lib/Wrapper/OptimizingScheduler.cpp b/lib/Wrapper/OptimizingScheduler.cpp
index ee75b2e3..528801fc 100644
--- a/lib/Wrapper/OptimizingScheduler.cpp
+++ b/lib/Wrapper/OptimizingScheduler.cpp
@@ -49,7 +49,7 @@ constexpr struct {
 } HeuristicNames[] = {
     {"CP", LSH_CP},     {"LUC", LSH_LUC}, {"UC", LSH_UC}, {"NID", LSH_NID},
     {"CPR", LSH_CPR},   {"ISO", LSH_ISO}, {"SC", LSH_SC}, {"LS", LSH_LS},
-    {"LLVM", LSH_LLVM}, {"MEM", LSH_MEM}
+    {"LLVM", LSH_LLVM}, {"CLUSTER", LSH_CLUSTER}
 };
 
 // Default path to the the configuration directory for opt-sched.
@@ -378,24 +378,37 @@ void ScheduleDAGOptSched::schedule() {
       OST->createDDGWrapper(C, this, MM.get(), LatencyPrecision, RegionName);
   DDG->convertSUnits();
   DDG->convertRegFiles();
+
+  // Find all clusterable instructions for the second pass.
   if (SecondPass) {
     dbgs() << "Finding load clusters.\n";
     int TotalLoadsInstructionsClusterable = DDG->findPossibleClusters(true);
     if (TotalLoadsInstructionsClusterable == 0)
       dbgs() << "  No load clustering possible\n";
+
     dbgs() << "Finding store clusters.\n";
     int TotalStoreInstructionsClusterable = DDG->findPossibleClusters(false);
     if (TotalStoreInstructionsClusterable == 0)
       dbgs() << "  No store clustering possible\n";
 
-    auto DDG2 = static_cast<DataDepGraph *>(DDG.get());
-    Logger::Info("Total clusterable instructions: %d loads, %d stores", TotalLoadsInstructionsClusterable, TotalStoreInstructionsClusterable);
-    DDG2->setMaxInstructionsInAllClusters(TotalLoadsInstructionsClusterable + TotalStoreInstructionsClusterable);
-    int end = DDG2->getMaxClusterCount();
+    Logger::Info("Total clusterable instructions: %d loads, %d stores",
+        TotalLoadsInstructionsClusterable, TotalStoreInstructionsClusterable);
+
+    // Get the DDG instance so that we can set and get information that will be
+    // read later on during enumeration.
+    auto DataDepGraphInstance = static_cast<DataDepGraph *>(DDG.get());
+    // Store total instructions in all clusters in the DDG instance.
+    DataDepGraphInstance->setTotalInstructionsInAllClusters(
+        TotalLoadsInstructionsClusterable + TotalStoreInstructionsClusterable);
+    int end = DataDepGraphInstance->getMinClusterCount();
+
+    // Iterate through all of the cluster blocks and print the total
+    // instructions in each block.
     if (end > 0) {
       Logger::Info("Total clusters in region: %d", end);
       for (int begin = 1; begin <= end; begin++) {
-        Logger::Info("  Cluster %d has total instructions %d", begin, DDG2->getMaxInstructionsInCluster(begin));
+        Logger::Info("  Cluster %d has total instructions %d", begin,
+            DataDepGraphInstance->getTotalInstructionsInCluster(begin));
       }
     }
   }
@@ -576,8 +589,6 @@ void ScheduleDAGOptSched::loadOptSchedConfig() {
   LowerBoundAlgorithm = parseLowerBoundAlgorithm();
   HeuristicPriorities = parseHeuristic(schedIni.GetString("HEURISTIC"));
   EnumPriorities = parseHeuristic(schedIni.GetString("ENUM_HEURISTIC"));
-  SecondPassPriorities =
-      parseHeuristic(schedIni.GetString("SECOND_PASS_HEURISTIC"));
   SecondPassEnumPriorities =
       parseHeuristic(schedIni.GetString("SECOND_PASS_ENUM_HEURISTIC"));
   SCF = parseSpillCostFunc();
@@ -686,7 +697,7 @@ SchedPriorities ScheduleDAGOptSched::parseHeuristic(const std::string &Str) {
     Priorities.vctr[Priorities.cnt++] = LSH;
     switch (LSH) {
     // Is LUC still the only dynamic heuristic?
-    case LSH_MEM:
+    case LSH_CLUSTER:
     case LSH_LUC:
       Priorities.isDynmc = true;
       break;
@@ -841,7 +852,6 @@ void ScheduleDAGOptSched::scheduleOptSchedBalanced() {
 
   // Set the heuristic for the enumerator in the second pass.
   EnumPriorities = SecondPassEnumPriorities;
-  HeuristicPriorities = SecondPassPriorities;
 
   // Force the input to the balanced scheduler to be the sequential order of the
   // (hopefully) good register pressure schedule. We don’t want the list

From 46b9542a77e7d38c49635e311832c3d3819d9d3a Mon Sep 17 00:00:00 2001
From: Vang Thao <vangthao@csus.edu>
Date: Thu, 30 Apr 2020 12:16:46 -0700
Subject: [PATCH 29/40] Fix not accounting for multiple clusters within the
 same store-chain.

---
 lib/Wrapper/OptSchedDDGWrapperBasic.cpp | 61 ++++++++++++++++---------
 1 file changed, 40 insertions(+), 21 deletions(-)

diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
index 4a808010..a976a8c4 100644
--- a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
+++ b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
@@ -512,15 +512,17 @@ void OptSchedDDGWrapperBasic::countBoundaryLiveness(
 int OptSchedDDGWrapperBasic::clusterNeighboringMemOps(
     ArrayRef<const SUnit *> MemOps) {
   // Will be set to true if clustering was found to be possible in this chain.
-  bool ClusterPossible = false;
+  bool InitForNewCluster = true;
   // Keep track of the count of instructions that are able to be clustered
   // and return the number.
   int TotalInstructionsPossible = 0;
+  int InstructionsInEachCluster = 0;
   SmallVector<MemOpInfo, 32> MemOpRecords;
   for (const SUnit *SU : MemOps) {
     MachineOperand *BaseOp;
     int64_t Offset;
-    if (DAG->TII->getMemOperandWithOffset(*SU->getInstr(), BaseOp, Offset, DAG->TRI))
+    if (DAG->TII->getMemOperandWithOffset(*SU->getInstr(), BaseOp, Offset,
+                                          DAG->TRI))
       MemOpRecords.push_back(MemOpInfo(SU, BaseOp, Offset));
   }
 
@@ -534,16 +536,18 @@ int OptSchedDDGWrapperBasic::clusterNeighboringMemOps(
   for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) {
     const SUnit *SUa = MemOpRecords[Idx].SU;
     const SUnit *SUb = MemOpRecords[Idx + 1].SU;
-    dbgs() << "  Checking possible clustering of (" << SUa->NodeNum << ") and (" << SUb->NodeNum << ")\n";
+    dbgs() << "  Checking possible clustering of (" << SUa->NodeNum << ") and ("
+           << SUb->NodeNum << ")\n";
     if (DAG->TII->shouldClusterMemOps(*MemOpRecords[Idx].BaseOp,
-                                 *MemOpRecords[Idx + 1].BaseOp,
-                                 ClusterLength)) {
-      dbgs() << "    Cluster possible at SU(" << SUa->NodeNum << ")- SU(" << SUb->NodeNum << ")\n";
-      
-      // If clustering was possible then increase the cluster count. This only
+                                      *MemOpRecords[Idx + 1].BaseOp,
+                                      ClusterLength)) {
+      dbgs() << "    Cluster possible at SU(" << SUa->NodeNum << ")- SU("
+             << SUb->NodeNum << ")\n";
+
+      // If clustering is possible then increase the cluster count. This only
       // happens once every new cluster
-      if (!ClusterPossible) {
-        ClusterPossible = true;
+      if (InitForNewCluster) {
+        InitForNewCluster = false;
         ClusterCount++;
         setMinClusterCount(ClusterCount);
         dbgs() << "    Setting total cluster count to " << ClusterCount << "\n";
@@ -552,22 +556,36 @@ int OptSchedDDGWrapperBasic::clusterNeighboringMemOps(
       // Tell the instructions what cluster group they are in
       if (insts_[SUa->NodeNum]->GetClusterGroup() == 0) {
         insts_[SUa->NodeNum]->SetMayCluster(ClusterCount);
-        TotalInstructionsPossible++;
+        InstructionsInEachCluster++;
       }
 
       if (insts_[SUb->NodeNum]->GetClusterGroup() == 0) {
         insts_[SUb->NodeNum]->SetMayCluster(ClusterCount);
-        TotalInstructionsPossible++;
+        InstructionsInEachCluster++;
       }
 
       ++ClusterLength;
-    } else
+    } else {
+      if (!InitForNewCluster) {
+        // If a cluster was initialized and started then the information before
+        // starting a new one.
+        MaxInstructionsInEachClusters.insert(
+            std::make_pair(ClusterCount, InstructionsInEachCluster));
+        TotalInstructionsPossible += InstructionsInEachCluster;
+        InitForNewCluster = true;
+        InstructionsInEachCluster = 0;
+      }
       ClusterLength = 1;
+    }
   }
   // Save the total instructions possible in this cluster. This number will be
   // used in enumeration to estimate an optimistic cost on the remaining
-  // cluster blocks.
-  MaxInstructionsInEachClusters.insert(std::make_pair(ClusterCount, TotalInstructionsPossible));
+  // cluster blocks.i
+  if (!InitForNewCluster) {
+    MaxInstructionsInEachClusters.insert(
+        std::make_pair(ClusterCount, InstructionsInEachCluster));
+    TotalInstructionsPossible += InstructionsInEachCluster;
+  }
 
   // Return the total number of instructions in this cluster block
   return TotalInstructionsPossible;
@@ -576,7 +594,8 @@ int OptSchedDDGWrapperBasic::clusterNeighboringMemOps(
 // Iterate through SUnits and find all possible clustering using LLVM/AMD's
 // method for possible clustering detection then transfer the information to
 // our scheduler so that our scheduler can access it during enumeration.
-// Partially copied from https://github.com/RadeonOpenCompute/llvm/blob/roc-ocl-2.4.0/lib/CodeGen/MachineScheduler.cpp#L1595
+// Partially copied from
+// https://github.com/RadeonOpenCompute/llvm/blob/roc-ocl-2.4.0/lib/CodeGen/MachineScheduler.cpp#L1595
 int OptSchedDDGWrapperBasic::findPossibleClusters(bool IsLoad) {
   // The count of all of the instructions that are in a load/store cluster.
   int TotalInstructionsPossible = 0;
@@ -591,9 +610,9 @@ int OptSchedDDGWrapperBasic::findPossibleClusters(bool IsLoad) {
     auto MI = SU.getInstr();
 
     // Print which instruction may load or store. Used for debugging purposes.
-    dbgs() << "Instruction (" << SU.NodeNum << ") " <<
-      DAG->TII->getName(MI->getOpcode())  << " may " <<
-      (IsLoad ? "load" : "store") << "\n";
+    dbgs() << "Instruction (" << SU.NodeNum << ") "
+           << DAG->TII->getName(MI->getOpcode()) << " may "
+           << (IsLoad ? "load" : "store") << "\n";
 
     unsigned ChainPredID = DAG->SUnits.size();
     for (const SDep &Pred : SU.Preds) {
@@ -617,12 +636,12 @@ int OptSchedDDGWrapperBasic::findPossibleClusters(bool IsLoad) {
     // Print the chain that LLVM has found
     dbgs() << "Printing the Node ID of the current chain: ";
     for (auto SU1 : SCD)
-    	dbgs() << SU1->NodeNum << " ";
+      dbgs() << SU1->NodeNum << " ";
     dbgs() << '\n';
 
     TotalInstructionsPossible += clusterNeighboringMemOps(SCD);
   }
- return TotalInstructionsPossible;
+  return TotalInstructionsPossible;
 }
 
 LLVMRegTypeFilter::LLVMRegTypeFilter(

From 19184f523f8c52a0754e3c6442ca26c15f5b5310 Mon Sep 17 00:00:00 2001
From: Vang Thao <vangthao@csus.edu>
Date: Wed, 3 Jun 2020 23:29:36 -0500
Subject: [PATCH 30/40] Working implementation of clustering using B&B. No
 history domination.

Currently working implementation of clustering with B&B. No hard limits on cluster size when using AMD's shouldClusterMemOps() function but there is a hard limit of 15 during B&B. Currently still debugging history domination.
---
 include/opt-sched/Scheduler/bb_spill.h     |  48 ++-
 include/opt-sched/Scheduler/enumerator.h   |  67 ++-
 include/opt-sched/Scheduler/graph.h        |   2 +-
 include/opt-sched/Scheduler/hist_table.h   |   5 +-
 include/opt-sched/Scheduler/sched_region.h |   4 +
 lib/Scheduler/bb_spill.cpp                 | 462 ++++++++++++---------
 lib/Scheduler/enumerator.cpp               |  73 +++-
 lib/Scheduler/hist_table.cpp               |  47 +++
 lib/Scheduler/ready_list.cpp               |  18 +-
 lib/Scheduler/sched_basic_data.cpp         |   2 -
 lib/Scheduler/sched_region.cpp             |   1 +
 lib/Wrapper/OptSchedDDGWrapperBasic.cpp    |   7 +-
 12 files changed, 478 insertions(+), 258 deletions(-)

diff --git a/include/opt-sched/Scheduler/bb_spill.h b/include/opt-sched/Scheduler/bb_spill.h
index 5a5b1ced..e0a55a8f 100644
--- a/include/opt-sched/Scheduler/bb_spill.h
+++ b/include/opt-sched/Scheduler/bb_spill.h
@@ -35,21 +35,53 @@ class BBWithSpill : public SchedRegion {
 
   InstCount crntSpillCost_;
   InstCount optmlSpillCost_;
+  int CurrentClusterCost;
+
+  /// Used to calculate the dynamic lower bound for clustering.
+  llvm::SmallVector<int, 32> ClusterCount;
+  llvm::SmallVector<int, 32> ClusterInstrRemainderCount;
+  int ClusterGroupCount;
+
+  /// Print the current clusters found so far in the schedule.
+  void printCurrentClustering();
+
+  void initForClustering();
+  
+  /// Calculate the lower bound cost for memory operations clustering and
+  /// return the lower bound cost. Does not take into account the clustering
+  /// weight.
+  int calculateClusterStaticLB();
+
+  /// Helper function for clustering to save the state of the current cluster.
+  void saveCluster(SchedInstruction *inst);
+
+  /// Helper function for clustering to start a new clustering.
+  void initCluster(SchedInstruction *inst);
+
+  /// Reset the active cluster to 0 (none).
+  void resetActiveCluster(SchedInstruction *inst);
+
+  /// Helper function to restore the previous cluster.
+  void restorePreviousCluster(SchedInstruction *inst);
+
+  bool isClusterFinished();
+
+  int calculateClusterDLB();
 
   /// Current cluster size
   unsigned int CurrentClusterSize; 
 
-  MapVector<int, int> InstructionsScheduledInEachCluster;
-
   /// The minimum amount of cluster blocks possible.
   int MinClusterBlocks;
 
   /// The minimum amount of cluster blocks + the optimistic expected cluster
   /// blocks remaining.
-  int CurrentClusterBlocks;
+  int DynamicClusterLowerBound;
 
   /// Current active cluster group.
-  int ActiveClusterGroup;
+  int ClusterActiveGroup;
+
+  int StartCycle;
 
   /// Flag to enable or disable clustering memory operations in the ILP pass.
   /// Reads from the sched.ini file then set the flag accordingly.
@@ -70,13 +102,15 @@ class BBWithSpill : public SchedRegion {
     /// restore the cluster state when backtracking.
     int InstNum; 
 
+    int Start;
+
     /// Contains the actual names of the instructions in the cluster. Only used
     /// for printing and debugging purposes.
     std::unique_ptr<llvm::SmallVector<llvm::StringRef, 4>> InstrList;
 
     /// Constructor for this struct
-    PastClusters(int Cluster, int Size, int Instructions)
-        : ClusterGroup(Cluster), ClusterSize(Size), InstNum(Instructions) {}
+    PastClusters(int Cluster, int Size, int Instructions, int CycleStart)
+        : ClusterGroup(Cluster), ClusterSize(Size), InstNum(Instructions), Start(CycleStart) {}
   };
 
   /// Vector containing the (n-1) past clusters
@@ -161,7 +195,7 @@ class BBWithSpill : public SchedRegion {
   void InitForCostCmputtn_();
   InstCount CmputDynmcCost_();
 
-  void UpdateSpillInfoForSchdul_(SchedInstruction *inst, bool trackCnflcts);
+  void UpdateSpillInfoForSchdul_(SchedInstruction *inst, bool trackCnflcts, int Start);
   void UpdateSpillInfoForUnSchdul_(SchedInstruction *inst);
   void SetupPhysRegs_();
   void CmputCrntSpillCost_();
diff --git a/include/opt-sched/Scheduler/enumerator.h b/include/opt-sched/Scheduler/enumerator.h
index be2f376f..d165ddd0 100644
--- a/include/opt-sched/Scheduler/enumerator.h
+++ b/include/opt-sched/Scheduler/enumerator.h
@@ -153,6 +153,12 @@ class EnumTreeNode {
   InstCount peakSpillCost_;
   InstCount spillCostSum_;
   InstCount totalCost_ = -1;
+  int ClusterCost;
+  int ClusterActiveGroup;
+  int ClusterAbsorbCount;
+  int ClusterDLB;
+  int ClusterTotalCost = -1;
+  int ClusterBestCost;
   bool totalCostIsActualCost_ = false;
   ReserveSlot *rsrvSlots_;
 
@@ -276,6 +282,18 @@ class EnumTreeNode {
   inline void SetSpillCostSum(InstCount cost);
   inline InstCount GetSpillCostSum();
 
+  inline void setClusteringCost(int Cost);
+  inline int getClusteringCost();
+  inline void setCurClusteringGroup(int Group);
+  inline int getCurClusteringGroup();
+  inline void setClusterAbsorbCount(int Absorb);
+  inline int getClusterAbsorbCount();
+  inline void setClusterLwrBound(int ClusterDynamicLowerBound);
+  inline int getClusterLwrBound();
+  inline void setTotalClusterCost(int Cost);
+  inline int getTotalClusterCost();
+  inline bool isClustering();
+
   bool ChkInstRdndncy(SchedInstruction *inst, int brnchNum);
   bool IsNxtSlotStall();
 
@@ -317,6 +335,9 @@ class Enumerator : public ConstrainedScheduler {
   friend class HistEnumTreeNode;
   friend class CostHistEnumTreeNode;
 
+  // Should we cluster memory operations
+  bool Clustering;
+
   // TODO(max): Document.
   bool isCnstrctd_;
 
@@ -508,7 +529,7 @@ class Enumerator : public ConstrainedScheduler {
              InstCount schedUprBound, int16_t sigHashSize,
              SchedPriorities prirts, Pruning PruningStrategy,
              bool SchedForRPOnly, bool enblStallEnum, Milliseconds timeout,
-             InstCount preFxdInstCnt = 0,
+             bool ClusteringEnabled, InstCount preFxdInstCnt = 0,
              SchedInstruction *preFxdInsts[] = NULL);
   virtual ~Enumerator();
   virtual void Reset();
@@ -525,6 +546,8 @@ class Enumerator : public ConstrainedScheduler {
   // (Chris)
   inline bool IsSchedForRPOnly() const { return SchedForRPOnly_; }
 
+  inline bool isClustering() const { return Clustering; }
+
   // Calculates the schedule and returns it in the passed argument.
   FUNC_RESULT FindSchedule(InstSchedule *sched, SchedRegion *rgn) {
     return RES_ERROR;
@@ -586,6 +609,7 @@ class LengthCostEnumerator : public Enumerator {
   bool WasObjctvMet_();
   bool BackTrack_();
   InstCount GetBestCost_();
+  int GetBestClusterCost_();
   void CreateRootNode_();
 
   // Check if branching from the current node by scheduling this instruction
@@ -603,7 +627,7 @@ class LengthCostEnumerator : public Enumerator {
                        SchedPriorities prirts, Pruning PruningStrategy,
                        bool SchedForRPOnly, bool enblStallEnum,
                        Milliseconds timeout, SPILL_COST_FUNCTION spillCostFunc,
-                       InstCount preFxdInstCnt = 0,
+                       bool ClusteringEnabled, InstCount preFxdInstCnt = 0,
                        SchedInstruction *preFxdInsts[] = NULL);
   virtual ~LengthCostEnumerator();
   void Reset();
@@ -616,6 +640,7 @@ class LengthCostEnumerator : public Enumerator {
   bool IsCostEnum();
   SPILL_COST_FUNCTION GetSpillCostFunc() { return spillCostFunc_; }
   inline InstCount GetBestCost() { return GetBestCost_(); }
+  int getBestClusterCost() { return GetBestClusterCost_(); }
 };
 /*****************************************************************************/
 
@@ -851,6 +876,44 @@ void EnumTreeNode::SetSpillCostSum(InstCount cost) {
 InstCount EnumTreeNode::GetSpillCostSum() { return spillCostSum_; }
 /*****************************************************************************/
 
+void EnumTreeNode::setClusteringCost(int Cost) {
+  assert(Cost >= 0);
+  ClusterCost = Cost;
+}
+
+int EnumTreeNode::getClusteringCost() { return ClusterCost; }
+
+void EnumTreeNode::setCurClusteringGroup(int Group) {
+  assert(Group >= 0);
+  ClusterActiveGroup = Group;
+}
+
+int EnumTreeNode::getCurClusteringGroup() { return ClusterActiveGroup; }
+
+void EnumTreeNode::setClusterAbsorbCount(int Absorb) {
+  assert(Absorb >= 0);
+  ClusterAbsorbCount = Absorb;
+}
+
+int EnumTreeNode::getClusterAbsorbCount() { return ClusterAbsorbCount; }
+
+void EnumTreeNode::setClusterLwrBound(int ClusterDynamicLowerBound) {
+  assert(ClusterDynamicLowerBound >= 0);
+  ClusterDLB = ClusterDynamicLowerBound;
+}
+
+int EnumTreeNode::getClusterLwrBound() { return ClusterDLB; }
+
+void EnumTreeNode::setTotalClusterCost(int Cost) {
+  assert(Cost >= 0);
+  ClusterTotalCost = Cost;
+}
+
+int EnumTreeNode::getTotalClusterCost() { return ClusterTotalCost; }
+
+bool EnumTreeNode::isClustering() { return enumrtr_->isClustering(); }
+/*****************************************************************************/
+
 bool EnumTreeNode::IsNxtCycleNew_() {
   if (enumrtr_->issuRate_ == 1) {
     return true;
diff --git a/include/opt-sched/Scheduler/graph.h b/include/opt-sched/Scheduler/graph.h
index af8ba8f2..790b7164 100644
--- a/include/opt-sched/Scheduler/graph.h
+++ b/include/opt-sched/Scheduler/graph.h
@@ -512,7 +512,7 @@ inline UDT_GEDGES GraphNode::GetRcrsvScsrCnt() const {
 }
 
 inline LinkedList<GraphEdge> *GraphNode::GetNghbrLst(DIRECTION dir) {
-  return dir == DIR_FRWRD ? scsrLst_ : prdcsrLst_;
+  return dir == DIR_FRWRD ? prdcsrLst_ : scsrLst_;
 }
 
 inline GraphEdge *GraphNode::GetFrstScsrEdge() {
diff --git a/include/opt-sched/Scheduler/hist_table.h b/include/opt-sched/Scheduler/hist_table.h
index 982c87a6..85f6592b 100644
--- a/include/opt-sched/Scheduler/hist_table.h
+++ b/include/opt-sched/Scheduler/hist_table.h
@@ -109,6 +109,10 @@ class CostHistEnumTreeNode : public HistEnumTreeNode {
   InstCount cost_;
   InstCount peakSpillCost_;
   InstCount spillCostSum_;
+  int ClusterCost;
+  int ClusterActiveGroup;
+  int ClusterAbsorbCount;
+  int ClusterTotalCost;
 
   // (Chris)
   InstCount totalCost_ = -1;
@@ -119,7 +123,6 @@ class CostHistEnumTreeNode : public HistEnumTreeNode {
 #ifdef IS_DEBUG
   bool costInfoSet_;
 #endif
-
   bool ChkCostDmntnForBBSpill_(EnumTreeNode *node, Enumerator *enumrtr);
   bool ChkCostDmntn_(EnumTreeNode *node, Enumerator *enumrtr,
                      InstCount &maxShft);
diff --git a/include/opt-sched/Scheduler/sched_region.h b/include/opt-sched/Scheduler/sched_region.h
index de36f85b..553d73b8 100644
--- a/include/opt-sched/Scheduler/sched_region.h
+++ b/include/opt-sched/Scheduler/sched_region.h
@@ -58,6 +58,7 @@ class SchedRegion {
   inline int GetCostLwrBound() { return costLwrBound_; }
   // Returns the best cost found so far for this region.
   inline InstCount GetBestCost() { return bestCost_; }
+  inline int getBestClusterCost() { return BestClusterCost; }
   // Returns a pointer to the list scheduler heurisitcs.
   inline SchedPriorities GetHeuristicPriorities() { return hurstcPrirts_; }
   // Get the number of simulated spills code added for this block.
@@ -132,6 +133,7 @@ class SchedRegion {
   
   // The best results found so far.
   InstCount bestCost_;
+  int BestClusterCost;
   InstCount bestSchedLngth_;
 
   // (Chris): The cost function. Defaults to PERP.
@@ -180,6 +182,8 @@ class SchedRegion {
 
   void SetBestCost(InstCount bestCost) { bestCost_ = bestCost; }
 
+  void setBestClusterCost(int BestCost) { BestClusterCost = BestCost; }
+
   void SetBestSchedLength(InstCount bestSchedLngth) { bestSchedLngth_ = bestSchedLngth; }
 
   const SchedPriorities& GetEnumPriorities() const { return enumPrirts_; }
diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp
index 2b89f76b..6920eae8 100644
--- a/lib/Scheduler/bb_spill.cpp
+++ b/lib/Scheduler/bb_spill.cpp
@@ -27,6 +27,9 @@ using namespace llvm::opt_sched;
 // The denominator used when calculating cost weight.
 static const int COST_WGHT_BASE = 10;
 
+// The max number of instructions in a cluster
+static const unsigned MAX_INSTR_IN_CLUSTER = 15;
+
 BBWithSpill::BBWithSpill(const OptSchedTarget *OST_, DataDepGraph *dataDepGraph,
                          long rgnNum, int16_t sigHashSize, LB_ALG lbAlg,
                          SchedPriorities hurstcPrirts,
@@ -67,23 +70,37 @@ BBWithSpill::BBWithSpill(const OptSchedTarget *OST_, DataDepGraph *dataDepGraph,
   schduldEntryInstCnt_ = 0;
   schduldExitInstCnt_ = 0;
   schduldInstCnt_ = 0;
-
-  // Memory clustering variables initialization
-  CurrentClusterSize = 0;
-  ActiveClusterGroup = 0;
-  PastClustersList.clear();
-  LastCluster = nullptr;
   Config &schedIni = SchedulerOptions::getInstance();
   ClusterMemoryOperations = schedIni.GetBool("CLUSTER_MEMORY_OPS");
   ClusteringWeight = schedIni.GetInt("CLUSTER_WEIGHT");
-  MinClusterBlocks = dataDepGraph_->getMinClusterCount();
-  CurrentClusterBlocks = MinClusterBlocks;
-  for (int begin = 1; begin <= MinClusterBlocks; begin++) {
-    InstructionsScheduledInEachCluster[begin] = 0;
+  ClusterGroupCount = dataDepGraph_->getMinClusterCount();
+  MinClusterBlocks = 0;
+  if (ClusterMemoryOperations && ClusterGroupCount > 0) {
+    ClusterCount.resize(ClusterGroupCount+1);
+    ClusterInstrRemainderCount.resize(ClusterGroupCount+1);
+    MinClusterBlocks = calculateClusterStaticLB();
+    initForClustering();
   }
 }
 /****************************************************************************/
 
+void BBWithSpill::initForClustering() {
+  // Memory clustering variables initialization
+  SchedInstruction::SetActiveCluster(0);
+  CurrentClusterSize = 0;
+  ClusterActiveGroup = 0;
+  CurrentClusterCost = 0;
+  PastClustersList.clear();
+  LastCluster.reset();
+  InstrList.reset();
+  DynamicClusterLowerBound = 0;
+
+  for (int begin = 1; begin <= ClusterGroupCount; begin++) {
+    ClusterCount[begin] = 0;
+    ClusterInstrRemainderCount[begin] = dataDepGraph_->getTotalInstructionsInCluster(begin);
+  }
+}
+
 BBWithSpill::~BBWithSpill() {
   if (enumrtr_ != NULL) {
     delete enumrtr_;
@@ -96,6 +113,25 @@ BBWithSpill::~BBWithSpill() {
 }
 /*****************************************************************************/
 
+int BBWithSpill::calculateClusterStaticLB() {
+  // No cluster in this scheduling region
+  if (ClusterGroupCount == 0)
+    return 0;
+
+  // Calculate the minimum cluster blocks that will be needed to cluster all of
+  // the instructions. The maximum amount in a cluster block is determined by
+  // the constant MAX_INSTR_IN_CLUSTER.
+  int ClusterCost = 0;
+  for (int begin = 1; begin <= ClusterGroupCount; begin++) {
+    int InstructionCount = dataDepGraph_->getTotalInstructionsInCluster(begin);
+    int CurrentClusterCost = std::ceil(double(InstructionCount) / MAX_INSTR_IN_CLUSTER);
+    Logger::Info("Cost for block %d is %d", begin, CurrentClusterCost);
+    ClusterCost += CurrentClusterCost;
+  }
+  
+  return ClusterCost;
+}
+
 bool BBWithSpill::EnableEnum_() {
   return true;
   /*
@@ -338,8 +374,6 @@ InstCount BBWithSpill::CmputCostLwrBound() {
 void BBWithSpill::InitForSchdulng() {
   InitForCostCmputtn_();
 
-  InstrList.reset();
-
   schduldEntryInstCnt_ = 0;
   schduldExitInstCnt_ = 0;
   schduldInstCnt_ = 0;
@@ -347,18 +381,8 @@ void BBWithSpill::InitForSchdulng() {
 /*****************************************************************************/
 
 void BBWithSpill::InitForCostCmputtn_() {
-  // Init/Reset memory clustering values if it is enabled
-  if (IsSecondPass() && ClusterMemoryOperations) {
-    SchedInstruction::SetActiveCluster(0);
-    CurrentClusterSize = 0;
-    ActiveClusterGroup = 0;
-    PastClustersList.clear();
-    LastCluster.reset();
-    CurrentClusterBlocks = MinClusterBlocks;
-    for (int begin = 1; begin <= MinClusterBlocks; begin++) {
-      InstructionsScheduledInEachCluster[begin] = 0;
-    }
-}
+  if (IsSecondPass() && ClusterMemoryOperations)
+    initForClustering();
 
   int i;
 
@@ -424,8 +448,11 @@ InstCount BBWithSpill::CmputCost_(InstSchedule *sched, COST_COMP_MODE compMode,
   execCost = cost;
   cost += crntSpillCost_ * SCW_;
   // Add the current clustering cost
-  if (IsSecondPass() && ClusterMemoryOperations)
-    cost += CurrentClusterBlocks * ClusteringWeight;
+  if (IsSecondPass() && ClusterMemoryOperations) {
+    cost += CurrentClusterCost * ClusteringWeight;
+    assert(calculateClusterDLB() == CurrentClusterCost);
+  }
+  
   sched->SetSpillCosts(spillCosts_);
   sched->SetPeakRegPressures(peakRegPressures_);
   sched->SetSpillCost(crntSpillCost_);
@@ -458,8 +485,85 @@ void BBWithSpill::CmputCrntSpillCost_() {
 }
 /*****************************************************************************/
 
+void BBWithSpill::saveCluster(SchedInstruction *inst) {
+  if (LastCluster)
+    // Save previous clusters in a vector except the last cluster
+    // that we just exited out of.
+    PastClustersList.push_back(std::move(LastCluster));
+
+  // Last cluster that we just exited out of, used for fast accessing
+  // to its contents.
+  LastCluster = llvm::make_unique<PastClusters>(
+      ClusterActiveGroup, CurrentClusterSize, inst->GetNum(), StartCycle);
+  
+  LastCluster->InstrList = std::move(InstrList);
+}
+
+void BBWithSpill::initCluster(SchedInstruction *inst) {
+  ClusterActiveGroup = inst->GetClusterGroup();
+  inst->SetActiveCluster(ClusterActiveGroup);
+  CurrentClusterSize = 1;
+  ClusterInstrRemainderCount[ClusterActiveGroup]--;
+  InstrList = llvm::make_unique<llvm::SmallVector<llvm::StringRef, 4>>();
+  InstrList->push_back(inst->GetName());
+  ClusterCount[ClusterActiveGroup]++;
+  CurrentClusterCost++;
+}
+
+void BBWithSpill::resetActiveCluster(SchedInstruction *inst) {
+  ClusterActiveGroup = 0;
+  inst->SetActiveCluster(0);
+  CurrentClusterSize = 0;
+}
+
+void BBWithSpill::restorePreviousCluster(SchedInstruction *inst) {
+  CurrentClusterSize = LastCluster->ClusterSize;
+  ClusterActiveGroup = LastCluster->ClusterGroup;
+  StartCycle = LastCluster->Start;
+  inst->SetActiveCluster(ClusterActiveGroup);
+  InstrList = std::move(LastCluster->InstrList);
+  LastCluster.reset(); // Release current cluster pointer
+
+  // Get previous cluster from vector list
+  if (!PastClustersList.empty()) {
+    LastCluster = std::move(PastClustersList.back());
+    PastClustersList.pop_back();
+  }
+}
+
+bool BBWithSpill::isClusterFinished() {
+  assert(ClusterActiveGroup != 0);
+  if (ClusterInstrRemainderCount[ClusterActiveGroup] == 0 ||
+      CurrentClusterSize == MAX_INSTR_IN_CLUSTER) {
+        return true;
+  }
+  return false;
+}
+
+int BBWithSpill::calculateClusterDLB() {
+  int OptimisticLowerBound = 0;
+  
+  for (int begin = 1; begin <= ClusterGroupCount; begin++) {
+    if (begin != ClusterActiveGroup)
+      OptimisticLowerBound += std::ceil(double(ClusterInstrRemainderCount[begin])/MAX_INSTR_IN_CLUSTER);
+    else {
+      // The amount of instructions remaining that the current open cluster can add
+      int AbsorbCount = MAX_INSTR_IN_CLUSTER - CurrentClusterSize;
+      // Assume the current open cluster can add the max amount of instructions
+      // that a cluster can contain.
+      int Remainder = ClusterInstrRemainderCount[begin] - AbsorbCount;
+      // If the remainder is negative then that indicates the open cluster can absorb all of the remaining instructions.
+      if (Remainder < 0)
+        Remainder = 0;
+      // Estimate the optimistic dynamic lower bound for the current cluster
+      OptimisticLowerBound += std::ceil(double(Remainder)/MAX_INSTR_IN_CLUSTER);
+    }
+  }
+  return CurrentClusterCost + OptimisticLowerBound;
+}
+
 void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst,
-                                            bool trackCnflcts) {
+                                            bool trackCnflcts, int Start) {
   int16_t regType;
   int defCnt, useCnt, regNum, physRegNum;
   Register **defs, **uses;
@@ -467,8 +571,17 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst,
   int liveRegs;
   InstCount newSpillCost;
 
+// Conditions for creating a cluster:
+// 1.) If a block is ended before it reaches 15 && there are remaining instructions
+
+// Conditions for removing a cluster:
+// 1.) If the block is not 15 && there are remaining instructions
+
   // Scheduling cases for clustering project:
   // 1.) Same Cluster -> Same Cluster
+  // If size == MAX_INSTR_IN_CLUSTER
+  // Save cluster to restore
+  // Set active to 0
   // 2.) Cluster -> Different Cluster
   // 3.) Non-Cluster -> Cluster
   // 4.) Cluster -> Non-Cluster
@@ -479,100 +592,47 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst,
     // Check if the current instruction is part of a cluster
     if (inst->GetMayCluster()) {
       // Check if there is a current active cluster
-      if (CurrentClusterSize > 0) {
+      // A ClusterActiveGroup == 0 indicates that there is no currently active clustering
+      // While ClusterActiveGroup != 0 indicates that there is active clustering
+      if (ClusterActiveGroup != 0) {
         // Check if the instruction is in the same cluster group as the active
         // cluster
-        if (ActiveClusterGroup == inst->GetClusterGroup()) {
+        if (ClusterActiveGroup == inst->GetClusterGroup()) {
           // Case 1: Simple case where the current instruction is part of an
           // already active cluster.
           CurrentClusterSize++;
-          InstructionsScheduledInEachCluster[ActiveClusterGroup]++;
-
+          ClusterInstrRemainderCount[ClusterActiveGroup]--;
 	        InstrList->push_back(inst->GetName());
 
+          // If we reach the max amount for this cluster then save the cluster
+          // and reset.
+          if (isClusterFinished())
+          {
+            saveCluster(inst);
+            resetActiveCluster(inst);
+          }
         } else {
           // Case 2: Else the instruction is part of different cluster that
           // is not currently active. Store information of the old cluster
           // group and start clustering for the new cluster.
-          if (LastCluster) {
-            // Save previous clusters in a vector except the last cluster
-            // that we just exited out of.
-            PastClustersList.push_back(std::move(LastCluster));
-
-            // Last cluster that we just exited out of, used for fast accessing
-            // to its contents
-            LastCluster = llvm::make_unique<PastClusters>(
-                ActiveClusterGroup, CurrentClusterSize, inst->GetNum());
-          } else
-            // This is the first cluster block that we exited out of.
-            LastCluster = llvm::make_unique<PastClusters>(
-                ActiveClusterGroup, CurrentClusterSize, inst->GetNum());
-
-          LastCluster->InstrList = std::move(InstrList);
-
-          // If the old cluster did not finish clustering all possible
-          // instructions in its cluster then that means there have to be an
-          // extra cluster block to finish all of the instructions in the
-          // cluster.
-          if (InstructionsScheduledInEachCluster[ActiveClusterGroup] <
-              dataDepGraph_->getTotalInstructionsInCluster(
-              ActiveClusterGroup)) {
-            CurrentClusterBlocks++;
-          }
+          saveCluster(inst);
 
           // Finish setting up the new cluster
-          ActiveClusterGroup = inst->GetClusterGroup();
-          inst->SetActiveCluster(ActiveClusterGroup);
-          CurrentClusterSize = 1;
-          InstructionsScheduledInEachCluster[ActiveClusterGroup]++;
-	        InstrList = llvm::make_unique<
-              llvm::SmallVector<llvm::StringRef, 4>>();
-  	      InstrList->push_back(inst->GetName());
+          initCluster(inst);
+          StartCycle = Start;
         }
       } else {
         // Case 3: Not currently clustering. Initialize clustering
-        ActiveClusterGroup = inst->GetClusterGroup();
-        inst->SetActiveCluster(ActiveClusterGroup);
-        CurrentClusterSize = 1;
-        InstructionsScheduledInEachCluster[ActiveClusterGroup]++;
-	      InstrList = llvm::make_unique<llvm::SmallVector<llvm::StringRef, 4>>();
-	      InstrList->push_back(inst->GetName());
+        initCluster(inst);
+        StartCycle = Start;
       }
-    } else if (CurrentClusterSize > 0) {
+    } else if (ClusterActiveGroup != 0) {
       // Case 4: Exiting out of an active cluster
       // Save the cluster to restore when backtracking.
-      if (LastCluster) {
-        // Save previous clusters in a vector except the last cluster
-        // that we just exited out of.
-        PastClustersList.push_back(std::move(LastCluster));
-
-        // Last cluster that we just exited out of, used for fast accessing
-        // to its contents.
-        LastCluster = llvm::make_unique<PastClusters>(
-            ActiveClusterGroup, CurrentClusterSize, inst->GetNum());
-      } else
-        // This is the first cluster block that we exited out of.
-        LastCluster = llvm::make_unique<PastClusters>(
-            ActiveClusterGroup, CurrentClusterSize, inst->GetNum());
-
-      LastCluster->InstrList = std::move(InstrList);
-
-      // If this cluster did not finish then that means there have to be an
-      // extra cluster block to finish all of the instructions in the cluster.
-      if (InstructionsScheduledInEachCluster[ActiveClusterGroup] <
-            dataDepGraph_->getTotalInstructionsInCluster(ActiveClusterGroup)) {
-        CurrentClusterBlocks++;
-      }
-
-      // Assert that the total instructions accounted for doesn't exceed the
-      // expected total instructions in the cluster
-      assert(InstructionsScheduledInEachCluster[ActiveClusterGroup] <=
-            dataDepGraph_->getTotalInstructionsInCluster(ActiveClusterGroup));
+      saveCluster(inst);
 
       // Reset active cluster
-      ActiveClusterGroup = 0;
-      inst->SetActiveCluster(0);
-      CurrentClusterSize = 0;
+      resetActiveCluster(inst);
     }
   }
 
@@ -775,84 +835,45 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) {
   if (IsSecondPass() && ClusterMemoryOperations) {
     // If the instruction we are backtracking from is part of a cluster
     if (inst->GetMayCluster()) {
-      // Case 1, 2, and 3
-      // Reduce the cluster size
-      CurrentClusterSize--;
-      // Decrement instructions scheduled in this cluster
-      InstructionsScheduledInEachCluster[ActiveClusterGroup]--;
-      assert(InstructionsScheduledInEachCluster[ActiveClusterGroup] >= 0);
-
-      // Remove instruction's name from the list
-      InstrList->pop_back();
-
-      // Case 2: If there are no more instructions in the currently active
-      // cluster then it indicates that we backtracked out of a cluster.
-      if (CurrentClusterSize == 0) {
-        // Set active cluster to none.
-        ActiveClusterGroup = 0;
-        inst->SetActiveCluster(0);
-
-        // Case 3: Check If this instruction ended another cluster
-        if (LastCluster) {
-          // If so, then we need to restore the state of the previous cluster
-          if (LastCluster->InstNum == inst->GetNum()) {
-            CurrentClusterSize = LastCluster->ClusterSize;
-            ActiveClusterGroup = LastCluster->ClusterGroup;
-            inst->SetActiveCluster(ActiveClusterGroup);
-
-	          InstrList = std::move(LastCluster->InstrList);
-
-            LastCluster.reset(); // Release current cluster pointer
-
-            // Get previous cluster from vector list
-            if (!PastClustersList.empty()) {
-              LastCluster = std::move(PastClustersList.back());
-              PastClustersList.pop_back();
-            }
-
-            // If we backtracked into another cluster that has not yet
-            // scheduled all of its instructions in the cluster, then undo our
-            // remaining cluster block estimate. There is a possibility that it
-            // is able to cluster all of the instructions in its cluster block
-            // and does not need an extra block.
-	          if (InstructionsScheduledInEachCluster[ActiveClusterGroup] !=
-                  dataDepGraph_->getTotalInstructionsInCluster(
-                  ActiveClusterGroup)) {
- 	            CurrentClusterBlocks--;
-	            assert(CurrentClusterBlocks >= MinClusterBlocks);
-	          }
+        if (CurrentClusterSize != 0) {
+        // Case 1, 2, and 3
+        // Reduce the cluster size
+        CurrentClusterSize--;
+        ClusterInstrRemainderCount[ClusterActiveGroup]++;
+        // Remove instruction's name from the list
+        InstrList->pop_back();
+
+        // Case 2: If there are no more instructions in the currently active
+        // cluster then it indicates that we backtracked out of a cluster.
+        if (CurrentClusterSize == 0) {
+          ClusterCount[ClusterActiveGroup]--;
+          assert(ClusterCount[ClusterActiveGroup] >= 0);
+          CurrentClusterCost--;
+          // Set active cluster to none.
+          resetActiveCluster(inst);
+
+          // Case 3: Check If this instruction ended another cluster
+          if (LastCluster && LastCluster->InstNum == inst->GetNum()) {
+            // If so, then we need to restore the state of the previous cluster
+            restorePreviousCluster(inst);
           }
         }
       }
-    } else if (LastCluster) {
-      if (LastCluster->InstNum == inst->GetNum()) {
-        // Case 4: If there was a previous cluster and this instruction
-        // ended the cluster then restore the previous cluster's state
-        CurrentClusterSize = LastCluster->ClusterSize;
-        ActiveClusterGroup = LastCluster->ClusterGroup;
-        inst->SetActiveCluster(ActiveClusterGroup);
-
-	      InstrList = std::move(LastCluster->InstrList);
-
-        LastCluster.reset();
-
-        // Get previous cluster from vector list
-        if (!PastClustersList.empty()) {
-          LastCluster = std::move(PastClustersList.back());
-          PastClustersList.pop_back();
-        }
-
-        // If we backtracked into another cluster that has not yet
-        // scheduled all of its instructions in the cluster, then undo our
-        // remaining cluster block estimate. There is a possibility that it is
-        // able to cluster all of the instructions in its cluster block and
-        // does not need an extra block.
-        if (InstructionsScheduledInEachCluster[ActiveClusterGroup] !=
-            dataDepGraph_->getTotalInstructionsInCluster(ActiveClusterGroup)) {
-          CurrentClusterBlocks--;
-          assert(CurrentClusterBlocks >= MinClusterBlocks);
-        }
+      // A cluster size of 0 while an instruction may cluster indicates that
+      // the current instruction is at the end of a finished cluster
+      else if (CurrentClusterSize == 0) {
+        assert(inst->GetNum() == LastCluster->InstNum);
+        restorePreviousCluster(inst);
+
+        CurrentClusterSize--;
+        ClusterInstrRemainderCount[ClusterActiveGroup]++;
+        // Remove instruction's name from the list
+        InstrList->pop_back();
       }
+    } else if (LastCluster && LastCluster->InstNum == inst->GetNum()) {
+      // Case 4: If there was a previous cluster and this instruction
+      // ended the cluster then restore the previous cluster's state
+      restorePreviousCluster(inst);
     }
   }
 
@@ -963,7 +984,7 @@ void BBWithSpill::SchdulInst(SchedInstruction *inst, InstCount cycleNum,
   if (inst == NULL)
     return;
   assert(inst != NULL);
-  UpdateSpillInfoForSchdul_(inst, trackCnflcts);
+  UpdateSpillInfoForSchdul_(inst, trackCnflcts, crntCycleNum_);
 }
 /*****************************************************************************/
 
@@ -999,7 +1020,7 @@ void BBWithSpill::FinishHurstc_() {
 
 void BBWithSpill::FinishOptml_() {
 #ifdef IS_DEBUG_BBSPILL_COST
-  stats::traceOptimalCost.Record(bestCost_);
+  stats::traceOptimalCost.Record(GetBestCost());
   stats::traceOptimalScheduleLength.Record(bestSchedLngth_);
 #endif
 }
@@ -1007,6 +1028,7 @@ void BBWithSpill::FinishOptml_() {
 
 Enumerator *BBWithSpill::AllocEnumrtr_(Milliseconds timeout) {
   bool enblStallEnum = enblStallEnum_;
+  bool ClusteringEnabled = IsSecondPass() && ClusterMemoryOperations;
   /*  if (!dataDepGraph_->IncludesUnpipelined()) {
       enblStallEnum = false;
     }*/
@@ -1014,7 +1036,7 @@ Enumerator *BBWithSpill::AllocEnumrtr_(Milliseconds timeout) {
   enumrtr_ = new LengthCostEnumerator(
       dataDepGraph_, machMdl_, schedUprBound_, GetSigHashSize(),
       GetEnumPriorities(), GetPruningStrategy(), SchedForRPOnly_, enblStallEnum,
-      timeout, GetSpillCostFunc(), 0, NULL);
+      timeout, GetSpillCostFunc(), ClusteringEnabled, 0, NULL);
 
   return enumrtr_;
 }
@@ -1047,7 +1069,7 @@ FUNC_RESULT BBWithSpill::Enumerate_(Milliseconds startTime,
       timeout = true;
     HandlEnumrtrRslt_(rslt, trgtLngth);
 
-    if (bestCost_ == 0 || rslt == RES_ERROR ||
+    if (GetBestCost() == 0 || rslt == RES_ERROR ||
         (lngthDeadline == rgnDeadline && rslt == RES_TIMEOUT)) { //||
         //(rslt == RES_SUCCESS && IsSecondPass())) {
 
@@ -1116,44 +1138,49 @@ InstCount BBWithSpill::UpdtOptmlSched(InstSchedule *crntSched,
       Logger::Info("$$$ GOOD_HIT: Better spill cost for a longer schedule");
 
     SetBestCost(crntCost);
+    if (IsSecondPass() && ClusterMemoryOperations)
+      setBestClusterCost(CurrentClusterCost);
     optmlSpillCost_ = crntSpillCost_;
     SetBestSchedLength(crntSched->GetCrntLngth());
     enumBestSched_->Copy(crntSched);
     bestSched_ = enumBestSched_;
+    printCurrentClustering();
+  }
 
-    // Print the instructions in the clusters after finding a schedule.
-    if (IsSecondPass() && ClusterMemoryOperations) {
-      dbgs() << "Printing clustered instructions:\n";
-      int i = 1;
-      for (const auto &clusters : PastClustersList) {
-        dbgs() << "Printing cluster " << i << ": ";
-        for (const auto &instr : *clusters->InstrList) {
-          dbgs() << instr << " ";
-        }
-        i++;
-      dbgs() << '\n';
+  return GetBestCost();
+}
+
+void BBWithSpill::printCurrentClustering() {
+  // Print the instructions in the clusters after finding a schedule.
+  if (IsSecondPass() && ClusterMemoryOperations) {
+    dbgs() << "Printing clustered instructions:\n";
+    int i = 1;
+    for (const auto &clusters : PastClustersList) {
+      dbgs() << "Printing cluster " << i << ", start cycle (" << clusters->Start << "): ";
+      for (const auto &instr : *clusters->InstrList) {
+        dbgs() << instr << " ";
       }
+      i++;
+    dbgs() << '\n';
+    }
 
-      if (LastCluster) {
-        dbgs() << "Printing cluster " << i << ": ";
-        for (const auto &instr : *(LastCluster->InstrList)) {
-          dbgs() << instr << " ";
-        }
-        i++;
-      dbgs() << '\n';
+    if (LastCluster) {
+      dbgs() << "Printing cluster " << i << ", start cycle (" << LastCluster->Start << "): ";
+      for (const auto &instr : *(LastCluster->InstrList)) {
+        dbgs() << instr << " ";
       }
+      i++;
+    dbgs() << '\n';
+    }
 
-      if (InstrList && InstrList->size() > 0) {
-        dbgs() << "Printing cluster " << i << ": ";
-        for (const auto &instr : *InstrList) {
-          dbgs() << instr << " ";
-        }
-      dbgs() << '\n';
+    if (InstrList && InstrList->size() > 0) {
+      dbgs() << "Printing cluster " << i << ", start cycle (" << StartCycle << "): ";
+      for (const auto &instr : *InstrList) {
+        dbgs() << instr << " ";
       }
+    dbgs() << '\n';
     }
   }
-
-  return GetBestCost();
 }
 /*****************************************************************************/
 
@@ -1181,21 +1208,31 @@ void BBWithSpill::SetupForSchdulng_() {
 bool BBWithSpill::ChkCostFsblty(InstCount trgtLngth, EnumTreeNode *node) {
   bool fsbl = true;
   InstCount crntCost, dynmcCostLwrBound;
+  int ClusterDynamicLowerBound;
   if (GetSpillCostFunc() == SCF_SLIL) {
     crntCost = dynamicSlilLowerBound_ * SCW_ + trgtLngth * schedCostFactor_;
   } else {
     crntCost = crntSpillCost_ * SCW_ + trgtLngth * schedCostFactor_;
   }
   // Add the cost of clustering
-  if (IsSecondPass() && ClusterMemoryOperations)
-    crntCost += CurrentClusterBlocks * ClusteringWeight;
+  if (IsSecondPass() && ClusterMemoryOperations) {
+    ClusterDynamicLowerBound = calculateClusterDLB();
+    crntCost += ClusterDynamicLowerBound * ClusteringWeight;
+  }
 
-  crntCost -= costLwrBound_;
+  crntCost -= GetCostLwrBound();
   dynmcCostLwrBound = crntCost;
 
   // assert(cost >= 0);
   assert(dynmcCostLwrBound >= 0);
 
+/*
+  if (IsSecondPass() && ClusterMemoryOperations) {
+    dbgs() << "Current cycle: " << node->GetTime() <<", current cost is: " << dynmcCostLwrBound << ". Current best is: " << GetBestCost() << '\n';
+    printCurrentClustering();
+  }
+*/
+
   fsbl = dynmcCostLwrBound < GetBestCost();
 
   // FIXME: RP tracking should be limited to the current SCF. We need RP
@@ -1205,6 +1242,17 @@ bool BBWithSpill::ChkCostFsblty(InstCount trgtLngth, EnumTreeNode *node) {
     node->SetCostLwrBound(dynmcCostLwrBound);
     node->SetPeakSpillCost(peakSpillCost_);
     node->SetSpillCostSum(totSpillCost_);
+    if (IsSecondPass() && ClusterMemoryOperations) {
+      node->setClusteringCost(CurrentClusterCost);
+      node->setCurClusteringGroup(ClusterActiveGroup);
+      node->setClusterLwrBound(ClusterDynamicLowerBound);
+      if (ClusterActiveGroup != 0) {
+        node->setClusterAbsorbCount(15 - CurrentClusterSize);
+      }
+      else {
+        node->setClusterAbsorbCount(0);
+      }
+    }
   }
   return fsbl;
 }
diff --git a/lib/Scheduler/enumerator.cpp b/lib/Scheduler/enumerator.cpp
index d9c4e3b1..43bf6ed6 100644
--- a/lib/Scheduler/enumerator.cpp
+++ b/lib/Scheduler/enumerator.cpp
@@ -64,6 +64,12 @@ void EnumTreeNode::Init_() {
   isLeaf_ = false;
   cost_ = INVALID_VALUE;
   costLwrBound_ = INVALID_VALUE;
+  ClusterCost = INVALID_VALUE;
+  ClusterActiveGroup = INVALID_VALUE;
+  ClusterAbsorbCount = INVALID_VALUE;
+  ClusterDLB = INVALID_VALUE;
+  ClusterTotalCost = -1;
+  ClusterBestCost = 99999999;
   crntCycleBlkd_ = false;
   rsrvSlots_ = NULL;
   totalCostIsActualCost_ = false;
@@ -434,8 +440,8 @@ Enumerator::Enumerator(DataDepGraph *dataDepGraph, MachineModel *machMdl,
                        InstCount schedUprBound, int16_t sigHashSize,
                        SchedPriorities prirts, Pruning PruningStrategy,
                        bool SchedForRPOnly, bool enblStallEnum,
-                       Milliseconds timeout, InstCount preFxdInstCnt,
-                       SchedInstruction *preFxdInsts[])
+                       Milliseconds timeout, bool ClusteringEnabled,
+                       InstCount preFxdInstCnt, SchedInstruction *preFxdInsts[])
     : ConstrainedScheduler(dataDepGraph, machMdl, schedUprBound) {
   memAllocBlkSize_ = (int)timeout / TIMEOUT_TO_MEMBLOCK_RATIO;
   assert(preFxdInstCnt >= 0);
@@ -454,6 +460,7 @@ Enumerator::Enumerator(DataDepGraph *dataDepGraph, MachineModel *machMdl,
   prune_ = PruningStrategy;
   SchedForRPOnly_ = SchedForRPOnly;
   enblStallEnum_ = enblStallEnum;
+  Clustering = ClusteringEnabled;
 
   isEarlySubProbDom_ = true;
 
@@ -1316,17 +1323,27 @@ void SetTotalCostsAndSuffixes(EnumTreeNode *const currentNode,
     Logger::Info("Leaf node total cost %d", currentNode->GetCost());
 #endif
     currentNode->SetTotalCost(currentNode->GetCost());
+    if (currentNode->isClustering())
+      currentNode->setTotalClusterCost(currentNode->getClusteringCost());
     currentNode->SetTotalCostIsActualCost(true);
   } else {
-    if (!currentNode->GetTotalCostIsActualCost() &&
-        (currentNode->GetTotalCost() == -1 ||
-         currentNode->GetCostLwrBound() < currentNode->GetTotalCost())) {
-#if defined(IS_DEBUG_ARCHIVE)
-      Logger::Info("Inner node doesn't have a real cost yet. Setting total "
-                   "cost to dynamic lower bound %d",
-                   currentNode->GetCostLwrBound());
-#endif
-      currentNode->SetTotalCost(currentNode->GetCostLwrBound());
+    if (!currentNode->GetTotalCostIsActualCost()) {
+      // Set overall weighted sum cost
+      if (currentNode->GetTotalCost() == -1 ||
+          currentNode->GetCostLwrBound() < currentNode->GetTotalCost()) {
+  #if defined(IS_DEBUG_ARCHIVE)
+        Logger::Info("Inner node doesn't have a real cost yet. Setting total "
+                    "cost to dynamic lower bound %d",
+                    currentNode->GetCostLwrBound());
+  #endif
+        currentNode->SetTotalCost(currentNode->GetCostLwrBound());
+      }
+
+      // Set clustering cost
+      if ((currentNode->isClustering() && currentNode->getTotalClusterCost() == -1) || 
+          (currentNode->getClusterLwrBound() < currentNode->getTotalClusterCost())) {
+        currentNode->setTotalClusterCost(currentNode->getClusterLwrBound());
+      }
     }
   }
 
@@ -1359,16 +1376,25 @@ void SetTotalCostsAndSuffixes(EnumTreeNode *const currentNode,
                      currentNode->GetTotalCost());
 #endif
         parentNode->SetTotalCost(currentNode->GetTotalCost());
+        if (currentNode->isClustering())
+          parentNode->setTotalClusterCost(currentNode->getTotalClusterCost());
         parentNode->SetTotalCostIsActualCost(true);
         parentNode->SetSuffix(std::move(parentSuffix));
-      } else if (currentNode->GetTotalCost() < parentNode->GetTotalCost()) {
-#if defined(IS_DEBUG_ARCHIVE)
-        Logger::Info(
-            "Current node has a real cost (%d), and so does parent. (%d)",
-            currentNode->GetTotalCost(), parentNode->GetTotalCost());
-#endif
-        parentNode->SetTotalCost(currentNode->GetTotalCost());
-        parentNode->SetSuffix(std::move(parentSuffix));
+      } else {
+        if (currentNode->GetTotalCost() < parentNode->GetTotalCost()) {
+  #if defined(IS_DEBUG_ARCHIVE)
+          Logger::Info(
+              "Current node has a real cost (%d), and so does parent. (%d)",
+              currentNode->GetTotalCost(), parentNode->GetTotalCost());
+  #endif
+          parentNode->SetTotalCost(currentNode->GetTotalCost());
+          parentNode->SetSuffix(std::move(parentSuffix));
+        }
+
+        // Set clustering cost
+        if (currentNode->isClustering() && currentNode->getTotalClusterCost() < parentNode->getTotalClusterCost()) {
+          parentNode->setTotalClusterCost(currentNode->getTotalClusterCost());
+        }
       }
     }
   }
@@ -1856,7 +1882,7 @@ LengthEnumerator::LengthEnumerator(
     bool SchedForRPOnly, bool enblStallEnum, Milliseconds timeout,
     InstCount preFxdInstCnt, SchedInstruction *preFxdInsts[])
     : Enumerator(dataDepGraph, machMdl, schedUprBound, sigHashSize, prirts,
-                 PruningStrategy, SchedForRPOnly, enblStallEnum, timeout,
+                 PruningStrategy, SchedForRPOnly, enblStallEnum, timeout, false,
                  preFxdInstCnt, preFxdInsts) {
   SetupAllocators_();
   tmpHstryNode_ = new HistEnumTreeNode;
@@ -1941,11 +1967,11 @@ LengthCostEnumerator::LengthCostEnumerator(
     DataDepGraph *dataDepGraph, MachineModel *machMdl, InstCount schedUprBound,
     int16_t sigHashSize, SchedPriorities prirts, Pruning PruningStrategy,
     bool SchedForRPOnly, bool enblStallEnum, Milliseconds timeout,
-    SPILL_COST_FUNCTION spillCostFunc, InstCount preFxdInstCnt,
-    SchedInstruction *preFxdInsts[])
+    SPILL_COST_FUNCTION spillCostFunc, bool ClusteringEnabled,
+    InstCount preFxdInstCnt, SchedInstruction *preFxdInsts[])
     : Enumerator(dataDepGraph, machMdl, schedUprBound, sigHashSize, prirts,
                  PruningStrategy, SchedForRPOnly, enblStallEnum, timeout,
-                 preFxdInstCnt, preFxdInsts) {
+                 ClusteringEnabled, preFxdInstCnt, preFxdInsts) {
   SetupAllocators_();
 
   costChkCnt_ = 0;
@@ -2141,6 +2167,7 @@ bool LengthCostEnumerator::BackTrack_() {
 /*****************************************************************************/
 
 InstCount LengthCostEnumerator::GetBestCost_() { return rgn_->GetBestCost(); }
+int LengthCostEnumerator::GetBestClusterCost_() { return rgn_->getBestClusterCost(); }
 /*****************************************************************************/
 
 void LengthCostEnumerator::CreateRootNode_() {
diff --git a/lib/Scheduler/hist_table.cpp b/lib/Scheduler/hist_table.cpp
index a4c1cae7..8a9ff356 100644
--- a/lib/Scheduler/hist_table.cpp
+++ b/lib/Scheduler/hist_table.cpp
@@ -400,6 +400,10 @@ void CostHistEnumTreeNode::Init_() {
   costInfoSet_ = false;
 #endif
   cost_ = 0;
+  ClusterCost = 9999999;
+  ClusterTotalCost = 9999999;
+  ClusterActiveGroup = 0;
+  ClusterAbsorbCount = 0;
 }
 
 bool CostHistEnumTreeNode::DoesDominate(EnumTreeNode *node,
@@ -467,6 +471,41 @@ static bool doesHistoryPeakCostDominate(InstCount OtherPrefixCost,
   return LCE->GetBestCost() <= OtherPrefixCost;
 }
 
+static bool doesClusterCostDominate(EnumTreeNode *CurEnumNode,
+                                    int ClusterActiveGroup, int ClusterCost,
+                                    int ClusterAbsorbCount, int ClusterTotalCost,
+                                    int ClusterBest) {
+  // Correct but too restrictive
+  if (CurEnumNode->getCurClusteringGroup() != ClusterActiveGroup)
+    return false;
+
+  // Count the instructions only if there is an instruction in the ready list that belongs
+  // to the open cluster. If there is none, you can't add any instructions. If there are no instructions
+  // on the ready list that belong to the open cluster, we can set the cluster absorb count to 0.
+  if (CurEnumNode->getClusteringCost() >= ClusterCost &&
+      CurEnumNode->getClusterAbsorbCount() <= ClusterAbsorbCount)
+    return true;
+
+  // More room in the open cluster can reduce the number clusters by at most one
+  if (CurEnumNode->getClusteringCost() >= ClusterCost + 1)
+    return true;
+
+  int improvement = ClusterCost - CurEnumNode->getClusteringCost();
+
+  // If the current node has a better absorb count then we optimistically assume it may
+  // improve the number of clusters by 1
+  if (CurEnumNode->getClusterAbsorbCount() < ClusterAbsorbCount)
+    improvement++;
+
+  // Two cases for a history node,
+  // 1.) One without a full schedule below it. Look at DLB.
+  // 2.) One with a full schedule below it. Look at the best found below the history node.
+  if (ClusterBest != INVALID_VALUE && improvement <= ClusterTotalCost - ClusterBest)
+    return true;
+
+  return false;
+}
+
 // Should we prune the other node based on RP cost.
 bool CostHistEnumTreeNode::ChkCostDmntnForBBSpill_(EnumTreeNode *Node,
                                                    Enumerator *E) {
@@ -502,6 +541,10 @@ bool CostHistEnumTreeNode::ChkCostDmntnForBBSpill_(EnumTreeNode *Node,
       ShouldPrune =
           spillCostSum_ % instCnt >= Node->GetSpillCostSum() % instCnt;
     }
+    if (!ShouldPrune && LCE->isClustering()) {
+      int ClusterBest = LCE->getBestClusterCost();
+      ShouldPrune = doesClusterCostDominate(Node, ClusterActiveGroup, ClusterCost, ClusterAbsorbCount, ClusterTotalCost, ClusterBest);
+    }
   }
   return ShouldPrune;
 }
@@ -511,6 +554,10 @@ void CostHistEnumTreeNode::SetCostInfo(EnumTreeNode *node, bool, Enumerator *) {
   peakSpillCost_ = node->GetPeakSpillCost();
   spillCostSum_ = node->GetSpillCostSum();
   isLngthFsbl_ = node->IsLngthFsbl();
+  ClusterCost = node->getClusteringCost();
+  ClusterActiveGroup = node->getCurClusteringGroup();
+  ClusterAbsorbCount = node->getClusterAbsorbCount();
+  ClusterTotalCost = node->getTotalClusterCost();
 
   // (Chris)
   partialCost_ = node->GetCostLwrBound();
diff --git a/lib/Scheduler/ready_list.cpp b/lib/Scheduler/ready_list.cpp
index f04f467f..6bee513b 100644
--- a/lib/Scheduler/ready_list.cpp
+++ b/lib/Scheduler/ready_list.cpp
@@ -78,9 +78,6 @@ ReadyList::ReadyList(DataDepGraph *dataDepGraph, SchedPriorities prirts) {
       ClusterBit = Utilities::clcltBitsNeededToHoldNum(1);
       totKeyBits += ClusterBit;
       break;
-
-    default:
-      break;
     } // end switch
   }   // end for
 
@@ -128,9 +125,6 @@ ReadyList::ReadyList(DataDepGraph *dataDepGraph, SchedPriorities prirts) {
     case LSH_CLUSTER:
       AddPrirtyToKey_(maxPriority_, keySize, ClusterBit, 1, 1);
       break;
-
-    default:
-      break;
     }
   }
 }
@@ -186,7 +180,6 @@ unsigned long ReadyList::CmputKey_(SchedInstruction *inst, bool isUpdate,
       newLastUseCnt = inst->CmputLastUseCnt();
       if (newLastUseCnt != oldLastUseCnt)
         changed = true;
-      }
 
       AddPrirtyToKey_(key, keySize, useCntBits_, newLastUseCnt, maxUseCnt_);
       break;
@@ -234,9 +227,6 @@ unsigned long ReadyList::CmputKey_(SchedInstruction *inst, bool isUpdate,
       }
       AddPrirtyToKey_(key, keySize, ClusterBit, ValueForKey, 1);
       break;
-
-    default:
-      break;
     }
   }
   return key;
@@ -253,15 +243,17 @@ void ReadyList::AddLatestSubLists(LinkedList<SchedInstruction> *lst1,
 }
 
 void ReadyList::Print(std::ostream &out) {
+  PriorityList<SchedInstruction> *OutList = new PriorityList<SchedInstruction>;
+  OutList->CopyList(prirtyLst_, nullptr);
   out << "Ready List: ";
-  for (auto *crntInst = prirtyLst_->GetFrstElmnt(); crntInst != NULL;
-       crntInst = prirtyLst_->GetNxtElmnt()) {
+  for (auto *crntInst = OutList->GetFrstElmnt(); crntInst != NULL;
+       crntInst = OutList->GetNxtElmnt()) {
     out << " " << crntInst->GetNum() << "(" << crntInst->GetClusterGroup()
         << ")";
   }
   out << '\n';
 
-  prirtyLst_->ResetIterator();
+  delete OutList;
 }
 
 void ReadyList::AddLatestSubList_(LinkedList<SchedInstruction> *lst) {
diff --git a/lib/Scheduler/sched_basic_data.cpp b/lib/Scheduler/sched_basic_data.cpp
index a529e530..e301893e 100644
--- a/lib/Scheduler/sched_basic_data.cpp
+++ b/lib/Scheduler/sched_basic_data.cpp
@@ -75,8 +75,6 @@ SchedInstruction::~SchedInstruction() {
 }
 
 bool SchedInstruction::computeWasActive() {
-  if (ClusterGroup == 0) return false;
-
   WasActive = GetActiveCluster() == GetClusterGroup();
   return WasActive;
 }
diff --git a/lib/Scheduler/sched_region.cpp b/lib/Scheduler/sched_region.cpp
index a885145f..9f0f5535 100644
--- a/lib/Scheduler/sched_region.cpp
+++ b/lib/Scheduler/sched_region.cpp
@@ -39,6 +39,7 @@ SchedRegion::SchedRegion(MachineModel *machMdl, DataDepGraph *dataDepGraph,
 
   totalSimSpills_ = INVALID_VALUE;
   bestCost_ = INVALID_VALUE;
+  BestClusterCost = INVALID_VALUE;
   bestSchedLngth_ = INVALID_VALUE;
   hurstcCost_ = INVALID_VALUE;
   enumCrntSched_ = NULL;
diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
index a976a8c4..2c9f55d7 100644
--- a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
+++ b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
@@ -538,9 +538,12 @@ int OptSchedDDGWrapperBasic::clusterNeighboringMemOps(
     const SUnit *SUb = MemOpRecords[Idx + 1].SU;
     dbgs() << "  Checking possible clustering of (" << SUa->NodeNum << ") and ("
            << SUb->NodeNum << ")\n";
+           
+    // Pass constant of 1 to AMD's function to determine clustering to remove
+    // the limit of 15. Our enumerator can determine when it has reached the
+    // limit instead of depending on AMD.
     if (DAG->TII->shouldClusterMemOps(*MemOpRecords[Idx].BaseOp,
-                                      *MemOpRecords[Idx + 1].BaseOp,
-                                      ClusterLength)) {
+                                      *MemOpRecords[Idx + 1].BaseOp, 1u)) {
       dbgs() << "    Cluster possible at SU(" << SUa->NodeNum << ")- SU("
              << SUb->NodeNum << ")\n";
 

From 4bfbc61a79c57416a7e6b0293e0b948c993502d2 Mon Sep 17 00:00:00 2001
From: Vang Thao <vangthao@csus.edu>
Date: Tue, 9 Jun 2020 22:43:23 -0500
Subject: [PATCH 31/40] Copy in dag mutation fix.

---
 .../Scheduler/OptSchedDDGWrapperBase.h        |  3 ++-
 lib/Scheduler/sched_region.cpp                | 15 ++++++++++++
 lib/Wrapper/OptSchedDDGWrapperBasic.cpp       | 23 +++++++++++++++----
 lib/Wrapper/OptSchedDDGWrapperBasic.h         | 20 ++++++++--------
 lib/Wrapper/OptimizingScheduler.cpp           | 14 +++++++----
 5 files changed, 57 insertions(+), 18 deletions(-)

diff --git a/include/opt-sched/Scheduler/OptSchedDDGWrapperBase.h b/include/opt-sched/Scheduler/OptSchedDDGWrapperBase.h
index b10c9248..6180e344 100644
--- a/include/opt-sched/Scheduler/OptSchedDDGWrapperBase.h
+++ b/include/opt-sched/Scheduler/OptSchedDDGWrapperBase.h
@@ -14,7 +14,8 @@ class OptSchedDDGWrapperBase {
 public:
   virtual ~OptSchedDDGWrapperBase() = default;
 
-  virtual void convertSUnits() = 0;
+  virtual void convertSUnits(bool IgnoreRealEdges,
+                             bool IgnoreArtificialEdges) = 0;
 
   virtual void convertRegFiles() = 0;
 
diff --git a/lib/Scheduler/sched_region.cpp b/lib/Scheduler/sched_region.cpp
index 9f0f5535..762b2625 100644
--- a/lib/Scheduler/sched_region.cpp
+++ b/lib/Scheduler/sched_region.cpp
@@ -2,6 +2,7 @@
 #include <memory>
 #include <utility>
 
+#include "Wrapper/OptSchedDDGWrapperBasic.h"
 #include "opt-sched/Scheduler/aco.h"
 #include "opt-sched/Scheduler/bb_spill.h"
 #include "opt-sched/Scheduler/config.h"
@@ -245,6 +246,19 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule(
 #endif
   }
 
+  // After the sequential scheduler in the second pass, add the artificial edges
+  // to the DDG. Some mutations were adding artificial edges which caused a
+  // conflict with the sequential scheduler. Therefore, wait until the
+  // sequential scheduler is done before adding artificial edges.
+  if (IsSecondPass()) {
+    static_cast<OptSchedDDGWrapperBasic *>(dataDepGraph_)->addArtificialEdges();
+    rslt = dataDepGraph_->UpdateSetupForSchdulng(needTransitiveClosure);
+    if (rslt != RES_SUCCESS) {
+      Logger::Info("Invalid DAG after adding artificial cluster edges");
+      return rslt;
+    }
+  }
+
   // Step #2: Use ACO to find a schedule if enabled and no optimal schedule is
   // yet to be found.
   if (AcoBeforeEnum && !isLstOptml) {
@@ -649,6 +663,7 @@ FUNC_RESULT SchedRegion::Optimize_(Milliseconds startTime,
     }
     stats::unsolvedProblemSize.Record(dataDepGraph_->GetInstCnt());
   }
+
   return rslt;
 }
 
diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
index 2c9f55d7..e2ccd8b5 100644
--- a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
+++ b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
@@ -78,7 +78,8 @@ OptSchedDDGWrapperBasic::OptSchedDDGWrapperBasic(
   ClusterCount = 0;
 }
 
-void OptSchedDDGWrapperBasic::convertSUnits() {
+void OptSchedDDGWrapperBasic::convertSUnits(bool IgnoreRealEdges,
+                                            bool IgnoreArtificialEdges) {
   LLVM_DEBUG(dbgs() << "Building opt_sched DAG\n");
   // The extra 2 are for the artifical root and leaf nodes.
   instCnt_ = nodeCnt_ = DAG->SUnits.size() + 2;
@@ -94,7 +95,7 @@ void OptSchedDDGWrapperBasic::convertSUnits() {
 
   // Create edges.
   for (const auto &SU : DAG->SUnits) {
-    convertEdges(SU);
+    convertEdges(SU, IgnoreRealEdges, IgnoreArtificialEdges);
   }
 
   // Add artificial root and leaf nodes and edges.
@@ -412,13 +413,27 @@ inline void OptSchedDDGWrapperBasic::setupLeaf() {
       CreateEdge_(i, LeafNum, 0, DEP_OTHER);
 }
 
-void OptSchedDDGWrapperBasic::convertEdges(const SUnit &SU) {
+void OptSchedDDGWrapperBasic::addArtificialEdges() {
+  for (const auto &SU : DAG->SUnits) {
+    convertEdges(SU, true, false);
+  }
+}
+
+void OptSchedDDGWrapperBasic::convertEdges(const SUnit &SU,
+                                           bool IgnoreRealEdges,
+                                           bool IgnoreArtificialEdges) {
   const MachineInstr *instr = SU.getInstr();
   SUnit::const_succ_iterator I, E;
   for (I = SU.Succs.begin(), E = SU.Succs.end(); I != E; ++I) {
     if (I->getSUnit()->isBoundaryNode())
       continue;
 
+    bool IsArtificial = I->isArtificial() || I->isCluster();
+    if (IgnoreArtificialEdges && IsArtificial)
+      continue;
+    else if (IgnoreRealEdges && !IsArtificial)
+      continue;
+
     DependenceType DepType;
     switch (I->getKind()) {
     case SDep::Data:
@@ -538,7 +553,7 @@ int OptSchedDDGWrapperBasic::clusterNeighboringMemOps(
     const SUnit *SUb = MemOpRecords[Idx + 1].SU;
     dbgs() << "  Checking possible clustering of (" << SUa->NodeNum << ") and ("
            << SUb->NodeNum << ")\n";
-           
+
     // Pass constant of 1 to AMD's function to determine clustering to remove
     // the limit of 15. Our enumerator can determine when it has reached the
     // limit instead of depending on AMD.
diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.h b/lib/Wrapper/OptSchedDDGWrapperBasic.h
index 373ddc52..0679e2b8 100644
--- a/lib/Wrapper/OptSchedDDGWrapperBasic.h
+++ b/lib/Wrapper/OptSchedDDGWrapperBasic.h
@@ -13,8 +13,8 @@
 #include "opt-sched/Scheduler/graph_trans.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineScheduler.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
 #include <map>
 #include <set>
 #include <vector>
@@ -48,7 +48,8 @@ class OptSchedDDGWrapperBasic : public DataDepGraph {
   /// Dump Optsched register def/use information for the region.
   void dumpOptSchedRegisters() const;
 
-  void convertSUnits() override;
+  void convertSUnits(bool IgnoreRealEdges, bool IgnoreArtificialEdges) override;
+  void addArtificialEdges();
   void convertRegFiles() override;
   int findPossibleClusters(bool IsLoad) override;
 
@@ -125,7 +126,8 @@ class OptSchedDDGWrapperBasic : public DataDepGraph {
   void convertSUnit(const llvm::SUnit &SU);
 
   // Create edges between optsched graph nodes using SUnit successors.
-  void convertEdges(const llvm::SUnit &SU);
+  void convertEdges(const llvm::SUnit &SU, bool IgnoreRealEdges,
+                    bool IgnoreArtificialEdges);
 
   // Count number or registers defined by the region boundary.
   void countBoundaryLiveness(std::vector<int> &RegDefCounts,
@@ -145,11 +147,11 @@ class OptSchedDDGWrapperBasic : public DataDepGraph {
     std::vector<SchedInstruction *> consumers;
   };
 
-  /// Count of the total clusters possible 
+  /// Count of the total clusters possible
   int ClusterCount;
 
-// Copied from
-// https://github.com/RadeonOpenCompute/llvm/blob/roc-ocl-2.4.0/lib/CodeGen/MachineScheduler.cpp#L1467
+  // Copied from
+  // https://github.com/RadeonOpenCompute/llvm/blob/roc-ocl-2.4.0/lib/CodeGen/MachineScheduler.cpp#L1467
   struct MemOpInfo {
     const SUnit *SU;
     MachineOperand *BaseOp;
@@ -191,9 +193,9 @@ class OptSchedDDGWrapperBasic : public DataDepGraph {
 };
 
 // Exclude certain registers from being visible to the scheduler. Use LLVM's
-// register pressure tracker to find the MAX register pressure for each register
-// type (pressure set). If the MAX pressure is below a certain threshold don't
-// track that register.
+// register pressure tracker to find the MAX register pressure for each
+// register type (pressure set). If the MAX pressure is below a certain
+// threshold don't track that register.
 class LLVMRegTypeFilter {
 private:
   const MachineModel *MM;
diff --git a/lib/Wrapper/OptimizingScheduler.cpp b/lib/Wrapper/OptimizingScheduler.cpp
index 528801fc..d12e8294 100644
--- a/lib/Wrapper/OptimizingScheduler.cpp
+++ b/lib/Wrapper/OptimizingScheduler.cpp
@@ -126,7 +126,8 @@ nextIfDebug(MachineBasicBlock::iterator I,
   return I;
 }
 
-static bool scheduleSpecificRegion(const StringRef RegionName, const Config &SchedIni) {
+static bool scheduleSpecificRegion(const StringRef RegionName,
+                                   const Config &SchedIni) {
   const bool ScheduleSpecificRegions =
       SchedIni.GetBool("SCHEDULE_SPECIFIC_REGIONS");
 
@@ -376,11 +377,13 @@ void ScheduleDAGOptSched::schedule() {
   // Convert graph
   auto DDG =
       OST->createDDGWrapper(C, this, MM.get(), LatencyPrecision, RegionName);
-  DDG->convertSUnits();
-  DDG->convertRegFiles();
 
   // Find all clusterable instructions for the second pass.
   if (SecondPass) {
+    // In the second pass, ignore artificial edges before running the sequential
+    // heuristic list scheduler.
+    DDG->convertSUnits(false, true);
+
     dbgs() << "Finding load clusters.\n";
     int TotalLoadsInstructionsClusterable = DDG->findPossibleClusters(true);
     if (TotalLoadsInstructionsClusterable == 0)
@@ -411,7 +414,10 @@ void ScheduleDAGOptSched::schedule() {
             DataDepGraphInstance->getTotalInstructionsInCluster(begin));
       }
     }
-  }
+  } else
+    DDG->convertSUnits(false, false);
+
+  DDG->convertRegFiles();
 
   auto *BDDG = static_cast<OptSchedDDGWrapperBasic *>(DDG.get());
   addGraphTransformations(BDDG);

From 0d80260cea45033520b30dcec25a5524c12d5c59 Mon Sep 17 00:00:00 2001
From: Vang Thao <vangthao@csus.edu>
Date: Wed, 10 Jun 2020 08:49:22 -0500
Subject: [PATCH 32/40] Copy verify schedule bugfix patch for dag mutation fix.

---
 include/opt-sched/Scheduler/data_dep.h         |  6 ++++--
 include/opt-sched/Scheduler/graph.h            |  8 ++++++--
 include/opt-sched/Scheduler/sched_basic_data.h | 12 +++++-------
 lib/Scheduler/data_dep.cpp                     | 14 ++++++++++----
 lib/Scheduler/sched_basic_data.cpp             | 12 +++++++++---
 lib/Wrapper/OptSchedDDGWrapperBasic.cpp        |  3 ++-
 6 files changed, 36 insertions(+), 19 deletions(-)

diff --git a/include/opt-sched/Scheduler/data_dep.h b/include/opt-sched/Scheduler/data_dep.h
index 5dd5c1e8..2fcd19be 100644
--- a/include/opt-sched/Scheduler/data_dep.h
+++ b/include/opt-sched/Scheduler/data_dep.h
@@ -295,7 +295,9 @@ class DataDepGraph : public llvm::opt_sched::OptSchedDDGWrapperBase,
   // Memory clustering helper functions
   int getMinClusterCount() { return MinClusterCount; }
   void setMinClusterCount(int Max) { MinClusterCount = Max; }
-  int getTotalInstructionsInAllClusters() { return TotalInstructionsInAllClusters; }
+  int getTotalInstructionsInAllClusters() {
+    return TotalInstructionsInAllClusters;
+  }
   void setTotalInstructionsInAllClusters(int Max) {
     TotalInstructionsInAllClusters = Max;
   }
@@ -407,7 +409,7 @@ class DataDepGraph : public llvm::opt_sched::OptSchedDDGWrapperBase,
                                 InstCount fileUB, int blkNum);
   FUNC_RESULT FinishNode_(InstCount nodeNum, InstCount edgeCnt = -1);
   void CreateEdge_(InstCount frmInstNum, InstCount toInstNum, int ltncy,
-                   DependenceType depType);
+                   DependenceType depType, bool IsArtificial = false);
 
   FUNC_RESULT Finish_();
 
diff --git a/include/opt-sched/Scheduler/graph.h b/include/opt-sched/Scheduler/graph.h
index 790b7164..fea0576f 100644
--- a/include/opt-sched/Scheduler/graph.h
+++ b/include/opt-sched/Scheduler/graph.h
@@ -49,11 +49,15 @@ struct GraphEdge {
   UDT_GEDGES predOrder;
   // The second node's order in the first node's successor list.
   UDT_GEDGES succOrder;
+  // Whether or not the edge is an artificial dependency meaning it isn't
+  // required to be correct
+  bool IsArtificial;
 
   // Creates an edge between two nodes with labels label and label2.
   GraphEdge(GraphNode *from, GraphNode *to, UDT_GLABEL label,
-            UDT_GLABEL label2 = 0)
-      : from(from), to(to), label(label), label2(label2) {}
+            UDT_GLABEL label2 = 0, bool IsArtificial = false)
+      : from(from), to(to), label(label), label2(label2),
+        IsArtificial(IsArtificial) {}
 
   // Returns the node on the other side of the edge from the provided node.
   // Assumes that the argument is one of the nodes on the sides of the edge.
diff --git a/include/opt-sched/Scheduler/sched_basic_data.h b/include/opt-sched/Scheduler/sched_basic_data.h
index d2c3518a..46117e9e 100644
--- a/include/opt-sched/Scheduler/sched_basic_data.h
+++ b/include/opt-sched/Scheduler/sched_basic_data.h
@@ -8,14 +8,11 @@ Last Update:  Sept. 2013
 #ifndef OPTSCHED_BASIC_SCHED_BASIC_DATA_H
 #define OPTSCHED_BASIC_SCHED_BASIC_DATA_H
 
-// For class string.
-#include <string>
-// For class ostream.
 #include "opt-sched/Scheduler/defines.h"
 #include "opt-sched/Scheduler/graph.h"
 #include "opt-sched/Scheduler/hash_table.h"
 #include "opt-sched/Scheduler/machine_model.h"
-#include <iostream>
+#include <string>
 
 namespace llvm {
 namespace opt_sched {
@@ -208,12 +205,14 @@ class SchedInstruction : public GraphNode {
   //   depType: the type of dependence between this node and the successor.
   SchedInstruction *GetFrstScsr(InstCount *prdcsrNum = NULL,
                                 UDT_GLABEL *ltncy = NULL,
-                                DependenceType *depType = NULL);
+                                DependenceType *depType = NULL,
+                                bool *IsArtificial = nullptr);
   // Returns the next successor of this instruction node and moves the
   // successor iterator forward. Fills parameters as above.
   SchedInstruction *GetNxtScsr(InstCount *prdcsrNum = NULL,
                                UDT_GLABEL *ltncy = NULL,
-                               DependenceType *depType = NULL);
+                               DependenceType *depType = NULL,
+                               bool *IsArtificial = nullptr);
 
   // Returns the last successor of this instruction node and moves the
   // successor iterator to the end of the list. If prdcsrNum is provided, this
@@ -436,7 +435,6 @@ class SchedInstruction : public GraphNode {
   string opCode_;
 
   bool WasActive;
- 
 
   /// The cluster group that the current instruction is a part of.
   /// Default of 0 means that it is not part of any cluster.
diff --git a/lib/Scheduler/data_dep.cpp b/lib/Scheduler/data_dep.cpp
index c58b4d92..513e8d9e 100644
--- a/lib/Scheduler/data_dep.cpp
+++ b/lib/Scheduler/data_dep.cpp
@@ -214,7 +214,7 @@ DataDepGraph::~DataDepGraph() {
   delete[] instCntPerType_;
 }
 
-int DataDepGraph::getTotalInstructionsInCluster(int Cluster) { 
+int DataDepGraph::getTotalInstructionsInCluster(int Cluster) {
   assert(Cluster > 0);
   return MaxInstructionsInEachClusters[Cluster];
 }
@@ -2980,8 +2980,15 @@ bool InstSchedule::VerifyDataDeps_(DataDepGraph *dataDepGraph) {
 
     UDT_GLABEL ltncy;
     DependenceType depType;
-    for (SchedInstruction *scsr = inst->GetFrstScsr(NULL, &ltncy, &depType);
-         scsr != NULL; scsr = inst->GetNxtScsr(NULL, &ltncy, &depType)) {
+    bool IsArtificial;
+    for (SchedInstruction *scsr =
+             inst->GetFrstScsr(NULL, &ltncy, &depType, &IsArtificial);
+         scsr != NULL;
+         scsr = inst->GetNxtScsr(NULL, &ltncy, &depType, &IsArtificial)) {
+      // Artificial nodes are not required for the schedule to be correct
+      if (IsArtificial)
+        continue;
+
       InstCount scsrCycle = GetSchedCycle(scsr);
       if (scsrCycle < (instCycle + ltncy)) {
         Logger::Error("Invalid schedule: Latency from %d to %d not satisfied",
@@ -3213,7 +3220,6 @@ bool DataDepGraph::DoesFeedUser(SchedInstruction *inst) {
     // If there is a successor instruction that decreases live intervals
     // or one that does not increase live intervals, then return true.
     return true;
-
   }
 // Return false if there is no recursive successor of inst
 // that uses a live register.
diff --git a/lib/Scheduler/sched_basic_data.cpp b/lib/Scheduler/sched_basic_data.cpp
index e301893e..4aec6ec6 100644
--- a/lib/Scheduler/sched_basic_data.cpp
+++ b/lib/Scheduler/sched_basic_data.cpp
@@ -19,7 +19,7 @@ SchedInstruction::SchedInstruction(InstCount num, const string &name,
   opCode_ = opCode;
   instType_ = instType;
   ClusterGroup = 0;
-  MayCluster = false; 
+  MayCluster = false;
 
   frwrdLwrBound_ = INVALID_VALUE;
   bkwrdLwrBound_ = INVALID_VALUE;
@@ -384,7 +384,8 @@ SchedInstruction *SchedInstruction::GetNxtPrdcsr(InstCount *scsrNum,
 
 SchedInstruction *SchedInstruction::GetFrstScsr(InstCount *prdcsrNum,
                                                 UDT_GLABEL *ltncy,
-                                                DependenceType *depType) {
+                                                DependenceType *depType,
+                                                bool *IsArtificial) {
   GraphEdge *edge = GetFrstScsrEdge();
   if (!edge)
     return NULL;
@@ -394,12 +395,15 @@ SchedInstruction *SchedInstruction::GetFrstScsr(InstCount *prdcsrNum,
     *ltncy = edge->label;
   if (depType)
     *depType = (DependenceType)edge->label2;
+  if (IsArtificial)
+    *IsArtificial = edge->IsArtificial;
   return (SchedInstruction *)(edge->to);
 }
 
 SchedInstruction *SchedInstruction::GetNxtScsr(InstCount *prdcsrNum,
                                                UDT_GLABEL *ltncy,
-                                               DependenceType *depType) {
+                                               DependenceType *depType,
+                                               bool *IsArtificial) {
   GraphEdge *edge = GetNxtScsrEdge();
   if (!edge)
     return NULL;
@@ -409,6 +413,8 @@ SchedInstruction *SchedInstruction::GetNxtScsr(InstCount *prdcsrNum,
     *ltncy = edge->label;
   if (depType)
     *depType = (DependenceType)edge->label2;
+  if (IsArtificial)
+    *IsArtificial = edge->IsArtificial;
   return (SchedInstruction *)(edge->to);
 }
 
diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
index e2ccd8b5..94126a51 100644
--- a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
+++ b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
@@ -460,7 +460,8 @@ void OptSchedDDGWrapperBasic::convertEdges(const SUnit &SU,
     else
       Latency = 1; // unit latency = ignore ilp
 
-    CreateEdge_(SU.NodeNum, I->getSUnit()->NodeNum, Latency, DepType);
+    CreateEdge_(SU.NodeNum, I->getSUnit()->NodeNum, Latency, DepType,
+                IsArtificial);
   }
 }
 

From 58978df53231779f72c6450b6234fcbeabde47e6 Mon Sep 17 00:00:00 2001
From: Vang Thao <vangthao@csus.edu>
Date: Wed, 10 Jun 2020 08:54:03 -0500
Subject: [PATCH 33/40] Missed a file to copy over.

---
 lib/Scheduler/data_dep.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lib/Scheduler/data_dep.cpp b/lib/Scheduler/data_dep.cpp
index 513e8d9e..14a38ee7 100644
--- a/lib/Scheduler/data_dep.cpp
+++ b/lib/Scheduler/data_dep.cpp
@@ -907,7 +907,8 @@ void DataDepGraph::CreateEdge(SchedInstruction *frmNode,
 }
 
 void DataDepGraph::CreateEdge_(InstCount frmNodeNum, InstCount toNodeNum,
-                               int ltncy, DependenceType depType) {
+                               int ltncy, DependenceType depType,
+                               bool IsArtificial) {
   GraphEdge *edge;
 
   assert(frmNodeNum < instCnt_);
@@ -936,7 +937,7 @@ void DataDepGraph::CreateEdge_(InstCount frmNodeNum, InstCount toNodeNum,
     Logger::Info("Creating edge from %d to %d of type %d and latency %d",
                  frmNodeNum, toNodeNum, depType, ltncy);
 #endif
-    edge = new GraphEdge(frmNode, toNode, ltncy, depType);
+    edge = new GraphEdge(frmNode, toNode, ltncy, depType, IsArtificial);
 
     frmNode->AddScsr(edge);
     toNode->AddPrdcsr(edge);

From ee1d32f9fb2d6b39f3aab9e9e4420cf7d80d52fd Mon Sep 17 00:00:00 2001
From: Vang Thao <vangthao@csus.edu>
Date: Fri, 12 Jun 2020 06:37:09 -0500
Subject: [PATCH 34/40] Ignore artificial edges for potential clustering and
 display clusters after sequential scheduler.

---
 include/opt-sched/Scheduler/bb_spill.h     | 14 ++--
 include/opt-sched/Scheduler/sched_region.h |  9 ++-
 lib/Scheduler/bb_spill.cpp                 | 91 ++++++++++++----------
 lib/Scheduler/sched_region.cpp             |  3 +
 lib/Wrapper/OptSchedDDGWrapperBasic.cpp    |  2 +-
 lib/Wrapper/OptimizingScheduler.cpp        | 18 +++--
 6 files changed, 79 insertions(+), 58 deletions(-)

diff --git a/include/opt-sched/Scheduler/bb_spill.h b/include/opt-sched/Scheduler/bb_spill.h
index e0a55a8f..5d7bd0c8 100644
--- a/include/opt-sched/Scheduler/bb_spill.h
+++ b/include/opt-sched/Scheduler/bb_spill.h
@@ -43,10 +43,10 @@ class BBWithSpill : public SchedRegion {
   int ClusterGroupCount;
 
   /// Print the current clusters found so far in the schedule.
-  void printCurrentClustering();
+  void printCurrentClustering() override;
 
   void initForClustering();
-  
+
   /// Calculate the lower bound cost for memory operations clustering and
   /// return the lower bound cost. Does not take into account the clustering
   /// weight.
@@ -69,7 +69,7 @@ class BBWithSpill : public SchedRegion {
   int calculateClusterDLB();
 
   /// Current cluster size
-  unsigned int CurrentClusterSize; 
+  unsigned int CurrentClusterSize;
 
   /// The minimum amount of cluster blocks possible.
   int MinClusterBlocks;
@@ -100,7 +100,7 @@ class BBWithSpill : public SchedRegion {
 
     /// Instruction number that ended this cluster. Used to check if we should
     /// restore the cluster state when backtracking.
-    int InstNum; 
+    int InstNum;
 
     int Start;
 
@@ -110,7 +110,8 @@ class BBWithSpill : public SchedRegion {
 
     /// Constructor for this struct
     PastClusters(int Cluster, int Size, int Instructions, int CycleStart)
-        : ClusterGroup(Cluster), ClusterSize(Size), InstNum(Instructions), Start(CycleStart) {}
+        : ClusterGroup(Cluster), ClusterSize(Size), InstNum(Instructions),
+          Start(CycleStart) {}
   };
 
   /// Vector containing the (n-1) past clusters
@@ -195,7 +196,8 @@ class BBWithSpill : public SchedRegion {
   void InitForCostCmputtn_();
   InstCount CmputDynmcCost_();
 
-  void UpdateSpillInfoForSchdul_(SchedInstruction *inst, bool trackCnflcts, int Start);
+  void UpdateSpillInfoForSchdul_(SchedInstruction *inst, bool trackCnflcts,
+                                 int Start);
   void UpdateSpillInfoForUnSchdul_(SchedInstruction *inst);
   void SetupPhysRegs_();
   void CmputCrntSpillCost_();
diff --git a/include/opt-sched/Scheduler/sched_region.h b/include/opt-sched/Scheduler/sched_region.h
index 553d73b8..d5e6a9e2 100644
--- a/include/opt-sched/Scheduler/sched_region.h
+++ b/include/opt-sched/Scheduler/sched_region.h
@@ -52,6 +52,7 @@ class SchedRegion {
   // Destroys the region. Must be overriden by child classes.
   virtual ~SchedRegion() {}
 
+  virtual void printCurrentClustering() = 0;
   // Returns the dependence graph of this region.
   inline DataDepGraph *GetDepGraph() { return dataDepGraph_; }
   // Returns the lower bound on the cost of this region.
@@ -130,7 +131,7 @@ class SchedRegion {
 
   // The absolute cost lower bound to be used as a ref for normalized costs.
   InstCount costLwrBound_ = 0;
-  
+
   // The best results found so far.
   InstCount bestCost_;
   int BestClusterCost;
@@ -184,9 +185,11 @@ class SchedRegion {
 
   void setBestClusterCost(int BestCost) { BestClusterCost = BestCost; }
 
-  void SetBestSchedLength(InstCount bestSchedLngth) { bestSchedLngth_ = bestSchedLngth; }
+  void SetBestSchedLength(InstCount bestSchedLngth) {
+    bestSchedLngth_ = bestSchedLngth;
+  }
 
-  const SchedPriorities& GetEnumPriorities() const { return enumPrirts_; }
+  const SchedPriorities &GetEnumPriorities() const { return enumPrirts_; }
 
   int16_t GetSigHashSize() const { return sigHashSize_; }
 
diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp
index 6920eae8..f62326da 100644
--- a/lib/Scheduler/bb_spill.cpp
+++ b/lib/Scheduler/bb_spill.cpp
@@ -76,8 +76,8 @@ BBWithSpill::BBWithSpill(const OptSchedTarget *OST_, DataDepGraph *dataDepGraph,
   ClusterGroupCount = dataDepGraph_->getMinClusterCount();
   MinClusterBlocks = 0;
   if (ClusterMemoryOperations && ClusterGroupCount > 0) {
-    ClusterCount.resize(ClusterGroupCount+1);
-    ClusterInstrRemainderCount.resize(ClusterGroupCount+1);
+    ClusterCount.resize(ClusterGroupCount + 1);
+    ClusterInstrRemainderCount.resize(ClusterGroupCount + 1);
     MinClusterBlocks = calculateClusterStaticLB();
     initForClustering();
   }
@@ -97,7 +97,8 @@ void BBWithSpill::initForClustering() {
 
   for (int begin = 1; begin <= ClusterGroupCount; begin++) {
     ClusterCount[begin] = 0;
-    ClusterInstrRemainderCount[begin] = dataDepGraph_->getTotalInstructionsInCluster(begin);
+    ClusterInstrRemainderCount[begin] =
+        dataDepGraph_->getTotalInstructionsInCluster(begin);
   }
 }
 
@@ -124,11 +125,12 @@ int BBWithSpill::calculateClusterStaticLB() {
   int ClusterCost = 0;
   for (int begin = 1; begin <= ClusterGroupCount; begin++) {
     int InstructionCount = dataDepGraph_->getTotalInstructionsInCluster(begin);
-    int CurrentClusterCost = std::ceil(double(InstructionCount) / MAX_INSTR_IN_CLUSTER);
+    int CurrentClusterCost =
+        std::ceil(double(InstructionCount) / MAX_INSTR_IN_CLUSTER);
     Logger::Info("Cost for block %d is %d", begin, CurrentClusterCost);
     ClusterCost += CurrentClusterCost;
   }
-  
+
   return ClusterCost;
 }
 
@@ -357,7 +359,7 @@ InstCount BBWithSpill::CmputCostLwrBound() {
 
   // Add the minimum of the possible clusters to the lower bound
   if (IsSecondPass() && ClusterMemoryOperations) {
-    staticLowerBound  += MinClusterBlocks * ClusteringWeight;
+    staticLowerBound += MinClusterBlocks * ClusteringWeight;
   }
 
 #if defined(IS_DEBUG_STATIC_LOWER_BOUND)
@@ -452,7 +454,7 @@ InstCount BBWithSpill::CmputCost_(InstSchedule *sched, COST_COMP_MODE compMode,
     cost += CurrentClusterCost * ClusteringWeight;
     assert(calculateClusterDLB() == CurrentClusterCost);
   }
-  
+
   sched->SetSpillCosts(spillCosts_);
   sched->SetPeakRegPressures(peakRegPressures_);
   sched->SetSpillCost(crntSpillCost_);
@@ -495,7 +497,7 @@ void BBWithSpill::saveCluster(SchedInstruction *inst) {
   // to its contents.
   LastCluster = llvm::make_unique<PastClusters>(
       ClusterActiveGroup, CurrentClusterSize, inst->GetNum(), StartCycle);
-  
+
   LastCluster->InstrList = std::move(InstrList);
 }
 
@@ -535,28 +537,32 @@ bool BBWithSpill::isClusterFinished() {
   assert(ClusterActiveGroup != 0);
   if (ClusterInstrRemainderCount[ClusterActiveGroup] == 0 ||
       CurrentClusterSize == MAX_INSTR_IN_CLUSTER) {
-        return true;
+    return true;
   }
   return false;
 }
 
 int BBWithSpill::calculateClusterDLB() {
   int OptimisticLowerBound = 0;
-  
+
   for (int begin = 1; begin <= ClusterGroupCount; begin++) {
     if (begin != ClusterActiveGroup)
-      OptimisticLowerBound += std::ceil(double(ClusterInstrRemainderCount[begin])/MAX_INSTR_IN_CLUSTER);
+      OptimisticLowerBound += std::ceil(
+          double(ClusterInstrRemainderCount[begin]) / MAX_INSTR_IN_CLUSTER);
     else {
-      // The amount of instructions remaining that the current open cluster can add
+      // The amount of instructions remaining that the current open cluster can
+      // add
       int AbsorbCount = MAX_INSTR_IN_CLUSTER - CurrentClusterSize;
       // Assume the current open cluster can add the max amount of instructions
       // that a cluster can contain.
       int Remainder = ClusterInstrRemainderCount[begin] - AbsorbCount;
-      // If the remainder is negative then that indicates the open cluster can absorb all of the remaining instructions.
+      // If the remainder is negative then that indicates the open cluster can
+      // absorb all of the remaining instructions.
       if (Remainder < 0)
         Remainder = 0;
       // Estimate the optimistic dynamic lower bound for the current cluster
-      OptimisticLowerBound += std::ceil(double(Remainder)/MAX_INSTR_IN_CLUSTER);
+      OptimisticLowerBound +=
+          std::ceil(double(Remainder) / MAX_INSTR_IN_CLUSTER);
     }
   }
   return CurrentClusterCost + OptimisticLowerBound;
@@ -571,11 +577,12 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst,
   int liveRegs;
   InstCount newSpillCost;
 
-// Conditions for creating a cluster:
-// 1.) If a block is ended before it reaches 15 && there are remaining instructions
+  // Conditions for creating a cluster:
+  // 1.) If a block is ended before it reaches 15 && there are remaining
+  // instructions
 
-// Conditions for removing a cluster:
-// 1.) If the block is not 15 && there are remaining instructions
+  // Conditions for removing a cluster:
+  // 1.) If the block is not 15 && there are remaining instructions
 
   // Scheduling cases for clustering project:
   // 1.) Same Cluster -> Same Cluster
@@ -592,8 +599,9 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst,
     // Check if the current instruction is part of a cluster
     if (inst->GetMayCluster()) {
       // Check if there is a current active cluster
-      // A ClusterActiveGroup == 0 indicates that there is no currently active clustering
-      // While ClusterActiveGroup != 0 indicates that there is active clustering
+      // A ClusterActiveGroup == 0 indicates that there is no currently active
+      // clustering While ClusterActiveGroup != 0 indicates that there is active
+      // clustering
       if (ClusterActiveGroup != 0) {
         // Check if the instruction is in the same cluster group as the active
         // cluster
@@ -602,12 +610,11 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst,
           // already active cluster.
           CurrentClusterSize++;
           ClusterInstrRemainderCount[ClusterActiveGroup]--;
-	        InstrList->push_back(inst->GetName());
+          InstrList->push_back(inst->GetName());
 
           // If we reach the max amount for this cluster then save the cluster
           // and reset.
-          if (isClusterFinished())
-          {
+          if (isClusterFinished()) {
             saveCluster(inst);
             resetActiveCluster(inst);
           }
@@ -835,7 +842,7 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) {
   if (IsSecondPass() && ClusterMemoryOperations) {
     // If the instruction we are backtracking from is part of a cluster
     if (inst->GetMayCluster()) {
-        if (CurrentClusterSize != 0) {
+      if (CurrentClusterSize != 0) {
         // Case 1, 2, and 3
         // Reduce the cluster size
         CurrentClusterSize--;
@@ -1071,7 +1078,7 @@ FUNC_RESULT BBWithSpill::Enumerate_(Milliseconds startTime,
 
     if (GetBestCost() == 0 || rslt == RES_ERROR ||
         (lngthDeadline == rgnDeadline && rslt == RES_TIMEOUT)) { //||
-        //(rslt == RES_SUCCESS && IsSecondPass())) {
+      //(rslt == RES_SUCCESS && IsSecondPass())) {
 
       // If doing two pass optsched and on the second pass then terminate if a
       // schedule is found with the same min-RP found in first pass.
@@ -1080,7 +1087,8 @@ FUNC_RESULT BBWithSpill::Enumerate_(Milliseconds startTime,
         Logger::Info("Schedule found in second pass, terminating BB loop.");
 
         if (trgtLngth  < schedUprBound_)
-          Logger::Info("Schedule found with length %d is shorter than current schedule with length %d.", trgtLngth, schedUprBound_);
+          Logger::Info("Schedule found with length %d is shorter than current
+      schedule with length %d.", trgtLngth, schedUprBound_);
       }*/
 
       break;
@@ -1156,29 +1164,32 @@ void BBWithSpill::printCurrentClustering() {
     dbgs() << "Printing clustered instructions:\n";
     int i = 1;
     for (const auto &clusters : PastClustersList) {
-      dbgs() << "Printing cluster " << i << ", start cycle (" << clusters->Start << "): ";
+      dbgs() << "Printing cluster " << i << ", start cycle (" << clusters->Start
+             << "): ";
       for (const auto &instr : *clusters->InstrList) {
         dbgs() << instr << " ";
       }
       i++;
-    dbgs() << '\n';
+      dbgs() << '\n';
     }
 
     if (LastCluster) {
-      dbgs() << "Printing cluster " << i << ", start cycle (" << LastCluster->Start << "): ";
+      dbgs() << "Printing cluster " << i << ", start cycle ("
+             << LastCluster->Start << "): ";
       for (const auto &instr : *(LastCluster->InstrList)) {
         dbgs() << instr << " ";
       }
       i++;
-    dbgs() << '\n';
+      dbgs() << '\n';
     }
 
     if (InstrList && InstrList->size() > 0) {
-      dbgs() << "Printing cluster " << i << ", start cycle (" << StartCycle << "): ";
+      dbgs() << "Printing cluster " << i << ", start cycle (" << StartCycle
+             << "): ";
       for (const auto &instr : *InstrList) {
         dbgs() << instr << " ";
       }
-    dbgs() << '\n';
+      dbgs() << '\n';
     }
   }
 }
@@ -1226,12 +1237,13 @@ bool BBWithSpill::ChkCostFsblty(InstCount trgtLngth, EnumTreeNode *node) {
   // assert(cost >= 0);
   assert(dynmcCostLwrBound >= 0);
 
-/*
-  if (IsSecondPass() && ClusterMemoryOperations) {
-    dbgs() << "Current cycle: " << node->GetTime() <<", current cost is: " << dynmcCostLwrBound << ". Current best is: " << GetBestCost() << '\n';
-    printCurrentClustering();
-  }
-*/
+  /*
+    if (IsSecondPass() && ClusterMemoryOperations) {
+      dbgs() << "Current cycle: " << node->GetTime() <<", current cost is: " <<
+    dynmcCostLwrBound << ". Current best is: " << GetBestCost() << '\n';
+      printCurrentClustering();
+    }
+  */
 
   fsbl = dynmcCostLwrBound < GetBestCost();
 
@@ -1248,8 +1260,7 @@ bool BBWithSpill::ChkCostFsblty(InstCount trgtLngth, EnumTreeNode *node) {
       node->setClusterLwrBound(ClusterDynamicLowerBound);
       if (ClusterActiveGroup != 0) {
         node->setClusterAbsorbCount(15 - CurrentClusterSize);
-      }
-      else {
+      } else {
         node->setClusterAbsorbCount(0);
       }
     }
diff --git a/lib/Scheduler/sched_region.cpp b/lib/Scheduler/sched_region.cpp
index 762b2625..8d52bb76 100644
--- a/lib/Scheduler/sched_region.cpp
+++ b/lib/Scheduler/sched_region.cpp
@@ -210,6 +210,9 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule(
 
     hurstcTime = Utilities::GetProcessorTime() - hurstcStart;
     stats::heuristicTime.Record(hurstcTime);
+    if (IsSecondPass())
+      printCurrentClustering();
+
     if (hurstcTime > 0)
       Logger::Info("Heuristic_Time %d", hurstcTime);
 
diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
index 94126a51..62305fa3 100644
--- a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
+++ b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
@@ -635,7 +635,7 @@ int OptSchedDDGWrapperBasic::findPossibleClusters(bool IsLoad) {
 
     unsigned ChainPredID = DAG->SUnits.size();
     for (const SDep &Pred : SU.Preds) {
-      if (Pred.isCtrl()) {
+      if (Pred.isCtrl() && !(Pred.isArtificial() || Pred.isCluster())) {
         ChainPredID = Pred.getSUnit()->NodeNum;
         break;
       }
diff --git a/lib/Wrapper/OptimizingScheduler.cpp b/lib/Wrapper/OptimizingScheduler.cpp
index d12e8294..8d6e1d77 100644
--- a/lib/Wrapper/OptimizingScheduler.cpp
+++ b/lib/Wrapper/OptimizingScheduler.cpp
@@ -44,13 +44,13 @@ bool OPTSCHED_gPrintSpills;
 
 // An array of possible OptSched heuristic names
 constexpr struct {
-  const char* Name;
+  const char *Name;
   LISTSCHED_HEURISTIC HID;
-} HeuristicNames[] = {
-    {"CP", LSH_CP},     {"LUC", LSH_LUC}, {"UC", LSH_UC}, {"NID", LSH_NID},
-    {"CPR", LSH_CPR},   {"ISO", LSH_ISO}, {"SC", LSH_SC}, {"LS", LSH_LS},
-    {"LLVM", LSH_LLVM}, {"CLUSTER", LSH_CLUSTER}
-};
+} HeuristicNames[] = {{"CP", LSH_CP},     {"LUC", LSH_LUC},
+                      {"UC", LSH_UC},     {"NID", LSH_NID},
+                      {"CPR", LSH_CPR},   {"ISO", LSH_ISO},
+                      {"SC", LSH_SC},     {"LS", LSH_LS},
+                      {"LLVM", LSH_LLVM}, {"CLUSTER", LSH_CLUSTER}};
 
 // Default path to the the configuration directory for opt-sched.
 static constexpr const char *DEFAULT_CFG_DIR = "~/.optsched-cfg/";
@@ -395,7 +395,8 @@ void ScheduleDAGOptSched::schedule() {
       dbgs() << "  No store clustering possible\n";
 
     Logger::Info("Total clusterable instructions: %d loads, %d stores",
-        TotalLoadsInstructionsClusterable, TotalStoreInstructionsClusterable);
+                 TotalLoadsInstructionsClusterable,
+                 TotalStoreInstructionsClusterable);
 
     // Get the DDG instance so that we can set and get information that will be
     // read later on during enumeration.
@@ -410,7 +411,8 @@ void ScheduleDAGOptSched::schedule() {
     if (end > 0) {
       Logger::Info("Total clusters in region: %d", end);
       for (int begin = 1; begin <= end; begin++) {
-        Logger::Info("  Cluster %d has total instructions %d", begin,
+        Logger::Info(
+            "  Cluster %d has total instructions %d", begin,
             DataDepGraphInstance->getTotalInstructionsInCluster(begin));
       }
     }

From decb49f889d3ff7f3a7631e20a055d2ec434798f Mon Sep 17 00:00:00 2001
From: vang thao <vang@optimizer-amd.ecs.csus.edu>
Date: Sat, 18 Jul 2020 10:09:09 -0700
Subject: [PATCH 35/40] Add option to print cluster information after
 scheduling and revert changes to upper bound calculation.

---
 example/optsched-cfg/sched.ini             |  5 ++
 include/opt-sched/Scheduler/bb_spill.h     |  9 +--
 include/opt-sched/Scheduler/data_dep.h     |  5 ++
 include/opt-sched/Scheduler/sched_region.h |  9 +++
 lib/Scheduler/bb_spill.cpp                 | 47 +++++++------
 lib/Scheduler/data_dep.cpp                 |  4 ++
 lib/Scheduler/sched_region.cpp             | 78 +++++++++++++---------
 7 files changed, 97 insertions(+), 60 deletions(-)

diff --git a/example/optsched-cfg/sched.ini b/example/optsched-cfg/sched.ini
index 11370b31..89a79d2b 100644
--- a/example/optsched-cfg/sched.ini
+++ b/example/optsched-cfg/sched.ini
@@ -8,6 +8,11 @@ USE_OPT_SCHED YES
 # Same options as use optimal scheduling.
 PRINT_SPILL_COUNTS YES
 
+# Print clustering information
+# YES
+# NO
+PRINT_CLUSTER YES
+
 # Use two pass scheduling approach. 
 # First pass minimizes RP and second pass tries to balances RP and ILP.
 # YES
diff --git a/include/opt-sched/Scheduler/bb_spill.h b/include/opt-sched/Scheduler/bb_spill.h
index 5d7bd0c8..ef536b85 100644
--- a/include/opt-sched/Scheduler/bb_spill.h
+++ b/include/opt-sched/Scheduler/bb_spill.h
@@ -42,6 +42,8 @@ class BBWithSpill : public SchedRegion {
   llvm::SmallVector<int, 32> ClusterInstrRemainderCount;
   int ClusterGroupCount;
 
+  void computeAndPrintClustering(InstSchedule *Sched) override;
+
   /// Print the current clusters found so far in the schedule.
   void printCurrentClustering() override;
 
@@ -83,13 +85,6 @@ class BBWithSpill : public SchedRegion {
 
   int StartCycle;
 
-  /// Flag to enable or disable clustering memory operations in the ILP pass.
-  /// Reads from the sched.ini file then set the flag accordingly.
-  bool ClusterMemoryOperations;
-
-  /// The weight for memory ops clustering.
-  int ClusteringWeight;
-
   /// Data struct to contain information about the previous clusters
   struct PastClusters {
     /// The cluster group
diff --git a/include/opt-sched/Scheduler/data_dep.h b/include/opt-sched/Scheduler/data_dep.h
index 2fcd19be..4a1494ed 100644
--- a/include/opt-sched/Scheduler/data_dep.h
+++ b/include/opt-sched/Scheduler/data_dep.h
@@ -647,6 +647,9 @@ class InstSchedule {
   // The schedule's spill cost according to the cost function used
   InstCount spillCost_;
 
+  // The number of clusters
+  int ClusterSize;
+
   // An array of peak reg pressures for all reg types in the schedule
   InstCount *peakRegPressures_;
 
@@ -694,6 +697,8 @@ class InstSchedule {
   InstCount GetExecCost() const;
   void SetSpillCost(InstCount cost);
   InstCount GetSpillCost() const;
+  void setClusterSize(int size);
+  int getClusterSize() const;
 
   void ResetInstIter();
   InstCount GetFrstInst(InstCount &cycleNum, InstCount &slotNum);
diff --git a/include/opt-sched/Scheduler/sched_region.h b/include/opt-sched/Scheduler/sched_region.h
index d5e6a9e2..0cc0a40e 100644
--- a/include/opt-sched/Scheduler/sched_region.h
+++ b/include/opt-sched/Scheduler/sched_region.h
@@ -52,6 +52,10 @@ class SchedRegion {
   // Destroys the region. Must be overriden by child classes.
   virtual ~SchedRegion() {}
 
+  bool PrintClustering;
+
+  virtual void computeAndPrintClustering(InstSchedule *Sched) = 0;
+
   virtual void printCurrentClustering() = 0;
   // Returns the dependence graph of this region.
   inline DataDepGraph *GetDepGraph() { return dataDepGraph_; }
@@ -163,6 +167,11 @@ class SchedRegion {
   InstSchedule *enumBestSched_;
   // The best schedule found so far (may be heuristic or enumerator generated)
   InstSchedule *bestSched_;
+  /// Flag to enable or disable clustering memory operations in the ILP pass.
+  /// Reads from the sched.ini file then set the flag accordingly.
+  bool ClusterMemoryOperations;
+  /// The weight for memory ops clustering.
+  int ClusteringWeight;
 
   // TODO(max): Document.
   InstCount schedLwrBound_;
diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp
index f62326da..92d93fe2 100644
--- a/lib/Scheduler/bb_spill.cpp
+++ b/lib/Scheduler/bb_spill.cpp
@@ -25,7 +25,7 @@ extern bool OPTSCHED_gPrintSpills;
 using namespace llvm::opt_sched;
 
 // The denominator used when calculating cost weight.
-static const int COST_WGHT_BASE = 10;
+static const int COST_WGHT_BASE = 100;
 
 // The max number of instructions in a cluster
 static const unsigned MAX_INSTR_IN_CLUSTER = 15;
@@ -70,12 +70,10 @@ BBWithSpill::BBWithSpill(const OptSchedTarget *OST_, DataDepGraph *dataDepGraph,
   schduldEntryInstCnt_ = 0;
   schduldExitInstCnt_ = 0;
   schduldInstCnt_ = 0;
-  Config &schedIni = SchedulerOptions::getInstance();
-  ClusterMemoryOperations = schedIni.GetBool("CLUSTER_MEMORY_OPS");
-  ClusteringWeight = schedIni.GetInt("CLUSTER_WEIGHT");
   ClusterGroupCount = dataDepGraph_->getMinClusterCount();
   MinClusterBlocks = 0;
-  if (ClusterMemoryOperations && ClusterGroupCount > 0) {
+//  if (ClusterMemoryOperations && ClusterGroupCount > 0) {
+  if (ClusterGroupCount > 0) {
     ClusterCount.resize(ClusterGroupCount + 1);
     ClusterInstrRemainderCount.resize(ClusterGroupCount + 1);
     MinClusterBlocks = calculateClusterStaticLB();
@@ -453,6 +451,7 @@ InstCount BBWithSpill::CmputCost_(InstSchedule *sched, COST_COMP_MODE compMode,
   if (IsSecondPass() && ClusterMemoryOperations) {
     cost += CurrentClusterCost * ClusteringWeight;
     assert(calculateClusterDLB() == CurrentClusterCost);
+    sched->setClusterSize(CurrentClusterCost);
   }
 
   sched->SetSpillCosts(spillCosts_);
@@ -487,6 +486,25 @@ void BBWithSpill::CmputCrntSpillCost_() {
 }
 /*****************************************************************************/
 
+void BBWithSpill::computeAndPrintClustering(InstSchedule *Sched) {
+  InstCount instNum;
+  InstCount cycleNum;
+  InstCount slotNum;
+  SchedInstruction *inst;
+  bool temp = ClusterMemoryOperations;
+
+  ClusterMemoryOperations = true;
+  InitForCostCmputtn_();
+  for (instNum = Sched->GetFrstInst(cycleNum, slotNum);
+       instNum != INVALID_VALUE;
+       instNum = Sched->GetNxtInst(cycleNum, slotNum)) {
+    inst = dataDepGraph_->GetInstByIndx(instNum);
+    SchdulInst(inst, cycleNum, slotNum, false);
+  }
+  printCurrentClustering();
+  ClusterMemoryOperations = temp;
+}
+
 void BBWithSpill::saveCluster(SchedInstruction *inst) {
   if (LastCluster)
     // Save previous clusters in a vector except the last cluster
@@ -1077,28 +1095,14 @@ FUNC_RESULT BBWithSpill::Enumerate_(Milliseconds startTime,
     HandlEnumrtrRslt_(rslt, trgtLngth);
 
     if (GetBestCost() == 0 || rslt == RES_ERROR ||
-        (lngthDeadline == rgnDeadline && rslt == RES_TIMEOUT)) { //||
-      //(rslt == RES_SUCCESS && IsSecondPass())) {
-
-      // If doing two pass optsched and on the second pass then terminate if a
-      // schedule is found with the same min-RP found in first pass.
-      /*
-      if (rslt == RES_SUCCESS && IsSecondPass()) {
-        Logger::Info("Schedule found in second pass, terminating BB loop.");
-
-        if (trgtLngth  < schedUprBound_)
-          Logger::Info("Schedule found with length %d is shorter than current
-      schedule with length %d.", trgtLngth, schedUprBound_);
-      }*/
-
+        (lngthDeadline == rgnDeadline && rslt == RES_TIMEOUT)) {
       break;
     }
 
     enumrtr_->Reset();
     enumCrntSched_->Reset();
 
-    if (!IsSecondPass())
-      CmputSchedUprBound_();
+    CmputSchedUprBound_();
 
     iterCnt++;
     costLwrBound += 1;
@@ -1152,7 +1156,6 @@ InstCount BBWithSpill::UpdtOptmlSched(InstSchedule *crntSched,
     SetBestSchedLength(crntSched->GetCrntLngth());
     enumBestSched_->Copy(crntSched);
     bestSched_ = enumBestSched_;
-    printCurrentClustering();
   }
 
   return GetBestCost();
diff --git a/lib/Scheduler/data_dep.cpp b/lib/Scheduler/data_dep.cpp
index 14a38ee7..7e6b3502 100644
--- a/lib/Scheduler/data_dep.cpp
+++ b/lib/Scheduler/data_dep.cpp
@@ -3059,6 +3059,10 @@ void InstSchedule::SetSpillCost(InstCount cost) { spillCost_ = cost; }
 
 InstCount InstSchedule::GetSpillCost() const { return spillCost_; }
 
+void InstSchedule::setClusterSize(int size) { ClusterSize = size; }
+
+int InstSchedule::getClusterSize() const { return ClusterSize; }
+
 /*******************************************************************************
  * Previously inlined functions
  ******************************************************************************/
diff --git a/lib/Scheduler/sched_region.cpp b/lib/Scheduler/sched_region.cpp
index 8d52bb76..d40cf81d 100644
--- a/lib/Scheduler/sched_region.cpp
+++ b/lib/Scheduler/sched_region.cpp
@@ -49,6 +49,7 @@ SchedRegion::SchedRegion(MachineModel *machMdl, DataDepGraph *dataDepGraph,
   schedUprBound_ = INVALID_VALUE;
 
   spillCostFunc_ = spillCostFunc;
+  PrintClustering = false;
 }
 
 void SchedRegion::UseFileBounds_() {
@@ -124,6 +125,9 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule(
   // heuristic scheduler or ACO before the branch & bound enumerator must be
   // enabled.
   Config &schedIni = SchedulerOptions::getInstance();
+  PrintClustering = schedIni.GetBool("PRINT_CLUSTER");
+  ClusterMemoryOperations = schedIni.GetBool("CLUSTER_MEMORY_OPS");
+  ClusteringWeight = schedIni.GetInt("CLUSTER_WEIGHT");
   bool HeuristicSchedulerEnabled = schedIni.GetBool("HEUR_ENABLED");
   bool AcoSchedulerEnabled = schedIni.GetBool("ACO_ENABLED");
   bool BbSchedulerEnabled = isBbEnabled(schedIni, rgnTimeout);
@@ -178,17 +182,6 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule(
   CmputAbslutUprBound_();
   schedLwrBound_ = dataDepGraph_->GetSchedLwrBound();
 
-  // We can calculate lower bounds here since it is only dependent
-  // on schedLwrBound_
-  if (!BbSchedulerEnabled)
-    costLwrBound_ = CmputCostLwrBound();
-  else
-    CmputLwrBounds_(false);
-
-  // Log the lower bound on the cost, allowing tools reading the log to compare
-  // absolute rather than relative costs.
-  Logger::Info("Lower bound of cost before scheduling: %d", costLwrBound_);
-
   // Step #1: Find the heuristic schedule if enabled.
   // Note: Heuristic scheduler is required for the two-pass scheduler
   // to use the sequential list scheduler which inserts stalls into
@@ -210,12 +203,37 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule(
 
     hurstcTime = Utilities::GetProcessorTime() - hurstcStart;
     stats::heuristicTime.Record(hurstcTime);
-    if (IsSecondPass())
-      printCurrentClustering();
 
     if (hurstcTime > 0)
       Logger::Info("Heuristic_Time %d", hurstcTime);
+  }
 
+  // After the sequential scheduler in the second pass, add the artificial edges
+  // to the DDG. Some mutations were adding artificial edges which caused a
+  // conflict with the sequential scheduler. Therefore, wait until the
+  // sequential scheduler is done before adding artificial edges.
+  if (IsSecondPass()) {
+    static_cast<OptSchedDDGWrapperBasic *>(dataDepGraph_)->addArtificialEdges();
+    rslt = dataDepGraph_->UpdateSetupForSchdulng(needTransitiveClosure);
+    if (rslt != RES_SUCCESS) {
+      Logger::Info("Invalid DAG after adding artificial cluster edges");
+      return rslt;
+    }
+  }
+
+  // This must be done after SetupForSchdulng() or UpdateSetupForSchdulng() to
+  // avoid resetting lower bound values.
+  if (!BbSchedulerEnabled)
+    costLwrBound_ = CmputCostLwrBound();
+  else
+    CmputLwrBounds_(false);
+
+  // Log the lower bound on the cost, allowing tools reading the log to compare
+  // absolute rather than relative costs.
+  Logger::Info("Lower bound of cost before scheduling: %d", costLwrBound_);
+
+  // Cost calculation must be below lower bounds calculation
+  if (HeuristicSchedulerEnabled || IsSecondPass()) {
     heuristicScheduleLength = lstSched->GetCrntLngth();
     InstCount hurstcExecCost;
     // Compute cost for Heuristic list scheduler, this must be called before
@@ -223,6 +241,9 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule(
     CmputNormCost_(lstSched, CCM_DYNMC, hurstcExecCost, true);
     hurstcCost_ = lstSched->GetCost();
 
+    if (IsSecondPass() && PrintClustering)
+      computeAndPrintClustering(lstSched);
+
     // This schedule is optimal so ACO will not be run
     // so set bestSched here.
     if (hurstcCost_ == 0) {
@@ -230,6 +251,8 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule(
       bestSched = bestSched_ = lstSched;
       bestSchedLngth_ = heuristicScheduleLength;
       bestCost_ = hurstcCost_;
+      if (IsSecondPass() && ClusterMemoryOperations)
+        bestSched->setClusterSize(lstSched->getClusterSize());
     }
 
     FinishHurstc_();
@@ -249,19 +272,6 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule(
 #endif
   }
 
-  // After the sequential scheduler in the second pass, add the artificial edges
-  // to the DDG. Some mutations were adding artificial edges which caused a
-  // conflict with the sequential scheduler. Therefore, wait until the
-  // sequential scheduler is done before adding artificial edges.
-  if (IsSecondPass()) {
-    static_cast<OptSchedDDGWrapperBasic *>(dataDepGraph_)->addArtificialEdges();
-    rslt = dataDepGraph_->UpdateSetupForSchdulng(needTransitiveClosure);
-    if (rslt != RES_SUCCESS) {
-      Logger::Info("Invalid DAG after adding artificial cluster edges");
-      return rslt;
-    }
-  }
-
   // Step #2: Use ACO to find a schedule if enabled and no optimal schedule is
   // yet to be found.
   if (AcoBeforeEnum && !isLstOptml) {
@@ -297,6 +307,8 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule(
       bestSched = bestSched_ = AcoSchedule;
       bestSchedLngth_ = AcoScheduleLength_;
       bestCost_ = AcoScheduleCost_;
+      if (IsSecondPass() && ClusterMemoryOperations)
+        bestSched->setClusterSize(AcoSchedule->getClusterSize());
     }
   }
 
@@ -312,6 +324,8 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule(
       bestSched = bestSched_ = lstSched;
       bestSchedLngth_ = heuristicScheduleLength;
       bestCost_ = hurstcCost_;
+      if (IsSecondPass() && ClusterMemoryOperations)
+        bestSched->setClusterSize(lstSched->getClusterSize());
     }
     // B) Heuristic was never run. In that case, just use ACO and run with its
     // results, into B&B.
@@ -319,6 +333,8 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule(
       bestSched = bestSched_ = AcoSchedule;
       bestSchedLngth_ = AcoScheduleLength_;
       bestCost_ = AcoScheduleCost_;
+      if (IsSecondPass() && ClusterMemoryOperations)
+        bestSched->setClusterSize(AcoSchedule->getClusterSize());
       // C) Neither scheduler was optimal. In that case, compare the two
       // schedules and use the one that's better as the input (initialSched) for
       // B&B.
@@ -327,6 +343,8 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule(
       bestSched = bestSched_;
       bestSchedLngth_ = bestSched_->GetCrntLngth();
       bestCost_ = bestSched_->GetCost();
+      if (IsSecondPass() && ClusterMemoryOperations)
+        bestSched->setClusterSize(bestSched_->getClusterSize());
     }
   }
   // Step #3: Compute the cost upper bound.
@@ -453,6 +471,9 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule(
 
     enumTime = Utilities::GetProcessorTime() - enumStart;
     stats::enumerationTime.Record(enumTime);
+
+    if (IsSecondPass() && PrintClustering && enumBestSched_ != NULL)
+      computeAndPrintClustering(enumBestSched_);
   }
 
   // Step 5: Run ACO if schedule from enumerator is not optimal
@@ -727,11 +748,6 @@ bool SchedRegion::CmputUprBounds_(InstSchedule *schedule, bool useFileBounds) {
     // If the heuristic schedule is optimal, we are done!
     schedUprBound_ = bestSchedLngth_;
     return true;
-  } else if (IsSecondPass()) {
-    // In the second pass, the upper bound is the length of the min-RP schedule
-    // that was found in the first pass with stalls inserted.
-    schedUprBound_ = schedule->GetCrntLngth();
-    return false;
   } else {
     CmputSchedUprBound_();
     return false;

From 913f83d150c24bc053948338e6ec568e6df99fc5 Mon Sep 17 00:00:00 2001
From: vang thao <vang@optimizer-amd.ecs.csus.edu>
Date: Sat, 15 Aug 2020 17:27:26 -0700
Subject: [PATCH 36/40] Added 2nd ILP pass with lower target occupancy

---
 include/opt-sched/Scheduler/data_dep.h     |  1 +
 include/opt-sched/Scheduler/sched_region.h |  7 ++-
 lib/CMakeLists.txt                         |  1 +
 lib/Scheduler/bb_spill.cpp                 | 27 ++++++--
 lib/Scheduler/data_dep.cpp                 | 39 ++++++++++++
 lib/Scheduler/sched_region.cpp             | 15 +++--
 lib/Wrapper/AMDGPU/GCNOptSched.cpp         | 45 ++++++++++++-
 lib/Wrapper/AMDGPU/GCNOptSched.h           | 10 ++-
 lib/Wrapper/AMDGPU/OptSchedGCNTarget.cpp   | 71 +++++----------------
 lib/Wrapper/AMDGPU/OptSchedGCNTarget.h     | 73 ++++++++++++++++++++++
 lib/Wrapper/OptSchedDDGWrapperBasic.cpp    | 16 ++---
 lib/Wrapper/OptimizingScheduler.cpp        | 51 ++++++++++-----
 lib/Wrapper/OptimizingScheduler.h          | 13 +++-
 13 files changed, 274 insertions(+), 95 deletions(-)
 create mode 100644 lib/Wrapper/AMDGPU/OptSchedGCNTarget.h

diff --git a/include/opt-sched/Scheduler/data_dep.h b/include/opt-sched/Scheduler/data_dep.h
index 4a1494ed..5b021145 100644
--- a/include/opt-sched/Scheduler/data_dep.h
+++ b/include/opt-sched/Scheduler/data_dep.h
@@ -722,6 +722,7 @@ class InstSchedule {
   void Print(std::ostream &out, char const *const title);
   void PrintInstList(FILE *file, DataDepGraph *dataDepGraph,
                      const char *title) const;
+  void Print(std::ostream &out, char const *const title, DataDepGraph *ddg);
   void PrintRegPressures() const;
   bool Verify(MachineModel *machMdl, DataDepGraph *dataDepGraph);
   void PrintClassData();
diff --git a/include/opt-sched/Scheduler/sched_region.h b/include/opt-sched/Scheduler/sched_region.h
index 0cc0a40e..2685b7d0 100644
--- a/include/opt-sched/Scheduler/sched_region.h
+++ b/include/opt-sched/Scheduler/sched_region.h
@@ -53,7 +53,7 @@ class SchedRegion {
   virtual ~SchedRegion() {}
 
   bool PrintClustering;
-
+  bool TwoPassEnabled;
   virtual void computeAndPrintClustering(InstSchedule *Sched) = 0;
 
   virtual void printCurrentClustering() = 0;
@@ -113,6 +113,9 @@ class SchedRegion {
   // Initialie variables for the second pass of the two-pass-optsched
   void InitSecondPass();
 
+  bool enumFoundSchedule() { return EnumFoundSchedule; }
+  void setEnumFoundSchedule() { EnumFoundSchedule = true; }
+
 private:
   // The algorithm to use for calculated lower bounds.
   LB_ALG lbAlg_;
@@ -133,6 +136,8 @@ class SchedRegion {
   // Used for two-pass-optsched to enable second pass functionalies.
   bool isSecondPass_;
 
+  bool EnumFoundSchedule;
+
   // The absolute cost lower bound to be used as a ref for normalized costs.
   InstCount costLwrBound_ = 0;
 
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 847cc6e5..9102bf94 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -5,3 +5,4 @@ ELSE()
 ENDIF()
 
 add_dependencies(OptSched ${OPT_SCHED_TARGET_DEPS})
+target_link_libraries(OptSched -L/home/vang/src/ROCm-2.4/opencl/build/lib/ libamdocl64.so)
diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp
index 92d93fe2..ee817ec7 100644
--- a/lib/Scheduler/bb_spill.cpp
+++ b/lib/Scheduler/bb_spill.cpp
@@ -381,7 +381,7 @@ void BBWithSpill::InitForSchdulng() {
 /*****************************************************************************/
 
 void BBWithSpill::InitForCostCmputtn_() {
-  if (IsSecondPass() && ClusterMemoryOperations)
+  if (ClusterMemoryOperations && (IsSecondPass() || !TwoPassEnabled))
     initForClustering();
 
   int i;
@@ -434,8 +434,23 @@ InstCount BBWithSpill::CmputNormCost_(InstSchedule *sched,
 
 InstCount BBWithSpill::CmputCost_(InstSchedule *sched, COST_COMP_MODE compMode,
                                   InstCount &execCost, bool trackCnflcts) {
+
+  InstCount instNum;
+  InstCount cycleNum;
+  InstCount slotNum;
+  SchedInstruction *inst;
+
   if (compMode == CCM_STTC) {
-    if (GetSpillCostFunc() == SCF_SPILLS) {
+    if (GetSpillCostFunc() != SCF_SPILLS) {
+      InitForCostCmputtn_();
+
+      for (instNum = sched->GetFrstInst(cycleNum, slotNum);
+           instNum != INVALID_VALUE;
+           instNum = sched->GetNxtInst(cycleNum, slotNum)) {
+        inst = dataDepGraph_->GetInstByIndx(instNum);
+        SchdulInst(inst, cycleNum, slotNum, trackCnflcts);
+      }
+    } else {
       LocalRegAlloc regAlloc(sched, dataDepGraph_);
       regAlloc.SetupForRegAlloc();
       regAlloc.AllocRegs();
@@ -613,7 +628,7 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst,
 
   // Possibly keep track of the current memory clustering size here
   // and in UpdateSpillInfoForUnSchdul_()
-  if (IsSecondPass() && ClusterMemoryOperations) {
+  if (ClusterMemoryOperations && (IsSecondPass() || !TwoPassEnabled)) {
     // Check if the current instruction is part of a cluster
     if (inst->GetMayCluster()) {
       // Check if there is a current active cluster
@@ -857,7 +872,7 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) {
   // 2.) Non-Cluster <- Cluster
   // 3.) Different Cluster <- Cluster
   // 4.) Cluster <- Non-cluster
-  if (IsSecondPass() && ClusterMemoryOperations) {
+  if (ClusterMemoryOperations && (IsSecondPass() || !TwoPassEnabled)) {
     // If the instruction we are backtracking from is part of a cluster
     if (inst->GetMayCluster()) {
       if (CurrentClusterSize != 0) {
@@ -1156,6 +1171,8 @@ InstCount BBWithSpill::UpdtOptmlSched(InstSchedule *crntSched,
     SetBestSchedLength(crntSched->GetCrntLngth());
     enumBestSched_->Copy(crntSched);
     bestSched_ = enumBestSched_;
+    if (!enumFoundSchedule())
+      setEnumFoundSchedule();
   }
 
   return GetBestCost();
@@ -1163,7 +1180,7 @@ InstCount BBWithSpill::UpdtOptmlSched(InstSchedule *crntSched,
 
 void BBWithSpill::printCurrentClustering() {
   // Print the instructions in the clusters after finding a schedule.
-  if (IsSecondPass() && ClusterMemoryOperations) {
+  if (ClusterMemoryOperations && (IsSecondPass() || !TwoPassEnabled)) {
     dbgs() << "Printing clustered instructions:\n";
     int i = 1;
     for (const auto &clusters : PastClustersList) {
diff --git a/lib/Scheduler/data_dep.cpp b/lib/Scheduler/data_dep.cpp
index 7e6b3502..ef6e2cda 100644
--- a/lib/Scheduler/data_dep.cpp
+++ b/lib/Scheduler/data_dep.cpp
@@ -2762,6 +2762,7 @@ void InstSchedule::Copy(InstSchedule *src) {
 
   SetSpillCosts(src->spillCosts_);
   SetPeakRegPressures(src->peakRegPressures_);
+  setClusterSize(src->getClusterSize());
   cost_ = src->cost_;
   execCost_ = src->execCost_;
   spillCost_ = src->spillCost_;
@@ -2836,6 +2837,44 @@ void InstSchedule::Print(std::ostream &out, char const *const label) {
   }
 }
 
+
+ void InstSchedule::Print(std::ostream &out, char const *const title,
+		 DataDepGraph *ddg) {
+  InstCount slotInCycle = 0;
+  InstCount cycleNum = 0;
+  InstCount i;
+
+  // out << '\n' << label << " Schedule";
+  Logger::Info("Printing Schedule");
+
+  for (i = 0; i < crntSlotNum_; i++) {
+    if (slotInCycle == 0) {
+      if (instInSlot_[i] != SCHD_STALL) {
+        InstCount instNum = instInSlot_[i];
+        SchedInstruction *inst = ddg->GetInstByIndx(instNum);
+        Logger::Info("Cycle# %d : %d - %s", cycleNum, instInSlot_[i], inst->GetName());
+      } else
+        Logger::Info("Cycle# %d : %d -", cycleNum, instInSlot_[i]);
+    } 
+    /*
+    out << "\nCycle# " << cycleNum << ":  ";
+
+    if (instInSlot_[i] == SCHD_STALL) {
+      out << "X ";
+    } else {
+      out << instInSlot_[i] << ' ';
+    }
+   */
+
+    slotInCycle++;
+
+    if (slotInCycle == issuRate_) {
+      slotInCycle = 0;
+      cycleNum++;
+    }
+  }
+ }
+
 #if defined(IS_DEBUG_PEAK_PRESSURE) || defined(IS_DEBUG_OPTSCHED_PRESSURES)
 void InstSchedule::PrintRegPressures() const {
   Logger::Info("OptSched max reg pressures:");
diff --git a/lib/Scheduler/sched_region.cpp b/lib/Scheduler/sched_region.cpp
index d40cf81d..cb545bc7 100644
--- a/lib/Scheduler/sched_region.cpp
+++ b/lib/Scheduler/sched_region.cpp
@@ -50,6 +50,7 @@ SchedRegion::SchedRegion(MachineModel *machMdl, DataDepGraph *dataDepGraph,
 
   spillCostFunc_ = spillCostFunc;
   PrintClustering = false;
+  EnumFoundSchedule = false;
 }
 
 void SchedRegion::UseFileBounds_() {
@@ -126,6 +127,7 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule(
   // enabled.
   Config &schedIni = SchedulerOptions::getInstance();
   PrintClustering = schedIni.GetBool("PRINT_CLUSTER");
+  TwoPassEnabled = schedIni.GetBool("USE_TWO_PASS");
   ClusterMemoryOperations = schedIni.GetBool("CLUSTER_MEMORY_OPS");
   ClusteringWeight = schedIni.GetInt("CLUSTER_WEIGHT");
   bool HeuristicSchedulerEnabled = schedIni.GetBool("HEUR_ENABLED");
@@ -241,9 +243,6 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule(
     CmputNormCost_(lstSched, CCM_DYNMC, hurstcExecCost, true);
     hurstcCost_ = lstSched->GetCost();
 
-    if (IsSecondPass() && PrintClustering)
-      computeAndPrintClustering(lstSched);
-
     // This schedule is optimal so ACO will not be run
     // so set bestSched here.
     if (hurstcCost_ == 0) {
@@ -471,9 +470,6 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule(
 
     enumTime = Utilities::GetProcessorTime() - enumStart;
     stats::enumerationTime.Record(enumTime);
-
-    if (IsSecondPass() && PrintClustering && enumBestSched_ != NULL)
-      computeAndPrintClustering(enumBestSched_);
   }
 
   // Step 5: Run ACO if schedule from enumerator is not optimal
@@ -646,6 +642,13 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule(
   }
 #endif
 
+  if (PrintClustering && bestSched != NULL && (IsSecondPass() || !TwoPassEnabled)) {
+    computeAndPrintClustering(bestSched);
+  }
+
+  //if (bestSched != NULL)
+    //bestSched->Print(Logger::GetLogStream(), "FinalSched", dataDepGraph_);
+
   return rslt;
 }
 
diff --git a/lib/Wrapper/AMDGPU/GCNOptSched.cpp b/lib/Wrapper/AMDGPU/GCNOptSched.cpp
index 46d6c1a3..8b987e3e 100644
--- a/lib/Wrapper/AMDGPU/GCNOptSched.cpp
+++ b/lib/Wrapper/AMDGPU/GCNOptSched.cpp
@@ -8,7 +8,10 @@
 #include "AMDGPUMacroFusion.h"
 #include "GCNSchedStrategy.h"
 #include "SIMachineFunctionInfo.h"
+#include "OptSchedGCNTarget.h"
+//#include "llvm/CodeGen/OptSequential.h"
 #include "llvm/Support/Debug.h"
+#include <algorithm>
 
 #define DEBUG_TYPE "optsched"
 
@@ -60,6 +63,7 @@ void ScheduleDAGOptSchedGCN::initSchedulers() {
   SchedPasses.push_back(OptSchedMaxOcc);
   // Second
   SchedPasses.push_back(OptSchedBalanced);
+  SchedPasses.push_back(OptSchedReschedule);
 }   
 
 // Execute scheduling passes.
@@ -67,15 +71,42 @@ void ScheduleDAGOptSchedGCN::initSchedulers() {
 void ScheduleDAGOptSchedGCN::finalizeSchedule() {
   if (TwoPassEnabled && OptSchedEnabled) {
     initSchedulers();
+    RescheduleRegions.resize(Regions.size());
+    RescheduleRegions.set();
 
     LLVM_DEBUG(dbgs() << "Starting two pass scheduling approach\n");
     TwoPassSchedulingStarted = true;
     for (const SchedPassStrategy &S : SchedPasses) {
       MachineBasicBlock *MBB = nullptr;
       // Reset
-      RegionNumber = ~0u;
+      RegionIdx = 0;
+      if (S == OptSchedReschedule) {
+        if (RescheduleRegions.none()) {
+	  dbgs() << "No regions to reschedule.\n";
+	  continue;
+	} else {
+          auto GCNOST = static_cast<OptSchedGCNTarget *>(OST.get());
+          unsigned TargetOccupancy = GCNOST->getTargetOcc();
+          if (TargetOccupancy == 1u) {
+            dbgs() << "Cannot lower occupancy to below 1.\n";
+	    continue;
+	  }
+
+          dbgs() << "Beginning rescheduling of regions.\n";
+	  unsigned NewTarget = TargetOccupancy - 1u;
+	  dbgs() << "Decreasing current target occupancy " << TargetOccupancy
+                 << " to new target " << NewTarget << '\n';
+	  GCNOST->limitOccupancy(NewTarget);
+	}
+      }
 
       for (auto &Region : Regions) {
+	/*if (S == OptSchedReschedule && !RescheduleRegions[RegionIdx]) {
+	  dbgs() << "Region " << RegionIdx << " does not need to be rescheduled.\n";
+	  ++RegionIdx;
+	  continue;
+	}*/
+
         RegionBegin = Region.first;
         RegionEnd = Region.second;
 
@@ -98,6 +129,7 @@ void ScheduleDAGOptSchedGCN::finalizeSchedule() {
         LLVM_DEBUG(getRealRegionPressure(RegionBegin, RegionEnd, LIS, "After"));
         Region = std::make_pair(RegionBegin, RegionEnd);
         exitRegion();
+        ++RegionIdx;
       }
       finishBlock();
     }
@@ -114,6 +146,7 @@ void ScheduleDAGOptSchedGCN::finalizeSchedule() {
 }
 
 void ScheduleDAGOptSchedGCN::runSchedPass(SchedPassStrategy S) {
+  RescheduleRegions[RegionIdx] = false;
   switch (S) {
   case GCNMaxOcc:
     scheduleGCNMaxOcc();
@@ -124,6 +157,9 @@ void ScheduleDAGOptSchedGCN::runSchedPass(SchedPassStrategy S) {
   case OptSchedBalanced:
     scheduleOptSchedBalanced();
     break;
+  case OptSchedReschedule:
+    scheduleOptSchedReschedule();
+    break;
   }
 }
 
@@ -144,3 +180,10 @@ void ScheduleDAGOptSchedGCN::scheduleOptSchedMaxOcc() {
 void ScheduleDAGOptSchedGCN::scheduleOptSchedBalanced() {
   ScheduleDAGOptSched::scheduleOptSchedBalanced();
 }
+
+void ScheduleDAGOptSchedGCN::scheduleOptSchedReschedule() {
+  IsThirdPass = true;
+  ScheduleDAGOptSched::scheduleOptSchedBalanced();
+  Logger::Info("End of third pass through\n");
+}
+
diff --git a/lib/Wrapper/AMDGPU/GCNOptSched.h b/lib/Wrapper/AMDGPU/GCNOptSched.h
index f08056aa..3d2646af 100644
--- a/lib/Wrapper/AMDGPU/GCNOptSched.h
+++ b/lib/Wrapper/AMDGPU/GCNOptSched.h
@@ -9,13 +9,19 @@
 
 #include "../OptimizingScheduler.h"
 #include "GCNRegPressure.h"
+#include "OptSchedGCNTarget.h"
 
 namespace llvm {
 namespace opt_sched {
 
 class ScheduleDAGOptSchedGCN : public ScheduleDAGOptSched {
 private:
-  enum SchedPassStrategy { GCNMaxOcc, OptSchedMaxOcc, OptSchedBalanced };
+  enum SchedPassStrategy {
+    GCNMaxOcc,
+    OptSchedMaxOcc,
+    OptSchedBalanced,
+    OptSchedReschedule
+  };
 
   // Vector of scheduling passes to execute.
   SmallVector<SchedPassStrategy, 4> SchedPasses;
@@ -45,6 +51,8 @@ class ScheduleDAGOptSchedGCN : public ScheduleDAGOptSched {
 
   // Run OptSched in ILP/RP balanced mode.
   void scheduleOptSchedBalanced() override;
+
+  void scheduleOptSchedReschedule();
 };
 
 } // namespace opt_sched
diff --git a/lib/Wrapper/AMDGPU/OptSchedGCNTarget.cpp b/lib/Wrapper/AMDGPU/OptSchedGCNTarget.cpp
index 21faf51e..9f63a720 100644
--- a/lib/Wrapper/AMDGPU/OptSchedGCNTarget.cpp
+++ b/lib/Wrapper/AMDGPU/OptSchedGCNTarget.cpp
@@ -3,6 +3,7 @@
 // AMDGCN OptSched target.
 //
 //===----------------------------------------------------------------------===//
+#include "OptSchedGCNTarget.h"
 #include "OptSchedDDGWrapperGCN.h"
 #include "SIMachineFunctionInfo.h"
 #include "Wrapper/OptSchedMachineWrapper.h"
@@ -22,7 +23,7 @@ using namespace llvm::opt_sched;
 
 // This is necessary because we cannot perfectly predict the number of registers
 // of each type that will be allocated.
-static const unsigned GPRErrorMargin = 3;
+static const unsigned GPRErrorMargin = 0;
 
 #ifndef NDEBUG
 static unsigned getOccupancyWeight(unsigned Occupancy) {
@@ -62,56 +63,6 @@ static unsigned getAdjustedOccupancy(const GCNSubtarget *ST, unsigned VGPRCount,
 
 namespace {
 
-class OptSchedGCNTarget : public OptSchedTarget {
-public:
-  std::unique_ptr<OptSchedMachineModel>
-  createMachineModel(const char *ConfigPath) override {
-    return llvm::make_unique<OptSchedMachineModel>(ConfigPath);
-  }
-
-  std::unique_ptr<OptSchedDDGWrapperBase>
-  createDDGWrapper(llvm::MachineSchedContext *Context, ScheduleDAGOptSched *DAG,
-                   OptSchedMachineModel *MM, LATENCY_PRECISION LatencyPrecision,
-                   const std::string &RegionID) override {
-    return llvm::make_unique<OptSchedDDGWrapperGCN>(Context, DAG, MM,
-                                                    LatencyPrecision, RegionID);
-  }
-
-  void initRegion(llvm::ScheduleDAGInstrs *DAG, MachineModel *MM_) override;
-
-  void finalizeRegion(const InstSchedule *Schedule) override;
-
-  // Returns occupancy cost with number of VGPRs and SGPRs from PRP for
-  // a partial or complete schedule.
-  InstCount getCost(const llvm::SmallVectorImpl<unsigned> &PRP) const override;
-
-  void dumpOccupancyInfo(const InstSchedule *Schedule) const;
-
-  // Revert scheduing if we decrease occupancy.
-  bool shouldKeepSchedule() override;
-
-private:
-  const llvm::MachineFunction *MF;
-  SIMachineFunctionInfo *MFI;
-  ScheduleDAGOptSched *DAG;
-  const GCNSubtarget *ST;
-
-  unsigned RegionStartingOccupancy;
-  unsigned RegionEndingOccupancy;
-  unsigned TargetOccupancy;
-
-  // Max occupancy with local memory size;
-  unsigned MaxOccLDS;
-
-  // In RP only (max occupancy) scheduling mode we should try to find
-  // a min-RP schedule without considering perf hints which suggest limiting
-  // occupancy. Returns true if we should consider perf hints.
-  bool shouldLimitWaves() const;
-
-  // Find occupancy with spill cost.
-  unsigned getOccupancyWithCost(const InstCount Cost) const;
-};
-
 std::unique_ptr<OptSchedTarget> createOptSchedGCNTarget() {
   return llvm::make_unique<OptSchedGCNTarget>();
 }
@@ -161,9 +112,9 @@ void OptSchedGCNTarget::initRegion(llvm::ScheduleDAGInstrs *DAG_,
   TargetOccupancy =
       shouldLimitWaves() ? MFI->getMinAllowedOccupancy() : MFI->getOccupancy();
 
-  LLVM_DEBUG(dbgs() << "Region starting occupancy is "
+  dbgs() << "Region starting occupancy is "
                     << RegionStartingOccupancy << "\n"
-                    << "Target occupancy is " << TargetOccupancy << "\n");
+                    << "Target occupancy is " << TargetOccupancy << "\n";
 }
 
 bool OptSchedGCNTarget::shouldLimitWaves() const {
@@ -173,6 +124,16 @@ bool OptSchedGCNTarget::shouldLimitWaves() const {
   return false;
 }
 
+void OptSchedGCNTarget::setTargetOcc(unsigned Target) {
+  dbgs() << "Setting target occupancy to " << Target << '\n';
+  TargetOccupancy = Target;
+}
+void OptSchedGCNTarget::limitOccupancy(unsigned Limit) {
+  dbgs() << "Limiting occupancy to " << Limit << '\n';
+  MFI->limitOccupancy(Limit);
+  TargetOccupancy = MFI->getOccupancy();
+}
+
 unsigned OptSchedGCNTarget::getOccupancyWithCost(const InstCount Cost) const {
   return TargetOccupancy - Cost;
 }
@@ -184,9 +145,9 @@ void OptSchedGCNTarget::finalizeRegion(const InstSchedule *Schedule) {
   // If we decrease occupancy we may revert scheduling.
   unsigned RegionOccupancy =
       std::max(RegionStartingOccupancy, RegionEndingOccupancy);
-  LLVM_DEBUG(if (RegionOccupancy < MFI->getOccupancy()) dbgs()
+  if (RegionOccupancy < MFI->getOccupancy()) dbgs()
              << "Limiting occupancy to " << RegionEndingOccupancy
-             << " waves.\n");
+             << " waves.\n";
   MFI->limitOccupancy(RegionOccupancy);
 }
 
diff --git a/lib/Wrapper/AMDGPU/OptSchedGCNTarget.h b/lib/Wrapper/AMDGPU/OptSchedGCNTarget.h
new file mode 100644
index 00000000..996caaff
--- /dev/null
+++ b/lib/Wrapper/AMDGPU/OptSchedGCNTarget.h
@@ -0,0 +1,73 @@
+#ifndef LLVM_GCN_OPT_SCHED_TARGET_H
+#define LLVM_GCN_OPT_SCHED_TARGET_H
+
+#include "OptSchedDDGWrapperGCN.h"
+#include "SIMachineFunctionInfo.h"
+#include "Wrapper/OptSchedMachineWrapper.h"
+#include "opt-sched/Scheduler/OptSchedTarget.h"
+#include "opt-sched/Scheduler/data_dep.h"
+#include "opt-sched/Scheduler/defines.h"
+#include "opt-sched/Scheduler/machine_model.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include <algorithm>
+#include <memory>
+
+using namespace llvm;
+using namespace llvm::opt_sched;
+
+class OptSchedGCNTarget : public OptSchedTarget {
+public:
+  std::unique_ptr<OptSchedMachineModel>
+  createMachineModel(const char *ConfigPath) override {
+    return llvm::make_unique<OptSchedMachineModel>(ConfigPath);
+  }
+
+  std::unique_ptr<OptSchedDDGWrapperBase>
+  createDDGWrapper(llvm::MachineSchedContext *Context, ScheduleDAGOptSched *DAG,
+                   OptSchedMachineModel *MM, LATENCY_PRECISION LatencyPrecision,
+                   const std::string &RegionID) override {
+    return llvm::make_unique<OptSchedDDGWrapperGCN>(Context, DAG, MM,
+                                                    LatencyPrecision, RegionID);
+  }
+
+  void initRegion(llvm::ScheduleDAGInstrs *DAG, MachineModel *MM_) override;
+
+  void finalizeRegion(const InstSchedule *Schedule) override;
+
+  // Returns occupancy cost with number of VGPRs and SGPRs from PRP for
+  // a partial or complete schedule.
+  InstCount getCost(const llvm::SmallVectorImpl<unsigned> &PRP) const override;
+
+  void dumpOccupancyInfo(const InstSchedule *Schedule) const;
+
+  // Revert scheduing if we decrease occupancy.
+  bool shouldKeepSchedule() override;
+
+  void limitOccupancy(unsigned Limit);
+  unsigned getTargetOcc() { return TargetOccupancy; }
+  void setTargetOcc(unsigned Target);
+
+private:
+  const llvm::MachineFunction *MF;
+  SIMachineFunctionInfo *MFI;
+  ScheduleDAGOptSched *DAG;
+  const GCNSubtarget *ST;
+
+  unsigned RegionStartingOccupancy;
+  unsigned RegionEndingOccupancy;
+  unsigned TargetOccupancy;
+
+  // Max occupancy with local memory size;
+  unsigned MaxOccLDS;
+
+  // In RP only (max occupancy) scheduling mode we should try to find
+  // a min-RP schedule without considering perf hints which suggest limiting
+  // occupancy. Returns true if we should consider perf hints.
+  bool shouldLimitWaves() const;
+
+  // Find occupancy with spill cost.
+  unsigned getOccupancyWithCost(const InstCount Cost) const;
+};
+
+#endif
diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
index 62305fa3..f5b03fe7 100644
--- a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
+++ b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
@@ -543,7 +543,7 @@ int OptSchedDDGWrapperBasic::clusterNeighboringMemOps(
   }
 
   if (MemOpRecords.size() < 2) {
-    dbgs() << "  Unable to cluster memop cluster of 1.\n";
+    LLVM_DEBUG(dbgs() << "  Unable to cluster memop cluster of 1.\n");
     return 0;
   }
 
@@ -552,16 +552,16 @@ int OptSchedDDGWrapperBasic::clusterNeighboringMemOps(
   for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) {
     const SUnit *SUa = MemOpRecords[Idx].SU;
     const SUnit *SUb = MemOpRecords[Idx + 1].SU;
-    dbgs() << "  Checking possible clustering of (" << SUa->NodeNum << ") and ("
-           << SUb->NodeNum << ")\n";
+    LLVM_DEBUG(dbgs() << "  Checking possible clustering of (" << SUa->NodeNum << ") and ("
+           << SUb->NodeNum << ")\n");
 
     // Pass constant of 1 to AMD's function to determine clustering to remove
     // the limit of 15. Our enumerator can determine when it has reached the
     // limit instead of depending on AMD.
     if (DAG->TII->shouldClusterMemOps(*MemOpRecords[Idx].BaseOp,
                                       *MemOpRecords[Idx + 1].BaseOp, 1u)) {
-      dbgs() << "    Cluster possible at SU(" << SUa->NodeNum << ")- SU("
-             << SUb->NodeNum << ")\n";
+      LLVM_DEBUG(dbgs() << "    Cluster possible at SU(" << SUa->NodeNum << ")- SU("
+             << SUb->NodeNum << ")\n");
 
       // If clustering is possible then increase the cluster count. This only
       // happens once every new cluster
@@ -653,10 +653,10 @@ int OptSchedDDGWrapperBasic::findPossibleClusters(bool IsLoad) {
   // Iterate over the store chains.
   for (auto &SCD : StoreChainDependents) {
     // Print the chain that LLVM has found
-    dbgs() << "Printing the Node ID of the current chain: ";
+    LLVM_DEBUG(dbgs() << "Printing the Node ID of the current chain: ");
     for (auto SU1 : SCD)
-      dbgs() << SU1->NodeNum << " ";
-    dbgs() << '\n';
+      LLVM_DEBUG(dbgs() << SU1->NodeNum << " ");
+    LLVM_DEBUG(dbgs() << '\n');
 
     TotalInstructionsPossible += clusterNeighboringMemOps(SCD);
   }
diff --git a/lib/Wrapper/OptimizingScheduler.cpp b/lib/Wrapper/OptimizingScheduler.cpp
index 8d6e1d77..ab09f7c2 100644
--- a/lib/Wrapper/OptimizingScheduler.cpp
+++ b/lib/Wrapper/OptimizingScheduler.cpp
@@ -16,10 +16,14 @@
 #include "opt-sched/Scheduler/register.h"
 #include "opt-sched/Scheduler/sched_region.h"
 #include "opt-sched/Scheduler/utilities.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MachineScheduler.h"
+/*#include "llvm/CodeGen/OptSequential.h"*/
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/RegisterPressure.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
@@ -34,11 +38,15 @@
 #include <algorithm>
 #include <chrono>
 #include <string>
+#include "AMDGPU/OptSchedGCNTarget.h"
 
 #define DEBUG_TYPE "optsched"
 
 using namespace llvm::opt_sched;
 
+llvm::SmallVector<llvm::StringRef, 8> UniqueRegionNames;
+llvm::DenseMap<llvm::StringRef, unsigned> RegionCounter;
+
 // hack to print spills
 bool OPTSCHED_gPrintSpills;
 
@@ -71,8 +79,8 @@ static ScheduleDAGInstrs *createOptSched(MachineSchedContext *C) {
 
 // Register the machine scheduler.
 static MachineSchedRegistry OptSchedMIRegistry("optsched",
-                                               "Use the OptSched scheduler.",
-                                               createOptSched);
+                                           "Use the OptSched scheduler.",
+                                           createOptSched);
 
 // Command line options for opt-sched.
 static cl::opt<std::string> OptSchedCfg(
@@ -258,10 +266,9 @@ void ScheduleDAGOptSched::schedule() {
   ShouldTrackLaneMasks = true;
   Config &schedIni = SchedulerOptions::getInstance();
 
-  ++RegionNumber;
   const std::string RegionName = C->MF->getFunction().getName().data() +
                                  std::string(":") +
-                                 std::to_string(RegionNumber);
+                                 std::to_string(RegionIdx);
 
   // If two pass scheduling is enabled then
   // first just record the scheduling region.
@@ -374,16 +381,21 @@ void ScheduleDAGOptSched::schedule() {
   // Build LLVM DAG
   SetupLLVMDag();
   OST->initRegion(this, MM.get());
+
   // Convert graph
   auto DDG =
       OST->createDDGWrapper(C, this, MM.get(), LatencyPrecision, RegionName);
 
-  // Find all clusterable instructions for the second pass.
-  if (SecondPass) {
-    // In the second pass, ignore artificial edges before running the sequential
-    // heuristic list scheduler.
-    DDG->convertSUnits(false, true);
+  // In the second pass, ignore artificial edges before running the sequential
+  // heuristic list scheduler.
+  if (SecondPass)
+    DDG->convertSUnits(/*IgnoreRealEdges=*/false,
+                       /*IgnoreArtificialEdges=*/true);
+  else
+    DDG->convertSUnits(false, false);
 
+  // Find all clusterable instructions for the second pass.
+  if (SecondPass || (!TwoPassEnabled && schedIni.GetBool("PRINT_CLUSTER"))) {
     dbgs() << "Finding load clusters.\n";
     int TotalLoadsInstructionsClusterable = DDG->findPossibleClusters(true);
     if (TotalLoadsInstructionsClusterable == 0)
@@ -416,8 +428,7 @@ void ScheduleDAGOptSched::schedule() {
             DataDepGraphInstance->getTotalInstructionsInCluster(begin));
       }
     }
-  } else
-    DDG->convertSUnits(false, false);
+  }
 
   DDG->convertRegFiles();
 
@@ -469,10 +480,18 @@ void ScheduleDAGOptSched::schedule() {
     return;
   }
 
+  // BB Enumerator did not find a schedule.
+  // Add the region to the list to be rescheduled.
+  if (SecondPass && !region->enumFoundSchedule() && !IsEasy && !IsThirdPass)
+    RescheduleRegions[RegionIdx] = true;
+
   LLVM_DEBUG(Logger::Info("OptSched succeeded."));
-  OST->finalizeRegion(Sched);
-  if (!OST->shouldKeepSchedule())
-    return;
+
+  if (!IsThirdPass) {
+    OST->finalizeRegion(Sched);
+    if (!OST->shouldKeepSchedule())
+      return;
+  }
 
   // Count simulated spills.
   if (isSimRegAllocEnabled()) {
@@ -570,6 +589,7 @@ void ScheduleDAGOptSched::loadOptSchedConfig() {
   TwoPassEnabled = isTwoPassEnabled();
   TwoPassSchedulingStarted = false;
   SecondPass = false;
+  IsThirdPass = false;
   LatencyPrecision = fetchLatencyPrecision();
   TreatOrderAsDataDeps = schedIni.GetBool("TREAT_ORDER_DEPS_AS_DATA_DEPS");
 
@@ -784,13 +804,14 @@ bool ScheduleDAGOptSched::rpMismatch(InstSchedule *sched) {
 void ScheduleDAGOptSched::finalizeSchedule() {
   if (TwoPassEnabled && OptSchedEnabled) {
     initSchedulers();
+    RescheduleRegions.resize(Regions.size());
 
     LLVM_DEBUG(dbgs() << "Starting two pass scheduling approach\n");
     TwoPassSchedulingStarted = true;
     for (const SchedPassStrategy &S : SchedPasses) {
       MachineBasicBlock *MBB = nullptr;
       // Reset
-      RegionNumber = ~0u;
+      RegionIdx = 0;
 
       for (auto &Region : Regions) {
         RegionBegin = Region.first;
diff --git a/lib/Wrapper/OptimizingScheduler.h b/lib/Wrapper/OptimizingScheduler.h
index 784c0681..aadca182 100644
--- a/lib/Wrapper/OptimizingScheduler.h
+++ b/lib/Wrapper/OptimizingScheduler.h
@@ -14,6 +14,7 @@
 #include "opt-sched/Scheduler/data_dep.h"
 #include "opt-sched/Scheduler/graph_trans.h"
 #include "opt-sched/Scheduler/sched_region.h"
+#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/Support/Debug.h"
@@ -59,13 +60,19 @@ class ScheduleDAGOptSched : public ScheduleDAGMILive {
   // pass. Used for the two pass scheduling approach.
   bool SecondPass;
 
+  bool IsThirdPass;
+
   // Region number uniquely identifies DAGs.
-  unsigned RegionNumber = ~0u;
+  size_t RegionIdx;
+
+  // Records if a region is not yet scheduled, or schedule has been reverted,
+  // or we generally desire to reschedule it.
+  llvm::BitVector RescheduleRegions;
 
   MachineSchedContext *C;
 
   // The OptSched target machine.
-  std::unique_ptr<OptSchedTarget> OST;
+  std::shared_ptr<OptSchedTarget> OST;
 
   // into the OptSched machine model
   std::unique_ptr<OptSchedMachineModel> MM;
@@ -251,7 +258,7 @@ class ScheduleDAGOptSched : public ScheduleDAGMILive {
   void dumpLLVMRegisters() const;
 
   // Getter for region number
-  int getRegionNum() const { return RegionNumber; }
+  int getRegionNum() const { return RegionIdx; }
 
   // Return the boundary instruction for this region if it is not a sentinel
   // value.

From b01eeff892f23af377febdf3ce637d91b984869c Mon Sep 17 00:00:00 2001
From: Vang Thao <vangthao@csus.edu>
Date: Mon, 17 Aug 2020 15:51:41 -0500
Subject: [PATCH 37/40] Add two conditions for re-scheduling ILP pass; Minimum
 occupancy and minimum ILP improvements.

---
 example/optsched-cfg/sched.ini      |  14 +++-
 lib/Wrapper/AMDGPU/GCNOptSched.cpp  | 116 +++++++++++++++++++++-------
 lib/Wrapper/AMDGPU/GCNOptSched.h    |  21 ++++-
 lib/Wrapper/OptimizingScheduler.cpp |  50 ++++++++----
 lib/Wrapper/OptimizingScheduler.h   |  18 ++++-
 5 files changed, 170 insertions(+), 49 deletions(-)

diff --git a/example/optsched-cfg/sched.ini b/example/optsched-cfg/sched.ini
index 89a79d2b..d1c88a18 100644
--- a/example/optsched-cfg/sched.ini
+++ b/example/optsched-cfg/sched.ini
@@ -19,7 +19,19 @@ PRINT_CLUSTER YES
 # NO
 USE_TWO_PASS NO
 
-# Allow enumerator to try to cluster memory operations together in the second pass.
+# Sets a limit for occupancy in the second ILP pass. We will not go below this
+# occupancy when attempting rescheduling.
+# Valid values: 1-10 (whole integers)
+MIN_OCCUPANCY_FOR_RESCHEDULE 3
+
+# Sets the required schedule length improvement percentage for the second ILP
+# pass. If we do not meet this minimum improvement then we do not keep the
+# lower occupancy schedules.
+# Valid values: 1-100 (whole integers)
+MIN_ILP_IMPROVEMENT 10
+
+# Allow enumerator to try to cluster memory operations together in the second
+# pass.
 # YES
 # NO
 CLUSTER_MEMORY_OPS NO
diff --git a/lib/Wrapper/AMDGPU/GCNOptSched.cpp b/lib/Wrapper/AMDGPU/GCNOptSched.cpp
index 8b987e3e..c2f65463 100644
--- a/lib/Wrapper/AMDGPU/GCNOptSched.cpp
+++ b/lib/Wrapper/AMDGPU/GCNOptSched.cpp
@@ -7,8 +7,8 @@
 #include "GCNOptSched.h"
 #include "AMDGPUMacroFusion.h"
 #include "GCNSchedStrategy.h"
-#include "SIMachineFunctionInfo.h"
 #include "OptSchedGCNTarget.h"
+#include "SIMachineFunctionInfo.h"
 //#include "llvm/CodeGen/OptSequential.h"
 #include "llvm/Support/Debug.h"
 #include <algorithm>
@@ -46,7 +46,29 @@ static void getRealRegionPressure(MachineBasicBlock::const_iterator Begin,
 
 ScheduleDAGOptSchedGCN::ScheduleDAGOptSchedGCN(
     llvm::MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
-    : ScheduleDAGOptSched(C, std::move(S)) {}
+    : ScheduleDAGOptSched(C, std::move(S)) {
+  MinOcc = getMinOcc();
+}
+
+unsigned ScheduleDAGOptSchedGCN::getMinOcc() {
+  SchedulerOptions &schedIni = SchedulerOptions::getInstance();
+  int MinOcc = schedIni.GetInt("MIN_OCCUPANCY_FOR_RESCHEDULE");
+  if (MinOcc <= 10 || MinOcc >= 1)
+    return MinOcc;
+
+  llvm_unreachable(
+      "Unrecognized option for MIN_OCCUPANCY_FOR_RESCHEDULE setting.")
+}
+
+int ScheduleDAGOptSchedGCN::getMinILPImprovement() {
+  SchedulerOptions &schedIni = SchedulerOptions::getInstance();
+  int MinIlpImprovement = schedIni.GetInt("MIN_ILP_IMPROVEMENT");
+  if (MinIlpImprovement <= 100 || MinIlpImprovement >= 1)
+    return MinIlpImprovement;
+
+  llvm_unreachable(
+      "Unrecognized option for MIN_OCCUPANCY_FOR_RESCHEDULE setting.")
+}
 
 void ScheduleDAGOptSchedGCN::initSchedulers() {
   // Add DAG mutations that apply to both GCN and OptSched DAG's
@@ -61,10 +83,11 @@ void ScheduleDAGOptSchedGCN::initSchedulers() {
 
   // First
   SchedPasses.push_back(OptSchedMaxOcc);
-  // Second
+  // Second ILP passes
   SchedPasses.push_back(OptSchedBalanced);
-  SchedPasses.push_back(OptSchedReschedule);
-}   
+  SchedPasses.push_back(OptSchedLowerOccAnalysis);
+  SchedPasses.push_back(OptSchedCommitLowerOcc);
+}
 
 // Execute scheduling passes.
 // Partially copied GCNScheduleDAGMILive::finalizeSchedule
@@ -72,6 +95,8 @@ void ScheduleDAGOptSchedGCN::finalizeSchedule() {
   if (TwoPassEnabled && OptSchedEnabled) {
     initSchedulers();
     RescheduleRegions.resize(Regions.size());
+    ILPAnalysis.resize(Regions.size());
+    CostAnalysis.resize(Regions.size());
     RescheduleRegions.set();
 
     LLVM_DEBUG(dbgs() << "Starting two pass scheduling approach\n");
@@ -80,32 +105,37 @@ void ScheduleDAGOptSchedGCN::finalizeSchedule() {
       MachineBasicBlock *MBB = nullptr;
       // Reset
       RegionIdx = 0;
-      if (S == OptSchedReschedule) {
+      if (S == OptSchedLowerOccAnalysis) {
         if (RescheduleRegions.none()) {
-	  dbgs() << "No regions to reschedule.\n";
-	  continue;
-	} else {
+          dbgs() << "No regions to reschedule.\n";
+          break;
+        } else {
           auto GCNOST = static_cast<OptSchedGCNTarget *>(OST.get());
           unsigned TargetOccupancy = GCNOST->getTargetOcc();
-          if (TargetOccupancy == 1u) {
-            dbgs() << "Cannot lower occupancy to below 1.\n";
-	    continue;
-	  }
+          if (TargetOccupancy <= MinOcc) {
+            dbgs() << "Cannot lower occupancy to below minimum occupancy of "
+                   << MinOCc << '\n';
+            break;
+          }
 
           dbgs() << "Beginning rescheduling of regions.\n";
-	  unsigned NewTarget = TargetOccupancy - 1u;
-	  dbgs() << "Decreasing current target occupancy " << TargetOccupancy
+          unsigned NewTarget = TargetOccupancy - 1u;
+          dbgs() << "Decreasing current target occupancy " << TargetOccupancy
                  << " to new target " << NewTarget << '\n';
-	  GCNOST->limitOccupancy(NewTarget);
-	}
+          GCNOST->limitOccupancy(NewTarget);
+        }
+      } else if (S == OptSchedCommitLowerOcc) {
+        if (!shouldCommitLowerOccSched())
+          break;
       }
 
       for (auto &Region : Regions) {
-	/*if (S == OptSchedReschedule && !RescheduleRegions[RegionIdx]) {
-	  dbgs() << "Region " << RegionIdx << " does not need to be rescheduled.\n";
-	  ++RegionIdx;
-	  continue;
-	}*/
+        /*if (S == OptSchedLowerOccAnalysis && !RescheduleRegions[RegionIdx]) {
+          dbgs() << "Region " << RegionIdx << " does not need to be
+        rescheduled.\n";
+          ++RegionIdx;
+          continue;
+        }*/
 
         RegionBegin = Region.first;
         RegionEnd = Region.second;
@@ -124,7 +154,8 @@ void ScheduleDAGOptSchedGCN::finalizeSchedule() {
           exitRegion();
           continue;
         }
-        LLVM_DEBUG(getRealRegionPressure(RegionBegin, RegionEnd, LIS, "Before"));
+        LLVM_DEBUG(
+            getRealRegionPressure(RegionBegin, RegionEnd, LIS, "Before"));
         runSchedPass(S);
         LLVM_DEBUG(getRealRegionPressure(RegionBegin, RegionEnd, LIS, "After"));
         Region = std::make_pair(RegionBegin, RegionEnd);
@@ -153,12 +184,19 @@ void ScheduleDAGOptSchedGCN::runSchedPass(SchedPassStrategy S) {
     break;
   case OptSchedMaxOcc:
     scheduleOptSchedMaxOcc();
+    Logger::Info("End of first pass through");
     break;
   case OptSchedBalanced:
     scheduleOptSchedBalanced();
+    Logger::Info("End of second pass through");
+    break;
+  case OptSchedLowerOccAnalysis:
+    scheduleOptSchedLowerOccAnalysis();
+    Logger::Info("End of third pass through");
     break;
-  case OptSchedReschedule:
-    scheduleOptSchedReschedule();
+  case OptSchedCommitLowerOcc:
+    scheduleCommitLowerOcc();
+    Logger::Info("End of fourth pass through");
     break;
   }
 }
@@ -181,9 +219,33 @@ void ScheduleDAGOptSchedGCN::scheduleOptSchedBalanced() {
   ScheduleDAGOptSched::scheduleOptSchedBalanced();
 }
 
-void ScheduleDAGOptSchedGCN::scheduleOptSchedReschedule() {
+void ScheduleDAGOptSchedGCN::scheduleOptSchedLowerOccAnalysis() {
   IsThirdPass = true;
   ScheduleDAGOptSched::scheduleOptSchedBalanced();
-  Logger::Info("End of third pass through\n");
+  IsThirdPass = false;
 }
 
+void ScheduleDAGOptSchedGCN::scheduleCommitLowerOcc() {
+  IsFourthPass = true;
+  ScheduleDAGOptSched::scheduleOptSchedBalanced();
+  IsFourthPass = false;
+}
+
+bool ScheduleDAGOptSchedGCN::shouldCommitLowerOccSched() {
+  // First analyze ILP improvements
+  int FirstPassILP = 0;
+  int SecondPassILP = 0;
+  int MinILPImprovement = getMinILPImprovement();
+  for (std::pair<int, int> &RegionLength : ILPAnalysis) {
+    FirstPassILP += RegionLength.first;
+    SecondPassILP += RegionLength.second;
+  }
+  double ILPImprovement =
+      ((FirstPassILP - SecondPassILP) / (double)FirstPassILP) * 100.0;
+  dbgs() << "ILPImprovement from second ILP pass is " << ILPImprovement
+         << ", min improvement is: " << MinILPImprovement << '\n';
+  if (ILPImprovement >= MinILPImprovement)
+    return true;
+
+  return false;
+}
\ No newline at end of file
diff --git a/lib/Wrapper/AMDGPU/GCNOptSched.h b/lib/Wrapper/AMDGPU/GCNOptSched.h
index 3d2646af..0a8df221 100644
--- a/lib/Wrapper/AMDGPU/GCNOptSched.h
+++ b/lib/Wrapper/AMDGPU/GCNOptSched.h
@@ -20,12 +20,24 @@ class ScheduleDAGOptSchedGCN : public ScheduleDAGOptSched {
     GCNMaxOcc,
     OptSchedMaxOcc,
     OptSchedBalanced,
-    OptSchedReschedule
+    OptSchedLowerOccAnalysis,
+    OptSchedCommitLowerOcc
   };
 
+  /// Get the minimum occupancy value from the sched.ini settings file. Check
+  /// if the value is between 1-10 and gives an error if it is not between the
+  /// valid range.
+  unsigned getMinOcc();
+
+  /// Analyze the possible improvements from lowering the target occupancy
+  /// and decide if we should keep the schedules.
+  bool shouldCommitLowerOccSched();
+
   // Vector of scheduling passes to execute.
   SmallVector<SchedPassStrategy, 4> SchedPasses;
 
+  unsigned MinOcc;
+
 public:
   ScheduleDAGOptSchedGCN(llvm::MachineSchedContext *C,
                          std::unique_ptr<MachineSchedStrategy> S);
@@ -52,7 +64,12 @@ class ScheduleDAGOptSchedGCN : public ScheduleDAGOptSched {
   // Run OptSched in ILP/RP balanced mode.
   void scheduleOptSchedBalanced() override;
 
-  void scheduleOptSchedReschedule();
+  // Lower occupancy and run OptSched in ILP/RP balanced mode for analysis.
+  void scheduleOptSchedLowerOccAnalysis();
+
+  // Lower occupancy and run OptSched in ILP/RP balanced mode to commit
+  // scheduling in analysis pass.
+  void scheduleCommitLowerOcc();
 };
 
 } // namespace opt_sched
diff --git a/lib/Wrapper/OptimizingScheduler.cpp b/lib/Wrapper/OptimizingScheduler.cpp
index ab09f7c2..356e5c32 100644
--- a/lib/Wrapper/OptimizingScheduler.cpp
+++ b/lib/Wrapper/OptimizingScheduler.cpp
@@ -24,6 +24,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 /*#include "llvm/CodeGen/OptSequential.h"*/
+#include "AMDGPU/OptSchedGCNTarget.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/RegisterPressure.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
@@ -38,7 +39,6 @@
 #include <algorithm>
 #include <chrono>
 #include <string>
-#include "AMDGPU/OptSchedGCNTarget.h"
 
 #define DEBUG_TYPE "optsched"
 
@@ -79,8 +79,8 @@ static ScheduleDAGInstrs *createOptSched(MachineSchedContext *C) {
 
 // Register the machine scheduler.
 static MachineSchedRegistry OptSchedMIRegistry("optsched",
-                                           "Use the OptSched scheduler.",
-                                           createOptSched);
+                                               "Use the OptSched scheduler.",
+                                               createOptSched);
 
 // Command line options for opt-sched.
 static cl::opt<std::string> OptSchedCfg(
@@ -267,8 +267,7 @@ void ScheduleDAGOptSched::schedule() {
   Config &schedIni = SchedulerOptions::getInstance();
 
   const std::string RegionName = C->MF->getFunction().getName().data() +
-                                 std::string(":") +
-                                 std::to_string(RegionIdx);
+                                 std::string(":") + std::to_string(RegionIdx);
 
   // If two pass scheduling is enabled then
   // first just record the scheduling region.
@@ -388,14 +387,14 @@ void ScheduleDAGOptSched::schedule() {
 
   // In the second pass, ignore artificial edges before running the sequential
   // heuristic list scheduler.
-  if (SecondPass)
+  if (IsSecondPass)
     DDG->convertSUnits(/*IgnoreRealEdges=*/false,
                        /*IgnoreArtificialEdges=*/true);
   else
     DDG->convertSUnits(false, false);
 
   // Find all clusterable instructions for the second pass.
-  if (SecondPass || (!TwoPassEnabled && schedIni.GetBool("PRINT_CLUSTER"))) {
+  if (IsSecondPass || (!TwoPassEnabled && schedIni.GetBool("PRINT_CLUSTER"))) {
     dbgs() << "Finding load clusters.\n";
     int TotalLoadsInstructionsClusterable = DDG->findPossibleClusters(true);
     if (TotalLoadsInstructionsClusterable == 0)
@@ -460,7 +459,7 @@ void ScheduleDAGOptSched::schedule() {
   }
 
   // Used for two-pass-optsched to alter upper bound value.
-  if (SecondPass)
+  if (IsSecondPass)
     region->InitSecondPass();
 
   // Setup time before scheduling
@@ -482,15 +481,29 @@ void ScheduleDAGOptSched::schedule() {
 
   // BB Enumerator did not find a schedule.
   // Add the region to the list to be rescheduled.
-  if (SecondPass && !region->enumFoundSchedule() && !IsEasy && !IsThirdPass)
+  if (IsSecondPass && !region->enumFoundSchedule() && !IsEasy && !IsThirdPass &&
+      !IsFourthPass)
     RescheduleRegions[RegionIdx] = true;
 
   LLVM_DEBUG(Logger::Info("OptSched succeeded."));
 
-  if (!IsThirdPass) {
-    OST->finalizeRegion(Sched);
-    if (!OST->shouldKeepSchedule())
+  OST->finalizeRegion(Sched);
+
+  if (IsFirstPass || IsSecondPass)
+    if (!OST->shouldKeepSchedule()) {
+      if (IsSecondPass) {
+        // We do not keep the schedule so the results of the sequential
+        // heuristic scheduler is the final result for the second pass.
+        ILPAnalysis[RegionIdx].first = HurstcSchedLngth;
+      }
       return;
+    }
+
+  if (IsSecondPass)
+    ILPAnalysis[RegionIdx].first = BestSchedLngth;
+  else if (IsThirdPass) {
+    ILPAnalysis[RegionIdx].second = BestSchedLngth;
+    return;
   }
 
   // Count simulated spills.
@@ -588,8 +601,10 @@ void ScheduleDAGOptSched::loadOptSchedConfig() {
   OptSchedEnabled = isOptSchedEnabled();
   TwoPassEnabled = isTwoPassEnabled();
   TwoPassSchedulingStarted = false;
-  SecondPass = false;
+  IsFirstPass = false;
+  IsSecondPass = false;
   IsThirdPass = false;
+  IsFourthPass = false;
   LatencyPrecision = fetchLatencyPrecision();
   TreatOrderAsDataDeps = schedIni.GetBool("TREAT_ORDER_DEPS_AS_DATA_DEPS");
 
@@ -853,14 +868,17 @@ void ScheduleDAGOptSched::runSchedPass(SchedPassStrategy S) {
   switch (S) {
   case OptSchedMinRP:
     scheduleOptSchedMinRP();
+    Logger::Info("End of first pass through");
     break;
   case OptSchedBalanced:
     scheduleOptSchedBalanced();
+    Logger::Info("End of second pass through");
     break;
   }
 }
 
 void ScheduleDAGOptSched::scheduleOptSchedMinRP() {
+  IsFirstPass = true;
   LatencyPrecision = LTP_UNITY;
   // Set times for the first pass
   RegionTimeout = FirstPassRegionTimeout;
@@ -868,11 +886,11 @@ void ScheduleDAGOptSched::scheduleOptSchedMinRP() {
   HeurSchedType = SCHED_LIST;
 
   schedule();
-  Logger::Info("End of first pass through\n");
+  IsFirstPass = false;
 }
 
 void ScheduleDAGOptSched::scheduleOptSchedBalanced() {
-  SecondPass = true;
+  IsSecondPass = true;
   LatencyPrecision = LTP_ROUGH;
 
   // Set times for the second pass
@@ -899,7 +917,7 @@ void ScheduleDAGOptSched::scheduleOptSchedBalanced() {
   MultiPassStaticNodeSup = false;
 
   schedule();
-  Logger::Info("End of second pass through");
+  IsSecondPass = false;
 }
 
 bool ScheduleDAGOptSched::isSimRegAllocEnabled() const {
diff --git a/lib/Wrapper/OptimizingScheduler.h b/lib/Wrapper/OptimizingScheduler.h
index aadca182..872b3f81 100644
--- a/lib/Wrapper/OptimizingScheduler.h
+++ b/lib/Wrapper/OptimizingScheduler.h
@@ -37,8 +37,15 @@ class ScheduleDAGOptSched : public ScheduleDAGMILive {
   // Vector of scheduling passes to execute.
   SmallVector<SchedPassStrategy, 4> SchedPasses;
 
-protected:
+  /// Contains the results of the first ILP pass and second analysis ILP pass.
+  /// Used to calculate if we should keep the lower target occupancy schedules
+  /// in the second ILP pass. First element is the first ILP pass and second
+  /// element is the second analysis ILP pass.
+  SmallVector<std::pair<int, int>, 32> ILPAnalysis;
+  /// TODO: Same as above for cost analysis.
+  SmallVector<std::pair<int, int>, 32> CostAnalysis;
 
+protected:
   // Vector of regions recorded for later rescheduling
   SmallVector<
       std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, 32>
@@ -56,12 +63,16 @@ class ScheduleDAGOptSched : public ScheduleDAGMILive {
   // Path to the machine model specification file for opt-sched.
   SmallString<128> PathCfgMM;
 
+  bool IsFirstPass;
+
   // Bool value indicating that the scheduler is in the second
   // pass. Used for the two pass scheduling approach.
-  bool SecondPass;
+  bool IsSecondPass;
 
   bool IsThirdPass;
 
+  bool isFourthPass;
+
   // Region number uniquely identifies DAGs.
   size_t RegionIdx;
 
@@ -167,7 +178,8 @@ class ScheduleDAGOptSched : public ScheduleDAGMILive {
 
   SchedPriorities SecondPassPriorities;
 
-  // The heuristic used for the second pass enumerator in the two-pass scheduling approach.
+  // The heuristic used for the second pass enumerator in the two-pass
+  // scheduling approach.
   SchedPriorities SecondPassEnumPriorities;
 
   // Static node superiority RP only graph transformation.

From 9bbb91d4d5bfe0bc597724b5677f0823134b2ef1 Mon Sep 17 00:00:00 2001
From: vang thao <vang@optimizer-amd.ecs.csus.edu>
Date: Wed, 19 Aug 2020 09:11:15 -0700
Subject: [PATCH 38/40] Fix ILP Improvement calculation bugs

---
 lib/CMakeLists.txt                  |  1 -
 lib/Scheduler/sched_region.cpp      |  5 ++-
 lib/Wrapper/AMDGPU/GCNOptSched.cpp  | 47 +++++++++++++++++++----------
 lib/Wrapper/AMDGPU/GCNOptSched.h    |  2 ++
 lib/Wrapper/OptimizingScheduler.cpp |  8 +++--
 lib/Wrapper/OptimizingScheduler.h   | 15 ++++-----
 6 files changed, 51 insertions(+), 27 deletions(-)

diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 9102bf94..847cc6e5 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -5,4 +5,3 @@ ELSE()
 ENDIF()
 
 add_dependencies(OptSched ${OPT_SCHED_TARGET_DEPS})
-target_link_libraries(OptSched -L/home/vang/src/ROCm-2.4/opencl/build/lib/ libamdocl64.so)
diff --git a/lib/Scheduler/sched_region.cpp b/lib/Scheduler/sched_region.cpp
index cb545bc7..64e4bc56 100644
--- a/lib/Scheduler/sched_region.cpp
+++ b/lib/Scheduler/sched_region.cpp
@@ -411,6 +411,9 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule(
   InitialSchedule = bestSched_;
   InitialScheduleCost = bestCost_;
   InitialScheduleLength = bestSchedLngth_;
+  /*Logger::Info("Printing Initiial schedule");
+  InitialSchedule->Print(Logger::GetLogStream(), "InitialSched", dataDepGraph_);
+  Logger::Info("Finish printing initial schedule");*/
 
   // Step #4: Find the optimal schedule if the heuristc and ACO was not optimal.
   if (BbSchedulerEnabled) {
@@ -647,7 +650,7 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule(
   }
 
   //if (bestSched != NULL)
-    //bestSched->Print(Logger::GetLogStream(), "FinalSched", dataDepGraph_);
+  //  bestSched->Print(Logger::GetLogStream(), "FinalSched", dataDepGraph_);
 
   return rslt;
 }
diff --git a/lib/Wrapper/AMDGPU/GCNOptSched.cpp b/lib/Wrapper/AMDGPU/GCNOptSched.cpp
index c2f65463..0a9434af 100644
--- a/lib/Wrapper/AMDGPU/GCNOptSched.cpp
+++ b/lib/Wrapper/AMDGPU/GCNOptSched.cpp
@@ -53,21 +53,23 @@ ScheduleDAGOptSchedGCN::ScheduleDAGOptSchedGCN(
 unsigned ScheduleDAGOptSchedGCN::getMinOcc() {
   SchedulerOptions &schedIni = SchedulerOptions::getInstance();
   int MinOcc = schedIni.GetInt("MIN_OCCUPANCY_FOR_RESCHEDULE");
-  if (MinOcc <= 10 || MinOcc >= 1)
+  if (MinOcc <= 10 && MinOcc >= 1)
     return MinOcc;
 
-  llvm_unreachable(
-      "Unrecognized option for MIN_OCCUPANCY_FOR_RESCHEDULE setting.")
+  Logger::Fatal(
+      "Unrecognized option for MIN_OCCUPANCY_FOR_RESCHEDULE setting: %d",
+      MinOcc);
 }
 
 int ScheduleDAGOptSchedGCN::getMinILPImprovement() {
   SchedulerOptions &schedIni = SchedulerOptions::getInstance();
   int MinIlpImprovement = schedIni.GetInt("MIN_ILP_IMPROVEMENT");
-  if (MinIlpImprovement <= 100 || MinIlpImprovement >= 1)
+  if (MinIlpImprovement <= 100 && MinIlpImprovement >= 0)
     return MinIlpImprovement;
 
-  llvm_unreachable(
-      "Unrecognized option for MIN_OCCUPANCY_FOR_RESCHEDULE setting.")
+  Logger::Fatal(
+      "Unrecognized option for MIN_OCCUPANCY_FOR_RESCHEDULE setting: %d",
+      MinIlpImprovement);
 }
 
 void ScheduleDAGOptSchedGCN::initSchedulers() {
@@ -114,7 +116,7 @@ void ScheduleDAGOptSchedGCN::finalizeSchedule() {
           unsigned TargetOccupancy = GCNOST->getTargetOcc();
           if (TargetOccupancy <= MinOcc) {
             dbgs() << "Cannot lower occupancy to below minimum occupancy of "
-                   << MinOCc << '\n';
+                   << MinOcc << '\n';
             break;
           }
 
@@ -125,8 +127,14 @@ void ScheduleDAGOptSchedGCN::finalizeSchedule() {
           GCNOST->limitOccupancy(NewTarget);
         }
       } else if (S == OptSchedCommitLowerOcc) {
-        if (!shouldCommitLowerOccSched())
+        dbgs()
+            << "Analyzing if we should commit the lower occupancy schedule\n";
+        if (!shouldCommitLowerOccSched()) {
+          dbgs()
+              << "Lower occupancy schedule did not meet minimum improvement.\n";
           break;
+        }
+        dbgs() << "Lower occupancy met minimum improvement requirement!\n";
       }
 
       for (auto &Region : Regions) {
@@ -233,19 +241,26 @@ void ScheduleDAGOptSchedGCN::scheduleCommitLowerOcc() {
 
 bool ScheduleDAGOptSchedGCN::shouldCommitLowerOccSched() {
   // First analyze ILP improvements
-  int FirstPassILP = 0;
-  int SecondPassILP = 0;
+  int FirstPassLengthSum = 0;
+  int SecondPassLengthSum = 0;
   int MinILPImprovement = getMinILPImprovement();
   for (std::pair<int, int> &RegionLength : ILPAnalysis) {
-    FirstPassILP += RegionLength.first;
-    SecondPassILP += RegionLength.second;
+    dbgs() << "First length -- " << RegionLength.first << ", Second length -- "
+           << RegionLength.second << '\n';
+    FirstPassLengthSum += RegionLength.first;
+    SecondPassLengthSum += RegionLength.second;
   }
-  double ILPImprovement =
-      ((FirstPassILP - SecondPassILP) / (double)FirstPassILP) * 100.0;
+  dbgs() << "First pass length sum: " << FirstPassLengthSum << '\n';
+  dbgs() << "Second pass length sum: " << SecondPassLengthSum << '\n';
+  double FirstPassAverageLength = (double)FirstPassLengthSum / Regions.size();
+  double SecondPassAverageLength = (double)SecondPassLengthSum / Regions.size();
+  double ILPImprovement = ((FirstPassAverageLength - SecondPassAverageLength) /
+                           FirstPassAverageLength) *
+                          100.0;
   dbgs() << "ILPImprovement from second ILP pass is " << ILPImprovement
          << ", min improvement is: " << MinILPImprovement << '\n';
-  if (ILPImprovement >= MinILPImprovement)
+  if (ILPImprovement - MinILPImprovement >= 0)
     return true;
 
   return false;
-}
\ No newline at end of file
+}
diff --git a/lib/Wrapper/AMDGPU/GCNOptSched.h b/lib/Wrapper/AMDGPU/GCNOptSched.h
index 0a8df221..c24c93c1 100644
--- a/lib/Wrapper/AMDGPU/GCNOptSched.h
+++ b/lib/Wrapper/AMDGPU/GCNOptSched.h
@@ -29,6 +29,8 @@ class ScheduleDAGOptSchedGCN : public ScheduleDAGOptSched {
   /// valid range.
   unsigned getMinOcc();
 
+  int getMinILPImprovement();
+
   /// Analyze the possible improvements from lowering the target occupancy
   /// and decide if we should keep the schedules.
   bool shouldCommitLowerOccSched();
diff --git a/lib/Wrapper/OptimizingScheduler.cpp b/lib/Wrapper/OptimizingScheduler.cpp
index 356e5c32..3e9a03b7 100644
--- a/lib/Wrapper/OptimizingScheduler.cpp
+++ b/lib/Wrapper/OptimizingScheduler.cpp
@@ -380,6 +380,10 @@ void ScheduleDAGOptSched::schedule() {
   // Build LLVM DAG
   SetupLLVMDag();
   OST->initRegion(this, MM.get());
+  /*if (IsSecondPass && !IsThirdPass && !IsFourthPass) {
+    auto GCNOST = static_cast<OptSchedGCNTarget *>(OST.get());
+    GCNOST->setTargetOcc(5);
+  }*/
 
   // Convert graph
   auto DDG =
@@ -489,7 +493,7 @@ void ScheduleDAGOptSched::schedule() {
 
   OST->finalizeRegion(Sched);
 
-  if (IsFirstPass || IsSecondPass)
+  if (!IsThirdPass && !IsFourthPass && (IsFirstPass || IsSecondPass))
     if (!OST->shouldKeepSchedule()) {
       if (IsSecondPass) {
         // We do not keep the schedule so the results of the sequential
@@ -499,7 +503,7 @@ void ScheduleDAGOptSched::schedule() {
       return;
     }
 
-  if (IsSecondPass)
+  if (IsSecondPass && !IsThirdPass && !IsFourthPass)
     ILPAnalysis[RegionIdx].first = BestSchedLngth;
   else if (IsThirdPass) {
     ILPAnalysis[RegionIdx].second = BestSchedLngth;
diff --git a/lib/Wrapper/OptimizingScheduler.h b/lib/Wrapper/OptimizingScheduler.h
index 872b3f81..502a7cd2 100644
--- a/lib/Wrapper/OptimizingScheduler.h
+++ b/lib/Wrapper/OptimizingScheduler.h
@@ -37,6 +37,13 @@ class ScheduleDAGOptSched : public ScheduleDAGMILive {
   // Vector of scheduling passes to execute.
   SmallVector<SchedPassStrategy, 4> SchedPasses;
 
+
+protected:
+  // Vector of regions recorded for later rescheduling
+  SmallVector<
+      std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, 32>
+      Regions;
+
   /// Contains the results of the first ILP pass and second analysis ILP pass.
   /// Used to calculate if we should keep the lower target occupancy schedules
   /// in the second ILP pass. First element is the first ILP pass and second
@@ -45,12 +52,6 @@ class ScheduleDAGOptSched : public ScheduleDAGMILive {
   /// TODO: Same as above for cost analysis.
   SmallVector<std::pair<int, int>, 32> CostAnalysis;
 
-protected:
-  // Vector of regions recorded for later rescheduling
-  SmallVector<
-      std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, 32>
-      Regions;
-
   // Path to opt-sched config options directory.
   SmallString<128> PathCfg;
 
@@ -71,7 +72,7 @@ class ScheduleDAGOptSched : public ScheduleDAGMILive {
 
   bool IsThirdPass;
 
-  bool isFourthPass;
+  bool IsFourthPass;
 
   // Region number uniquely identifies DAGs.
   size_t RegionIdx;

From 6b28d0d9ec8709711325d511dd2d899cdae6d4a1 Mon Sep 17 00:00:00 2001
From: Vang Thao <vangthao@csus.edu>
Date: Thu, 20 Aug 2020 21:06:31 -0500
Subject: [PATCH 39/40] Disable heuristic scheduler and B&B enumerator in 3rd
 ILP pass.

---
 lib/Wrapper/AMDGPU/GCNOptSched.cpp  | 47 +++++++++--------------------
 lib/Wrapper/OptimizingScheduler.cpp | 36 +++++++++++++---------
 lib/Wrapper/OptimizingScheduler.h   |  3 +-
 3 files changed, 38 insertions(+), 48 deletions(-)

diff --git a/lib/Wrapper/AMDGPU/GCNOptSched.cpp b/lib/Wrapper/AMDGPU/GCNOptSched.cpp
index 0a9434af..55067d32 100644
--- a/lib/Wrapper/AMDGPU/GCNOptSched.cpp
+++ b/lib/Wrapper/AMDGPU/GCNOptSched.cpp
@@ -11,7 +11,9 @@
 #include "SIMachineFunctionInfo.h"
 //#include "llvm/CodeGen/OptSequential.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include <algorithm>
+#include <string>
 
 #define DEBUG_TYPE "optsched"
 
@@ -56,9 +58,9 @@ unsigned ScheduleDAGOptSchedGCN::getMinOcc() {
   if (MinOcc <= 10 && MinOcc >= 1)
     return MinOcc;
 
-  Logger::Fatal(
+  llvm::report_fatal_error(
       "Unrecognized option for MIN_OCCUPANCY_FOR_RESCHEDULE setting: %d",
-      MinOcc);
+      std::to_string(MinOcc), false);
 }
 
 int ScheduleDAGOptSchedGCN::getMinILPImprovement() {
@@ -67,9 +69,9 @@ int ScheduleDAGOptSchedGCN::getMinILPImprovement() {
   if (MinIlpImprovement <= 100 && MinIlpImprovement >= 0)
     return MinIlpImprovement;
 
-  Logger::Fatal(
+  llvm::report_fatal_error(
       "Unrecognized option for MIN_OCCUPANCY_FOR_RESCHEDULE setting: %d",
-      MinIlpImprovement);
+      std::to_string(MinIlpImprovement), false);
 }
 
 void ScheduleDAGOptSchedGCN::initSchedulers() {
@@ -99,6 +101,7 @@ void ScheduleDAGOptSchedGCN::finalizeSchedule() {
     RescheduleRegions.resize(Regions.size());
     ILPAnalysis.resize(Regions.size());
     CostAnalysis.resize(Regions.size());
+    LowerOccScheds.resize(Regions.size());
     RescheduleRegions.set();
 
     LLVM_DEBUG(dbgs() << "Starting two pass scheduling approach\n");
@@ -107,28 +110,24 @@ void ScheduleDAGOptSchedGCN::finalizeSchedule() {
       MachineBasicBlock *MBB = nullptr;
       // Reset
       RegionIdx = 0;
+
       if (S == OptSchedLowerOccAnalysis) {
-        if (RescheduleRegions.none()) {
-          dbgs() << "No regions to reschedule.\n";
+        if (RescheduleRegions.none())
           break;
-        } else {
+        else {
           auto GCNOST = static_cast<OptSchedGCNTarget *>(OST.get());
           unsigned TargetOccupancy = GCNOST->getTargetOcc();
-          if (TargetOccupancy <= MinOcc) {
-            dbgs() << "Cannot lower occupancy to below minimum occupancy of "
-                   << MinOcc << '\n';
+          if (TargetOccupancy <= MinOcc)
             break;
-          }
 
-          dbgs() << "Beginning rescheduling of regions.\n";
           unsigned NewTarget = TargetOccupancy - 1u;
           dbgs() << "Decreasing current target occupancy " << TargetOccupancy
                  << " to new target " << NewTarget << '\n';
           GCNOST->limitOccupancy(NewTarget);
         }
-      } else if (S == OptSchedCommitLowerOcc) {
-        dbgs()
-            << "Analyzing if we should commit the lower occupancy schedule\n";
+      }
+
+      if (S == OptSchedCommitLowerOcc) {
         if (!shouldCommitLowerOccSched()) {
           dbgs()
               << "Lower occupancy schedule did not meet minimum improvement.\n";
@@ -138,13 +137,6 @@ void ScheduleDAGOptSchedGCN::finalizeSchedule() {
       }
 
       for (auto &Region : Regions) {
-        /*if (S == OptSchedLowerOccAnalysis && !RescheduleRegions[RegionIdx]) {
-          dbgs() << "Region " << RegionIdx << " does not need to be
-        rescheduled.\n";
-          ++RegionIdx;
-          continue;
-        }*/
-
         RegionBegin = Region.first;
         RegionEnd = Region.second;
 
@@ -175,13 +167,6 @@ void ScheduleDAGOptSchedGCN::finalizeSchedule() {
   }
 
   ScheduleDAGMILive::finalizeSchedule();
-
-  LLVM_DEBUG(if (isSimRegAllocEnabled()) {
-    dbgs() << "*************************************\n";
-    dbgs() << "Function: " << MF.getName()
-           << "\nTotal Simulated Spills: " << SimulatedSpills << "\n";
-    dbgs() << "*************************************\n";
-  });
 }
 
 void ScheduleDAGOptSchedGCN::runSchedPass(SchedPassStrategy S) {
@@ -245,13 +230,9 @@ bool ScheduleDAGOptSchedGCN::shouldCommitLowerOccSched() {
   int SecondPassLengthSum = 0;
   int MinILPImprovement = getMinILPImprovement();
   for (std::pair<int, int> &RegionLength : ILPAnalysis) {
-    dbgs() << "First length -- " << RegionLength.first << ", Second length -- "
-           << RegionLength.second << '\n';
     FirstPassLengthSum += RegionLength.first;
     SecondPassLengthSum += RegionLength.second;
   }
-  dbgs() << "First pass length sum: " << FirstPassLengthSum << '\n';
-  dbgs() << "Second pass length sum: " << SecondPassLengthSum << '\n';
   double FirstPassAverageLength = (double)FirstPassLengthSum / Regions.size();
   double SecondPassAverageLength = (double)SecondPassLengthSum / Regions.size();
   double ILPImprovement = ((FirstPassAverageLength - SecondPassAverageLength) /
diff --git a/lib/Wrapper/OptimizingScheduler.cpp b/lib/Wrapper/OptimizingScheduler.cpp
index 3e9a03b7..a3ad4e1c 100644
--- a/lib/Wrapper/OptimizingScheduler.cpp
+++ b/lib/Wrapper/OptimizingScheduler.cpp
@@ -413,8 +413,8 @@ void ScheduleDAGOptSched::schedule() {
                  TotalLoadsInstructionsClusterable,
                  TotalStoreInstructionsClusterable);
 
-    // Get the DDG instance so that we can set and get information that will be
-    // read later on during enumeration.
+    // Get the DDG instance so that we can set and get information that will
+    // be read later on during enumeration.
     auto DataDepGraphInstance = static_cast<DataDepGraph *>(DDG.get());
     // Store total instructions in all clusters in the DDG instance.
     DataDepGraphInstance->setTotalInstructionsInAllClusters(
@@ -469,18 +469,25 @@ void ScheduleDAGOptSched::schedule() {
   // Setup time before scheduling
   Utilities::startTime = std::chrono::high_resolution_clock::now();
   // Schedule region.
-  Rslt = region->FindOptimalSchedule(CurrentRegionTimeout, CurrentLengthTimeout,
-                                     IsEasy, NormBestCost, BestSchedLngth,
-                                     NormHurstcCost, HurstcSchedLngth, Sched,
-                                     FilterByPerp, blocksToKeep(schedIni));
-
-  if ((!(Rslt == RES_SUCCESS || Rslt == RES_TIMEOUT) || Sched == NULL)) {
-    LLVM_DEBUG(
-        Logger::Info("OptSched run failed: rslt=%d, sched=%p. Falling back.",
-                     Rslt, (void *)Sched));
-    // Scheduling with opt-sched failed.
-    // fallbackScheduler();
-    return;
+  if (!IsFourthPass) {
+    Rslt = region->FindOptimalSchedule(
+        CurrentRegionTimeout, CurrentLengthTimeout, IsEasy, NormBestCost,
+        BestSchedLngth, NormHurstcCost, HurstcSchedLngth, Sched, FilterByPerp,
+        blocksToKeep(schedIni));
+
+    if ((!(Rslt == RES_SUCCESS || Rslt == RES_TIMEOUT) || Sched == NULL)) {
+      LLVM_DEBUG(
+          Logger::Info("OptSched run failed: rslt=%d, sched=%p. Falling back.",
+                       Rslt, (void *)Sched));
+      // Scheduling with opt-sched failed.
+      // fallbackScheduler();
+      return;
+    }
+  } else {
+    dbgs() << "Processing DAG " << RegionName << '\n';
+    dbgs() << "Restoring schedule from second ILP pass: \n";
+    Sched = LowerOccScheds[RegionIdx];
+    dbgs() << "Applying lower occupancy schedule\n";
   }
 
   // BB Enumerator did not find a schedule.
@@ -507,6 +514,7 @@ void ScheduleDAGOptSched::schedule() {
     ILPAnalysis[RegionIdx].first = BestSchedLngth;
   else if (IsThirdPass) {
     ILPAnalysis[RegionIdx].second = BestSchedLngth;
+    LowerOccScheds[RegionIdx] = Sched;
     return;
   }
 
diff --git a/lib/Wrapper/OptimizingScheduler.h b/lib/Wrapper/OptimizingScheduler.h
index 502a7cd2..72191801 100644
--- a/lib/Wrapper/OptimizingScheduler.h
+++ b/lib/Wrapper/OptimizingScheduler.h
@@ -37,7 +37,6 @@ class ScheduleDAGOptSched : public ScheduleDAGMILive {
   // Vector of scheduling passes to execute.
   SmallVector<SchedPassStrategy, 4> SchedPasses;
 
-
 protected:
   // Vector of regions recorded for later rescheduling
   SmallVector<
@@ -51,6 +50,8 @@ class ScheduleDAGOptSched : public ScheduleDAGMILive {
   SmallVector<std::pair<int, int>, 32> ILPAnalysis;
   /// TODO: Same as above for cost analysis.
   SmallVector<std::pair<int, int>, 32> CostAnalysis;
+  /// Store the lower occupancy schedules from the second ILP pass.
+  SmallVector<InstSchedule *, 16> LowerOccScheds;
 
   // Path to opt-sched config options directory.
   SmallString<128> PathCfg;

From 527d08f60bc5ea561ffc8f8e60c82a598502c1f1 Mon Sep 17 00:00:00 2001
From: vang thao <vang@optimizer-amd.ecs.csus.edu>
Date: Sun, 23 Aug 2020 19:15:17 -0700
Subject: [PATCH 40/40] Fix incorrect statement

---
 lib/Wrapper/AMDGPU/GCNOptSched.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/Wrapper/AMDGPU/GCNOptSched.cpp b/lib/Wrapper/AMDGPU/GCNOptSched.cpp
index 55067d32..915f4e6b 100644
--- a/lib/Wrapper/AMDGPU/GCNOptSched.cpp
+++ b/lib/Wrapper/AMDGPU/GCNOptSched.cpp
@@ -59,7 +59,7 @@ unsigned ScheduleDAGOptSchedGCN::getMinOcc() {
     return MinOcc;
 
   llvm::report_fatal_error(
-      "Unrecognized option for MIN_OCCUPANCY_FOR_RESCHEDULE setting: %d",
+      "Unrecognized option for MIN_OCCUPANCY_FOR_RESCHEDULE setting: %d" +
       std::to_string(MinOcc), false);
 }
 
@@ -70,7 +70,7 @@ int ScheduleDAGOptSchedGCN::getMinILPImprovement() {
     return MinIlpImprovement;
 
   llvm::report_fatal_error(
-      "Unrecognized option for MIN_OCCUPANCY_FOR_RESCHEDULE setting: %d",
+      "Unrecognized option for MIN_OCCUPANCY_FOR_RESCHEDULE setting: %d" +
       std::to_string(MinIlpImprovement), false);
 }