From 215e6f962d8d6cdec210aea3231474c20d4ac64f Mon Sep 17 00:00:00 2001 From: Vang Thao Date: Fri, 6 Mar 2020 23:35:25 -0800 Subject: [PATCH 01/40] Added notes on how to possibly start. --- include/opt-sched/Scheduler/data_dep.h | 1 + .../opt-sched/Scheduler/sched_basic_data.h | 21 +++++++++++++++ lib/Scheduler/bb_spill.cpp | 8 ++++++ lib/Scheduler/data_dep.cpp | 6 +++-- lib/Scheduler/enumerator.cpp | 12 +++++++++ lib/Scheduler/sched_basic_data.cpp | 4 +++ lib/Wrapper/OptSchedDDGWrapperBasic.cpp | 26 +++++++++++++++++++ lib/Wrapper/OptimizingScheduler.cpp | 1 + 8 files changed, 77 insertions(+), 2 deletions(-) diff --git a/include/opt-sched/Scheduler/data_dep.h b/include/opt-sched/Scheduler/data_dep.h index 48dda038..d0885fd0 100644 --- a/include/opt-sched/Scheduler/data_dep.h +++ b/include/opt-sched/Scheduler/data_dep.h @@ -386,6 +386,7 @@ class DataDepGraph : public llvm::opt_sched::OptSchedDDGWrapperBase, SchedInstruction *CreateNode_(InstCount instNum, char const *const instName, InstType instType, char const *const opCode, + /* bool InstrMayLoad, bool InstrMayStore,*/ int nodeID, InstCount fileSchedOrder, InstCount fileSchedCycle, InstCount fileLB, InstCount fileUB, int blkNum); diff --git a/include/opt-sched/Scheduler/sched_basic_data.h b/include/opt-sched/Scheduler/sched_basic_data.h index 52306e82..6e3ed08b 100644 --- a/include/opt-sched/Scheduler/sched_basic_data.h +++ b/include/opt-sched/Scheduler/sched_basic_data.h @@ -425,6 +425,18 @@ class SchedInstruction : public GraphNode { InstType GetCrtclPathFrmRoot() { return crtclPathFrmRoot_; } + /// Return true if this instruction could possibly read memory + /// Copied from https://llvm.org/doxygen/classllvm_1_1MachineInstr.html + // bool mayLoad() { return MayLoad; } + + /// Return true if this instruction could possibly modify memory. + /// Copied from https://llvm.org/doxygen/classllvm_1_1MachineInstr.html + // bool mayStore() { return MayStore; } + + /// Set MayCluster to true if clustering memory operations was found + /// to be possible. + // void setMayCluster () { MayCluster = true; } + friend class SchedRange; protected: @@ -432,6 +444,15 @@ class SchedInstruction : public GraphNode { string name_; // The mnemonic of this instruction, e.g. "add" or "jmp". string opCode_; + /// Indicate if this instruction may be a load operation + // bool MayLoad; + /// Indicate if this instruction may be a store operation + // bool MayStore; + /// Data structure to store a possible clustering with other isntructions. + /// This data structure should have a fast lookup operation. + // dataStructure PossibleClustures; + /// This value should be set to true if clustering may be possible. + // bool MayCluster; // A numberical ID for this instruction. int nodeID_; // The type of this instruction. diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp index 736a38ad..815dc277 100644 --- a/lib/Scheduler/bb_spill.cpp +++ b/lib/Scheduler/bb_spill.cpp @@ -431,6 +431,14 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst, int liveRegs; InstCount newSpillCost; + // Possibly keep track of the current memory clustering size here + // and in UpdateSpillInfoForUnSchdul_() + // if inst->mayCluster() then + // if current instruction is already part of a cluster then + // increment cluster size by 1 + // else if not in a cluster then + // start clustering by initializing cluster values + defCnt = inst->GetDefs(defs); useCnt = inst->GetUses(uses); diff --git a/lib/Scheduler/data_dep.cpp b/lib/Scheduler/data_dep.cpp index 5974f496..65d2f0b8 100644 --- a/lib/Scheduler/data_dep.cpp +++ b/lib/Scheduler/data_dep.cpp @@ -824,8 +824,10 @@ FUNC_RESULT DataDepGraph::SkipGraph(SpecsBuffer *buf, bool &endOfFileReached) { SchedInstruction *DataDepGraph::CreateNode_( InstCount instNum, char const *const instName, InstType instType, - char const *const opCode, int nodeID, InstCount fileSchedOrder, - InstCount fileSchedCycle, InstCount fileLB, InstCount fileUB, int blkNum) { + char const *const opCode, + /* bool InstrMayLoad, bool InstrMayStore,*/ int nodeID, + InstCount fileSchedOrder, InstCount fileSchedCycle, InstCount fileLB, + InstCount fileUB, int blkNum) { SchedInstruction *newInstPtr; newInstPtr = new SchedInstruction(instNum, instName, instType, opCode, diff --git a/lib/Scheduler/enumerator.cpp b/lib/Scheduler/enumerator.cpp index d9c4e3b1..e94f2170 100644 --- a/lib/Scheduler/enumerator.cpp +++ b/lib/Scheduler/enumerator.cpp @@ -982,6 +982,18 @@ bool Enumerator::FindNxtFsblBrnch_(EnumTreeNode *&newNode) { if (crntBrnchNum == 0 && SchedForRPOnly_) crntNode_->SetFoundInstWithUse(IsUseInRdyLst_()); + // Note: This is just a thought, we might not need this here. + // Check if clustering is possible. + // We want to only do memory clustering in the second pass for now. + // if (crntBrnchNum == 0 && EnableMemClustering && SecondPass) + // // TODO: Implement these functions/attributes + // // and implement cost. Also keep track of current + // // cluster size since we do not want to exceed 15 + // // memory operations in a cluster (This and the cost + // // is probably done somewhere else and not here). + // ClusteringPossible = crntNode_->CheckForClustering(); + // crntNode_->SetClusteringPossible(ClusteringPossible); + for (i = crntBrnchNum; i < brnchCnt && crntNode_->IsFeasible(); i++) { #ifdef IS_DEBUG_FLOW Logger::Info("Probing branch %d out of %d", i, brnchCnt); diff --git a/lib/Scheduler/sched_basic_data.cpp b/lib/Scheduler/sched_basic_data.cpp index ef552365..bdec48cb 100644 --- a/lib/Scheduler/sched_basic_data.cpp +++ b/lib/Scheduler/sched_basic_data.cpp @@ -6,6 +6,7 @@ using namespace llvm::opt_sched; SchedInstruction::SchedInstruction(InstCount num, const string &name, InstType instType, const string &opCode, + /* bool InstrMayLoad, bool InstrMayStore,*/ InstCount maxInstCnt, int nodeID, InstCount fileSchedOrder, InstCount fileSchedCycle, InstCount fileLB, @@ -15,6 +16,9 @@ SchedInstruction::SchedInstruction(InstCount num, const string &name, name_ = name; opCode_ = opCode; instType_ = instType; + // MayLoad = InstrMayLoad; + // MayStore = InstrMayStore; + // MayCluster = false; frwrdLwrBound_ = INVALID_VALUE; bkwrdLwrBound_ = INVALID_VALUE; diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp index ba6985cf..b7d3444f 100644 --- a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp +++ b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp @@ -376,6 +376,8 @@ inline void OptSchedDDGWrapperBasic::setupRoot() { int RootNum = DAG->SUnits.size(); root_ = CreateNode_(RootNum, "artificial", MM->GetInstTypeByName("artificial"), "__optsched_entry", + // mayLoad = false; + // mayStore = false; RootNum, // nodeID RootNum, // fileSchedOrder RootNum, // fileSchedCycle @@ -394,6 +396,8 @@ inline void OptSchedDDGWrapperBasic::setupLeaf() { int LeafNum = DAG->SUnits.size() + 1; CreateNode_(LeafNum, "artificial", MM->GetInstTypeByName("artificial"), "__optsched_exit", + // mayLoad = false; + // mayStore = false; LeafNum, // nodeID LeafNum, // fileSchedOrder LeafNum, // fileSchedCycle @@ -467,6 +471,8 @@ void OptSchedDDGWrapperBasic::convertSUnit(const SUnit &SU) { } CreateNode_(SU.NodeNum, InstName.c_str(), InstType, InstName.c_str(), + // MI->mayLoad() + // MI->mayStore() SU.NodeNum, // nodeID SU.NodeNum, // fileSchedOrder SU.NodeNum, // fileSchedCycle @@ -500,6 +506,26 @@ void OptSchedDDGWrapperBasic::countBoundaryLiveness( } } + +/// Iterate through SUnits and find all possible clustering then transfer +/// the information over to the SchedInstruction class as a bitvector. +/// Partially copied from https://github.com/llvm/llvm-project/blob/master/llvm/lib/CodeGen/MachineScheduler.cpp#L1615 +// void findPossibleClusters() { +// Copy how LLVM handles clustering except instead of actually +// modifying the DAG, we can possibly set MayCluster to true. +// Then add the nodes that can be clustered together into a +// data structure. + +// for (auto &SU : DAG->SUnits) { +// if ((IsLoad && !SU.getInstr()->mayLoad()) || +// (!IsLoad && !SU.getInstr()->mayStore())) +// continue; +// ... +// ... +// } +// ... +// } + LLVMRegTypeFilter::LLVMRegTypeFilter( const MachineModel *MM, const llvm::TargetRegisterInfo *TRI, const std::vector &RegionPressure, float RegFilterFactor) diff --git a/lib/Wrapper/OptimizingScheduler.cpp b/lib/Wrapper/OptimizingScheduler.cpp index 5d0416c5..be70dfa2 100644 --- a/lib/Wrapper/OptimizingScheduler.cpp +++ b/lib/Wrapper/OptimizingScheduler.cpp @@ -380,6 +380,7 @@ void ScheduleDAGOptSched::schedule() { OST->createDDGWrapper(C, this, MM.get(), LatencyPrecision, RegionName); DDG->convertSUnits(); DDG->convertRegFiles(); + // DDG->findPossibleClusters(); auto *BDDG = static_cast(DDG.get()); addGraphTransformations(BDDG); From 449456b09f718c04107fd21b65d526646c24ff07 Mon Sep 17 00:00:00 2001 From: Vang Thao Date: Mon, 9 Mar 2020 18:21:50 -0700 Subject: [PATCH 02/40] Idea on how to implement checking if an instruction is part of a cluster and potential issues. --- lib/Scheduler/bb_spill.cpp | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp index 815dc277..527ac0cd 100644 --- a/lib/Scheduler/bb_spill.cpp +++ b/lib/Scheduler/bb_spill.cpp @@ -434,10 +434,29 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst, // Possibly keep track of the current memory clustering size here // and in UpdateSpillInfoForUnSchdul_() // if inst->mayCluster() then - // if current instruction is already part of a cluster then + + // // Can use bit operations to check if it is part of an active clustering + // // Possible implementation: if (curClusterBitVector[inst->GetNum]) + // if curInst is part of an active cluster then // increment cluster size by 1 // else if not in a cluster then // start clustering by initializing cluster values + // // Possibly use bit operations to activate part of cluster + // // Ex: + // // Instr 0, 3, 4 can be clustered and there are 5 total instructions + // // curClusterBitVector Bitvector: 11001 + // + // Potential Issues: + // 1. How to implement this when un-scheduling? Need to keep track if new instruction disable a cluster + // so that when we backtrack, we can re-activate the cluster. + // 2. Keeping track of the average clustering size when we aren't done scheduling. + // Cost function that was discussed during the meeting on Friday: + // (15 - averageClusteringSize) * ClusteringWeight + // We want to minimize this cost but there is an issue in the following example + // Ex: Partial schedule was able to cluster a block of 15. averageClusteringSize : 15, CostFnc: (15-15)*Weight = 0 + // Any cluster block below size 15 will decrease the average cluster size and increase the cost. + // This makes our B&B enumerator actually favor not doing clustering. + defCnt = inst->GetDefs(defs); useCnt = inst->GetUses(uses); From a6376ab44178f8f04456a9f37c36007ca5d33843 Mon Sep 17 00:00:00 2001 From: Vang Thao Date: Tue, 10 Mar 2020 19:33:36 -0700 Subject: [PATCH 03/40] Added LLVM's method to check if we should cluster MemOps --- .../Scheduler/OptSchedDDGWrapperBase.h | 2 + lib/Wrapper/AMDGPU/OptSchedDDGWrapperGCN.cpp | 2 +- lib/Wrapper/OptSchedDDGWrapperBasic.cpp | 100 +++++++++++++++--- lib/Wrapper/OptSchedDDGWrapperBasic.h | 46 ++++++++ lib/Wrapper/OptimizingScheduler.cpp | 3 +- 5 files changed, 139 insertions(+), 14 deletions(-) diff --git a/include/opt-sched/Scheduler/OptSchedDDGWrapperBase.h b/include/opt-sched/Scheduler/OptSchedDDGWrapperBase.h index 8eb1499d..4db4673c 100644 --- a/include/opt-sched/Scheduler/OptSchedDDGWrapperBase.h +++ b/include/opt-sched/Scheduler/OptSchedDDGWrapperBase.h @@ -17,6 +17,8 @@ class OptSchedDDGWrapperBase { virtual void convertSUnits() = 0; virtual void convertRegFiles() = 0; + + virtual void findPossibleClusters() = 0; }; } // namespace opt_sched diff --git a/lib/Wrapper/AMDGPU/OptSchedDDGWrapperGCN.cpp b/lib/Wrapper/AMDGPU/OptSchedDDGWrapperGCN.cpp index 57aa0713..0aaf5bc4 100644 --- a/lib/Wrapper/AMDGPU/OptSchedDDGWrapperGCN.cpp +++ b/lib/Wrapper/AMDGPU/OptSchedDDGWrapperGCN.cpp @@ -182,7 +182,7 @@ void OptSchedDDGWrapperGCN::convertRegFiles() { } LLVM_DEBUG(DAG->dumpLLVMRegisters()); - LLVM_DEBUG(dumpOptSchedRegisters()); + //LLVM_DEBUG(dumpOptSchedRegisters()); } void OptSchedDDGWrapperGCN::addSubRegDefs(SchedInstruction *Instr, unsigned Reg, diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp index b7d3444f..a26b254a 100644 --- a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp +++ b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp @@ -9,6 +9,8 @@ #include "opt-sched/Scheduler/logger.h" #include "opt-sched/Scheduler/register.h" #include "opt-sched/Scheduler/sched_basic_data.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" @@ -20,12 +22,14 @@ #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Function.h" #include "llvm/Target/TargetMachine.h" +#include #include #include #include #include #include #include +#include #include #define DEBUG_TYPE "optsched-ddg-wrapper" @@ -205,7 +209,7 @@ void OptSchedDDGWrapperBasic::addDefsAndUses() { } LLVM_DEBUG(DAG->dumpLLVMRegisters()); - LLVM_DEBUG(dumpOptSchedRegisters()); + //LLVM_DEBUG(dumpOptSchedRegisters()); } void OptSchedDDGWrapperBasic::addUse(unsigned RegUnit, InstCount Index) { @@ -506,25 +510,97 @@ void OptSchedDDGWrapperBasic::countBoundaryLiveness( } } +// Partially copied from +// https://github.com/RadeonOpenCompute/llvm/blob/roc-ocl-2.4.0/lib/CodeGen/MachineScheduler.cpp#L1554 +void OptSchedDDGWrapperBasic::clusterNeighboringMemOps_( + ArrayRef MemOps) { + SmallVector MemOpRecords; + dbgs() << "Processing possible clusters\n"; + for (const SUnit *SU : MemOps) { + dbgs() << " " << SU->NodeNum << " is in the chain.\n"; + MachineOperand *BaseOp; + int64_t Offset; + if (DAG->TII->getMemOperandWithOffset(*SU->getInstr(), BaseOp, Offset, DAG->TRI)) + MemOpRecords.push_back(MemOpInfo(SU, BaseOp, Offset)); + } + + if (MemOpRecords.size() < 2) { + dbgs() << " Unable to cluster memop cluster of 1.\n"; + return; + } + + llvm::sort(MemOpRecords); + unsigned ClusterLength = 1; + for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) { + const SUnit *SUa = MemOpRecords[Idx].SU; + const SUnit *SUb = MemOpRecords[Idx + 1].SU; + dbgs() << " Checking possible clustering of (" << SUa->NodeNum << ") and (" << SUb->NodeNum << ")\n"; + if (DAG->TII->shouldClusterMemOps(*MemOpRecords[Idx].BaseOp, + *MemOpRecords[Idx + 1].BaseOp, + ClusterLength)) { + dbgs() << " Cluster possible at SU(" << SUa->NodeNum << ")- SU(" << SUb->NodeNum << ")\n"; + ++ClusterLength; + } else + ClusterLength = 1; + } +} /// Iterate through SUnits and find all possible clustering then transfer /// the information over to the SchedInstruction class as a bitvector. -/// Partially copied from https://github.com/llvm/llvm-project/blob/master/llvm/lib/CodeGen/MachineScheduler.cpp#L1615 -// void findPossibleClusters() { +/// Partially copied from https://github.com/RadeonOpenCompute/llvm/blob/roc-ocl-2.4.0/lib/CodeGen/MachineScheduler.cpp#L1595 +void OptSchedDDGWrapperBasic::findPossibleClusters() { // Copy how LLVM handles clustering except instead of actually // modifying the DAG, we can possibly set MayCluster to true. // Then add the nodes that can be clustered together into a // data structure. -// for (auto &SU : DAG->SUnits) { -// if ((IsLoad && !SU.getInstr()->mayLoad()) || -// (!IsLoad && !SU.getInstr()->mayStore())) -// continue; -// ... -// ... -// } -// ... -// } + // Experiment with clustering loads first + bool IsLoad = true; + + dbgs() << "Looking for load clusters\n"; + DenseMap StoreChainIDs; + // Map each store chain to a set of dependent MemOps. + SmallVector, 32> StoreChainDependents; + for (const SUnit &SU : DAG->SUnits) { + if ((IsLoad && !SU.getInstr()->mayLoad()) || + (!IsLoad && !SU.getInstr()->mayStore())) + continue; + auto MI = SU.getInstr(); + dbgs() << " Instruction (" << SU.NodeNum << ") " << DAG->TII->getName(MI->getOpcode()) << " may load.\n"; + + unsigned ChainPredID = DAG->SUnits.size(); + for (const SDep &Pred : SU.Preds) { + if (Pred.isCtrl()) { + auto PredMI = Pred.getSUnit()->getInstr(); + dbgs() << " Breaking chain at (" << Pred.getSUnit()->NodeNum << ") " << DAG->TII->getName(PredMI->getOpcode()) << '\n'; + ChainPredID = Pred.getSUnit()->NodeNum; + break; + } + } + // Check if this chain-like pred has been seen + // before. ChainPredID==MaxNodeID at the top of the schedule. + unsigned NumChains = StoreChainDependents.size(); + dbgs() << " ChainPredID " << ChainPredID << ", NumChains " << NumChains << '\n'; + std::pair::iterator, bool> Result = + StoreChainIDs.insert(std::make_pair(ChainPredID, NumChains)); + if (Result.second) + StoreChainDependents.resize(NumChains + 1); + dbgs() << " Pushing (" << SU.NodeNum << ") on the chain.\n"; + StoreChainDependents[Result.first->second].push_back(&SU); + dbgs() << " inPrinting size of SCD: " << StoreChainDependents.size() << '\n'; + } + + + dbgs() << " outPrinting size of SCD: " << StoreChainDependents.size() << '\n'; + // Iterate over the store chains. + for (auto &SCD : StoreChainDependents) { + dbgs() << " Printing the list before clustering: "; + for (auto SU1 : SCD) + dbgs() << SU1->NodeNum << " "; + dbgs() << '\n'; + clusterNeighboringMemOps_(SCD); + } +} LLVMRegTypeFilter::LLVMRegTypeFilter( const MachineModel *MM, const llvm::TargetRegisterInfo *TRI, diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.h b/lib/Wrapper/OptSchedDDGWrapperBasic.h index 88631511..9970fab9 100644 --- a/lib/Wrapper/OptSchedDDGWrapperBasic.h +++ b/lib/Wrapper/OptSchedDDGWrapperBasic.h @@ -14,6 +14,7 @@ #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include #include #include @@ -49,6 +50,7 @@ class OptSchedDDGWrapperBasic : public DataDepGraph { void convertSUnits() override; void convertRegFiles() override; + void findPossibleClusters() override; protected: // A convenience machMdl_ pointer casted to OptSchedMachineModel*. @@ -133,6 +135,9 @@ class OptSchedDDGWrapperBasic : public DataDepGraph { // Find liveness info generated by the region boundary. void discoverBoundaryLiveness(const llvm::MachineInstr *MI); + void clusterNeighboringMemOps_( + ArrayRef MemOps); + // Holds a register live range, mapping a producer to a set of consumers. struct LiveRange { // The node which defines the register tracked by this live range. @@ -140,6 +145,47 @@ class OptSchedDDGWrapperBasic : public DataDepGraph { // The nodes which use the register tracked by this live range. std::vector consumers; }; + +// Copied from +// https://github.com/RadeonOpenCompute/llvm/blob/roc-ocl-2.4.0/lib/CodeGen/MachineScheduler.cpp#L1467 + struct MemOpInfo { + const SUnit *SU; + MachineOperand *BaseOp; + int64_t Offset; + + MemOpInfo(const SUnit *su, MachineOperand *Op, int64_t ofs) + : SU(su), BaseOp(Op), Offset(ofs) {} + + bool operator<(const MemOpInfo &RHS) const { + if (BaseOp->getType() != RHS.BaseOp->getType()) + return BaseOp->getType() < RHS.BaseOp->getType(); + + if (BaseOp->isReg()) + return std::make_tuple(BaseOp->getReg(), Offset, SU->NodeNum) < + std::make_tuple(RHS.BaseOp->getReg(), RHS.Offset, + RHS.SU->NodeNum); + if (BaseOp->isFI()) { + const MachineFunction &MF = + *BaseOp->getParent()->getParent()->getParent(); + const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering(); + bool StackGrowsDown = TFI.getStackGrowthDirection() == + TargetFrameLowering::StackGrowsDown; + // Can't use tuple comparison here since we might need to use a + // different order when the stack grows down. + if (BaseOp->getIndex() != RHS.BaseOp->getIndex()) + return StackGrowsDown ? BaseOp->getIndex() > RHS.BaseOp->getIndex() + : BaseOp->getIndex() < RHS.BaseOp->getIndex(); + + if (Offset != RHS.Offset) + return StackGrowsDown ? Offset > RHS.Offset : Offset < RHS.Offset; + + return SU->NodeNum < RHS.SU->NodeNum; + } + + llvm_unreachable("MemOpClusterMutation only supports register or frame " + "index bases."); + } + }; }; // Exclude certain registers from being visible to the scheduler. Use LLVM's diff --git a/lib/Wrapper/OptimizingScheduler.cpp b/lib/Wrapper/OptimizingScheduler.cpp index be70dfa2..c9877c62 100644 --- a/lib/Wrapper/OptimizingScheduler.cpp +++ b/lib/Wrapper/OptimizingScheduler.cpp @@ -380,7 +380,8 @@ void ScheduleDAGOptSched::schedule() { OST->createDDGWrapper(C, this, MM.get(), LatencyPrecision, RegionName); DDG->convertSUnits(); DDG->convertRegFiles(); - // DDG->findPossibleClusters(); + dbgs() << "Printing possible clusters\n"; + DDG->findPossibleClusters(); auto *BDDG = static_cast(DDG.get()); addGraphTransformations(BDDG); From 186a1f3c3b041da856bbaba4ad067f9e4a920f3d Mon Sep 17 00:00:00 2001 From: Vang Thao Date: Thu, 12 Mar 2020 19:13:13 -0700 Subject: [PATCH 04/40] Idea for implementation (WIP) --- include/opt-sched/Scheduler/bb_spill.h | 33 +++++ .../opt-sched/Scheduler/sched_basic_data.h | 10 +- lib/Scheduler/bb_spill.cpp | 123 ++++++++++++++---- lib/Scheduler/sched_basic_data.cpp | 11 +- lib/Wrapper/OptSchedDDGWrapperBasic.cpp | 19 ++- 5 files changed, 167 insertions(+), 29 deletions(-) diff --git a/include/opt-sched/Scheduler/bb_spill.h b/include/opt-sched/Scheduler/bb_spill.h index 27e3cbed..d857189e 100644 --- a/include/opt-sched/Scheduler/bb_spill.h +++ b/include/opt-sched/Scheduler/bb_spill.h @@ -9,11 +9,13 @@ Last Update: Apr. 2011 #ifndef OPTSCHED_SPILL_BB_SPILL_H #define OPTSCHED_SPILL_BB_SPILL_H +#include "opt-sched/Scheduler/bit_vector.h" #include "opt-sched/Scheduler/OptSchedTarget.h" #include "opt-sched/Scheduler/defines.h" #include "opt-sched/Scheduler/sched_region.h" #include "llvm/ADT/SmallVector.h" #include +#include #include #include @@ -33,6 +35,37 @@ class BBWithSpill : public SchedRegion { InstCount crntSpillCost_; InstCount optmlSpillCost_; + /// May not need this variable + bool CurrentlyClustering; + + /// Current cluster size + unsigned int CurrentClusterSize; + + /// Bitvector containing active bits for instructions that can be clustered + /// together + std::shared_ptr CurrentClusterVector; + + /// Experimental variables and values for cost adjustment + int ClusteringWeight; + int ClusterInitialCost; + + // Data struct to contain information about the previous clusters + struct PastClusters { + std::shared_ptr ClusterVector; + int ClusterSize; + int InstNum; // Instruction number that ended this cluster + + // Constructor + PastClusters(std::shared_ptr Cluster, int size, int num) + : ClusterVector(Cluster), ClusterSize(size), InstNum(num) {} + }; + + /// Vector containing the (n-1) past clusters + llvm::SmallVector> PastClustersList; + + /// Pointer to the latest past cluster + std::unique_ptr LastCluster; + // The target machine const OptSchedTarget *OST; diff --git a/include/opt-sched/Scheduler/sched_basic_data.h b/include/opt-sched/Scheduler/sched_basic_data.h index 6e3ed08b..01ff8882 100644 --- a/include/opt-sched/Scheduler/sched_basic_data.h +++ b/include/opt-sched/Scheduler/sched_basic_data.h @@ -16,6 +16,7 @@ Last Update: Sept. 2013 #include "opt-sched/Scheduler/hash_table.h" #include "opt-sched/Scheduler/machine_model.h" #include +#include namespace llvm { namespace opt_sched { @@ -435,8 +436,9 @@ class SchedInstruction : public GraphNode { /// Set MayCluster to true if clustering memory operations was found /// to be possible. - // void setMayCluster () { MayCluster = true; } - + void SetMayCluster(std::shared_ptr PossibleClustersVector); + bool GetMayCluster() { return MayCluster; } + auto GetClusterVector(); friend class SchedRange; protected: @@ -450,9 +452,9 @@ class SchedInstruction : public GraphNode { // bool MayStore; /// Data structure to store a possible clustering with other isntructions. /// This data structure should have a fast lookup operation. - // dataStructure PossibleClustures; + std::shared_ptr PossibleClusturesBitVector; /// This value should be set to true if clustering may be possible. - // bool MayCluster; + bool MayCluster; // A numberical ID for this instruction. int nodeID_; // The type of this instruction. diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp index 527ac0cd..8986ed4b 100644 --- a/lib/Scheduler/bb_spill.cpp +++ b/lib/Scheduler/bb_spill.cpp @@ -69,6 +69,13 @@ BBWithSpill::BBWithSpill(const OptSchedTarget *OST_, DataDepGraph *dataDepGraph, schduldEntryInstCnt_ = 0; schduldExitInstCnt_ = 0; schduldInstCnt_ = 0; + + CurrentClusterSize = 0; + CurrentClusterVector = nullptr; + ClusteringWeight = 10000; + ClusterInitialCost = 10000000; + PastClustersList.clear(); + LastCluster = nullptr; } /****************************************************************************/ @@ -431,33 +438,63 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst, int liveRegs; InstCount newSpillCost; + // Scheduling cases for clustering project: + // 1.) Cluster -> Cluster + // Simple case, just increment 1 from cluster size + // 2.) Cluster -> Non-Cluster + // ?? End clustering + // 3.) Non-Cluster -> Cluster + // Simple case, initialize clustering + // Possibly keep track of the current memory clustering size here // and in UpdateSpillInfoForUnSchdul_() - // if inst->mayCluster() then - - // // Can use bit operations to check if it is part of an active clustering - // // Possible implementation: if (curClusterBitVector[inst->GetNum]) - // if curInst is part of an active cluster then - // increment cluster size by 1 - // else if not in a cluster then - // start clustering by initializing cluster values - // // Possibly use bit operations to activate part of cluster - // // Ex: - // // Instr 0, 3, 4 can be clustered and there are 5 total instructions - // // curClusterBitVector Bitvector: 11001 - // - // Potential Issues: - // 1. How to implement this when un-scheduling? Need to keep track if new instruction disable a cluster - // so that when we backtrack, we can re-activate the cluster. - // 2. Keeping track of the average clustering size when we aren't done scheduling. + if (isSecondPass) { + if (inst->GetMayCluster()) { + if (CurrentClusterSize > 0 && CurrentClusterVector->GetBit(inst->GetNum()) { + // Case 1: Currently clustering and this current instruction is part of + // the cluster + CurrentClusterSize++; + if (CurrentClusterSize > 1) + // Only decrement the cost if we cluster at least 2 operations + // together (EXPERIMENTAL FOR NOW) + ClusterInitialCost -= ClusteringWeight; + } else { + // Case 3: Not currently clustering. Initialize clustering + // Sidenote: What if we go from current cluster to a different cluster? + CurrentClusterVector.reset(); // Clear cluster vector + CurrentClusterVector = inst->GetClusterVector(); // Set active cluster + CurrentClusterSize = 1; // Current size is 1 + } + } else if (CurrentClusterSize > 1) { + // Case 2: Exiting out of an active cluster + // Save the cluster to restore when backtracking. + if (LastCluster) { + // List of previous clusters + PastClustersList.push_back(std::move(LastCluster)); + + // Current previous cluster + LastCluster = llvm::make_unique( + CurrentClusterVector, CurrentClusterSize, inst->GetNum()); + } else + LastCluster = llvm::make_unique( + CurrentClusterVector, CurrentClusterSize, inst->GetNum()); + CurrentClusterVector.reset(); // Reset active cluster + CurrentClusterSize = 0; // Set cluster size to 0 + } + } + // Potential Issues: + // 1. Keeping track of the average clustering size when we aren't done + // scheduling. // Cost function that was discussed during the meeting on Friday: // (15 - averageClusteringSize) * ClusteringWeight - // We want to minimize this cost but there is an issue in the following example - // Ex: Partial schedule was able to cluster a block of 15. averageClusteringSize : 15, CostFnc: (15-15)*Weight = 0 - // Any cluster block below size 15 will decrease the average cluster size and increase the cost. - // This makes our B&B enumerator actually favor not doing clustering. + // We want to minimize this cost but there is an issue in the following + // example + // Ex: Partial schedule was able to cluster a block of 15. + // averageClusteringSize : 15, CostFnc: (15-15)*Weight = 0 + // Any cluster block below size 15 will decrease the average + // cluster size and increase the cost. This makes our B&B + // enumerator actually favor not doing clustering. - defCnt = inst->GetDefs(defs); useCnt = inst->GetUses(uses); @@ -649,6 +686,48 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) { inst->GetNum()); #endif + // Backtracking cases for clustering project: + // 1.) Cluster <- Cluster + // Simple case, just decrement 1 from cluster size + // 2.) Cluster <- Non-Cluster + // Have to restore state of Cluster and ?? + // Can/should we use a stack to restore state? + // 3.) Non-Cluster <- Cluster + // Simple case, just decrement 1 from cluster size + // If cluster size == 0, delete CurrentClusterVector + if (isSecondPass) { + if (inst->GetMayCluster()) { + // Case 1 + if (CurrentClusterSize > 0 && CurrentClusterVector->GetBit(inst->GetNum()) { + // Currently clustering and this current instruction is part of the + // cluter + if (CurrentClusterSize > 1) + ClusterInitialCost += ClusteringWeight; // Re-add the cost + CurrentClusterSize--; + } else { + // Case 3 + CurrentClusterSize--; + if (CurrentClusterSize == 0) + CurrentClusterVector.reset(); + } + } else if (LastCluster) { + if (LastCluster->InstNum == inst->GetNum()) { + // Case 2: If there was a previous cluster and + // this instruction ended the cluster then restore the previous + // cluster's state + CurrentClusterSize = LastCluster->ClusterSize; + CurrentClusterVector = LastCluster->ClusterVector; + LastCluster.reset(); // Release current cluster pointer + + // Get previous cluster from vector list + if (!PastClustersList.empty()) { + LastCluster = std::move(PastClustersList.back()); + PastClustersList.pop_back(); + } + } + } + + defCnt = inst->GetDefs(defs); useCnt = inst->GetUses(uses); diff --git a/lib/Scheduler/sched_basic_data.cpp b/lib/Scheduler/sched_basic_data.cpp index bdec48cb..2ff63c33 100644 --- a/lib/Scheduler/sched_basic_data.cpp +++ b/lib/Scheduler/sched_basic_data.cpp @@ -18,7 +18,7 @@ SchedInstruction::SchedInstruction(InstCount num, const string &name, instType_ = instType; // MayLoad = InstrMayLoad; // MayStore = InstrMayStore; - // MayCluster = false; + MayCluster = false; frwrdLwrBound_ = INVALID_VALUE; bkwrdLwrBound_ = INVALID_VALUE; @@ -742,6 +742,15 @@ int16_t SchedInstruction::CmputLastUseCnt() { return lastUseCnt_; } +void SchedInstruction::SetMayCluster(std::shared_ptr PossibleClustersVector) { + if (PossibleClustersVector->GetOneCnt > 0) { + PossibleClusturesBitVector = PossibleClustersVector; + MayCluster = true; + } +} + +auto SchedInstruction::GetClusterVector() { return PossibleClusturesBitVector; } + /****************************************************************************** * SchedRange * ******************************************************************************/ diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp index a26b254a..cd4f394f 100644 --- a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp +++ b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp @@ -5,6 +5,7 @@ //===----------------------------------------------------------------------===// #include "OptSchedDDGWrapperBasic.h" +#include "opt-sched/Scheduler/bit_vector.h" #include "opt-sched/Scheduler/config.h" #include "opt-sched/Scheduler/logger.h" #include "opt-sched/Scheduler/register.h" @@ -22,7 +23,6 @@ #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Function.h" #include "llvm/Target/TargetMachine.h" -#include #include #include #include @@ -516,6 +516,7 @@ void OptSchedDDGWrapperBasic::clusterNeighboringMemOps_( ArrayRef MemOps) { SmallVector MemOpRecords; dbgs() << "Processing possible clusters\n"; + for (const SUnit *SU : MemOps) { dbgs() << " " << SU->NodeNum << " is in the chain.\n"; MachineOperand *BaseOp; @@ -529,6 +530,8 @@ void OptSchedDDGWrapperBasic::clusterNeighboringMemOps_( return; } + auto ClusterVector = llvm::make_unique(DAG->SUnits.size()); + llvm::sort(MemOpRecords); unsigned ClusterLength = 1; for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) { @@ -538,11 +541,23 @@ void OptSchedDDGWrapperBasic::clusterNeighboringMemOps_( if (DAG->TII->shouldClusterMemOps(*MemOpRecords[Idx].BaseOp, *MemOpRecords[Idx + 1].BaseOp, ClusterLength)) { - dbgs() << " Cluster possible at SU(" << SUa->NodeNum << ")- SU(" << SUb->NodeNum << ")\n"; + dbgs() << " Cluster possible at SU(" << SUa->NodeNum << ")- SU(" << SUb->NodeNum << ")\n"; ++ClusterLength; + ClusterVector->SetBit(SUa->NodeNum); + ClusterVector->SetBit(SUb->NodeNum); } else ClusterLength = 1; } + dbgs () << "Printing bit vector: "; + for (int i = ClusterVector->GetSize() - 1; i >= 0; i--) { + if (ClusterVector->GetBit(i)) + dbgs() << "1"; + else + dbgs() << "0"; + } + dbgs() << '\n'; + insts_[SUa->NodeNum]->SetMayCluster(ClusterVector); + insts_[SUb->NodeNum]->SetMayCluster(ClusterVector); } /// Iterate through SUnits and find all possible clustering then transfer From c4b097344aeffdc818f7a465deb61c7798bb0bc6 Mon Sep 17 00:00:00 2001 From: vang Date: Thu, 12 Mar 2020 20:12:45 -0700 Subject: [PATCH 05/40] Fixed some compilation issues --- include/opt-sched/Scheduler/bb_spill.h | 2 +- include/opt-sched/Scheduler/sched_basic_data.h | 2 +- lib/Scheduler/bb_spill.cpp | 6 +++--- lib/Scheduler/sched_basic_data.cpp | 4 ++-- lib/Wrapper/OptSchedDDGWrapperBasic.cpp | 4 ++-- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/opt-sched/Scheduler/bb_spill.h b/include/opt-sched/Scheduler/bb_spill.h index d857189e..ee2325da 100644 --- a/include/opt-sched/Scheduler/bb_spill.h +++ b/include/opt-sched/Scheduler/bb_spill.h @@ -61,7 +61,7 @@ class BBWithSpill : public SchedRegion { }; /// Vector containing the (n-1) past clusters - llvm::SmallVector> PastClustersList; + llvm::SmallVector, 0> PastClustersList; /// Pointer to the latest past cluster std::unique_ptr LastCluster; diff --git a/include/opt-sched/Scheduler/sched_basic_data.h b/include/opt-sched/Scheduler/sched_basic_data.h index 01ff8882..2eededb2 100644 --- a/include/opt-sched/Scheduler/sched_basic_data.h +++ b/include/opt-sched/Scheduler/sched_basic_data.h @@ -438,7 +438,7 @@ class SchedInstruction : public GraphNode { /// to be possible. void SetMayCluster(std::shared_ptr PossibleClustersVector); bool GetMayCluster() { return MayCluster; } - auto GetClusterVector(); + std::shared_ptr GetClusterVector(); friend class SchedRange; protected: diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp index 8986ed4b..ccaad4dc 100644 --- a/lib/Scheduler/bb_spill.cpp +++ b/lib/Scheduler/bb_spill.cpp @@ -450,7 +450,7 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst, // and in UpdateSpillInfoForUnSchdul_() if (isSecondPass) { if (inst->GetMayCluster()) { - if (CurrentClusterSize > 0 && CurrentClusterVector->GetBit(inst->GetNum()) { + if (CurrentClusterSize > 0 && CurrentClusterVector->GetBit(inst->GetNum())) { // Case 1: Currently clustering and this current instruction is part of // the cluster CurrentClusterSize++; @@ -698,7 +698,7 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) { if (isSecondPass) { if (inst->GetMayCluster()) { // Case 1 - if (CurrentClusterSize > 0 && CurrentClusterVector->GetBit(inst->GetNum()) { + if (CurrentClusterSize > 0 && CurrentClusterVector->GetBit(inst->GetNum())) { // Currently clustering and this current instruction is part of the // cluter if (CurrentClusterSize > 1) @@ -725,7 +725,7 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) { PastClustersList.pop_back(); } } - } + }} defCnt = inst->GetDefs(defs); diff --git a/lib/Scheduler/sched_basic_data.cpp b/lib/Scheduler/sched_basic_data.cpp index 2ff63c33..f74b605f 100644 --- a/lib/Scheduler/sched_basic_data.cpp +++ b/lib/Scheduler/sched_basic_data.cpp @@ -743,13 +743,13 @@ int16_t SchedInstruction::CmputLastUseCnt() { } void SchedInstruction::SetMayCluster(std::shared_ptr PossibleClustersVector) { - if (PossibleClustersVector->GetOneCnt > 0) { + if (PossibleClustersVector->GetOneCnt() > 0) { PossibleClusturesBitVector = PossibleClustersVector; MayCluster = true; } } -auto SchedInstruction::GetClusterVector() { return PossibleClusturesBitVector; } +std::shared_ptr SchedInstruction::GetClusterVector() { return PossibleClusturesBitVector; } /****************************************************************************** * SchedRange * diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp index cd4f394f..c2e38723 100644 --- a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp +++ b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp @@ -545,6 +545,8 @@ void OptSchedDDGWrapperBasic::clusterNeighboringMemOps_( ++ClusterLength; ClusterVector->SetBit(SUa->NodeNum); ClusterVector->SetBit(SUb->NodeNum); + insts_[SUa->NodeNum]->SetMayCluster(ClusterVector); + insts_[SUb->NodeNum]->SetMayCluster(ClusterVector); } else ClusterLength = 1; } @@ -556,8 +558,6 @@ void OptSchedDDGWrapperBasic::clusterNeighboringMemOps_( dbgs() << "0"; } dbgs() << '\n'; - insts_[SUa->NodeNum]->SetMayCluster(ClusterVector); - insts_[SUb->NodeNum]->SetMayCluster(ClusterVector); } /// Iterate through SUnits and find all possible clustering then transfer From 75b02f4eb9f00d768efccea3af6f5d1944370701 Mon Sep 17 00:00:00 2001 From: Vang Thao Date: Fri, 13 Mar 2020 08:42:28 -0700 Subject: [PATCH 06/40] Fixed some compiler bugs, and added experimental cost. --- include/opt-sched/Scheduler/bb_spill.h | 3 --- lib/Scheduler/bb_spill.cpp | 15 +++++++++++---- lib/Scheduler/enumerator.cpp | 2 +- lib/Scheduler/sched_region.cpp | 8 +++++++- lib/Wrapper/OptSchedDDGWrapperBasic.cpp | 2 +- 5 files changed, 20 insertions(+), 10 deletions(-) diff --git a/include/opt-sched/Scheduler/bb_spill.h b/include/opt-sched/Scheduler/bb_spill.h index ee2325da..84f6282e 100644 --- a/include/opt-sched/Scheduler/bb_spill.h +++ b/include/opt-sched/Scheduler/bb_spill.h @@ -35,9 +35,6 @@ class BBWithSpill : public SchedRegion { InstCount crntSpillCost_; InstCount optmlSpillCost_; - /// May not need this variable - bool CurrentlyClustering; - /// Current cluster size unsigned int CurrentClusterSize; diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp index ccaad4dc..5f9696cb 100644 --- a/lib/Scheduler/bb_spill.cpp +++ b/lib/Scheduler/bb_spill.cpp @@ -72,8 +72,8 @@ BBWithSpill::BBWithSpill(const OptSchedTarget *OST_, DataDepGraph *dataDepGraph, CurrentClusterSize = 0; CurrentClusterVector = nullptr; - ClusteringWeight = 10000; - ClusterInitialCost = 10000000; + ClusteringWeight = 1000; + ClusterInitialCost = 1000000; PastClustersList.clear(); LastCluster = nullptr; } @@ -376,6 +376,9 @@ InstCount BBWithSpill::CmputNormCost_(InstSchedule *sched, cost -= costLwrBound_; execCost -= costLwrBound_; + if (isSecondPass) + cost += ClusterInitialCost; + sched->SetCost(cost); sched->SetExecCost(execCost); return cost; @@ -454,10 +457,12 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst, // Case 1: Currently clustering and this current instruction is part of // the cluster CurrentClusterSize++; - if (CurrentClusterSize > 1) + if (CurrentClusterSize > 2) { // Only decrement the cost if we cluster at least 2 operations // together (EXPERIMENTAL FOR NOW) ClusterInitialCost -= ClusteringWeight; + Logger::Info("More than 2 instructions clustered together!"); + } } else { // Case 3: Not currently clustering. Initialize clustering // Sidenote: What if we go from current cluster to a different cluster? @@ -701,8 +706,10 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) { if (CurrentClusterSize > 0 && CurrentClusterVector->GetBit(inst->GetNum())) { // Currently clustering and this current instruction is part of the // cluter - if (CurrentClusterSize > 1) + if (CurrentClusterSize > 2) { ClusterInitialCost += ClusteringWeight; // Re-add the cost + Logger::Info("More than 2 instructions clustered together. Undoing!!"); + } CurrentClusterSize--; } else { // Case 3 diff --git a/lib/Scheduler/enumerator.cpp b/lib/Scheduler/enumerator.cpp index e94f2170..89d157b9 100644 --- a/lib/Scheduler/enumerator.cpp +++ b/lib/Scheduler/enumerator.cpp @@ -974,7 +974,7 @@ bool Enumerator::FindNxtFsblBrnch_(EnumTreeNode *&newNode) { #ifdef IS_DEBUG_READY_LIST Logger::Info("Ready List Size is %d", rdyInstCnt); // Warning! That will reset the instruction iterator! - // rdyLst_->Print(Logger::GetLogStream()); + rdyLst_->Print(Logger::GetLogStream()); stats::maxReadyListSize.SetMax(rdyInstCnt); #endif diff --git a/lib/Scheduler/sched_region.cpp b/lib/Scheduler/sched_region.cpp index 4353dc18..c0c8f0e4 100644 --- a/lib/Scheduler/sched_region.cpp +++ b/lib/Scheduler/sched_region.cpp @@ -600,6 +600,12 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule( Logger::Info("DAG %s PEAK %d", dataDepGraph_->GetDagID(), maxSpillCost); } #endif + + if (isSecondPass) { + Logger::Info("Printing final schedule."); + bestSched->Print(Logger::GetLogStream(), "Best Sched"); + } + return rslt; } @@ -641,7 +647,7 @@ FUNC_RESULT SchedRegion::Optimize_(Milliseconds startTime, } stats::unsolvedProblemSize.Record(dataDepGraph_->GetInstCnt()); } - + return rslt; } diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp index c2e38723..cd0fd07b 100644 --- a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp +++ b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp @@ -530,7 +530,7 @@ void OptSchedDDGWrapperBasic::clusterNeighboringMemOps_( return; } - auto ClusterVector = llvm::make_unique(DAG->SUnits.size()); + auto ClusterVector = std::make_shared(DAG->SUnits.size()); llvm::sort(MemOpRecords); unsigned ClusterLength = 1; From 00501ae26353b8d44322f5be4170987482f489f5 Mon Sep 17 00:00:00 2001 From: Vang Thao Date: Fri, 13 Mar 2020 09:28:20 -0700 Subject: [PATCH 07/40] Cleaned up debug statements. NFC --- lib/Scheduler/enumerator.cpp | 2 +- lib/Scheduler/sched_region.cpp | 5 ++-- lib/Wrapper/OptSchedDDGWrapperBasic.cpp | 40 ++++++++++++------------- lib/Wrapper/OptimizingScheduler.cpp | 1 - 4 files changed, 24 insertions(+), 24 deletions(-) diff --git a/lib/Scheduler/enumerator.cpp b/lib/Scheduler/enumerator.cpp index 89d157b9..e94f2170 100644 --- a/lib/Scheduler/enumerator.cpp +++ b/lib/Scheduler/enumerator.cpp @@ -974,7 +974,7 @@ bool Enumerator::FindNxtFsblBrnch_(EnumTreeNode *&newNode) { #ifdef IS_DEBUG_READY_LIST Logger::Info("Ready List Size is %d", rdyInstCnt); // Warning! That will reset the instruction iterator! - rdyLst_->Print(Logger::GetLogStream()); + // rdyLst_->Print(Logger::GetLogStream()); stats::maxReadyListSize.SetMax(rdyInstCnt); #endif diff --git a/lib/Scheduler/sched_region.cpp b/lib/Scheduler/sched_region.cpp index c0c8f0e4..5ff6ae94 100644 --- a/lib/Scheduler/sched_region.cpp +++ b/lib/Scheduler/sched_region.cpp @@ -601,11 +601,13 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule( } #endif +#ifdef IS_DEBUG_MEMORY_CLUSTERING if (isSecondPass) { Logger::Info("Printing final schedule."); bestSched->Print(Logger::GetLogStream(), "Best Sched"); } - +#endif + return rslt; } @@ -647,7 +649,6 @@ FUNC_RESULT SchedRegion::Optimize_(Milliseconds startTime, } stats::unsolvedProblemSize.Record(dataDepGraph_->GetInstCnt()); } - return rslt; } diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp index cd0fd07b..8f607c68 100644 --- a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp +++ b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp @@ -515,10 +515,10 @@ void OptSchedDDGWrapperBasic::countBoundaryLiveness( void OptSchedDDGWrapperBasic::clusterNeighboringMemOps_( ArrayRef MemOps) { SmallVector MemOpRecords; - dbgs() << "Processing possible clusters\n"; + LLVM_DEBUG(dbgs() << "Processing possible clusters\n"); for (const SUnit *SU : MemOps) { - dbgs() << " " << SU->NodeNum << " is in the chain.\n"; + LLVM_DEBUG(dbgs() << " " << SU->NodeNum << " is in the chain.\n"); MachineOperand *BaseOp; int64_t Offset; if (DAG->TII->getMemOperandWithOffset(*SU->getInstr(), BaseOp, Offset, DAG->TRI)) @@ -526,7 +526,7 @@ void OptSchedDDGWrapperBasic::clusterNeighboringMemOps_( } if (MemOpRecords.size() < 2) { - dbgs() << " Unable to cluster memop cluster of 1.\n"; + LLVM_DEBUG(dbgs() << " Unable to cluster memop cluster of 1.\n"); return; } @@ -537,11 +537,11 @@ void OptSchedDDGWrapperBasic::clusterNeighboringMemOps_( for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) { const SUnit *SUa = MemOpRecords[Idx].SU; const SUnit *SUb = MemOpRecords[Idx + 1].SU; - dbgs() << " Checking possible clustering of (" << SUa->NodeNum << ") and (" << SUb->NodeNum << ")\n"; + LLVM_DEBUG(dbgs() << " Checking possible clustering of (" << SUa->NodeNum << ") and (" << SUb->NodeNum << ")\n"); if (DAG->TII->shouldClusterMemOps(*MemOpRecords[Idx].BaseOp, *MemOpRecords[Idx + 1].BaseOp, ClusterLength)) { - dbgs() << " Cluster possible at SU(" << SUa->NodeNum << ")- SU(" << SUb->NodeNum << ")\n"; + LLVM_DEBUG(dbgs() << " Cluster possible at SU(" << SUa->NodeNum << ")- SU(" << SUb->NodeNum << ")\n"); ++ClusterLength; ClusterVector->SetBit(SUa->NodeNum); ClusterVector->SetBit(SUb->NodeNum); @@ -550,14 +550,16 @@ void OptSchedDDGWrapperBasic::clusterNeighboringMemOps_( } else ClusterLength = 1; } - dbgs () << "Printing bit vector: "; +#ifdef IS_DEBUG_MEMORY_CLUSTERING + LLVM_DEBUG(dbgs () << "Printing bit vector: "); for (int i = ClusterVector->GetSize() - 1; i >= 0; i--) { if (ClusterVector->GetBit(i)) - dbgs() << "1"; + LLVM_DEBUG(dbgs() << "1"); else - dbgs() << "0"; + LLVM_DEBUG(dbgs() << "0"); } - dbgs() << '\n'; + LLVM_DEBUG(dbgs() << '\n'); +#endif } /// Iterate through SUnits and find all possible clustering then transfer @@ -572,7 +574,7 @@ void OptSchedDDGWrapperBasic::findPossibleClusters() { // Experiment with clustering loads first bool IsLoad = true; - dbgs() << "Looking for load clusters\n"; + LLVM_DEBUG(dbgs() << "Looking for load clusters\n"); DenseMap StoreChainIDs; // Map each store chain to a set of dependent MemOps. SmallVector, 32> StoreChainDependents; @@ -581,13 +583,13 @@ void OptSchedDDGWrapperBasic::findPossibleClusters() { (!IsLoad && !SU.getInstr()->mayStore())) continue; auto MI = SU.getInstr(); - dbgs() << " Instruction (" << SU.NodeNum << ") " << DAG->TII->getName(MI->getOpcode()) << " may load.\n"; + LLVM_DEBUG(dbgs() << " Instruction (" << SU.NodeNum << ") " << DAG->TII->getName(MI->getOpcode()) << " may load.\n"); unsigned ChainPredID = DAG->SUnits.size(); for (const SDep &Pred : SU.Preds) { if (Pred.isCtrl()) { auto PredMI = Pred.getSUnit()->getInstr(); - dbgs() << " Breaking chain at (" << Pred.getSUnit()->NodeNum << ") " << DAG->TII->getName(PredMI->getOpcode()) << '\n'; + LLVM_DEBUG(dbgs() << " Breaking chain at (" << Pred.getSUnit()->NodeNum << ") " << DAG->TII->getName(PredMI->getOpcode()) << '\n'); ChainPredID = Pred.getSUnit()->NodeNum; break; } @@ -595,24 +597,22 @@ void OptSchedDDGWrapperBasic::findPossibleClusters() { // Check if this chain-like pred has been seen // before. ChainPredID==MaxNodeID at the top of the schedule. unsigned NumChains = StoreChainDependents.size(); - dbgs() << " ChainPredID " << ChainPredID << ", NumChains " << NumChains << '\n'; + LLVM_DEBUG(dbgs() << " ChainPredID " << ChainPredID << ", NumChains " << NumChains << '\n'); std::pair::iterator, bool> Result = StoreChainIDs.insert(std::make_pair(ChainPredID, NumChains)); if (Result.second) StoreChainDependents.resize(NumChains + 1); - dbgs() << " Pushing (" << SU.NodeNum << ") on the chain.\n"; StoreChainDependents[Result.first->second].push_back(&SU); - dbgs() << " inPrinting size of SCD: " << StoreChainDependents.size() << '\n'; } - - dbgs() << " outPrinting size of SCD: " << StoreChainDependents.size() << '\n'; // Iterate over the store chains. for (auto &SCD : StoreChainDependents) { - dbgs() << " Printing the list before clustering: "; +#ifdef IS_DEBUG_MEMORY_CLUSTERING + LLVM_DEBUG(dbgs() << " Printing the list before clustering: "); for (auto SU1 : SCD) - dbgs() << SU1->NodeNum << " "; - dbgs() << '\n'; + LLVM_DEBUG(dbgs() << SU1->NodeNum << " "); + LLVM_DEBUG(dbgs() << '\n'); +#endif clusterNeighboringMemOps_(SCD); } } diff --git a/lib/Wrapper/OptimizingScheduler.cpp b/lib/Wrapper/OptimizingScheduler.cpp index c9877c62..ca383ac6 100644 --- a/lib/Wrapper/OptimizingScheduler.cpp +++ b/lib/Wrapper/OptimizingScheduler.cpp @@ -380,7 +380,6 @@ void ScheduleDAGOptSched::schedule() { OST->createDDGWrapper(C, this, MM.get(), LatencyPrecision, RegionName); DDG->convertSUnits(); DDG->convertRegFiles(); - dbgs() << "Printing possible clusters\n"; DDG->findPossibleClusters(); auto *BDDG = static_cast(DDG.get()); From 3603da9cb7e75bc3942d427586d466a8de169732 Mon Sep 17 00:00:00 2001 From: Vang Thao Date: Fri, 13 Mar 2020 10:47:39 -0700 Subject: [PATCH 08/40] Added clustering cost to ChkCostFsblty, and added TODOs. --- include/opt-sched/Scheduler/bb_spill.h | 17 ++++++--- .../opt-sched/Scheduler/sched_basic_data.h | 14 +------- lib/Scheduler/bb_spill.cpp | 35 ++++++++++++------- lib/Scheduler/enumerator.cpp | 12 ------- lib/Wrapper/OptSchedDDGWrapperBasic.cpp | 12 +++---- 5 files changed, 39 insertions(+), 51 deletions(-) diff --git a/include/opt-sched/Scheduler/bb_spill.h b/include/opt-sched/Scheduler/bb_spill.h index 84f6282e..62771561 100644 --- a/include/opt-sched/Scheduler/bb_spill.h +++ b/include/opt-sched/Scheduler/bb_spill.h @@ -42,25 +42,32 @@ class BBWithSpill : public SchedRegion { /// together std::shared_ptr CurrentClusterVector; + // TODO: Implement cost function for clustering /// Experimental variables and values for cost adjustment int ClusteringWeight; int ClusterInitialCost; - // Data struct to contain information about the previous clusters + /// Data struct to contain information about the previous clusters struct PastClusters { std::shared_ptr ClusterVector; + /// Size of the cluster when it was ended by an instruction not in the + /// cluster int ClusterSize; - int InstNum; // Instruction number that ended this cluster - // Constructor + /// Instruction number that ended this cluster + int InstNum; + + /// Constructor for this struct PastClusters(std::shared_ptr Cluster, int size, int num) : ClusterVector(Cluster), ClusterSize(size), InstNum(num) {} }; /// Vector containing the (n-1) past clusters - llvm::SmallVector, 0> PastClustersList; + llvm::SmallVector, 4> PastClustersList; - /// Pointer to the latest past cluster + /// Pointer to the last cluster. This is kept out of the vector to + /// avoid having to fetch it every time we compare the current instruction + /// number to the one that ended the cluster. std::unique_ptr LastCluster; // The target machine diff --git a/include/opt-sched/Scheduler/sched_basic_data.h b/include/opt-sched/Scheduler/sched_basic_data.h index 2eededb2..166dbbad 100644 --- a/include/opt-sched/Scheduler/sched_basic_data.h +++ b/include/opt-sched/Scheduler/sched_basic_data.h @@ -426,14 +426,6 @@ class SchedInstruction : public GraphNode { InstType GetCrtclPathFrmRoot() { return crtclPathFrmRoot_; } - /// Return true if this instruction could possibly read memory - /// Copied from https://llvm.org/doxygen/classllvm_1_1MachineInstr.html - // bool mayLoad() { return MayLoad; } - - /// Return true if this instruction could possibly modify memory. - /// Copied from https://llvm.org/doxygen/classllvm_1_1MachineInstr.html - // bool mayStore() { return MayStore; } - /// Set MayCluster to true if clustering memory operations was found /// to be possible. void SetMayCluster(std::shared_ptr PossibleClustersVector); @@ -446,11 +438,7 @@ class SchedInstruction : public GraphNode { string name_; // The mnemonic of this instruction, e.g. "add" or "jmp". string opCode_; - /// Indicate if this instruction may be a load operation - // bool MayLoad; - /// Indicate if this instruction may be a store operation - // bool MayStore; - /// Data structure to store a possible clustering with other isntructions. + /// Data structure to store a possible clustering with other instructions. /// This data structure should have a fast lookup operation. std::shared_ptr PossibleClusturesBitVector; /// This value should be set to true if clustering may be possible. diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp index 5f9696cb..c91bf552 100644 --- a/lib/Scheduler/bb_spill.cpp +++ b/lib/Scheduler/bb_spill.cpp @@ -376,6 +376,7 @@ InstCount BBWithSpill::CmputNormCost_(InstSchedule *sched, cost -= costLwrBound_; execCost -= costLwrBound_; + // TODO: Implement cost function for clustering if (isSecondPass) cost += ClusterInitialCost; @@ -443,17 +444,19 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst, // Scheduling cases for clustering project: // 1.) Cluster -> Cluster - // Simple case, just increment 1 from cluster size + // Simple case, just increment 1 from cluster size // 2.) Cluster -> Non-Cluster - // ?? End clustering + // ?? End clustering // 3.) Non-Cluster -> Cluster - // Simple case, initialize clustering + // Simple case, initialize clustering // Possibly keep track of the current memory clustering size here // and in UpdateSpillInfoForUnSchdul_() if (isSecondPass) { if (inst->GetMayCluster()) { - if (CurrentClusterSize > 0 && CurrentClusterVector->GetBit(inst->GetNum())) { + // TODO: Check for different cluster to different cluster scheduling + if (CurrentClusterSize > 0 && + CurrentClusterVector->GetBit(inst->GetNum())) { // Case 1: Currently clustering and this current instruction is part of // the cluster CurrentClusterSize++; @@ -461,15 +464,14 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst, // Only decrement the cost if we cluster at least 2 operations // together (EXPERIMENTAL FOR NOW) ClusterInitialCost -= ClusteringWeight; - Logger::Info("More than 2 instructions clustered together!"); - } - } else { + //Logger::Info("More than 2 instructions clustered together!"); + } + } else { // Case 3: Not currently clustering. Initialize clustering - // Sidenote: What if we go from current cluster to a different cluster? - CurrentClusterVector.reset(); // Clear cluster vector + CurrentClusterVector.reset(); // Clear cluster vector CurrentClusterVector = inst->GetClusterVector(); // Set active cluster - CurrentClusterSize = 1; // Current size is 1 - } + CurrentClusterSize = 1; // Current size is 1 + } } else if (CurrentClusterSize > 1) { // Case 2: Exiting out of an active cluster // Save the cluster to restore when backtracking. @@ -701,14 +703,16 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) { // Simple case, just decrement 1 from cluster size // If cluster size == 0, delete CurrentClusterVector if (isSecondPass) { + // TODO: Check for different cluster to different cluster + // backtracking. if (inst->GetMayCluster()) { // Case 1 if (CurrentClusterSize > 0 && CurrentClusterVector->GetBit(inst->GetNum())) { // Currently clustering and this current instruction is part of the - // cluter + // cluster if (CurrentClusterSize > 2) { ClusterInitialCost += ClusteringWeight; // Re-add the cost - Logger::Info("More than 2 instructions clustered together. Undoing!!"); + //Logger::Info("More than 2 instructions clustered together. Undoing!!"); } CurrentClusterSize--; } else { @@ -1027,6 +1031,7 @@ void BBWithSpill::SetupForSchdulng_() { bool BBWithSpill::ChkCostFsblty(InstCount trgtLngth, EnumTreeNode *node) { bool fsbl = true; InstCount crntCost, dynmcCostLwrBound; + if (spillCostFunc_ == SCF_SLIL) { crntCost = dynamicSlilLowerBound_ * SCW_ + trgtLngth * schedCostFactor_; } else { @@ -1035,6 +1040,10 @@ bool BBWithSpill::ChkCostFsblty(InstCount trgtLngth, EnumTreeNode *node) { crntCost -= costLwrBound_; dynmcCostLwrBound = crntCost; + // TODO: Implement cost function for clustering + if (isSecondPass) + cost += ClusterInitialCost; + // assert(cost >= 0); assert(dynmcCostLwrBound >= 0); diff --git a/lib/Scheduler/enumerator.cpp b/lib/Scheduler/enumerator.cpp index e94f2170..d9c4e3b1 100644 --- a/lib/Scheduler/enumerator.cpp +++ b/lib/Scheduler/enumerator.cpp @@ -982,18 +982,6 @@ bool Enumerator::FindNxtFsblBrnch_(EnumTreeNode *&newNode) { if (crntBrnchNum == 0 && SchedForRPOnly_) crntNode_->SetFoundInstWithUse(IsUseInRdyLst_()); - // Note: This is just a thought, we might not need this here. - // Check if clustering is possible. - // We want to only do memory clustering in the second pass for now. - // if (crntBrnchNum == 0 && EnableMemClustering && SecondPass) - // // TODO: Implement these functions/attributes - // // and implement cost. Also keep track of current - // // cluster size since we do not want to exceed 15 - // // memory operations in a cluster (This and the cost - // // is probably done somewhere else and not here). - // ClusteringPossible = crntNode_->CheckForClustering(); - // crntNode_->SetClusteringPossible(ClusteringPossible); - for (i = crntBrnchNum; i < brnchCnt && crntNode_->IsFeasible(); i++) { #ifdef IS_DEBUG_FLOW Logger::Info("Probing branch %d out of %d", i, brnchCnt); diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp index 8f607c68..247294e9 100644 --- a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp +++ b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp @@ -510,8 +510,8 @@ void OptSchedDDGWrapperBasic::countBoundaryLiveness( } } -// Partially copied from -// https://github.com/RadeonOpenCompute/llvm/blob/roc-ocl-2.4.0/lib/CodeGen/MachineScheduler.cpp#L1554 +/// Partially copied from +/// https://github.com/RadeonOpenCompute/llvm/blob/roc-ocl-2.4.0/lib/CodeGen/MachineScheduler.cpp#L1554 void OptSchedDDGWrapperBasic::clusterNeighboringMemOps_( ArrayRef MemOps) { SmallVector MemOpRecords; @@ -566,12 +566,8 @@ void OptSchedDDGWrapperBasic::clusterNeighboringMemOps_( /// the information over to the SchedInstruction class as a bitvector. /// Partially copied from https://github.com/RadeonOpenCompute/llvm/blob/roc-ocl-2.4.0/lib/CodeGen/MachineScheduler.cpp#L1595 void OptSchedDDGWrapperBasic::findPossibleClusters() { -// Copy how LLVM handles clustering except instead of actually -// modifying the DAG, we can possibly set MayCluster to true. -// Then add the nodes that can be clustered together into a -// data structure. - - // Experiment with clustering loads first + // TODO: Add For-loop to also do store clusters. Currently only does load + // clusters bool IsLoad = true; LLVM_DEBUG(dbgs() << "Looking for load clusters\n"); From 035272bc36327dc15ad688ab0625c71ffe05cb12 Mon Sep 17 00:00:00 2001 From: vang thao Date: Fri, 13 Mar 2020 16:37:16 -0700 Subject: [PATCH 09/40] Fix typo for variable and disabled terminating enumerator when we find a schedule in the ILP pass --- lib/Scheduler/bb_spill.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp index c91bf552..63190abe 100644 --- a/lib/Scheduler/bb_spill.cpp +++ b/lib/Scheduler/bb_spill.cpp @@ -464,7 +464,7 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst, // Only decrement the cost if we cluster at least 2 operations // together (EXPERIMENTAL FOR NOW) ClusterInitialCost -= ClusteringWeight; - //Logger::Info("More than 2 instructions clustered together!"); + Logger::Info("More than 2 instructions clustered together!"); } } else { // Case 3: Not currently clustering. Initialize clustering @@ -712,7 +712,7 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) { // cluster if (CurrentClusterSize > 2) { ClusterInitialCost += ClusteringWeight; // Re-add the cost - //Logger::Info("More than 2 instructions clustered together. Undoing!!"); + Logger::Info("More than 2 instructions clustered together. Undoing!!"); } CurrentClusterSize--; } else { @@ -930,17 +930,18 @@ FUNC_RESULT BBWithSpill::Enumerate_(Milliseconds startTime, HandlEnumrtrRslt_(rslt, trgtLngth); if (bestCost_ == 0 || rslt == RES_ERROR || - (lngthDeadline == rgnDeadline && rslt == RES_TIMEOUT) || - (rslt == RES_SUCCESS && isSecondPass)) { + (lngthDeadline == rgnDeadline && rslt == RES_TIMEOUT)) { //|| + //(rslt == RES_SUCCESS && isSecondPass)) { // If doing two pass optsched and on the second pass then terminate if a // schedule is found with the same min-RP found in first pass. + /* if (rslt == RES_SUCCESS && isSecondPass) { Logger::Info("Schedule found in second pass, terminating BB loop."); if (trgtLngth < schedUprBound_) Logger::Info("Schedule found with length %d is shorter than current schedule with length %d.", trgtLngth, schedUprBound_); - } + }*/ break; } @@ -1042,7 +1043,7 @@ bool BBWithSpill::ChkCostFsblty(InstCount trgtLngth, EnumTreeNode *node) { // TODO: Implement cost function for clustering if (isSecondPass) - cost += ClusterInitialCost; + crntCost += ClusterInitialCost; // assert(cost >= 0); assert(dynmcCostLwrBound >= 0); From a2cd231990157acdce17769d6e428acdb4f439ba Mon Sep 17 00:00:00 2001 From: vang thao Date: Tue, 17 Mar 2020 13:07:34 -0700 Subject: [PATCH 10/40] Debugging statements and reset mem clustering info in InitForSchduling --- lib/Scheduler/bb_spill.cpp | 10 ++++++++-- lib/Scheduler/enumerator.cpp | 11 ++++++++++- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp index 63190abe..bb06fc18 100644 --- a/lib/Scheduler/bb_spill.cpp +++ b/lib/Scheduler/bb_spill.cpp @@ -327,6 +327,12 @@ InstCount BBWithSpill::CmputCostLwrBound() { void BBWithSpill::InitForSchdulng() { InitForCostCmputtn_(); + CurrentClusterSize = 0; + CurrentClusterVector.reset(); + ClusterInitialCost = 1000000; + PastClustersList.clear(); + LastCluster.reset(); + schduldEntryInstCnt_ = 0; schduldExitInstCnt_ = 0; schduldInstCnt_ = 0; @@ -464,7 +470,7 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst, // Only decrement the cost if we cluster at least 2 operations // together (EXPERIMENTAL FOR NOW) ClusterInitialCost -= ClusteringWeight; - Logger::Info("More than 2 instructions clustered together!"); + Logger::Info("Currently clustering %d instructions together", CurrentClusterSize); } } else { // Case 3: Not currently clustering. Initialize clustering @@ -712,9 +718,9 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) { // cluster if (CurrentClusterSize > 2) { ClusterInitialCost += ClusteringWeight; // Re-add the cost - Logger::Info("More than 2 instructions clustered together. Undoing!!"); } CurrentClusterSize--; + Logger::Info("Undoing an instruction from the cluster. Current size: %d", CurrentClusterSize); } else { // Case 3 CurrentClusterSize--; diff --git a/lib/Scheduler/enumerator.cpp b/lib/Scheduler/enumerator.cpp index d9c4e3b1..39019034 100644 --- a/lib/Scheduler/enumerator.cpp +++ b/lib/Scheduler/enumerator.cpp @@ -1065,6 +1065,7 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode, if (inst != NULL) if (inst->GetPreFxdCycle() != INVALID_VALUE) if (inst->GetPreFxdCycle() != crntCycleNum_) { + Logger::Info("Pruned due to prefixed cycle"); return false; } @@ -1073,6 +1074,7 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode, #ifdef IS_DEBUG_INFSBLTY_TESTS stats::forwardLBInfeasibilityHits++; #endif + Logger::Info("Pruned due to forward lowerbound"); return false; } @@ -1080,6 +1082,7 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode, #ifdef IS_DEBUG_INFSBLTY_TESTS stats::backwardLBInfeasibilityHits++; #endif + Logger::Info("Pruned due to backward lowerbound"); return false; } } @@ -1100,6 +1103,7 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode, stats::nodeSuperiorityInfeasibilityHits++; #endif isNodeDmntd = true; + Logger::Info("Pruned due to node superiority"); return false; } } @@ -1117,6 +1121,7 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode, #ifdef IS_DEBUG_INFSBLTY_TESTS stats::slotCountInfeasibilityHits++; #endif + Logger::Info("Pruned due to issue slot infeasibility"); return false; } @@ -1127,6 +1132,7 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode, #ifdef IS_DEBUG_INFSBLTY_TESTS stats::rangeTighteningInfeasibilityHits++; #endif + Logger::Info("Pruned due to range tightening infeasibility"); return false; } @@ -1144,6 +1150,7 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode, #ifdef IS_DEBUG_INFSBLTY_TESTS stats::historyDominationInfeasibilityHits++; #endif + Logger::Info("Pruned due to history domination"); return false; } } @@ -1158,7 +1165,7 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode, stats::relaxedSchedulingInfeasibilityHits++; #endif isRlxInfsbl = true; - + Logger::Info("Pruned due to relaxed schedule"); return false; } } @@ -2071,6 +2078,7 @@ bool LengthCostEnumerator::ProbeBranch_(SchedInstruction *inst, isFsbl = ChkCostFsblty_(inst, newNode); if (isFsbl == false) { + Logger::Info("Pruned due to cost infeasibility"); return false; } @@ -2088,6 +2096,7 @@ bool LengthCostEnumerator::ProbeBranch_(SchedInstruction *inst, #endif rgn_->UnschdulInst(inst, crntCycleNum_, crntSlotNum_, parent); + Logger::Info("Pruned due to history domination"); return false; } } From 760c38d5a4153f73f5d85724581524ddba90f2b0 Mon Sep 17 00:00:00 2001 From: Vang Thao Date: Tue, 17 Mar 2020 14:09:14 -0700 Subject: [PATCH 11/40] Added setting or memory clustering in settings. Fixed clustering for cluster to cluster mem-ops. More Debug statements. --- example/optsched-cfg/sched.ini | 5 ++ include/opt-sched/Scheduler/bb_spill.h | 4 ++ lib/Scheduler/bb_spill.cpp | 94 +++++++++++++++++--------- 3 files changed, 72 insertions(+), 31 deletions(-) diff --git a/example/optsched-cfg/sched.ini b/example/optsched-cfg/sched.ini index 07e9a626..8addb5f5 100644 --- a/example/optsched-cfg/sched.ini +++ b/example/optsched-cfg/sched.ini @@ -14,6 +14,11 @@ PRINT_SPILL_COUNTS YES # NO USE_TWO_PASS NO +# Cluster memory operations together in the second pass +# YES +# NO +CLUSTER_MEMORY_OPS NO + # These 3 flags control which schedulers will be used. # Each one can be individually toggled. The heuristic # list scheduler or ACO must be run before the diff --git a/include/opt-sched/Scheduler/bb_spill.h b/include/opt-sched/Scheduler/bb_spill.h index 62771561..df798d43 100644 --- a/include/opt-sched/Scheduler/bb_spill.h +++ b/include/opt-sched/Scheduler/bb_spill.h @@ -42,6 +42,10 @@ class BBWithSpill : public SchedRegion { /// together std::shared_ptr CurrentClusterVector; + /// Flag to enable or disable clustering memory operations + /// in the ILP pass. + bool ClusterMemoryOperations; + // TODO: Implement cost function for clustering /// Experimental variables and values for cost adjustment int ClusteringWeight; diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp index bb06fc18..eaf577f3 100644 --- a/lib/Scheduler/bb_spill.cpp +++ b/lib/Scheduler/bb_spill.cpp @@ -76,6 +76,8 @@ BBWithSpill::BBWithSpill(const OptSchedTarget *OST_, DataDepGraph *dataDepGraph, ClusterInitialCost = 1000000; PastClustersList.clear(); LastCluster = nullptr; + + ClusterMemoryOperations = schedIni.GetBool("CLUSTER_MEMORY_OPS"); } /****************************************************************************/ @@ -383,7 +385,7 @@ InstCount BBWithSpill::CmputNormCost_(InstSchedule *sched, execCost -= costLwrBound_; // TODO: Implement cost function for clustering - if (isSecondPass) + if (isSecondPass && ClusterMemoryOperations) cost += ClusterInitialCost; sched->SetCost(cost); @@ -458,19 +460,38 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst, // Possibly keep track of the current memory clustering size here // and in UpdateSpillInfoForUnSchdul_() - if (isSecondPass) { + if (isSecondPass && ClusterMemoryOperations) { if (inst->GetMayCluster()) { - // TODO: Check for different cluster to different cluster scheduling - if (CurrentClusterSize > 0 && - CurrentClusterVector->GetBit(inst->GetNum())) { - // Case 1: Currently clustering and this current instruction is part of - // the cluster - CurrentClusterSize++; - if (CurrentClusterSize > 2) { - // Only decrement the cost if we cluster at least 2 operations - // together (EXPERIMENTAL FOR NOW) - ClusterInitialCost -= ClusteringWeight; - Logger::Info("Currently clustering %d instructions together", CurrentClusterSize); + // If there is a current active cluster + if (CurrentClusterSize > 0) { + // The instruction is in the current active cluster + if (CurrentClusterVector->GetBit(inst->GetNum())) { + // Case 1: Currently clustering and this current instruction is part + // of the cluster + CurrentClusterSize++; + if (CurrentClusterSize > 2) { + // Only decrement the cost if we cluster at least 2 operations + // together (EXPERIMENTAL FOR NOW) + ClusterInitialCost -= ClusteringWeight; + Logger::Info("Currently clustering %d instructions together", + CurrentClusterSize); + } + } else { + Logger::Info("Inst %d pushing cluster size %d onto the stack due to " + "cluster to cluster op", + inst->GetNum(), CurrentClusterSize); + // The instruction is in another cluster that is not currently active. + // Exit out of the currently active cluster into a new one. + if (LastCluster) { + PastClustersList.push_back(std::move(LastCluster)); + LastCluster = llvm::make_unique( + CurrentClusterVector, CurrentClusterSize, inst->GetNum()); + } else + LastCluster = llvm::make_unique( + CurrentClusterVector, CurrentClusterSize, inst->GetNum()); + CurrentClusterVector.reset(); + CurrentClusterVector = inst->GetClusterVector(); + CurrentClusterSize = 1; } } else { // Case 3: Not currently clustering. Initialize clustering @@ -479,6 +500,8 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst, CurrentClusterSize = 1; // Current size is 1 } } else if (CurrentClusterSize > 1) { + Logger::Info("Inst %d pushing cluster size %d onto the stack", + inst->GetNum(), CurrentClusterSize); // Case 2: Exiting out of an active cluster // Save the cluster to restore when backtracking. if (LastCluster) { @@ -708,24 +731,31 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) { // 3.) Non-Cluster <- Cluster // Simple case, just decrement 1 from cluster size // If cluster size == 0, delete CurrentClusterVector - if (isSecondPass) { + if (isSecondPass && ClusterMemoryOperations) { // TODO: Check for different cluster to different cluster // backtracking. if (inst->GetMayCluster()) { - // Case 1 - if (CurrentClusterSize > 0 && CurrentClusterVector->GetBit(inst->GetNum())) { - // Currently clustering and this current instruction is part of the - // cluster - if (CurrentClusterSize > 2) { - ClusterInitialCost += ClusteringWeight; // Re-add the cost - } - CurrentClusterSize--; - Logger::Info("Undoing an instruction from the cluster. Current size: %d", CurrentClusterSize); - } else { - // Case 3 - CurrentClusterSize--; - if (CurrentClusterSize == 0) - CurrentClusterVector.reset(); + // Case 1 and 3 + if (CurrentClusterSize > 2) { + ClusterInitialCost += ClusteringWeight; // Re-add the cost + } + CurrentClusterSize--; + Logger::Info("Undoing an instruction from the cluster. Current size: %d", + CurrentClusterSize); + + if (CurrentClusterSize == 0) { + CurrentClusterVector.reset(); + if (LastCluster->InstNum == inst->GetNum()) { + CurrentClusterSize = LastCluster->ClusterSize; + CurrentClusterVector = LastCluster->ClusterVector; + LastCluster.reset(); // Release current cluster pointer + + // Get previous cluster from vector list + if (!PastClustersList.empty()) { + LastCluster = std::move(PastClustersList.back()); + PastClustersList.pop_back(); + } + } } } else if (LastCluster) { if (LastCluster->InstNum == inst->GetNum()) { @@ -741,9 +771,11 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) { LastCluster = std::move(PastClustersList.back()); PastClustersList.pop_back(); } + Logger::Info("Inst %d popping cluster size %d off the stack", + inst->GetNum(), CurrentClusterSize); + } } - }} - + } defCnt = inst->GetDefs(defs); useCnt = inst->GetUses(uses); @@ -1048,7 +1080,7 @@ bool BBWithSpill::ChkCostFsblty(InstCount trgtLngth, EnumTreeNode *node) { dynmcCostLwrBound = crntCost; // TODO: Implement cost function for clustering - if (isSecondPass) + if (isSecondPass && ClusterMemoryOperations) crntCost += ClusterInitialCost; // assert(cost >= 0); From 8b5e2cc5811129bf643e1bb029d804171f857936 Mon Sep 17 00:00:00 2001 From: Vang Thao Date: Tue, 17 Mar 2020 14:11:11 -0700 Subject: [PATCH 12/40] Fix missing var. --- lib/Scheduler/bb_spill.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp index eaf577f3..706eeedb 100644 --- a/lib/Scheduler/bb_spill.cpp +++ b/lib/Scheduler/bb_spill.cpp @@ -77,6 +77,7 @@ BBWithSpill::BBWithSpill(const OptSchedTarget *OST_, DataDepGraph *dataDepGraph, PastClustersList.clear(); LastCluster = nullptr; + Config &schedIni = SchedulerOptions::getInstance(); ClusterMemoryOperations = schedIni.GetBool("CLUSTER_MEMORY_OPS"); } /****************************************************************************/ From 93f01e3c167d6a610e7eabb58e866e872adbac95 Mon Sep 17 00:00:00 2001 From: Vang Thao Date: Tue, 17 Mar 2020 14:26:14 -0700 Subject: [PATCH 13/40] Fix memory segmentation --- lib/Scheduler/bb_spill.cpp | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp index 706eeedb..e141bf83 100644 --- a/lib/Scheduler/bb_spill.cpp +++ b/lib/Scheduler/bb_spill.cpp @@ -744,17 +744,24 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) { Logger::Info("Undoing an instruction from the cluster. Current size: %d", CurrentClusterSize); + // If there is no more member in the currently active cluster then disable + // the cluster if (CurrentClusterSize == 0) { CurrentClusterVector.reset(); - if (LastCluster->InstNum == inst->GetNum()) { - CurrentClusterSize = LastCluster->ClusterSize; - CurrentClusterVector = LastCluster->ClusterVector; - LastCluster.reset(); // Release current cluster pointer - - // Get previous cluster from vector list - if (!PastClustersList.empty()) { - LastCluster = std::move(PastClustersList.back()); - PastClustersList.pop_back(); + + // If there was a previously active cluster, check last cluster to see + // if we need to restore the state + if (LastCluster) { + if (LastCluster->InstNum == inst->GetNum()) { + CurrentClusterSize = LastCluster->ClusterSize; + CurrentClusterVector = LastCluster->ClusterVector; + LastCluster.reset(); // Release current cluster pointer + + // Get previous cluster from vector list + if (!PastClustersList.empty()) { + LastCluster = std::move(PastClustersList.back()); + PastClustersList.pop_back(); + } } } } From 111d5eb167ecdd67fdcb629e06359e06d95d6177 Mon Sep 17 00:00:00 2001 From: Vang Thao Date: Tue, 17 Mar 2020 15:17:44 -0700 Subject: [PATCH 14/40] Use an integer instead of a vector for cluster groups. --- include/opt-sched/Scheduler/bb_spill.h | 11 +++---- .../opt-sched/Scheduler/sched_basic_data.h | 15 +++++---- lib/Scheduler/bb_spill.cpp | 33 +++++++++---------- lib/Scheduler/sched_basic_data.cpp | 13 +++----- lib/Wrapper/OptSchedDDGWrapperBasic.cpp | 29 ++++++++-------- lib/Wrapper/OptSchedDDGWrapperBasic.h | 3 ++ 6 files changed, 54 insertions(+), 50 deletions(-) diff --git a/include/opt-sched/Scheduler/bb_spill.h b/include/opt-sched/Scheduler/bb_spill.h index df798d43..91c13f37 100644 --- a/include/opt-sched/Scheduler/bb_spill.h +++ b/include/opt-sched/Scheduler/bb_spill.h @@ -38,9 +38,8 @@ class BBWithSpill : public SchedRegion { /// Current cluster size unsigned int CurrentClusterSize; - /// Bitvector containing active bits for instructions that can be clustered - /// together - std::shared_ptr CurrentClusterVector; + /// Current active cluster group + int ActiveClusterGroup; /// Flag to enable or disable clustering memory operations /// in the ILP pass. @@ -53,7 +52,7 @@ class BBWithSpill : public SchedRegion { /// Data struct to contain information about the previous clusters struct PastClusters { - std::shared_ptr ClusterVector; + int ClusterGroup; /// Size of the cluster when it was ended by an instruction not in the /// cluster int ClusterSize; @@ -62,8 +61,8 @@ class BBWithSpill : public SchedRegion { int InstNum; /// Constructor for this struct - PastClusters(std::shared_ptr Cluster, int size, int num) - : ClusterVector(Cluster), ClusterSize(size), InstNum(num) {} + PastClusters(int Cluster, int size, int num) + : ClusterGroup(Cluster), ClusterSize(size), InstNum(num) {} }; /// Vector containing the (n-1) past clusters diff --git a/include/opt-sched/Scheduler/sched_basic_data.h b/include/opt-sched/Scheduler/sched_basic_data.h index 166dbbad..9a6d631b 100644 --- a/include/opt-sched/Scheduler/sched_basic_data.h +++ b/include/opt-sched/Scheduler/sched_basic_data.h @@ -16,7 +16,6 @@ Last Update: Sept. 2013 #include "opt-sched/Scheduler/hash_table.h" #include "opt-sched/Scheduler/machine_model.h" #include -#include namespace llvm { namespace opt_sched { @@ -428,9 +427,11 @@ class SchedInstruction : public GraphNode { /// Set MayCluster to true if clustering memory operations was found /// to be possible. - void SetMayCluster(std::shared_ptr PossibleClustersVector); + void SetMayCluster(int ClusteringGroup); bool GetMayCluster() { return MayCluster; } - std::shared_ptr GetClusterVector(); + int GetClusterGroup() { return ClusterGroup; } + static int GetActiveCluster() { return ActiveCluster; } + static int SetActiveCluster(int Active) { ActiveCluster = Active; } friend class SchedRange; protected: @@ -438,11 +439,13 @@ class SchedInstruction : public GraphNode { string name_; // The mnemonic of this instruction, e.g. "add" or "jmp". string opCode_; - /// Data structure to store a possible clustering with other instructions. - /// This data structure should have a fast lookup operation. - std::shared_ptr PossibleClusturesBitVector; + /// The cluster group that the current instruction is a part of. + /// Default of 0 means that it is not part of any cluster. + int ClusterGroup; /// This value should be set to true if clustering may be possible. bool MayCluster; + /// Currently active cluster. Used for ready list. + static int ActiveCluster; // A numberical ID for this instruction. int nodeID_; // The type of this instruction. diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp index e141bf83..4c14eb78 100644 --- a/lib/Scheduler/bb_spill.cpp +++ b/lib/Scheduler/bb_spill.cpp @@ -71,7 +71,7 @@ BBWithSpill::BBWithSpill(const OptSchedTarget *OST_, DataDepGraph *dataDepGraph, schduldInstCnt_ = 0; CurrentClusterSize = 0; - CurrentClusterVector = nullptr; + ActiveClusterGroup = 0; ClusteringWeight = 1000; ClusterInitialCost = 1000000; PastClustersList.clear(); @@ -331,7 +331,7 @@ void BBWithSpill::InitForSchdulng() { InitForCostCmputtn_(); CurrentClusterSize = 0; - CurrentClusterVector.reset(); + ActiveClusterGroup = 0; ClusterInitialCost = 1000000; PastClustersList.clear(); LastCluster.reset(); @@ -466,7 +466,7 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst, // If there is a current active cluster if (CurrentClusterSize > 0) { // The instruction is in the current active cluster - if (CurrentClusterVector->GetBit(inst->GetNum())) { + if (ActiveClusterGroup == inst->GetClusterGroup()) { // Case 1: Currently clustering and this current instruction is part // of the cluster CurrentClusterSize++; @@ -486,19 +486,18 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst, if (LastCluster) { PastClustersList.push_back(std::move(LastCluster)); LastCluster = llvm::make_unique( - CurrentClusterVector, CurrentClusterSize, inst->GetNum()); + ActiveClusterGroup, CurrentClusterSize, inst->GetNum()); } else LastCluster = llvm::make_unique( - CurrentClusterVector, CurrentClusterSize, inst->GetNum()); - CurrentClusterVector.reset(); - CurrentClusterVector = inst->GetClusterVector(); + ActiveClusterGroup, CurrentClusterSize, inst->GetNum()); + + ActiveClusterGroup = inst->GetClusterGroup(); CurrentClusterSize = 1; } } else { // Case 3: Not currently clustering. Initialize clustering - CurrentClusterVector.reset(); // Clear cluster vector - CurrentClusterVector = inst->GetClusterVector(); // Set active cluster - CurrentClusterSize = 1; // Current size is 1 + ActiveClusterGroup = inst->GetClusterGroup(); + CurrentClusterSize = 1; } } else if (CurrentClusterSize > 1) { Logger::Info("Inst %d pushing cluster size %d onto the stack", @@ -511,11 +510,11 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst, // Current previous cluster LastCluster = llvm::make_unique( - CurrentClusterVector, CurrentClusterSize, inst->GetNum()); + ActiveClusterGroup, CurrentClusterSize, inst->GetNum()); } else LastCluster = llvm::make_unique( - CurrentClusterVector, CurrentClusterSize, inst->GetNum()); - CurrentClusterVector.reset(); // Reset active cluster + ActiveClusterGroup, CurrentClusterSize, inst->GetNum()); + ActiveClusterGroup = 0; // Reset active cluster CurrentClusterSize = 0; // Set cluster size to 0 } } @@ -731,7 +730,7 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) { // Can/should we use a stack to restore state? // 3.) Non-Cluster <- Cluster // Simple case, just decrement 1 from cluster size - // If cluster size == 0, delete CurrentClusterVector + // If cluster size == 0, set ActiveClusterGroup = 0; if (isSecondPass && ClusterMemoryOperations) { // TODO: Check for different cluster to different cluster // backtracking. @@ -747,14 +746,14 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) { // If there is no more member in the currently active cluster then disable // the cluster if (CurrentClusterSize == 0) { - CurrentClusterVector.reset(); + ActiveClusterGroup = 0; // If there was a previously active cluster, check last cluster to see // if we need to restore the state if (LastCluster) { if (LastCluster->InstNum == inst->GetNum()) { CurrentClusterSize = LastCluster->ClusterSize; - CurrentClusterVector = LastCluster->ClusterVector; + ActiveClusterGroup = LastCluster->ClusterGroup; LastCluster.reset(); // Release current cluster pointer // Get previous cluster from vector list @@ -771,7 +770,7 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) { // this instruction ended the cluster then restore the previous // cluster's state CurrentClusterSize = LastCluster->ClusterSize; - CurrentClusterVector = LastCluster->ClusterVector; + ActiveClusterGroup = LastCluster->ClusterGroup; LastCluster.reset(); // Release current cluster pointer // Get previous cluster from vector list diff --git a/lib/Scheduler/sched_basic_data.cpp b/lib/Scheduler/sched_basic_data.cpp index f74b605f..536cd99c 100644 --- a/lib/Scheduler/sched_basic_data.cpp +++ b/lib/Scheduler/sched_basic_data.cpp @@ -6,7 +6,6 @@ using namespace llvm::opt_sched; SchedInstruction::SchedInstruction(InstCount num, const string &name, InstType instType, const string &opCode, - /* bool InstrMayLoad, bool InstrMayStore,*/ InstCount maxInstCnt, int nodeID, InstCount fileSchedOrder, InstCount fileSchedCycle, InstCount fileLB, @@ -16,8 +15,8 @@ SchedInstruction::SchedInstruction(InstCount num, const string &name, name_ = name; opCode_ = opCode; instType_ = instType; - // MayLoad = InstrMayLoad; - // MayStore = InstrMayStore; + ClusterGroup = 0; + ActiveCluster = 0; MayCluster = false; frwrdLwrBound_ = INVALID_VALUE; @@ -742,15 +741,13 @@ int16_t SchedInstruction::CmputLastUseCnt() { return lastUseCnt_; } -void SchedInstruction::SetMayCluster(std::shared_ptr PossibleClustersVector) { - if (PossibleClustersVector->GetOneCnt() > 0) { - PossibleClusturesBitVector = PossibleClustersVector; +void SchedInstruction::SetMayCluster(int ClusteringGroup) { + if (ClusteringGroup > 0) { + ClusterGroup = ClusteringGroup; MayCluster = true; } } -std::shared_ptr SchedInstruction::GetClusterVector() { return PossibleClusturesBitVector; } - /****************************************************************************** * SchedRange * ******************************************************************************/ diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp index 247294e9..cdbd8d1a 100644 --- a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp +++ b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp @@ -75,6 +75,8 @@ OptSchedDDGWrapperBasic::OptSchedDDGWrapperBasic( if (ShouldFilterRegisterTypes) RTFilter = createLLVMRegTypeFilter(MM, DAG->TRI, DAG->getRegPressure().MaxSetPressure); + + ClusterCount = 0; } void OptSchedDDGWrapperBasic::convertSUnits() { @@ -380,8 +382,6 @@ inline void OptSchedDDGWrapperBasic::setupRoot() { int RootNum = DAG->SUnits.size(); root_ = CreateNode_(RootNum, "artificial", MM->GetInstTypeByName("artificial"), "__optsched_entry", - // mayLoad = false; - // mayStore = false; RootNum, // nodeID RootNum, // fileSchedOrder RootNum, // fileSchedCycle @@ -400,8 +400,6 @@ inline void OptSchedDDGWrapperBasic::setupLeaf() { int LeafNum = DAG->SUnits.size() + 1; CreateNode_(LeafNum, "artificial", MM->GetInstTypeByName("artificial"), "__optsched_exit", - // mayLoad = false; - // mayStore = false; LeafNum, // nodeID LeafNum, // fileSchedOrder LeafNum, // fileSchedCycle @@ -475,8 +473,6 @@ void OptSchedDDGWrapperBasic::convertSUnit(const SUnit &SU) { } CreateNode_(SU.NodeNum, InstName.c_str(), InstType, InstName.c_str(), - // MI->mayLoad() - // MI->mayStore() SU.NodeNum, // nodeID SU.NodeNum, // fileSchedOrder SU.NodeNum, // fileSchedCycle @@ -515,8 +511,9 @@ void OptSchedDDGWrapperBasic::countBoundaryLiveness( void OptSchedDDGWrapperBasic::clusterNeighboringMemOps_( ArrayRef MemOps) { SmallVector MemOpRecords; + bool ClusterPossible = false; + LLVM_DEBUG(dbgs() << "Processing possible clusters\n"); - for (const SUnit *SU : MemOps) { LLVM_DEBUG(dbgs() << " " << SU->NodeNum << " is in the chain.\n"); MachineOperand *BaseOp; @@ -530,8 +527,6 @@ void OptSchedDDGWrapperBasic::clusterNeighboringMemOps_( return; } - auto ClusterVector = std::make_shared(DAG->SUnits.size()); - llvm::sort(MemOpRecords); unsigned ClusterLength = 1; for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) { @@ -542,11 +537,19 @@ void OptSchedDDGWrapperBasic::clusterNeighboringMemOps_( *MemOpRecords[Idx + 1].BaseOp, ClusterLength)) { LLVM_DEBUG(dbgs() << " Cluster possible at SU(" << SUa->NodeNum << ")- SU(" << SUb->NodeNum << ")\n"); + + // If clustering was possible then increase the cluster count. This only + // happens once every cluster + if (!ClusterPossible) { + ClusterPossible = true; + ClusterCount++; + } + + // Tell the instructions what cluster number they are in + insts_[SUa->NodeNum]->SetMayCluster(ClusterCount); + insts_[SUb->NodeNum]->SetMayCluster(ClusterCount); + ++ClusterLength; - ClusterVector->SetBit(SUa->NodeNum); - ClusterVector->SetBit(SUb->NodeNum); - insts_[SUa->NodeNum]->SetMayCluster(ClusterVector); - insts_[SUb->NodeNum]->SetMayCluster(ClusterVector); } else ClusterLength = 1; } diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.h b/lib/Wrapper/OptSchedDDGWrapperBasic.h index 9970fab9..76d5d7ea 100644 --- a/lib/Wrapper/OptSchedDDGWrapperBasic.h +++ b/lib/Wrapper/OptSchedDDGWrapperBasic.h @@ -146,6 +146,9 @@ class OptSchedDDGWrapperBasic : public DataDepGraph { std::vector consumers; }; + /// Count of the total clusters possible + int ClusterCount; + // Copied from // https://github.com/RadeonOpenCompute/llvm/blob/roc-ocl-2.4.0/lib/CodeGen/MachineScheduler.cpp#L1467 struct MemOpInfo { From 298fb0fe2d063bcaa2ae99c5a015f84c6a17ca59 Mon Sep 17 00:00:00 2001 From: vang thao Date: Wed, 18 Mar 2020 00:01:26 -0700 Subject: [PATCH 15/40] Fix error with static variable. --- include/opt-sched/Scheduler/sched_basic_data.h | 2 +- lib/Scheduler/sched_basic_data.cpp | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/include/opt-sched/Scheduler/sched_basic_data.h b/include/opt-sched/Scheduler/sched_basic_data.h index 9a6d631b..3a737d8e 100644 --- a/include/opt-sched/Scheduler/sched_basic_data.h +++ b/include/opt-sched/Scheduler/sched_basic_data.h @@ -431,7 +431,7 @@ class SchedInstruction : public GraphNode { bool GetMayCluster() { return MayCluster; } int GetClusterGroup() { return ClusterGroup; } static int GetActiveCluster() { return ActiveCluster; } - static int SetActiveCluster(int Active) { ActiveCluster = Active; } + static void SetActiveCluster(int Active) { ActiveCluster = Active; } friend class SchedRange; protected: diff --git a/lib/Scheduler/sched_basic_data.cpp b/lib/Scheduler/sched_basic_data.cpp index 536cd99c..2fa5f09d 100644 --- a/lib/Scheduler/sched_basic_data.cpp +++ b/lib/Scheduler/sched_basic_data.cpp @@ -4,6 +4,8 @@ using namespace llvm::opt_sched; +int SchedInstruction::ActiveCluster = 0; + SchedInstruction::SchedInstruction(InstCount num, const string &name, InstType instType, const string &opCode, InstCount maxInstCnt, int nodeID, @@ -16,7 +18,6 @@ SchedInstruction::SchedInstruction(InstCount num, const string &name, opCode_ = opCode; instType_ = instType; ClusterGroup = 0; - ActiveCluster = 0; MayCluster = false; frwrdLwrBound_ = INVALID_VALUE; From 30c7d9cf598d3c7fe7b1b34cade690d02839f284 Mon Sep 17 00:00:00 2001 From: Vang Thao Date: Wed, 18 Mar 2020 17:44:02 -0700 Subject: [PATCH 16/40] Added MEM heuristic priority. Not yet implemented. --- include/opt-sched/Scheduler/data_dep.h | 5 +++ .../opt-sched/Scheduler/sched_basic_data.h | 5 ++- lib/Scheduler/bb_spill.cpp | 42 ++++++++++++------- lib/Scheduler/data_dep.cpp | 2 + lib/Scheduler/ready_list.cpp | 20 +++++++++ lib/Wrapper/OptSchedDDGWrapperBasic.cpp | 2 + lib/Wrapper/OptimizingScheduler.cpp | 8 ++-- 7 files changed, 64 insertions(+), 20 deletions(-) diff --git a/include/opt-sched/Scheduler/data_dep.h b/include/opt-sched/Scheduler/data_dep.h index d0885fd0..803574c9 100644 --- a/include/opt-sched/Scheduler/data_dep.h +++ b/include/opt-sched/Scheduler/data_dep.h @@ -291,7 +291,12 @@ class DataDepGraph : public llvm::opt_sched::OptSchedDDGWrapperBase, RegisterFile *getRegFiles() { return RegFiles.get(); } + int getMaxClusterCount() { return MaxClusterCount; } + void setMaxClusterCount(int Max) { MaxClusterCount = Max; } + protected: + int MaxClusterCount; + // TODO(max): Get rid of this. // Number of basic blocks int32_t bscBlkCnt_; diff --git a/include/opt-sched/Scheduler/sched_basic_data.h b/include/opt-sched/Scheduler/sched_basic_data.h index 3a737d8e..cdfad226 100644 --- a/include/opt-sched/Scheduler/sched_basic_data.h +++ b/include/opt-sched/Scheduler/sched_basic_data.h @@ -51,7 +51,10 @@ enum LISTSCHED_HEURISTIC { LSH_LS = 7, // LLVM list scheduler order - LSH_LLVM = 8 + LSH_LLVM = 8, + + // Memory clustering + LSH_MEM = 9 }; #define MAX_SCHED_PRIRTS 10 diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp index 4c14eb78..ebad0f9e 100644 --- a/lib/Scheduler/bb_spill.cpp +++ b/lib/Scheduler/bb_spill.cpp @@ -492,32 +492,43 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst, ActiveClusterGroup, CurrentClusterSize, inst->GetNum()); ActiveClusterGroup = inst->GetClusterGroup(); + inst->SetActiveCluster(ActiveClusterGroup); CurrentClusterSize = 1; } } else { // Case 3: Not currently clustering. Initialize clustering ActiveClusterGroup = inst->GetClusterGroup(); + inst->SetActiveCluster(ActiveClusterGroup); CurrentClusterSize = 1; } - } else if (CurrentClusterSize > 1) { - Logger::Info("Inst %d pushing cluster size %d onto the stack", - inst->GetNum(), CurrentClusterSize); + } else if (CurrentClusterSize > 0) { // Case 2: Exiting out of an active cluster - // Save the cluster to restore when backtracking. - if (LastCluster) { - // List of previous clusters - PastClustersList.push_back(std::move(LastCluster)); - - // Current previous cluster - LastCluster = llvm::make_unique( - ActiveClusterGroup, CurrentClusterSize, inst->GetNum()); - } else - LastCluster = llvm::make_unique( - ActiveClusterGroup, CurrentClusterSize, inst->GetNum()); + // Only save the state if we cluster 2 or more instructions together + // already + if (CurrentClusterSize > 1) { + Logger::Info("Inst %d pushing cluster size %d onto the stack", + inst->GetNum(), CurrentClusterSize); + + // Save the cluster to restore when backtracking. + if (LastCluster) { + // Save previous current cluster in a vector + PastClustersList.push_back(std::move(LastCluster)); + + // Current cluster + LastCluster = llvm::make_unique( + ActiveClusterGroup, CurrentClusterSize, inst->GetNum()); + } else + // This is the first cluster that we are saving + LastCluster = llvm::make_unique( + ActiveClusterGroup, CurrentClusterSize, inst->GetNum()); + } + ActiveClusterGroup = 0; // Reset active cluster + inst->SetActiveCluster(0); CurrentClusterSize = 0; // Set cluster size to 0 } } + // Potential Issues: // 1. Keeping track of the average clustering size when we aren't done // scheduling. @@ -747,6 +758,7 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) { // the cluster if (CurrentClusterSize == 0) { ActiveClusterGroup = 0; + inst->SetActiveCluster(0); // If there was a previously active cluster, check last cluster to see // if we need to restore the state @@ -754,6 +766,7 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) { if (LastCluster->InstNum == inst->GetNum()) { CurrentClusterSize = LastCluster->ClusterSize; ActiveClusterGroup = LastCluster->ClusterGroup; + inst->SetActiveCluster(ActiveClusterGroup); LastCluster.reset(); // Release current cluster pointer // Get previous cluster from vector list @@ -771,6 +784,7 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) { // cluster's state CurrentClusterSize = LastCluster->ClusterSize; ActiveClusterGroup = LastCluster->ClusterGroup; + inst->SetActiveCluster(ActiveClusterGroup); LastCluster.reset(); // Release current cluster pointer // Get previous cluster from vector list diff --git a/lib/Scheduler/data_dep.cpp b/lib/Scheduler/data_dep.cpp index 65d2f0b8..a6652c9f 100644 --- a/lib/Scheduler/data_dep.cpp +++ b/lib/Scheduler/data_dep.cpp @@ -197,6 +197,8 @@ DataDepGraph::DataDepGraph(MachineModel *machMdl, LATENCY_PRECISION ltncyPrcsn) exitInstCnt_ = 0; RegFiles = llvm::make_unique(machMdl_->GetRegTypeCnt()); + + MaxClusterCount = 0; } DataDepGraph::~DataDepGraph() { diff --git a/lib/Scheduler/ready_list.cpp b/lib/Scheduler/ready_list.cpp index 398ca5ed..553238aa 100644 --- a/lib/Scheduler/ready_list.cpp +++ b/lib/Scheduler/ready_list.cpp @@ -68,6 +68,13 @@ ReadyList::ReadyList(DataDepGraph *dataDepGraph, SchedPriorities prirts) { ltncySumBits_ = Utilities::clcltBitsNeededToHoldNum(maxLtncySum_); totKeyBits += ltncySumBits_; break; + + case LSH_MEM: + Logger::Info("MEM heuristic detected"); + break; + + default: + break; } // end switch } // end for @@ -111,6 +118,13 @@ ReadyList::ReadyList(DataDepGraph *dataDepGraph, SchedPriorities prirts) { AddPrirtyToKey_(maxPriority_, keySize, ltncySumBits_, maxLtncySum_, maxLtncySum_); break; + + case LSH_MEM: + Logger::Info("MEM heuristic detected"); + break; + + default: + break; } } } @@ -190,6 +204,12 @@ unsigned long ReadyList::CmputKey_(SchedInstruction *inst, bool isUpdate, AddPrirtyToKey_(key, keySize, ltncySumBits_, inst->GetLtncySum(), maxLtncySum_); break; + + case LSH_MEM: + break; + + default: + break; } } return key; diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp index cdbd8d1a..6d50fc8f 100644 --- a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp +++ b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp @@ -543,6 +543,8 @@ void OptSchedDDGWrapperBasic::clusterNeighboringMemOps_( if (!ClusterPossible) { ClusterPossible = true; ClusterCount++; + setMaxClusterCount(ClusterCount); + Logger::Info("Setting max cluster count to %d", ClusterCount); } // Tell the instructions what cluster number they are in diff --git a/lib/Wrapper/OptimizingScheduler.cpp b/lib/Wrapper/OptimizingScheduler.cpp index ca383ac6..08a12466 100644 --- a/lib/Wrapper/OptimizingScheduler.cpp +++ b/lib/Wrapper/OptimizingScheduler.cpp @@ -47,11 +47,9 @@ constexpr struct { const char* Name; LISTSCHED_HEURISTIC HID; } HeuristicNames[] = { - {"CP", LSH_CP}, {"LUC", LSH_LUC}, - {"UC", LSH_UC}, {"NID", LSH_NID}, - {"CPR", LSH_CPR}, {"ISO", LSH_ISO}, - {"SC", LSH_SC}, {"LS", LSH_LS}, - {"LLVM", LSH_LLVM} + {"CP", LSH_CP}, {"LUC", LSH_LUC}, {"UC", LSH_UC}, {"NID", LSH_NID}, + {"CPR", LSH_CPR}, {"ISO", LSH_ISO}, {"SC", LSH_SC}, {"LS", LSH_LS}, + {"LLVM", LSH_LLVM}, {"MEM", LSH_MEM} }; // Default path to the the configuration directory for opt-sched. From d71246088d33d4a583281d0cdd485e09f89a382c Mon Sep 17 00:00:00 2001 From: Vang Thao Date: Wed, 18 Mar 2020 18:36:47 -0700 Subject: [PATCH 17/40] ALso save state for cluster of size 1. --- lib/Scheduler/bb_spill.cpp | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp index ebad0f9e..534ef298 100644 --- a/lib/Scheduler/bb_spill.cpp +++ b/lib/Scheduler/bb_spill.cpp @@ -503,25 +503,21 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst, } } else if (CurrentClusterSize > 0) { // Case 2: Exiting out of an active cluster - // Only save the state if we cluster 2 or more instructions together - // already - if (CurrentClusterSize > 1) { - Logger::Info("Inst %d pushing cluster size %d onto the stack", - inst->GetNum(), CurrentClusterSize); + Logger::Info("Inst %d pushing cluster size %d onto the stack", + inst->GetNum(), CurrentClusterSize); - // Save the cluster to restore when backtracking. - if (LastCluster) { - // Save previous current cluster in a vector - PastClustersList.push_back(std::move(LastCluster)); - - // Current cluster - LastCluster = llvm::make_unique( - ActiveClusterGroup, CurrentClusterSize, inst->GetNum()); - } else - // This is the first cluster that we are saving - LastCluster = llvm::make_unique( - ActiveClusterGroup, CurrentClusterSize, inst->GetNum()); - } + // Save the cluster to restore when backtracking. + if (LastCluster) { + // Save previous current cluster in a vector + PastClustersList.push_back(std::move(LastCluster)); + + // Current cluster + LastCluster = llvm::make_unique( + ActiveClusterGroup, CurrentClusterSize, inst->GetNum()); + } else + // This is the first cluster that we are saving + LastCluster = llvm::make_unique( + ActiveClusterGroup, CurrentClusterSize, inst->GetNum()); ActiveClusterGroup = 0; // Reset active cluster inst->SetActiveCluster(0); From 91967badf67e129063621e1351b0516fb7f7219d Mon Sep 17 00:00:00 2001 From: Vang Thao Date: Thu, 19 Mar 2020 00:50:34 -0700 Subject: [PATCH 18/40] First implementation of MEM heuristic. --- example/optsched-cfg/sched.ini | 3 +++ include/opt-sched/Scheduler/ready_list.h | 1 + lib/Scheduler/bb_spill.cpp | 1 + lib/Scheduler/ready_list.cpp | 9 +++++++-- lib/Wrapper/OptimizingScheduler.cpp | 3 +++ lib/Wrapper/OptimizingScheduler.h | 2 ++ 6 files changed, 17 insertions(+), 2 deletions(-) diff --git a/example/optsched-cfg/sched.ini b/example/optsched-cfg/sched.ini index 8addb5f5..8968dbdb 100644 --- a/example/optsched-cfg/sched.ini +++ b/example/optsched-cfg/sched.ini @@ -84,6 +84,9 @@ TIMEOUT_PER INSTR # Example: LUC_CP_NID HEURISTIC LUC_CP_NID +# Same as HEURISTIC except with MEM_ prefix. +SECOND_PASS_HEURISTIC MEM_LUC_CP_NID + # The heuristic used for the enumerator. If the two pass scheduling # approach is enabled, then this value will be used for the first pass. # Same valid values as HEURISTIC. diff --git a/include/opt-sched/Scheduler/ready_list.h b/include/opt-sched/Scheduler/ready_list.h index 3c7bb1a6..054b19f1 100644 --- a/include/opt-sched/Scheduler/ready_list.h +++ b/include/opt-sched/Scheduler/ready_list.h @@ -115,6 +115,7 @@ class ReadyList { int16_t ltncySumBits_; int16_t nodeID_Bits_; int16_t inptSchedOrderBits_; + int16_t ClusterBit; // Constructs the priority-list key based on the schemes listed in prirts_. unsigned long CmputKey_(SchedInstruction *inst, bool isUpdate, bool &changed); diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp index 534ef298..f774ce03 100644 --- a/lib/Scheduler/bb_spill.cpp +++ b/lib/Scheduler/bb_spill.cpp @@ -330,6 +330,7 @@ InstCount BBWithSpill::CmputCostLwrBound() { void BBWithSpill::InitForSchdulng() { InitForCostCmputtn_(); + SchedInstruction::SetActiveCluster(0); CurrentClusterSize = 0; ActiveClusterGroup = 0; ClusterInitialCost = 1000000; diff --git a/lib/Scheduler/ready_list.cpp b/lib/Scheduler/ready_list.cpp index 553238aa..421e7034 100644 --- a/lib/Scheduler/ready_list.cpp +++ b/lib/Scheduler/ready_list.cpp @@ -70,7 +70,8 @@ ReadyList::ReadyList(DataDepGraph *dataDepGraph, SchedPriorities prirts) { break; case LSH_MEM: - Logger::Info("MEM heuristic detected"); + ClusterBit = Utilities::clcltBitsNeededToHoldNum(1); + totKeyBits += ClusterBit break; default: @@ -120,7 +121,7 @@ ReadyList::ReadyList(DataDepGraph *dataDepGraph, SchedPriorities prirts) { break; case LSH_MEM: - Logger::Info("MEM heuristic detected"); + AddPrirtyToKey_(maxPriority_, keySize, ClusterBit, 1, 1); break; default: @@ -206,6 +207,10 @@ unsigned long ReadyList::CmputKey_(SchedInstruction *inst, bool isUpdate, break; case LSH_MEM: + unsigned long ValueForKey = + inst->GetClusterGroup() == SchedInstruction::GetActiveCluster() ? 1 + : 0; + AddPrirtyToKey_(key, keySize, ClusterBit, ValueForKey, 1); break; default: diff --git a/lib/Wrapper/OptimizingScheduler.cpp b/lib/Wrapper/OptimizingScheduler.cpp index 08a12466..e8edbdee 100644 --- a/lib/Wrapper/OptimizingScheduler.cpp +++ b/lib/Wrapper/OptimizingScheduler.cpp @@ -556,6 +556,8 @@ void ScheduleDAGOptSched::loadOptSchedConfig() { LowerBoundAlgorithm = parseLowerBoundAlgorithm(); HeuristicPriorities = parseHeuristic(schedIni.GetString("HEURISTIC")); EnumPriorities = parseHeuristic(schedIni.GetString("ENUM_HEURISTIC")); + SecondPassPriorities = + parseHeuristic(schedIni.GetString("SECOND_PASS_HEURISTIC")) SecondPassEnumPriorities = parseHeuristic(schedIni.GetString("SECOND_PASS_ENUM_HEURISTIC")); SCF = parseSpillCostFunc(); @@ -818,6 +820,7 @@ void ScheduleDAGOptSched::scheduleOptSchedBalanced() { // Set the heuristic for the enumerator in the second pass. EnumPriorities = SecondPassEnumPriorities; + HeuristicPriorities = SecondPassPriorities; // Force the input to the balanced scheduler to be the sequential order of the // (hopefully) good register pressure schedule. We don’t want the list diff --git a/lib/Wrapper/OptimizingScheduler.h b/lib/Wrapper/OptimizingScheduler.h index 13b92e7d..784c0681 100644 --- a/lib/Wrapper/OptimizingScheduler.h +++ b/lib/Wrapper/OptimizingScheduler.h @@ -158,6 +158,8 @@ class ScheduleDAGOptSched : public ScheduleDAGMILive { // The heuristic used for the enumerator. SchedPriorities EnumPriorities; + SchedPriorities SecondPassPriorities; + // The heuristic used for the second pass enumerator in the two-pass scheduling approach. SchedPriorities SecondPassEnumPriorities; From ed248f094568d30a77fe00122271a89233736075 Mon Sep 17 00:00:00 2001 From: vang thao Date: Thu, 19 Mar 2020 23:05:55 -0700 Subject: [PATCH 19/40] Print out ready list and changes to linked list (Vlad) --- include/opt-sched/Scheduler/lnkd_lst.h | 70 ++++++++++++++++++-------- lib/Scheduler/bb_spill.cpp | 3 +- lib/Scheduler/enumerator.cpp | 10 ++-- lib/Scheduler/list_sched.cpp | 2 + lib/Scheduler/ready_list.cpp | 9 ++-- lib/Wrapper/OptimizingScheduler.cpp | 2 +- 6 files changed, 63 insertions(+), 33 deletions(-) diff --git a/include/opt-sched/Scheduler/lnkd_lst.h b/include/opt-sched/Scheduler/lnkd_lst.h index ee553398..3c311f9d 100644 --- a/include/opt-sched/Scheduler/lnkd_lst.h +++ b/include/opt-sched/Scheduler/lnkd_lst.h @@ -571,43 +571,69 @@ inline T *PriorityList::GetNxtPriorityElmnt(K &key) { } } +//(Vlad) added functionality to decrease priority +//used for decreasing priority of clusterable instrs +//when leaving a cluster template void PriorityList::BoostEntry(KeyedEntry *entry, K newKey) { KeyedEntry *crnt; KeyedEntry *next = entry->GetNext(); KeyedEntry *prev = entry->GetPrev(); - assert(newKey > entry->key); assert(LinkedList::topEntry_ != NULL); - entry->key = newKey; + if (entry->key < newKey) //behave normally + { + entry->key = newKey; - // If it is already at the top, or its previous still has a larger key, - // then the entry is already in place and no boosting is needed - if (entry == LinkedList::topEntry_ || prev->key >= newKey) - return; + // If it is already at the top, or its previous still has a larger key, + // then the entry is already in place and no boosting is needed + if (entry == LinkedList::topEntry_ || prev->key >= newKey) + return; - prev = NULL; + prev = NULL; - for (crnt = entry->GetPrev(); crnt != NULL; crnt = crnt->GetPrev()) { - if (crnt->key >= newKey) { - assert(crnt != entry); - assert(crnt != entry->GetPrev()); - prev = crnt; - break; + for (crnt = entry->GetPrev(); crnt != NULL; crnt = crnt->GetPrev()) { + if (crnt->key >= newKey) { + assert(crnt != entry); + assert(crnt != entry->GetPrev()); + prev = crnt; + break; + } } - } - if (prev == NULL) { - next = (KeyedEntry *)LinkedList::topEntry_; - } else { - next = prev->GetNext(); - assert(next != NULL); + if (prev == NULL) { + next = (KeyedEntry *)LinkedList::topEntry_; + } else { + next = prev->GetNext(); + assert(next != NULL); + } + + assert(next != entry->GetNext()); + LinkedList::RmvEntry_(entry, false); + InsrtEntry_(entry, next); } + else //move entry down on priority list + { + entry->key = newKey; + + //if it is at the bottom or next entry still has a smaller key, + //then the entry is already in place + if (entry == LinkedList::bottomEntry_ || next->key <= newKey) + return; - assert(next != entry->GetNext()); - LinkedList::RmvEntry_(entry, false); - InsrtEntry_(entry, next); + for (crnt = entry->GetNext(); crnt != NULL; crnt = crnt->GetNext()) + { + if (crnt->key <= newKey) + { + next = crnt; + break; + } + } + + LinkedList::RmvEntry_(entry, false); + InsrtEntry_(entry, next); + } this->itrtrReset_ = true; } diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp index f774ce03..66b569b8 100644 --- a/lib/Scheduler/bb_spill.cpp +++ b/lib/Scheduler/bb_spill.cpp @@ -525,7 +525,7 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst, CurrentClusterSize = 0; // Set cluster size to 0 } } - + Logger::Info("Currently active cluster %d", ActiveClusterGroup); // Potential Issues: // 1. Keeping track of the average clustering size when we aren't done // scheduling. @@ -794,6 +794,7 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) { } } } + Logger::Info("Currently active cluster %d", ActiveClusterGroup); defCnt = inst->GetDefs(defs); useCnt = inst->GetUses(uses); diff --git a/lib/Scheduler/enumerator.cpp b/lib/Scheduler/enumerator.cpp index 39019034..728df961 100644 --- a/lib/Scheduler/enumerator.cpp +++ b/lib/Scheduler/enumerator.cpp @@ -971,13 +971,13 @@ bool Enumerator::FindNxtFsblBrnch_(EnumTreeNode *&newNode) { assert(crntNode_->IsLeaf() || (brnchCnt != rdyInstCnt) ? 1 : rdyInstCnt); // brnchCnt == rdyInstCnt == 0 ? 1 : rdyInstCnt); -#ifdef IS_DEBUG_READY_LIST - Logger::Info("Ready List Size is %d", rdyInstCnt); +//#ifdef IS_DEBUG_READY_LIST +// Logger::Info("Ready List Size is %d", rdyInstCnt); // Warning! That will reset the instruction iterator! - // rdyLst_->Print(Logger::GetLogStream()); + rdyLst_->Print(Logger::GetLogStream()); - stats::maxReadyListSize.SetMax(rdyInstCnt); -#endif +// stats::maxReadyListSize.SetMax(rdyInstCnt); +//#endif if (crntBrnchNum == 0 && SchedForRPOnly_) crntNode_->SetFoundInstWithUse(IsUseInRdyLst_()); diff --git a/lib/Scheduler/list_sched.cpp b/lib/Scheduler/list_sched.cpp index 9bf96951..f737def5 100644 --- a/lib/Scheduler/list_sched.cpp +++ b/lib/Scheduler/list_sched.cpp @@ -40,6 +40,8 @@ FUNC_RESULT ListScheduler::FindSchedule(InstSchedule *sched, SchedRegion *rgn) { while (!IsSchedComplete_()) { UpdtRdyLst_(crntCycleNum_, crntSlotNum_); rdyLst_->ResetIterator(); + rdyLst_->Print(Logger::GetLogStream()); + rdyLst_->ResetIterator(); iterCnt++; rdyLstSize = rdyLst_->GetInstCnt(); diff --git a/lib/Scheduler/ready_list.cpp b/lib/Scheduler/ready_list.cpp index 421e7034..0e4bcd0b 100644 --- a/lib/Scheduler/ready_list.cpp +++ b/lib/Scheduler/ready_list.cpp @@ -71,7 +71,7 @@ ReadyList::ReadyList(DataDepGraph *dataDepGraph, SchedPriorities prirts) { case LSH_MEM: ClusterBit = Utilities::clcltBitsNeededToHoldNum(1); - totKeyBits += ClusterBit + totKeyBits += ClusterBit; break; default: @@ -158,6 +158,7 @@ unsigned long ReadyList::CmputKey_(SchedInstruction *inst, bool isUpdate, int16_t keySize = 0; int i; int16_t oldLastUseCnt, newLastUseCnt; + unsigned long ValueForKey; changed = true; if (isUpdate) changed = false; @@ -207,7 +208,7 @@ unsigned long ReadyList::CmputKey_(SchedInstruction *inst, bool isUpdate, break; case LSH_MEM: - unsigned long ValueForKey = + ValueForKey = inst->GetClusterGroup() == SchedInstruction::GetActiveCluster() ? 1 : 0; AddPrirtyToKey_(key, keySize, ClusterBit, ValueForKey, 1); @@ -232,9 +233,9 @@ void ReadyList::AddLatestSubLists(LinkedList *lst1, void ReadyList::Print(std::ostream &out) { out << "Ready List: "; - for (const auto *crntInst = prirtyLst_->GetFrstElmnt(); crntInst != NULL; + for (auto *crntInst = prirtyLst_->GetFrstElmnt(); crntInst != NULL; crntInst = prirtyLst_->GetNxtElmnt()) { - out << " " << crntInst->GetNum(); + out << " " << crntInst->GetNum() << "(" << crntInst->GetClusterGroup() << ")"; } out << '\n'; diff --git a/lib/Wrapper/OptimizingScheduler.cpp b/lib/Wrapper/OptimizingScheduler.cpp index e8edbdee..a984469d 100644 --- a/lib/Wrapper/OptimizingScheduler.cpp +++ b/lib/Wrapper/OptimizingScheduler.cpp @@ -557,7 +557,7 @@ void ScheduleDAGOptSched::loadOptSchedConfig() { HeuristicPriorities = parseHeuristic(schedIni.GetString("HEURISTIC")); EnumPriorities = parseHeuristic(schedIni.GetString("ENUM_HEURISTIC")); SecondPassPriorities = - parseHeuristic(schedIni.GetString("SECOND_PASS_HEURISTIC")) + parseHeuristic(schedIni.GetString("SECOND_PASS_HEURISTIC")); SecondPassEnumPriorities = parseHeuristic(schedIni.GetString("SECOND_PASS_ENUM_HEURISTIC")); SCF = parseSpillCostFunc(); From 366405768ea81f02646c3f118836d4367ffa17e0 Mon Sep 17 00:00:00 2001 From: Vang Thao Date: Fri, 20 Mar 2020 09:28:06 -0700 Subject: [PATCH 20/40] Extract more information about each cluster to be later used in lower bound estimation. --- include/opt-sched/Scheduler/data_dep.h | 9 +++++++++ lib/Scheduler/data_dep.cpp | 6 ++++++ lib/Wrapper/OptSchedDDGWrapperBasic.cpp | 23 ++++++++++++++++++----- lib/Wrapper/OptSchedDDGWrapperBasic.h | 2 +- 4 files changed, 34 insertions(+), 6 deletions(-) diff --git a/include/opt-sched/Scheduler/data_dep.h b/include/opt-sched/Scheduler/data_dep.h index 803574c9..40833f1c 100644 --- a/include/opt-sched/Scheduler/data_dep.h +++ b/include/opt-sched/Scheduler/data_dep.h @@ -13,6 +13,7 @@ Last Update: Mar. 2011 #include "opt-sched/Scheduler/buffers.h" #include "opt-sched/Scheduler/defines.h" #include "opt-sched/Scheduler/sched_basic_data.h" +#include "llvm/ADT/MapVector.h" #include "llvm/ADT/SmallVector.h" #include @@ -293,9 +294,17 @@ class DataDepGraph : public llvm::opt_sched::OptSchedDDGWrapperBase, int getMaxClusterCount() { return MaxClusterCount; } void setMaxClusterCount(int Max) { MaxClusterCount = Max; } + int getMaxInstructionsInAllClusters() { return MaxInstructionsInAllClusters; } + void setMaxInstructionsInAllClusters(int Max) { + MaxInstructionsInAllClusters = Max; + } + + int getMaxInstructionsInCluster(int Cluster); protected: int MaxClusterCount; + int MaxInstructionsInAllClusters; + MapVector MaxInstructionsInEachClusters; // TODO(max): Get rid of this. // Number of basic blocks diff --git a/lib/Scheduler/data_dep.cpp b/lib/Scheduler/data_dep.cpp index a6652c9f..c42bcafe 100644 --- a/lib/Scheduler/data_dep.cpp +++ b/lib/Scheduler/data_dep.cpp @@ -199,6 +199,7 @@ DataDepGraph::DataDepGraph(MachineModel *machMdl, LATENCY_PRECISION ltncyPrcsn) RegFiles = llvm::make_unique(machMdl_->GetRegTypeCnt()); MaxClusterCount = 0; + MaxInstructionsInClusters = 0; } DataDepGraph::~DataDepGraph() { @@ -213,6 +214,11 @@ DataDepGraph::~DataDepGraph() { delete[] instCntPerType_; } +int DataDepGraph::getMaxInstructionsInCluster(int Cluster) { + assert(Cluster > 0); + return MaxInstructionsInEachClusters[Cluster]; +} + FUNC_RESULT DataDepGraph::SetupForSchdulng(bool cmputTrnstvClsr) { assert(wasSetupForSchduling_ == false); diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp index 6d50fc8f..98f7c9d5 100644 --- a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp +++ b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp @@ -508,10 +508,11 @@ void OptSchedDDGWrapperBasic::countBoundaryLiveness( /// Partially copied from /// https://github.com/RadeonOpenCompute/llvm/blob/roc-ocl-2.4.0/lib/CodeGen/MachineScheduler.cpp#L1554 -void OptSchedDDGWrapperBasic::clusterNeighboringMemOps_( +int OptSchedDDGWrapperBasic::clusterNeighboringMemOps_( ArrayRef MemOps) { SmallVector MemOpRecords; bool ClusterPossible = false; + int TotalInstructionsPossible = 0; LLVM_DEBUG(dbgs() << "Processing possible clusters\n"); for (const SUnit *SU : MemOps) { @@ -524,7 +525,7 @@ void OptSchedDDGWrapperBasic::clusterNeighboringMemOps_( if (MemOpRecords.size() < 2) { LLVM_DEBUG(dbgs() << " Unable to cluster memop cluster of 1.\n"); - return; + return 0; } llvm::sort(MemOpRecords); @@ -548,8 +549,15 @@ void OptSchedDDGWrapperBasic::clusterNeighboringMemOps_( } // Tell the instructions what cluster number they are in - insts_[SUa->NodeNum]->SetMayCluster(ClusterCount); - insts_[SUb->NodeNum]->SetMayCluster(ClusterCount); + if (insts_[SUa->NodeNum]->GetClusterGroup() == 0) { + insts_[SUa->NodeNum]->SetMayCluster(ClusterCount); + TotalInstructionsPossible++; + } + + if (insts_[SUb->NodeNum]->GetClusterGroup() == 0) { + insts_[SUb->NodeNum]->SetMayCluster(ClusterCount); + TotalInstructionsPossible++; + } ++ClusterLength; } else @@ -565,6 +573,8 @@ void OptSchedDDGWrapperBasic::clusterNeighboringMemOps_( } LLVM_DEBUG(dbgs() << '\n'); #endif + MaxInstructionsInEachClusters.insert(ClusterCount, TotalInstructionsPossible); + return TotalInstructionsPossible; } /// Iterate through SUnits and find all possible clustering then transfer @@ -574,6 +584,7 @@ void OptSchedDDGWrapperBasic::findPossibleClusters() { // TODO: Add For-loop to also do store clusters. Currently only does load // clusters bool IsLoad = true; + int TotalInstructionsPossible = 0; LLVM_DEBUG(dbgs() << "Looking for load clusters\n"); DenseMap StoreChainIDs; @@ -614,8 +625,10 @@ void OptSchedDDGWrapperBasic::findPossibleClusters() { LLVM_DEBUG(dbgs() << SU1->NodeNum << " "); LLVM_DEBUG(dbgs() << '\n'); #endif - clusterNeighboringMemOps_(SCD); + TotalInstructionsPossible += clusterNeighboringMemOps_(SCD); } + + setMaxInstructionsInClusters(TotalInstructionsPossible); } LLVMRegTypeFilter::LLVMRegTypeFilter( diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.h b/lib/Wrapper/OptSchedDDGWrapperBasic.h index 76d5d7ea..4fd3937b 100644 --- a/lib/Wrapper/OptSchedDDGWrapperBasic.h +++ b/lib/Wrapper/OptSchedDDGWrapperBasic.h @@ -135,7 +135,7 @@ class OptSchedDDGWrapperBasic : public DataDepGraph { // Find liveness info generated by the region boundary. void discoverBoundaryLiveness(const llvm::MachineInstr *MI); - void clusterNeighboringMemOps_( + int clusterNeighboringMemOps_( ArrayRef MemOps); // Holds a register live range, mapping a producer to a set of consumers. From b8e4ac597da12d15591955d9bc19e470cc095e96 Mon Sep 17 00:00:00 2001 From: vang thao Date: Fri, 20 Mar 2020 09:34:44 -0700 Subject: [PATCH 21/40] Error fixes --- lib/Scheduler/data_dep.cpp | 2 +- lib/Wrapper/OptSchedDDGWrapperBasic.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/Scheduler/data_dep.cpp b/lib/Scheduler/data_dep.cpp index c42bcafe..005bec55 100644 --- a/lib/Scheduler/data_dep.cpp +++ b/lib/Scheduler/data_dep.cpp @@ -199,7 +199,7 @@ DataDepGraph::DataDepGraph(MachineModel *machMdl, LATENCY_PRECISION ltncyPrcsn) RegFiles = llvm::make_unique(machMdl_->GetRegTypeCnt()); MaxClusterCount = 0; - MaxInstructionsInClusters = 0; + MaxInstructionsInAllClusters = 0; } DataDepGraph::~DataDepGraph() { diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp index 98f7c9d5..875bd5a4 100644 --- a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp +++ b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp @@ -573,7 +573,7 @@ int OptSchedDDGWrapperBasic::clusterNeighboringMemOps_( } LLVM_DEBUG(dbgs() << '\n'); #endif - MaxInstructionsInEachClusters.insert(ClusterCount, TotalInstructionsPossible); + MaxInstructionsInEachClusters.insert(std::make_pair(ClusterCount, TotalInstructionsPossible)); return TotalInstructionsPossible; } @@ -628,7 +628,7 @@ void OptSchedDDGWrapperBasic::findPossibleClusters() { TotalInstructionsPossible += clusterNeighboringMemOps_(SCD); } - setMaxInstructionsInClusters(TotalInstructionsPossible); + setMaxInstructionsInAllClusters(TotalInstructionsPossible); } LLVMRegTypeFilter::LLVMRegTypeFilter( From b519e2532a615aefe0dc2bc7b442b9db53aeb392 Mon Sep 17 00:00:00 2001 From: vang thao Date: Fri, 20 Mar 2020 13:47:02 -0700 Subject: [PATCH 22/40] First implementation of cost function --- example/optsched-cfg/sched.ini | 2 + include/opt-sched/Scheduler/bb_spill.h | 7 ++ lib/Scheduler/bb_spill.cpp | 96 ++++++++++++++++++-------- lib/Scheduler/enumerator.cpp | 30 ++++---- lib/Scheduler/list_sched.cpp | 2 - lib/Wrapper/OptimizingScheduler.cpp | 10 +++ 6 files changed, 100 insertions(+), 47 deletions(-) diff --git a/example/optsched-cfg/sched.ini b/example/optsched-cfg/sched.ini index 8968dbdb..d9b45132 100644 --- a/example/optsched-cfg/sched.ini +++ b/example/optsched-cfg/sched.ini @@ -19,6 +19,8 @@ USE_TWO_PASS NO # NO CLUSTER_MEMORY_OPS NO +CLUSTER_WEIGHT 1000000 + # These 3 flags control which schedulers will be used. # Each one can be individually toggled. The heuristic # list scheduler or ACO must be run before the diff --git a/include/opt-sched/Scheduler/bb_spill.h b/include/opt-sched/Scheduler/bb_spill.h index 91c13f37..9ab1381f 100644 --- a/include/opt-sched/Scheduler/bb_spill.h +++ b/include/opt-sched/Scheduler/bb_spill.h @@ -13,6 +13,7 @@ Last Update: Apr. 2011 #include "opt-sched/Scheduler/OptSchedTarget.h" #include "opt-sched/Scheduler/defines.h" #include "opt-sched/Scheduler/sched_region.h" +#include "llvm/ADT/MapVector.h" #include "llvm/ADT/SmallVector.h" #include #include @@ -38,6 +39,11 @@ class BBWithSpill : public SchedRegion { /// Current cluster size unsigned int CurrentClusterSize; + MapVector InstructionsScheduledInEachCluster; + + int MaxClusterBlocks; + int CurrentClusterBlocks; + /// Current active cluster group int ActiveClusterGroup; @@ -49,6 +55,7 @@ class BBWithSpill : public SchedRegion { /// Experimental variables and values for cost adjustment int ClusteringWeight; int ClusterInitialCost; + int TotalInstructionsInClusters; /// Data struct to contain information about the previous clusters struct PastClusters { diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp index 66b569b8..bf9c2a8c 100644 --- a/lib/Scheduler/bb_spill.cpp +++ b/lib/Scheduler/bb_spill.cpp @@ -72,13 +72,19 @@ BBWithSpill::BBWithSpill(const OptSchedTarget *OST_, DataDepGraph *dataDepGraph, CurrentClusterSize = 0; ActiveClusterGroup = 0; - ClusteringWeight = 1000; ClusterInitialCost = 1000000; PastClustersList.clear(); LastCluster = nullptr; + TotalInstructionsInClusters = 0; Config &schedIni = SchedulerOptions::getInstance(); ClusterMemoryOperations = schedIni.GetBool("CLUSTER_MEMORY_OPS"); + ClusteringWeight = schedIni.GetInt("CLUSTER_WEIGHT"); + MaxClusterBlocks = dataDepGraph_->getMaxClusterCount(); + CurrentClusterBlocks = MaxClusterBlocks; + for (int begin = 1; begin <= MaxClusterBlocks; begin++) { + InstructionsScheduledInEachCluster[begin] = 0; + } } /****************************************************************************/ @@ -316,6 +322,11 @@ InstCount BBWithSpill::CmputCostLwrBound() { InstCount staticLowerBound = schedLwrBound_ * schedCostFactor_ + spillCostLwrBound * SCW_; + + if (isSecondPass && ClusterMemoryOperations) { + staticLowerBound += MaxClusterBlocks * ClusteringWeight; + } + #if defined(IS_DEBUG_STATIC_LOWER_BOUND) Logger::Info( "DAG %s spillCostLB %d scFactor %d lengthLB %d lenFactor %d staticLB %d", @@ -336,6 +347,10 @@ void BBWithSpill::InitForSchdulng() { ClusterInitialCost = 1000000; PastClustersList.clear(); LastCluster.reset(); + CurrentClusterBlocks = MaxClusterBlocks; + for (int begin = 1; begin <= MaxClusterBlocks; begin++) { + InstructionsScheduledInEachCluster[begin] = 0; + } schduldEntryInstCnt_ = 0; schduldExitInstCnt_ = 0; @@ -383,12 +398,12 @@ InstCount BBWithSpill::CmputNormCost_(InstSchedule *sched, InstCount &execCost, bool trackCnflcts) { InstCount cost = CmputCost_(sched, compMode, execCost, trackCnflcts); - cost -= costLwrBound_; - execCost -= costLwrBound_; - // TODO: Implement cost function for clustering if (isSecondPass && ClusterMemoryOperations) - cost += ClusterInitialCost; + cost += CurrentClusterBlocks * ClusteringWeight; + + cost -= costLwrBound_; + execCost -= costLwrBound_; sched->SetCost(cost); sched->SetExecCost(execCost); @@ -471,17 +486,11 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst, // Case 1: Currently clustering and this current instruction is part // of the cluster CurrentClusterSize++; - if (CurrentClusterSize > 2) { - // Only decrement the cost if we cluster at least 2 operations - // together (EXPERIMENTAL FOR NOW) - ClusterInitialCost -= ClusteringWeight; - Logger::Info("Currently clustering %d instructions together", - CurrentClusterSize); - } + InstructionsScheduledInEachCluster[ActiveClusterGroup]++; } else { - Logger::Info("Inst %d pushing cluster size %d onto the stack due to " - "cluster to cluster op", - inst->GetNum(), CurrentClusterSize); + //Logger::Info("Inst %d pushing cluster size %d onto the stack due to " + // "cluster to cluster op", + // inst->GetNum(), CurrentClusterSize); // The instruction is in another cluster that is not currently active. // Exit out of the currently active cluster into a new one. if (LastCluster) { @@ -491,21 +500,29 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst, } else LastCluster = llvm::make_unique( ActiveClusterGroup, CurrentClusterSize, inst->GetNum()); + + // If this cluster did not finish then that means there have to be an extra cluster block to finish all of the instructions + // // in the cluster + if (InstructionsScheduledInEachCluster[ActiveClusterGroup] < dataDepGraph_->getMaxInstructionsInCluster(ActiveClusterGroup)) { + CurrentClusterBlocks++; + } ActiveClusterGroup = inst->GetClusterGroup(); inst->SetActiveCluster(ActiveClusterGroup); CurrentClusterSize = 1; + InstructionsScheduledInEachCluster[ActiveClusterGroup]++; } } else { // Case 3: Not currently clustering. Initialize clustering ActiveClusterGroup = inst->GetClusterGroup(); inst->SetActiveCluster(ActiveClusterGroup); CurrentClusterSize = 1; + InstructionsScheduledInEachCluster[ActiveClusterGroup]++; } } else if (CurrentClusterSize > 0) { // Case 2: Exiting out of an active cluster - Logger::Info("Inst %d pushing cluster size %d onto the stack", - inst->GetNum(), CurrentClusterSize); +// Logger::Info("Inst %d pushing cluster size %d onto the stack", + // inst->GetNum(), CurrentClusterSize); // Save the cluster to restore when backtracking. if (LastCluster) { @@ -520,12 +537,23 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst, LastCluster = llvm::make_unique( ActiveClusterGroup, CurrentClusterSize, inst->GetNum()); + // If InstrScheduledInEachCluster != Max + // blocks++ + + // If this cluster did not finish then that means there have to be an extra cluster block to finish all of the instructions + // in the cluster + if (InstructionsScheduledInEachCluster[ActiveClusterGroup] < dataDepGraph_->getMaxInstructionsInCluster(ActiveClusterGroup)) { + CurrentClusterBlocks++; + } + + assert(InstructionsScheduledInEachCluster[ActiveClusterGroup] <= dataDepGraph_->getMaxInstructionsInCluster(ActiveClusterGroup)); + ActiveClusterGroup = 0; // Reset active cluster inst->SetActiveCluster(0); CurrentClusterSize = 0; // Set cluster size to 0 } } - Logger::Info("Currently active cluster %d", ActiveClusterGroup); +// Logger::Info("Currently active cluster %d", ActiveClusterGroup); // Potential Issues: // 1. Keeping track of the average clustering size when we aren't done // scheduling. @@ -744,12 +772,11 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) { // backtracking. if (inst->GetMayCluster()) { // Case 1 and 3 - if (CurrentClusterSize > 2) { - ClusterInitialCost += ClusteringWeight; // Re-add the cost - } CurrentClusterSize--; - Logger::Info("Undoing an instruction from the cluster. Current size: %d", - CurrentClusterSize); + InstructionsScheduledInEachCluster[ActiveClusterGroup]--; + assert(InstructionsScheduledInEachCluster[ActiveClusterGroup] >= 0); + //Logger::Info("Undoing an instruction from the cluster. Current size: %d", + // CurrentClusterSize); // If there is no more member in the currently active cluster then disable // the cluster @@ -771,6 +798,10 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) { LastCluster = std::move(PastClustersList.back()); PastClustersList.pop_back(); } + if (InstructionsScheduledInEachCluster[ActiveClusterGroup] != dataDepGraph_->getMaxInstructionsInCluster(ActiveClusterGroup)) { + CurrentClusterBlocks--; + assert(CurrentClusterBlocks >= MaxClusterBlocks); + } } } } @@ -789,12 +820,17 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) { LastCluster = std::move(PastClustersList.back()); PastClustersList.pop_back(); } - Logger::Info("Inst %d popping cluster size %d off the stack", - inst->GetNum(), CurrentClusterSize); + //Logger::Info("Inst %d popping cluster size %d off the stacks", + // inst->GetNum(), CurrentClusterSize); + + if (InstructionsScheduledInEachCluster[ActiveClusterGroup] != dataDepGraph_->getMaxInstructionsInCluster(ActiveClusterGroup)) { + CurrentClusterBlocks--; + assert(CurrentClusterBlocks >= MaxClusterBlocks); + } } } } - Logger::Info("Currently active cluster %d", ActiveClusterGroup); +// Logger::Info("Currently active cluster %d", ActiveClusterGroup); defCnt = inst->GetDefs(defs); useCnt = inst->GetUses(uses); @@ -1095,12 +1131,12 @@ bool BBWithSpill::ChkCostFsblty(InstCount trgtLngth, EnumTreeNode *node) { } else { crntCost = crntSpillCost_ * SCW_ + trgtLngth * schedCostFactor_; } - crntCost -= costLwrBound_; - dynmcCostLwrBound = crntCost; - // TODO: Implement cost function for clustering if (isSecondPass && ClusterMemoryOperations) - crntCost += ClusterInitialCost; + crntCost += CurrentClusterBlocks * ClusteringWeight; + + crntCost -= costLwrBound_; + dynmcCostLwrBound = crntCost; // assert(cost >= 0); assert(dynmcCostLwrBound >= 0); diff --git a/lib/Scheduler/enumerator.cpp b/lib/Scheduler/enumerator.cpp index 728df961..115e03b6 100644 --- a/lib/Scheduler/enumerator.cpp +++ b/lib/Scheduler/enumerator.cpp @@ -971,13 +971,13 @@ bool Enumerator::FindNxtFsblBrnch_(EnumTreeNode *&newNode) { assert(crntNode_->IsLeaf() || (brnchCnt != rdyInstCnt) ? 1 : rdyInstCnt); // brnchCnt == rdyInstCnt == 0 ? 1 : rdyInstCnt); -//#ifdef IS_DEBUG_READY_LIST -// Logger::Info("Ready List Size is %d", rdyInstCnt); +#ifdef IS_DEBUG_READY_LIST + Logger::Info("Ready List Size is %d", rdyInstCnt); // Warning! That will reset the instruction iterator! - rdyLst_->Print(Logger::GetLogStream()); + // rdyLst_->Print(Logger::GetLogStream()); -// stats::maxReadyListSize.SetMax(rdyInstCnt); -//#endif + stats::maxReadyListSize.SetMax(rdyInstCnt); +#endif if (crntBrnchNum == 0 && SchedForRPOnly_) crntNode_->SetFoundInstWithUse(IsUseInRdyLst_()); @@ -1065,7 +1065,7 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode, if (inst != NULL) if (inst->GetPreFxdCycle() != INVALID_VALUE) if (inst->GetPreFxdCycle() != crntCycleNum_) { - Logger::Info("Pruned due to prefixed cycle"); + //Logger::Info("Pruned due to prefixed cycle"); return false; } @@ -1074,7 +1074,7 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode, #ifdef IS_DEBUG_INFSBLTY_TESTS stats::forwardLBInfeasibilityHits++; #endif - Logger::Info("Pruned due to forward lowerbound"); + //Logger::Info("Pruned due to forward lowerbound"); return false; } @@ -1082,7 +1082,7 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode, #ifdef IS_DEBUG_INFSBLTY_TESTS stats::backwardLBInfeasibilityHits++; #endif - Logger::Info("Pruned due to backward lowerbound"); + //Logger::Info("Pruned due to backward lowerbound"); return false; } } @@ -1103,7 +1103,7 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode, stats::nodeSuperiorityInfeasibilityHits++; #endif isNodeDmntd = true; - Logger::Info("Pruned due to node superiority"); + //Logger::Info("Pruned due to node superiority"); return false; } } @@ -1121,7 +1121,7 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode, #ifdef IS_DEBUG_INFSBLTY_TESTS stats::slotCountInfeasibilityHits++; #endif - Logger::Info("Pruned due to issue slot infeasibility"); + //Logger::Info("Pruned due to issue slot infeasibility"); return false; } @@ -1132,7 +1132,7 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode, #ifdef IS_DEBUG_INFSBLTY_TESTS stats::rangeTighteningInfeasibilityHits++; #endif - Logger::Info("Pruned due to range tightening infeasibility"); + //Logger::Info("Pruned due to range tightening infeasibility"); return false; } @@ -1150,7 +1150,7 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode, #ifdef IS_DEBUG_INFSBLTY_TESTS stats::historyDominationInfeasibilityHits++; #endif - Logger::Info("Pruned due to history domination"); + //Logger::Info("Pruned due to history domination"); return false; } } @@ -1165,7 +1165,7 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode, stats::relaxedSchedulingInfeasibilityHits++; #endif isRlxInfsbl = true; - Logger::Info("Pruned due to relaxed schedule"); + //Logger::Info("Pruned due to relaxed schedule"); return false; } } @@ -2078,7 +2078,7 @@ bool LengthCostEnumerator::ProbeBranch_(SchedInstruction *inst, isFsbl = ChkCostFsblty_(inst, newNode); if (isFsbl == false) { - Logger::Info("Pruned due to cost infeasibility"); + //Logger::Info("Pruned due to cost infeasibility"); return false; } @@ -2096,7 +2096,7 @@ bool LengthCostEnumerator::ProbeBranch_(SchedInstruction *inst, #endif rgn_->UnschdulInst(inst, crntCycleNum_, crntSlotNum_, parent); - Logger::Info("Pruned due to history domination"); + //Logger::Info("Pruned due to history domination"); return false; } } diff --git a/lib/Scheduler/list_sched.cpp b/lib/Scheduler/list_sched.cpp index f737def5..9bf96951 100644 --- a/lib/Scheduler/list_sched.cpp +++ b/lib/Scheduler/list_sched.cpp @@ -40,8 +40,6 @@ FUNC_RESULT ListScheduler::FindSchedule(InstSchedule *sched, SchedRegion *rgn) { while (!IsSchedComplete_()) { UpdtRdyLst_(crntCycleNum_, crntSlotNum_); rdyLst_->ResetIterator(); - rdyLst_->Print(Logger::GetLogStream()); - rdyLst_->ResetIterator(); iterCnt++; rdyLstSize = rdyLst_->GetInstCnt(); diff --git a/lib/Wrapper/OptimizingScheduler.cpp b/lib/Wrapper/OptimizingScheduler.cpp index a984469d..dba961a1 100644 --- a/lib/Wrapper/OptimizingScheduler.cpp +++ b/lib/Wrapper/OptimizingScheduler.cpp @@ -379,6 +379,16 @@ void ScheduleDAGOptSched::schedule() { DDG->convertSUnits(); DDG->convertRegFiles(); DDG->findPossibleClusters(); + if (SecondPass) { + auto DDG2 = static_cast(DDG.get()); + int end = DDG2->getMaxClusterCount(); + if (end > 0) { + Logger::Info("Total clusters in region: %d", end); + for (int begin = 1; begin <= end; begin++) { + Logger::Info("Cluster %d has total instructions %d", begin, DDG2->getMaxInstructionsInCluster(begin)); + } + } + } auto *BDDG = static_cast(DDG.get()); addGraphTransformations(BDDG); From 26a89c3b686e0e051646ece0ae2fc27e2c09407b Mon Sep 17 00:00:00 2001 From: Vang Thao Date: Fri, 20 Mar 2020 14:00:34 -0700 Subject: [PATCH 23/40] Some code cleanup. No functional changes. --- include/opt-sched/Scheduler/bb_spill.h | 2 -- include/opt-sched/Scheduler/data_dep.h | 1 - lib/Scheduler/bb_spill.cpp | 2 -- lib/Scheduler/data_dep.cpp | 6 ++---- lib/Scheduler/enumerator.cpp | 11 +---------- lib/Wrapper/OptSchedDDGWrapperBasic.cpp | 7 +++---- lib/Wrapper/OptSchedDDGWrapperBasic.h | 3 +-- 7 files changed, 7 insertions(+), 25 deletions(-) diff --git a/include/opt-sched/Scheduler/bb_spill.h b/include/opt-sched/Scheduler/bb_spill.h index 9ab1381f..a2d6afa5 100644 --- a/include/opt-sched/Scheduler/bb_spill.h +++ b/include/opt-sched/Scheduler/bb_spill.h @@ -9,7 +9,6 @@ Last Update: Apr. 2011 #ifndef OPTSCHED_SPILL_BB_SPILL_H #define OPTSCHED_SPILL_BB_SPILL_H -#include "opt-sched/Scheduler/bit_vector.h" #include "opt-sched/Scheduler/OptSchedTarget.h" #include "opt-sched/Scheduler/defines.h" #include "opt-sched/Scheduler/sched_region.h" @@ -54,7 +53,6 @@ class BBWithSpill : public SchedRegion { // TODO: Implement cost function for clustering /// Experimental variables and values for cost adjustment int ClusteringWeight; - int ClusterInitialCost; int TotalInstructionsInClusters; /// Data struct to contain information about the previous clusters diff --git a/include/opt-sched/Scheduler/data_dep.h b/include/opt-sched/Scheduler/data_dep.h index 40833f1c..6857ec79 100644 --- a/include/opt-sched/Scheduler/data_dep.h +++ b/include/opt-sched/Scheduler/data_dep.h @@ -400,7 +400,6 @@ class DataDepGraph : public llvm::opt_sched::OptSchedDDGWrapperBase, SchedInstruction *CreateNode_(InstCount instNum, char const *const instName, InstType instType, char const *const opCode, - /* bool InstrMayLoad, bool InstrMayStore,*/ int nodeID, InstCount fileSchedOrder, InstCount fileSchedCycle, InstCount fileLB, InstCount fileUB, int blkNum); diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp index bf9c2a8c..6717046f 100644 --- a/lib/Scheduler/bb_spill.cpp +++ b/lib/Scheduler/bb_spill.cpp @@ -72,11 +72,9 @@ BBWithSpill::BBWithSpill(const OptSchedTarget *OST_, DataDepGraph *dataDepGraph, CurrentClusterSize = 0; ActiveClusterGroup = 0; - ClusterInitialCost = 1000000; PastClustersList.clear(); LastCluster = nullptr; TotalInstructionsInClusters = 0; - Config &schedIni = SchedulerOptions::getInstance(); ClusterMemoryOperations = schedIni.GetBool("CLUSTER_MEMORY_OPS"); ClusteringWeight = schedIni.GetInt("CLUSTER_WEIGHT"); diff --git a/lib/Scheduler/data_dep.cpp b/lib/Scheduler/data_dep.cpp index 005bec55..4db8ace6 100644 --- a/lib/Scheduler/data_dep.cpp +++ b/lib/Scheduler/data_dep.cpp @@ -832,10 +832,8 @@ FUNC_RESULT DataDepGraph::SkipGraph(SpecsBuffer *buf, bool &endOfFileReached) { SchedInstruction *DataDepGraph::CreateNode_( InstCount instNum, char const *const instName, InstType instType, - char const *const opCode, - /* bool InstrMayLoad, bool InstrMayStore,*/ int nodeID, - InstCount fileSchedOrder, InstCount fileSchedCycle, InstCount fileLB, - InstCount fileUB, int blkNum) { + char const *const opCode, int nodeID, InstCount fileSchedOrder, + InstCount fileSchedCycle, InstCount fileLB, InstCount fileUB, int blkNum) { SchedInstruction *newInstPtr; newInstPtr = new SchedInstruction(instNum, instName, instType, opCode, diff --git a/lib/Scheduler/enumerator.cpp b/lib/Scheduler/enumerator.cpp index 115e03b6..d9c4e3b1 100644 --- a/lib/Scheduler/enumerator.cpp +++ b/lib/Scheduler/enumerator.cpp @@ -1065,7 +1065,6 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode, if (inst != NULL) if (inst->GetPreFxdCycle() != INVALID_VALUE) if (inst->GetPreFxdCycle() != crntCycleNum_) { - //Logger::Info("Pruned due to prefixed cycle"); return false; } @@ -1074,7 +1073,6 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode, #ifdef IS_DEBUG_INFSBLTY_TESTS stats::forwardLBInfeasibilityHits++; #endif - //Logger::Info("Pruned due to forward lowerbound"); return false; } @@ -1082,7 +1080,6 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode, #ifdef IS_DEBUG_INFSBLTY_TESTS stats::backwardLBInfeasibilityHits++; #endif - //Logger::Info("Pruned due to backward lowerbound"); return false; } } @@ -1103,7 +1100,6 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode, stats::nodeSuperiorityInfeasibilityHits++; #endif isNodeDmntd = true; - //Logger::Info("Pruned due to node superiority"); return false; } } @@ -1121,7 +1117,6 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode, #ifdef IS_DEBUG_INFSBLTY_TESTS stats::slotCountInfeasibilityHits++; #endif - //Logger::Info("Pruned due to issue slot infeasibility"); return false; } @@ -1132,7 +1127,6 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode, #ifdef IS_DEBUG_INFSBLTY_TESTS stats::rangeTighteningInfeasibilityHits++; #endif - //Logger::Info("Pruned due to range tightening infeasibility"); return false; } @@ -1150,7 +1144,6 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode, #ifdef IS_DEBUG_INFSBLTY_TESTS stats::historyDominationInfeasibilityHits++; #endif - //Logger::Info("Pruned due to history domination"); return false; } } @@ -1165,7 +1158,7 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode, stats::relaxedSchedulingInfeasibilityHits++; #endif isRlxInfsbl = true; - //Logger::Info("Pruned due to relaxed schedule"); + return false; } } @@ -2078,7 +2071,6 @@ bool LengthCostEnumerator::ProbeBranch_(SchedInstruction *inst, isFsbl = ChkCostFsblty_(inst, newNode); if (isFsbl == false) { - //Logger::Info("Pruned due to cost infeasibility"); return false; } @@ -2096,7 +2088,6 @@ bool LengthCostEnumerator::ProbeBranch_(SchedInstruction *inst, #endif rgn_->UnschdulInst(inst, crntCycleNum_, crntSlotNum_, parent); - //Logger::Info("Pruned due to history domination"); return false; } } diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp index 875bd5a4..461cde3d 100644 --- a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp +++ b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp @@ -5,7 +5,6 @@ //===----------------------------------------------------------------------===// #include "OptSchedDDGWrapperBasic.h" -#include "opt-sched/Scheduler/bit_vector.h" #include "opt-sched/Scheduler/config.h" #include "opt-sched/Scheduler/logger.h" #include "opt-sched/Scheduler/register.h" @@ -508,7 +507,7 @@ void OptSchedDDGWrapperBasic::countBoundaryLiveness( /// Partially copied from /// https://github.com/RadeonOpenCompute/llvm/blob/roc-ocl-2.4.0/lib/CodeGen/MachineScheduler.cpp#L1554 -int OptSchedDDGWrapperBasic::clusterNeighboringMemOps_( +int OptSchedDDGWrapperBasic::clusterNeighboringMemOps( ArrayRef MemOps) { SmallVector MemOpRecords; bool ClusterPossible = false; @@ -578,7 +577,7 @@ int OptSchedDDGWrapperBasic::clusterNeighboringMemOps_( } /// Iterate through SUnits and find all possible clustering then transfer -/// the information over to the SchedInstruction class as a bitvector. +/// the information so that our scheduler can access it. /// Partially copied from https://github.com/RadeonOpenCompute/llvm/blob/roc-ocl-2.4.0/lib/CodeGen/MachineScheduler.cpp#L1595 void OptSchedDDGWrapperBasic::findPossibleClusters() { // TODO: Add For-loop to also do store clusters. Currently only does load @@ -625,7 +624,7 @@ void OptSchedDDGWrapperBasic::findPossibleClusters() { LLVM_DEBUG(dbgs() << SU1->NodeNum << " "); LLVM_DEBUG(dbgs() << '\n'); #endif - TotalInstructionsPossible += clusterNeighboringMemOps_(SCD); + TotalInstructionsPossible += clusterNeighboringMemOps(SCD); } setMaxInstructionsInAllClusters(TotalInstructionsPossible); diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.h b/lib/Wrapper/OptSchedDDGWrapperBasic.h index 4fd3937b..eef47684 100644 --- a/lib/Wrapper/OptSchedDDGWrapperBasic.h +++ b/lib/Wrapper/OptSchedDDGWrapperBasic.h @@ -135,8 +135,7 @@ class OptSchedDDGWrapperBasic : public DataDepGraph { // Find liveness info generated by the region boundary. void discoverBoundaryLiveness(const llvm::MachineInstr *MI); - int clusterNeighboringMemOps_( - ArrayRef MemOps); + int clusterNeighboringMemOps(ArrayRef MemOps); // Holds a register live range, mapping a producer to a set of consumers. struct LiveRange { From ec8e0bd4f759d28f023d16d1e87609071901c949 Mon Sep 17 00:00:00 2001 From: vang thao Date: Fri, 20 Mar 2020 14:09:08 -0700 Subject: [PATCH 24/40] Missed variable to clean up --- lib/Scheduler/bb_spill.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp index 6717046f..5fa4caea 100644 --- a/lib/Scheduler/bb_spill.cpp +++ b/lib/Scheduler/bb_spill.cpp @@ -342,7 +342,6 @@ void BBWithSpill::InitForSchdulng() { SchedInstruction::SetActiveCluster(0); CurrentClusterSize = 0; ActiveClusterGroup = 0; - ClusterInitialCost = 1000000; PastClustersList.clear(); LastCluster.reset(); CurrentClusterBlocks = MaxClusterBlocks; From f467f83a7aa4e24ff6a98a4b38d24665f0d1d37c Mon Sep 17 00:00:00 2001 From: vang thao Date: Thu, 26 Mar 2020 18:16:18 -0700 Subject: [PATCH 25/40] Fix issues with enumerator not updating priorities --- include/opt-sched/Scheduler/enumerator.h | 3 ++ include/opt-sched/Scheduler/lnkd_lst.h | 29 ++++++++++--------- .../opt-sched/Scheduler/sched_basic_data.h | 6 ++++ lib/Scheduler/bb_spill.cpp | 16 ++-------- lib/Scheduler/ready_list.cpp | 23 +++++++++++---- lib/Scheduler/sched_basic_data.cpp | 8 +++++ lib/Wrapper/OptimizingScheduler.cpp | 1 + 7 files changed, 53 insertions(+), 33 deletions(-) diff --git a/include/opt-sched/Scheduler/enumerator.h b/include/opt-sched/Scheduler/enumerator.h index 85dc5b18..145f24d4 100644 --- a/include/opt-sched/Scheduler/enumerator.h +++ b/include/opt-sched/Scheduler/enumerator.h @@ -926,6 +926,9 @@ inline void Enumerator::UpdtRdyLst_(InstCount cycleNum, int slotNum) { } rdyLst_->AddLatestSubLists(lst1, lst2); + + if (prirts_.isDynmc) + rdyLst_->UpdatePriorities(); } /*****************************************************************************/ diff --git a/include/opt-sched/Scheduler/lnkd_lst.h b/include/opt-sched/Scheduler/lnkd_lst.h index 3c311f9d..d6b696cb 100644 --- a/include/opt-sched/Scheduler/lnkd_lst.h +++ b/include/opt-sched/Scheduler/lnkd_lst.h @@ -177,7 +177,8 @@ class PriorityList : public LinkedList { T *GetNxtPriorityElmnt(); T *GetNxtPriorityElmnt(K &key); // Copies all the data from another list. The existing list must be empty. - void CopyList(PriorityList const *const otherLst); + void CopyList(PriorityList const *const otherLst, + KeyedEntry **keyedEntries_); protected: KeyedEntry *allocKeyEntries_; @@ -572,8 +573,8 @@ inline T *PriorityList::GetNxtPriorityElmnt(K &key) { } //(Vlad) added functionality to decrease priority -//used for decreasing priority of clusterable instrs -//when leaving a cluster +// used for decreasing priority of clusterable instrs +// when leaving a cluster template void PriorityList::BoostEntry(KeyedEntry *entry, K newKey) { KeyedEntry *crnt; @@ -582,7 +583,7 @@ void PriorityList::BoostEntry(KeyedEntry *entry, K newKey) { assert(LinkedList::topEntry_ != NULL); - if (entry->key < newKey) //behave normally + if (entry->key < newKey) // behave normally { entry->key = newKey; @@ -612,22 +613,19 @@ void PriorityList::BoostEntry(KeyedEntry *entry, K newKey) { assert(next != entry->GetNext()); LinkedList::RmvEntry_(entry, false); InsrtEntry_(entry, next); - } - else //move entry down on priority list + } else // move entry down on priority list { entry->key = newKey; - //if it is at the bottom or next entry still has a smaller key, - //then the entry is already in place + // if it is at the bottom or next entry still has a smaller key, + // then the entry is already in place if (entry == LinkedList::bottomEntry_ || next->key <= newKey) return; - for (crnt = entry->GetNext(); crnt != NULL; crnt = crnt->GetNext()) - { - if (crnt->key <= newKey) - { + for (crnt = entry->GetNext(); crnt != NULL; crnt = crnt->GetNext()) { + if (crnt->key <= newKey) { next = crnt; - break; + break; } } @@ -639,7 +637,9 @@ void PriorityList::BoostEntry(KeyedEntry *entry, K newKey) { } template -void PriorityList::CopyList(PriorityList const *const otherLst) { +void PriorityList::CopyList( + PriorityList const *const otherLst, + KeyedEntry **keyedEntries_) { assert(LinkedList::elmntCnt_ == 0); for (KeyedEntry *entry = (KeyedEntry *)otherLst->topEntry_; @@ -648,6 +648,7 @@ void PriorityList::CopyList(PriorityList const *const otherLst) { K key = entry->key; KeyedEntry *newEntry = AllocEntry_(elmnt, key); LinkedList::AppendEntry_(newEntry); + keyedEntries_[entry->element->GetNum()] = newEntry; if (entry == otherLst->rtrvEntry_) { LinkedList::rtrvEntry_ = newEntry; diff --git a/include/opt-sched/Scheduler/sched_basic_data.h b/include/opt-sched/Scheduler/sched_basic_data.h index cdfad226..c9d2c107 100644 --- a/include/opt-sched/Scheduler/sched_basic_data.h +++ b/include/opt-sched/Scheduler/sched_basic_data.h @@ -435,6 +435,8 @@ class SchedInstruction : public GraphNode { int GetClusterGroup() { return ClusterGroup; } static int GetActiveCluster() { return ActiveCluster; } static void SetActiveCluster(int Active) { ActiveCluster = Active; } + bool getWasActive() { return WasActive; } + bool computeWasActive(); friend class SchedRange; protected: @@ -442,6 +444,10 @@ class SchedInstruction : public GraphNode { string name_; // The mnemonic of this instruction, e.g. "add" or "jmp". string opCode_; + + bool WasActive; + + /// The cluster group that the current instruction is a part of. /// Default of 0 means that it is not part of any cluster. int ClusterGroup; diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp index 5fa4caea..3dfa48b3 100644 --- a/lib/Scheduler/bb_spill.cpp +++ b/lib/Scheduler/bb_spill.cpp @@ -550,19 +550,7 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst, CurrentClusterSize = 0; // Set cluster size to 0 } } -// Logger::Info("Currently active cluster %d", ActiveClusterGroup); - // Potential Issues: - // 1. Keeping track of the average clustering size when we aren't done - // scheduling. - // Cost function that was discussed during the meeting on Friday: - // (15 - averageClusteringSize) * ClusteringWeight - // We want to minimize this cost but there is an issue in the following - // example - // Ex: Partial schedule was able to cluster a block of 15. - // averageClusteringSize : 15, CostFnc: (15-15)*Weight = 0 - // Any cluster block below size 15 will decrease the average - // cluster size and increase the cost. This makes our B&B - // enumerator actually favor not doing clustering. + // Logger::Info("schedule, Currently active cluster %d", ActiveClusterGroup); defCnt = inst->GetDefs(defs); useCnt = inst->GetUses(uses); @@ -827,7 +815,7 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) { } } } -// Logger::Info("Currently active cluster %d", ActiveClusterGroup); +// Logger::Info("unschedule, Currently active cluster %d", ActiveClusterGroup); defCnt = inst->GetDefs(defs); useCnt = inst->GetUses(uses); diff --git a/lib/Scheduler/ready_list.cpp b/lib/Scheduler/ready_list.cpp index 0e4bcd0b..5a1be9d6 100644 --- a/lib/Scheduler/ready_list.cpp +++ b/lib/Scheduler/ready_list.cpp @@ -149,7 +149,7 @@ void ReadyList::CopyList(ReadyList *othrLst) { assert(prirtyLst_->GetElmntCnt() == 0); assert(latestSubLst_->GetElmntCnt() == 0); assert(othrLst != NULL); - prirtyLst_->CopyList(othrLst->prirtyLst_); + prirtyLst_->CopyList(othrLst->prirtyLst_, keyedEntries_); } unsigned long ReadyList::CmputKey_(SchedInstruction *inst, bool isUpdate, @@ -159,6 +159,7 @@ unsigned long ReadyList::CmputKey_(SchedInstruction *inst, bool isUpdate, int i; int16_t oldLastUseCnt, newLastUseCnt; unsigned long ValueForKey; + bool OldWasActive, NewWasActive; changed = true; if (isUpdate) changed = false; @@ -174,9 +175,10 @@ unsigned long ReadyList::CmputKey_(SchedInstruction *inst, bool isUpdate, case LSH_LUC: oldLastUseCnt = inst->GetLastUseCnt(); newLastUseCnt = inst->CmputLastUseCnt(); - assert(!isUpdate || newLastUseCnt >= oldLastUseCnt); - if (newLastUseCnt != oldLastUseCnt) + // assert(!isUpdate || newLastUseCnt >= oldLastUseCnt); + if (newLastUseCnt != oldLastUseCnt) { changed = true; + } AddPrirtyToKey_(key, keySize, useCntBits_, newLastUseCnt, maxUseCnt_); break; @@ -208,9 +210,19 @@ unsigned long ReadyList::CmputKey_(SchedInstruction *inst, bool isUpdate, break; case LSH_MEM: - ValueForKey = - inst->GetClusterGroup() == SchedInstruction::GetActiveCluster() ? 1 + if (inst->GetClusterGroup() == 0) + ValueForKey = 0; + else { + OldWasActive = inst->getWasActive(); + NewWasActive = inst->computeWasActive(); + + if (OldWasActive != NewWasActive) { + changed = true; + } + ValueForKey = + inst->GetClusterGroup() == SchedInstruction::GetActiveCluster() ? 1 : 0; + } AddPrirtyToKey_(key, keySize, ClusterBit, ValueForKey, 1); break; @@ -298,6 +310,7 @@ void ReadyList::AddInst(SchedInstruction *inst) { assert(changed == true); KeyedEntry *entry = prirtyLst_->InsrtElmnt(inst, key, true); + InstCount instNum = inst->GetNum(); if (prirts_.isDynmc) keyedEntries_[instNum] = entry; diff --git a/lib/Scheduler/sched_basic_data.cpp b/lib/Scheduler/sched_basic_data.cpp index 2fa5f09d..c858be34 100644 --- a/lib/Scheduler/sched_basic_data.cpp +++ b/lib/Scheduler/sched_basic_data.cpp @@ -64,6 +64,7 @@ SchedInstruction::SchedInstruction(InstCount num, const string &name, mustBeInBBEntry_ = false; mustBeInBBExit_ = false; + WasActive = false; } SchedInstruction::~SchedInstruction() { @@ -72,6 +73,13 @@ SchedInstruction::~SchedInstruction() { delete crntRange_; } +bool SchedInstruction::computeWasActive() { + if (ClusterGroup == 0) return false; + + WasActive = GetActiveCluster() == GetClusterGroup(); + return WasActive; +} + void SchedInstruction::SetupForSchdulng(InstCount instCnt, bool isCP_FromScsr, bool isCP_FromPrdcsr) { if (memAllocd_) diff --git a/lib/Wrapper/OptimizingScheduler.cpp b/lib/Wrapper/OptimizingScheduler.cpp index dba961a1..295e9002 100644 --- a/lib/Wrapper/OptimizingScheduler.cpp +++ b/lib/Wrapper/OptimizingScheduler.cpp @@ -676,6 +676,7 @@ SchedPriorities ScheduleDAGOptSched::parseHeuristic(const std::string &Str) { Priorities.vctr[Priorities.cnt++] = LSH; switch (LSH) { // Is LUC still the only dynamic heuristic? + case LSH_MEM: case LSH_LUC: Priorities.isDynmc = true; break; From 7fcb9a47928937acd48dad638b90456a6e555954 Mon Sep 17 00:00:00 2001 From: vang thao Date: Sat, 4 Apr 2020 14:14:06 -0700 Subject: [PATCH 26/40] Added store clustering and debugging statements --- .../Scheduler/OptSchedDDGWrapperBase.h | 2 +- include/opt-sched/Scheduler/bb_spill.h | 5 ++ lib/Scheduler/bb_spill.cpp | 60 ++++++++++++++++++- lib/Wrapper/AMDGPU/OptSchedGCNTarget.cpp | 4 ++ lib/Wrapper/OptSchedDDGWrapperBasic.cpp | 43 ++++--------- lib/Wrapper/OptSchedDDGWrapperBasic.h | 2 +- lib/Wrapper/OptimizingScheduler.cpp | 14 ++++- 7 files changed, 93 insertions(+), 37 deletions(-) diff --git a/include/opt-sched/Scheduler/OptSchedDDGWrapperBase.h b/include/opt-sched/Scheduler/OptSchedDDGWrapperBase.h index 4db4673c..b10c9248 100644 --- a/include/opt-sched/Scheduler/OptSchedDDGWrapperBase.h +++ b/include/opt-sched/Scheduler/OptSchedDDGWrapperBase.h @@ -18,7 +18,7 @@ class OptSchedDDGWrapperBase { virtual void convertRegFiles() = 0; - virtual void findPossibleClusters() = 0; + virtual int findPossibleClusters(bool IsLoad) = 0; }; } // namespace opt_sched diff --git a/include/opt-sched/Scheduler/bb_spill.h b/include/opt-sched/Scheduler/bb_spill.h index a2d6afa5..61461710 100644 --- a/include/opt-sched/Scheduler/bb_spill.h +++ b/include/opt-sched/Scheduler/bb_spill.h @@ -14,6 +14,7 @@ Last Update: Apr. 2011 #include "opt-sched/Scheduler/sched_region.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" #include #include #include @@ -65,6 +66,8 @@ class BBWithSpill : public SchedRegion { /// Instruction number that ended this cluster int InstNum; + std::unique_ptr> InstrList; + /// Constructor for this struct PastClusters(int Cluster, int size, int num) : ClusterGroup(Cluster), ClusterSize(size), InstNum(num) {} @@ -73,6 +76,8 @@ class BBWithSpill : public SchedRegion { /// Vector containing the (n-1) past clusters llvm::SmallVector, 4> PastClustersList; + std::unique_ptr> InstrList; + /// Pointer to the last cluster. This is kept out of the vector to /// avoid having to fetch it every time we compare the current instruction /// number to the one that ended the cluster. diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp index 3dfa48b3..f3b49c3e 100644 --- a/lib/Scheduler/bb_spill.cpp +++ b/lib/Scheduler/bb_spill.cpp @@ -349,6 +349,8 @@ void BBWithSpill::InitForSchdulng() { InstructionsScheduledInEachCluster[begin] = 0; } + InstrList.reset(); + schduldEntryInstCnt_ = 0; schduldExitInstCnt_ = 0; schduldInstCnt_ = 0; @@ -483,7 +485,10 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst, // Case 1: Currently clustering and this current instruction is part // of the cluster CurrentClusterSize++; - InstructionsScheduledInEachCluster[ActiveClusterGroup]++; + InstructionsScheduledInEachCluster[ActiveClusterGroup]++; + + InstrList->push_back(inst->GetName()); + } else { //Logger::Info("Inst %d pushing cluster size %d onto the stack due to " // "cluster to cluster op", @@ -497,7 +502,9 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst, } else LastCluster = llvm::make_unique( ActiveClusterGroup, CurrentClusterSize, inst->GetNum()); - + + LastCluster->InstrList = std::move(InstrList); + // If this cluster did not finish then that means there have to be an extra cluster block to finish all of the instructions // // in the cluster if (InstructionsScheduledInEachCluster[ActiveClusterGroup] < dataDepGraph_->getMaxInstructionsInCluster(ActiveClusterGroup)) { @@ -508,6 +515,10 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst, inst->SetActiveCluster(ActiveClusterGroup); CurrentClusterSize = 1; InstructionsScheduledInEachCluster[ActiveClusterGroup]++; + + InstrList = llvm::make_unique>(); + InstrList->push_back(inst->GetName()); + } } else { // Case 3: Not currently clustering. Initialize clustering @@ -515,6 +526,10 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst, inst->SetActiveCluster(ActiveClusterGroup); CurrentClusterSize = 1; InstructionsScheduledInEachCluster[ActiveClusterGroup]++; + + InstrList = llvm::make_unique>(); + InstrList->push_back(inst->GetName()); + } } else if (CurrentClusterSize > 0) { // Case 2: Exiting out of an active cluster @@ -534,6 +549,8 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst, LastCluster = llvm::make_unique( ActiveClusterGroup, CurrentClusterSize, inst->GetNum()); + LastCluster->InstrList = std::move(InstrList); + // If InstrScheduledInEachCluster != Max // blocks++ @@ -760,6 +777,9 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) { CurrentClusterSize--; InstructionsScheduledInEachCluster[ActiveClusterGroup]--; assert(InstructionsScheduledInEachCluster[ActiveClusterGroup] >= 0); + + InstrList->pop_back(); + //Logger::Info("Undoing an instruction from the cluster. Current size: %d", // CurrentClusterSize); @@ -776,6 +796,9 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) { CurrentClusterSize = LastCluster->ClusterSize; ActiveClusterGroup = LastCluster->ClusterGroup; inst->SetActiveCluster(ActiveClusterGroup); + + InstrList = std::move(LastCluster->InstrList); + LastCluster.reset(); // Release current cluster pointer // Get previous cluster from vector list @@ -798,6 +821,9 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) { CurrentClusterSize = LastCluster->ClusterSize; ActiveClusterGroup = LastCluster->ClusterGroup; inst->SetActiveCluster(ActiveClusterGroup); + + InstrList = std::move(LastCluster->InstrList); + LastCluster.reset(); // Release current cluster pointer // Get previous cluster from vector list @@ -1080,6 +1106,36 @@ InstCount BBWithSpill::UpdtOptmlSched(InstSchedule *crntSched, bestSchedLngth_ = crntSched->GetCrntLngth(); enumBestSched_->Copy(crntSched); bestSched_ = enumBestSched_; + + if (isSecondPass && ClusterMemoryOperations) { + dbgs() << "Printing clustered instructions:\n"; + int i = 1; + for (const auto &clusters : PastClustersList) { + dbgs() << "Printing cluster " << i << ": "; + for (const auto &instr : *clusters->InstrList) { + dbgs() << instr << " "; + } + i++; + dbgs() << '\n'; + } + + if (LastCluster) { + dbgs() << "Printing cluster " << i << ": "; + for (const auto &instr : *(LastCluster->InstrList)) { + dbgs() << instr << " "; + } + i++; + dbgs() << '\n'; + } + + if (InstrList && InstrList->size() > 0) { + dbgs() << "Printing cluster " << i << ": "; + for (const auto &instr : *InstrList) { + dbgs() << instr << " "; + } + dbgs() << '\n'; + } + } } return bestCost_; diff --git a/lib/Wrapper/AMDGPU/OptSchedGCNTarget.cpp b/lib/Wrapper/AMDGPU/OptSchedGCNTarget.cpp index a250408b..798dc122 100644 --- a/lib/Wrapper/AMDGPU/OptSchedGCNTarget.cpp +++ b/lib/Wrapper/AMDGPU/OptSchedGCNTarget.cpp @@ -212,6 +212,10 @@ bool OptSchedGCNTarget::shouldKeepSchedule() { dbgs() << "Reverting Scheduling because of a decrease in occupancy from " << RegionStartingOccupancy << " to " << RegionEndingOccupancy << ".\n"); + Logger::Info( + "Reverting Scheduling because of a decrease in occupancy from %d to %d.", RegionStartingOccupancy, RegionEndingOccupancy +); + return false; } diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp index 461cde3d..6dfdb4a9 100644 --- a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp +++ b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp @@ -513,9 +513,7 @@ int OptSchedDDGWrapperBasic::clusterNeighboringMemOps( bool ClusterPossible = false; int TotalInstructionsPossible = 0; - LLVM_DEBUG(dbgs() << "Processing possible clusters\n"); for (const SUnit *SU : MemOps) { - LLVM_DEBUG(dbgs() << " " << SU->NodeNum << " is in the chain.\n"); MachineOperand *BaseOp; int64_t Offset; if (DAG->TII->getMemOperandWithOffset(*SU->getInstr(), BaseOp, Offset, DAG->TRI)) @@ -523,7 +521,7 @@ int OptSchedDDGWrapperBasic::clusterNeighboringMemOps( } if (MemOpRecords.size() < 2) { - LLVM_DEBUG(dbgs() << " Unable to cluster memop cluster of 1.\n"); + dbgs() << " Unable to cluster memop cluster of 1.\n"; return 0; } @@ -532,11 +530,11 @@ int OptSchedDDGWrapperBasic::clusterNeighboringMemOps( for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) { const SUnit *SUa = MemOpRecords[Idx].SU; const SUnit *SUb = MemOpRecords[Idx + 1].SU; - LLVM_DEBUG(dbgs() << " Checking possible clustering of (" << SUa->NodeNum << ") and (" << SUb->NodeNum << ")\n"); + dbgs() << " Checking possible clustering of (" << SUa->NodeNum << ") and (" << SUb->NodeNum << ")\n"; if (DAG->TII->shouldClusterMemOps(*MemOpRecords[Idx].BaseOp, *MemOpRecords[Idx + 1].BaseOp, ClusterLength)) { - LLVM_DEBUG(dbgs() << " Cluster possible at SU(" << SUa->NodeNum << ")- SU(" << SUb->NodeNum << ")\n"); + dbgs() << " Cluster possible at SU(" << SUa->NodeNum << ")- SU(" << SUb->NodeNum << ")\n"; // If clustering was possible then increase the cluster count. This only // happens once every cluster @@ -544,7 +542,7 @@ int OptSchedDDGWrapperBasic::clusterNeighboringMemOps( ClusterPossible = true; ClusterCount++; setMaxClusterCount(ClusterCount); - Logger::Info("Setting max cluster count to %d", ClusterCount); + dbgs() << " Setting total cluster count to " << ClusterCount << "\n"; } // Tell the instructions what cluster number they are in @@ -562,16 +560,6 @@ int OptSchedDDGWrapperBasic::clusterNeighboringMemOps( } else ClusterLength = 1; } -#ifdef IS_DEBUG_MEMORY_CLUSTERING - LLVM_DEBUG(dbgs () << "Printing bit vector: "); - for (int i = ClusterVector->GetSize() - 1; i >= 0; i--) { - if (ClusterVector->GetBit(i)) - LLVM_DEBUG(dbgs() << "1"); - else - LLVM_DEBUG(dbgs() << "0"); - } - LLVM_DEBUG(dbgs() << '\n'); -#endif MaxInstructionsInEachClusters.insert(std::make_pair(ClusterCount, TotalInstructionsPossible)); return TotalInstructionsPossible; } @@ -579,13 +567,10 @@ int OptSchedDDGWrapperBasic::clusterNeighboringMemOps( /// Iterate through SUnits and find all possible clustering then transfer /// the information so that our scheduler can access it. /// Partially copied from https://github.com/RadeonOpenCompute/llvm/blob/roc-ocl-2.4.0/lib/CodeGen/MachineScheduler.cpp#L1595 -void OptSchedDDGWrapperBasic::findPossibleClusters() { +int OptSchedDDGWrapperBasic::findPossibleClusters(bool IsLoad) { // TODO: Add For-loop to also do store clusters. Currently only does load // clusters - bool IsLoad = true; int TotalInstructionsPossible = 0; - - LLVM_DEBUG(dbgs() << "Looking for load clusters\n"); DenseMap StoreChainIDs; // Map each store chain to a set of dependent MemOps. SmallVector, 32> StoreChainDependents; @@ -594,13 +579,12 @@ void OptSchedDDGWrapperBasic::findPossibleClusters() { (!IsLoad && !SU.getInstr()->mayStore())) continue; auto MI = SU.getInstr(); - LLVM_DEBUG(dbgs() << " Instruction (" << SU.NodeNum << ") " << DAG->TII->getName(MI->getOpcode()) << " may load.\n"); + + dbgs() << "Instruction (" << SU.NodeNum << ") " << DAG->TII->getName(MI->getOpcode()) << " may " << (IsLoad ? "load" : "store") << "\n"; unsigned ChainPredID = DAG->SUnits.size(); for (const SDep &Pred : SU.Preds) { if (Pred.isCtrl()) { - auto PredMI = Pred.getSUnit()->getInstr(); - LLVM_DEBUG(dbgs() << " Breaking chain at (" << Pred.getSUnit()->NodeNum << ") " << DAG->TII->getName(PredMI->getOpcode()) << '\n'); ChainPredID = Pred.getSUnit()->NodeNum; break; } @@ -608,7 +592,6 @@ void OptSchedDDGWrapperBasic::findPossibleClusters() { // Check if this chain-like pred has been seen // before. ChainPredID==MaxNodeID at the top of the schedule. unsigned NumChains = StoreChainDependents.size(); - LLVM_DEBUG(dbgs() << " ChainPredID " << ChainPredID << ", NumChains " << NumChains << '\n'); std::pair::iterator, bool> Result = StoreChainIDs.insert(std::make_pair(ChainPredID, NumChains)); if (Result.second) @@ -618,16 +601,14 @@ void OptSchedDDGWrapperBasic::findPossibleClusters() { // Iterate over the store chains. for (auto &SCD : StoreChainDependents) { -#ifdef IS_DEBUG_MEMORY_CLUSTERING - LLVM_DEBUG(dbgs() << " Printing the list before clustering: "); + dbgs() << "Printing the Node ID of the current chain: "; for (auto SU1 : SCD) - LLVM_DEBUG(dbgs() << SU1->NodeNum << " "); - LLVM_DEBUG(dbgs() << '\n'); -#endif + dbgs() << SU1->NodeNum << " "; + dbgs() << '\n'; TotalInstructionsPossible += clusterNeighboringMemOps(SCD); } - - setMaxInstructionsInAllClusters(TotalInstructionsPossible); + return TotalInstructionsPossible; +// setMaxInstructionsInAllClusters(TotalInstructionsPossible); } LLVMRegTypeFilter::LLVMRegTypeFilter( diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.h b/lib/Wrapper/OptSchedDDGWrapperBasic.h index eef47684..373ddc52 100644 --- a/lib/Wrapper/OptSchedDDGWrapperBasic.h +++ b/lib/Wrapper/OptSchedDDGWrapperBasic.h @@ -50,7 +50,7 @@ class OptSchedDDGWrapperBasic : public DataDepGraph { void convertSUnits() override; void convertRegFiles() override; - void findPossibleClusters() override; + int findPossibleClusters(bool IsLoad) override; protected: // A convenience machMdl_ pointer casted to OptSchedMachineModel*. diff --git a/lib/Wrapper/OptimizingScheduler.cpp b/lib/Wrapper/OptimizingScheduler.cpp index 295e9002..ee75b2e3 100644 --- a/lib/Wrapper/OptimizingScheduler.cpp +++ b/lib/Wrapper/OptimizingScheduler.cpp @@ -378,14 +378,24 @@ void ScheduleDAGOptSched::schedule() { OST->createDDGWrapper(C, this, MM.get(), LatencyPrecision, RegionName); DDG->convertSUnits(); DDG->convertRegFiles(); - DDG->findPossibleClusters(); if (SecondPass) { + dbgs() << "Finding load clusters.\n"; + int TotalLoadsInstructionsClusterable = DDG->findPossibleClusters(true); + if (TotalLoadsInstructionsClusterable == 0) + dbgs() << " No load clustering possible\n"; + dbgs() << "Finding store clusters.\n"; + int TotalStoreInstructionsClusterable = DDG->findPossibleClusters(false); + if (TotalStoreInstructionsClusterable == 0) + dbgs() << " No store clustering possible\n"; + auto DDG2 = static_cast(DDG.get()); + Logger::Info("Total clusterable instructions: %d loads, %d stores", TotalLoadsInstructionsClusterable, TotalStoreInstructionsClusterable); + DDG2->setMaxInstructionsInAllClusters(TotalLoadsInstructionsClusterable + TotalStoreInstructionsClusterable); int end = DDG2->getMaxClusterCount(); if (end > 0) { Logger::Info("Total clusters in region: %d", end); for (int begin = 1; begin <= end; begin++) { - Logger::Info("Cluster %d has total instructions %d", begin, DDG2->getMaxInstructionsInCluster(begin)); + Logger::Info(" Cluster %d has total instructions %d", begin, DDG2->getMaxInstructionsInCluster(begin)); } } } From cccccc31805f8422cfdbc57659093ec36960e838 Mon Sep 17 00:00:00 2001 From: Vang Thao Date: Wed, 8 Apr 2020 19:11:43 -0700 Subject: [PATCH 27/40] Fix segmentation fault due to copying ready list when a dynamic heuristic is not used. --- include/opt-sched/Scheduler/lnkd_lst.h | 5 +++-- lib/Scheduler/ready_list.cpp | 16 ++++++++++------ 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/include/opt-sched/Scheduler/lnkd_lst.h b/include/opt-sched/Scheduler/lnkd_lst.h index d6b696cb..bcb770d2 100644 --- a/include/opt-sched/Scheduler/lnkd_lst.h +++ b/include/opt-sched/Scheduler/lnkd_lst.h @@ -178,7 +178,7 @@ class PriorityList : public LinkedList { T *GetNxtPriorityElmnt(K &key); // Copies all the data from another list. The existing list must be empty. void CopyList(PriorityList const *const otherLst, - KeyedEntry **keyedEntries_); + KeyedEntry **keyedEntries_ = nullptr); protected: KeyedEntry *allocKeyEntries_; @@ -648,7 +648,8 @@ void PriorityList::CopyList( K key = entry->key; KeyedEntry *newEntry = AllocEntry_(elmnt, key); LinkedList::AppendEntry_(newEntry); - keyedEntries_[entry->element->GetNum()] = newEntry; + if (keyedEntries_) + keyedEntries_[entry->element->GetNum()] = newEntry; if (entry == otherLst->rtrvEntry_) { LinkedList::rtrvEntry_ = newEntry; diff --git a/lib/Scheduler/ready_list.cpp b/lib/Scheduler/ready_list.cpp index 5a1be9d6..ad377e84 100644 --- a/lib/Scheduler/ready_list.cpp +++ b/lib/Scheduler/ready_list.cpp @@ -8,13 +8,18 @@ using namespace llvm::opt_sched; ReadyList::ReadyList(DataDepGraph *dataDepGraph, SchedPriorities prirts) { prirts_ = prirts; prirtyLst_ = NULL; - keyedEntries_ = NULL; int i; uint16_t totKeyBits = 0; useCntBits_ = crtclPathBits_ = scsrCntBits_ = ltncySumBits_ = nodeID_Bits_ = inptSchedOrderBits_ = 0; + if (prirts_.isDynmc) + keyedEntries_ = new KeyedEntry + *[dataDepGraph->GetInstCnt()]; + else + keyedEntries_ = nullptr; + // Calculate the number of bits needed to hold the maximum value of each // priority scheme for (i = 0; i < prirts.cnt; i++) { @@ -27,8 +32,6 @@ ReadyList::ReadyList(DataDepGraph *dataDepGraph, SchedPriorities prirts) { break; case LSH_LUC: - keyedEntries_ = new KeyedEntry - *[dataDepGraph->GetInstCnt()]; for (int j = 0; j < dataDepGraph->GetInstCnt(); j++) { keyedEntries_[j] = NULL; } @@ -211,7 +214,7 @@ unsigned long ReadyList::CmputKey_(SchedInstruction *inst, bool isUpdate, case LSH_MEM: if (inst->GetClusterGroup() == 0) - ValueForKey = 0; + ValueForKey = 0; else { OldWasActive = inst->getWasActive(); NewWasActive = inst->computeWasActive(); @@ -221,7 +224,7 @@ unsigned long ReadyList::CmputKey_(SchedInstruction *inst, bool isUpdate, } ValueForKey = inst->GetClusterGroup() == SchedInstruction::GetActiveCluster() ? 1 - : 0; + : 0; } AddPrirtyToKey_(key, keySize, ClusterBit, ValueForKey, 1); break; @@ -247,7 +250,8 @@ void ReadyList::Print(std::ostream &out) { out << "Ready List: "; for (auto *crntInst = prirtyLst_->GetFrstElmnt(); crntInst != NULL; crntInst = prirtyLst_->GetNxtElmnt()) { - out << " " << crntInst->GetNum() << "(" << crntInst->GetClusterGroup() << ")"; + out << " " << crntInst->GetNum() << "(" << crntInst->GetClusterGroup() + << ")"; } out << '\n'; From b4f55af8b4da528923405d93f06fd33d40ad60db Mon Sep 17 00:00:00 2001 From: Vang Thao Date: Wed, 22 Apr 2020 20:52:25 -0700 Subject: [PATCH 28/40] Updated comments for easier review. --- example/optsched-cfg/sched.ini | 12 +- include/opt-sched/Scheduler/bb_spill.h | 32 ++- include/opt-sched/Scheduler/data_dep.h | 20 +- include/opt-sched/Scheduler/enumerator.h | 6 +- .../opt-sched/Scheduler/sched_basic_data.h | 5 +- lib/Scheduler/bb_spill.cpp | 221 ++++++++++-------- lib/Scheduler/data_dep.cpp | 6 +- lib/Scheduler/ready_list.cpp | 21 +- lib/Scheduler/sched_basic_data.cpp | 1 + lib/Scheduler/sched_region.cpp | 7 - lib/Wrapper/AMDGPU/OptSchedDDGWrapperGCN.cpp | 2 +- lib/Wrapper/OptSchedDDGWrapperBasic.cpp | 44 ++-- lib/Wrapper/OptimizingScheduler.cpp | 30 ++- 13 files changed, 228 insertions(+), 179 deletions(-) diff --git a/example/optsched-cfg/sched.ini b/example/optsched-cfg/sched.ini index d9b45132..11370b31 100644 --- a/example/optsched-cfg/sched.ini +++ b/example/optsched-cfg/sched.ini @@ -14,12 +14,14 @@ PRINT_SPILL_COUNTS YES # NO USE_TWO_PASS NO -# Cluster memory operations together in the second pass +# Allow enumerator to try to cluster memory operations together in the second pass. # YES # NO CLUSTER_MEMORY_OPS NO -CLUSTER_WEIGHT 1000000 +# The weight for clustering. This factor determines the importance of +# trying to find clusters when enumerating. +CLUSTER_WEIGHT 1000 # These 3 flags control which schedulers will be used. # Each one can be individually toggled. The heuristic @@ -86,16 +88,14 @@ TIMEOUT_PER INSTR # Example: LUC_CP_NID HEURISTIC LUC_CP_NID -# Same as HEURISTIC except with MEM_ prefix. -SECOND_PASS_HEURISTIC MEM_LUC_CP_NID - # The heuristic used for the enumerator. If the two pass scheduling # approach is enabled, then this value will be used for the first pass. # Same valid values as HEURISTIC. ENUM_HEURISTIC LUC_CP_NID # The heuuristic used for the enumerator in the second pass in the two-pass scheduling approach. -# Same valid values as HEURISTIC. +# Same valid values as HEURISTIC with an additional heuristic: +# Cluster: Favor instructions that are part of an active memory clustering group. SECOND_PASS_ENUM_HEURISTIC LUC_CP_NID # The spill cost function to be used. Valid values are: diff --git a/include/opt-sched/Scheduler/bb_spill.h b/include/opt-sched/Scheduler/bb_spill.h index 61461710..5a5b1ced 100644 --- a/include/opt-sched/Scheduler/bb_spill.h +++ b/include/opt-sched/Scheduler/bb_spill.h @@ -41,45 +41,53 @@ class BBWithSpill : public SchedRegion { MapVector InstructionsScheduledInEachCluster; - int MaxClusterBlocks; + /// The minimum amount of cluster blocks possible. + int MinClusterBlocks; + + /// The minimum amount of cluster blocks + the optimistic expected cluster + /// blocks remaining. int CurrentClusterBlocks; - /// Current active cluster group + /// Current active cluster group. int ActiveClusterGroup; - /// Flag to enable or disable clustering memory operations - /// in the ILP pass. + /// Flag to enable or disable clustering memory operations in the ILP pass. + /// Reads from the sched.ini file then set the flag accordingly. bool ClusterMemoryOperations; - // TODO: Implement cost function for clustering - /// Experimental variables and values for cost adjustment + /// The weight for memory ops clustering. int ClusteringWeight; - int TotalInstructionsInClusters; /// Data struct to contain information about the previous clusters struct PastClusters { + /// The cluster group int ClusterGroup; /// Size of the cluster when it was ended by an instruction not in the /// cluster int ClusterSize; - /// Instruction number that ended this cluster + /// Instruction number that ended this cluster. Used to check if we should + /// restore the cluster state when backtracking. int InstNum; + /// Contains the actual names of the instructions in the cluster. Only used + /// for printing and debugging purposes. std::unique_ptr> InstrList; /// Constructor for this struct - PastClusters(int Cluster, int size, int num) - : ClusterGroup(Cluster), ClusterSize(size), InstNum(num) {} + PastClusters(int Cluster, int Size, int Instructions) + : ClusterGroup(Cluster), ClusterSize(Size), InstNum(Instructions) {} }; /// Vector containing the (n-1) past clusters llvm::SmallVector, 4> PastClustersList; + /// Contains the actual names of the instructions in the current cluster. + /// Only used for printing and debugging purposes. std::unique_ptr> InstrList; - /// Pointer to the last cluster. This is kept out of the vector to - /// avoid having to fetch it every time we compare the current instruction + /// Pointer to the last cluster. This is kept out of the vector to avoid + /// having to fetch it every time we compare the current instruction /// number to the one that ended the cluster. std::unique_ptr LastCluster; diff --git a/include/opt-sched/Scheduler/data_dep.h b/include/opt-sched/Scheduler/data_dep.h index 6857ec79..c450f454 100644 --- a/include/opt-sched/Scheduler/data_dep.h +++ b/include/opt-sched/Scheduler/data_dep.h @@ -292,18 +292,20 @@ class DataDepGraph : public llvm::opt_sched::OptSchedDDGWrapperBase, RegisterFile *getRegFiles() { return RegFiles.get(); } - int getMaxClusterCount() { return MaxClusterCount; } - void setMaxClusterCount(int Max) { MaxClusterCount = Max; } - int getMaxInstructionsInAllClusters() { return MaxInstructionsInAllClusters; } - void setMaxInstructionsInAllClusters(int Max) { - MaxInstructionsInAllClusters = Max; + // Memory clustering helper functions + int getMinClusterCount() { return MinClusterCount; } + void setMinClusterCount(int Max) { MinClusterCount = Max; } + int getTotalInstructionsInAllClusters() { return TotalInstructionsInAllClusters; } + void setTotalInstructionsInAllClusters(int Max) { + TotalInstructionsInAllClusters = Max; } - - int getMaxInstructionsInCluster(int Cluster); + int getTotalInstructionsInCluster(int Cluster); protected: - int MaxClusterCount; - int MaxInstructionsInAllClusters; + int MinClusterCount; + int TotalInstructionsInAllClusters; + /// Map the cluster block to the total number of instructions found in the + /// block MapVector MaxInstructionsInEachClusters; // TODO(max): Get rid of this. diff --git a/include/opt-sched/Scheduler/enumerator.h b/include/opt-sched/Scheduler/enumerator.h index 145f24d4..534ec741 100644 --- a/include/opt-sched/Scheduler/enumerator.h +++ b/include/opt-sched/Scheduler/enumerator.h @@ -917,6 +917,9 @@ inline void Enumerator::UpdtRdyLst_(InstCount cycleNum, int slotNum) { LinkedList *lst1 = NULL; LinkedList *lst2 = frstRdyLstPerCycle_[cycleNum]; + if (prirts_.isDynmc) + rdyLst_->UpdatePriorities(); + if (slotNum == 0 && prevCycleNum >= 0) { // If at the begining of a new cycle other than the very first cycle, then // we also have to include the instructions that might have become ready in @@ -926,9 +929,6 @@ inline void Enumerator::UpdtRdyLst_(InstCount cycleNum, int slotNum) { } rdyLst_->AddLatestSubLists(lst1, lst2); - - if (prirts_.isDynmc) - rdyLst_->UpdatePriorities(); } /*****************************************************************************/ diff --git a/include/opt-sched/Scheduler/sched_basic_data.h b/include/opt-sched/Scheduler/sched_basic_data.h index c9d2c107..35868786 100644 --- a/include/opt-sched/Scheduler/sched_basic_data.h +++ b/include/opt-sched/Scheduler/sched_basic_data.h @@ -53,8 +53,9 @@ enum LISTSCHED_HEURISTIC { // LLVM list scheduler order LSH_LLVM = 8, - // Memory clustering - LSH_MEM = 9 + // Dynamic memory clustering heuristic, favor instructions that are part of + // an active cluster + LSH_CLUSTER = 9 }; #define MAX_SCHED_PRIRTS 10 diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp index f3b49c3e..3632d648 100644 --- a/lib/Scheduler/bb_spill.cpp +++ b/lib/Scheduler/bb_spill.cpp @@ -70,17 +70,17 @@ BBWithSpill::BBWithSpill(const OptSchedTarget *OST_, DataDepGraph *dataDepGraph, schduldExitInstCnt_ = 0; schduldInstCnt_ = 0; + // Memory clustering variables initialization CurrentClusterSize = 0; ActiveClusterGroup = 0; PastClustersList.clear(); LastCluster = nullptr; - TotalInstructionsInClusters = 0; Config &schedIni = SchedulerOptions::getInstance(); ClusterMemoryOperations = schedIni.GetBool("CLUSTER_MEMORY_OPS"); ClusteringWeight = schedIni.GetInt("CLUSTER_WEIGHT"); - MaxClusterBlocks = dataDepGraph_->getMaxClusterCount(); - CurrentClusterBlocks = MaxClusterBlocks; - for (int begin = 1; begin <= MaxClusterBlocks; begin++) { + MinClusterBlocks = dataDepGraph_->getMinClusterCount(); + CurrentClusterBlocks = MinClusterBlocks; + for (int begin = 1; begin <= MinClusterBlocks; begin++) { InstructionsScheduledInEachCluster[begin] = 0; } } @@ -320,9 +320,9 @@ InstCount BBWithSpill::CmputCostLwrBound() { InstCount staticLowerBound = schedLwrBound_ * schedCostFactor_ + spillCostLwrBound * SCW_; - + // Add the minimum of the possible clusters to the lower bound if (isSecondPass && ClusterMemoryOperations) { - staticLowerBound += MaxClusterBlocks * ClusteringWeight; + staticLowerBound += MinClusterBlocks * ClusteringWeight; } #if defined(IS_DEBUG_STATIC_LOWER_BOUND) @@ -339,16 +339,6 @@ InstCount BBWithSpill::CmputCostLwrBound() { void BBWithSpill::InitForSchdulng() { InitForCostCmputtn_(); - SchedInstruction::SetActiveCluster(0); - CurrentClusterSize = 0; - ActiveClusterGroup = 0; - PastClustersList.clear(); - LastCluster.reset(); - CurrentClusterBlocks = MaxClusterBlocks; - for (int begin = 1; begin <= MaxClusterBlocks; begin++) { - InstructionsScheduledInEachCluster[begin] = 0; - } - InstrList.reset(); schduldEntryInstCnt_ = 0; @@ -358,6 +348,19 @@ void BBWithSpill::InitForSchdulng() { /*****************************************************************************/ void BBWithSpill::InitForCostCmputtn_() { + // Init/Reset memory clustering values if it is enabled + if (isSecondPass && ClusterMemoryOperations) { + SchedInstruction::SetActiveCluster(0); + CurrentClusterSize = 0; + ActiveClusterGroup = 0; + PastClustersList.clear(); + LastCluster.reset(); + CurrentClusterBlocks = MinClusterBlocks; + for (int begin = 1; begin <= MinClusterBlocks; begin++) { + InstructionsScheduledInEachCluster[begin] = 0; + } +} + int i; crntCycleNum_ = 0; @@ -397,10 +400,6 @@ InstCount BBWithSpill::CmputNormCost_(InstSchedule *sched, InstCount &execCost, bool trackCnflcts) { InstCount cost = CmputCost_(sched, compMode, execCost, trackCnflcts); - // TODO: Implement cost function for clustering - if (isSecondPass && ClusterMemoryOperations) - cost += CurrentClusterBlocks * ClusteringWeight; - cost -= costLwrBound_; execCost -= costLwrBound_; @@ -425,6 +424,9 @@ InstCount BBWithSpill::CmputCost_(InstSchedule *sched, COST_COMP_MODE compMode, InstCount cost = sched->GetCrntLngth() * schedCostFactor_; execCost = cost; cost += crntSpillCost_ * SCW_; + // Add the current clustering cost + if (isSecondPass && ClusterMemoryOperations) + cost += CurrentClusterBlocks * ClusteringWeight; sched->SetSpillCosts(spillCosts_); sched->SetPeakRegPressures(peakRegPressures_); sched->SetSpillCost(crntSpillCost_); @@ -467,58 +469,66 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst, InstCount newSpillCost; // Scheduling cases for clustering project: - // 1.) Cluster -> Cluster - // Simple case, just increment 1 from cluster size - // 2.) Cluster -> Non-Cluster - // ?? End clustering + // 1.) Same Cluster -> Same Cluster + // 2.) Cluster -> Different Cluster // 3.) Non-Cluster -> Cluster - // Simple case, initialize clustering - + // 4.) Cluster -> Non-Cluster + // Possibly keep track of the current memory clustering size here // and in UpdateSpillInfoForUnSchdul_() if (isSecondPass && ClusterMemoryOperations) { + // Check if the current instruction is part of a cluster if (inst->GetMayCluster()) { - // If there is a current active cluster + // Check if there is a current active cluster if (CurrentClusterSize > 0) { - // The instruction is in the current active cluster + // Check if the instruction is in the same cluster group as the active + // cluster if (ActiveClusterGroup == inst->GetClusterGroup()) { - // Case 1: Currently clustering and this current instruction is part - // of the cluster + // Case 1: Simple case where the current instruction is part of an + // already active cluster. CurrentClusterSize++; InstructionsScheduledInEachCluster[ActiveClusterGroup]++; - InstrList->push_back(inst->GetName()); + InstrList->push_back(inst->GetName()); } else { - //Logger::Info("Inst %d pushing cluster size %d onto the stack due to " - // "cluster to cluster op", - // inst->GetNum(), CurrentClusterSize); - // The instruction is in another cluster that is not currently active. - // Exit out of the currently active cluster into a new one. + // Case 2: Else the instruction is part of different cluster that + // is not currently active. Store information of the old cluster + // group and start clustering for the new cluster. if (LastCluster) { + // Save previous clusters in a vector except the last cluster + // that we just exited out of. PastClustersList.push_back(std::move(LastCluster)); + + // Last cluster that we just exited out of, used for fast accessing + // to its contents LastCluster = llvm::make_unique( ActiveClusterGroup, CurrentClusterSize, inst->GetNum()); } else + // This is the first cluster block that we exited out of. LastCluster = llvm::make_unique( ActiveClusterGroup, CurrentClusterSize, inst->GetNum()); LastCluster->InstrList = std::move(InstrList); - // If this cluster did not finish then that means there have to be an extra cluster block to finish all of the instructions - // // in the cluster - if (InstructionsScheduledInEachCluster[ActiveClusterGroup] < dataDepGraph_->getMaxInstructionsInCluster(ActiveClusterGroup)) { - CurrentClusterBlocks++; - } + // If the old cluster did not finish clustering all possible + // instructions in its cluster then that means there have to be an + // extra cluster block to finish all of the instructions in the + // cluster. + if (InstructionsScheduledInEachCluster[ActiveClusterGroup] < + dataDepGraph_->getTotalInstructionsInCluster( + ActiveClusterGroup)) { + CurrentClusterBlocks++; + } + // Finish setting up the new cluster ActiveClusterGroup = inst->GetClusterGroup(); inst->SetActiveCluster(ActiveClusterGroup); CurrentClusterSize = 1; InstructionsScheduledInEachCluster[ActiveClusterGroup]++; - - InstrList = llvm::make_unique>(); - InstrList->push_back(inst->GetName()); - + InstrList = llvm::make_unique< + llvm::SmallVector>(); + InstrList->push_back(inst->GetName()); } } else { // Case 3: Not currently clustering. Initialize clustering @@ -526,48 +536,46 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst, inst->SetActiveCluster(ActiveClusterGroup); CurrentClusterSize = 1; InstructionsScheduledInEachCluster[ActiveClusterGroup]++; - - InstrList = llvm::make_unique>(); - InstrList->push_back(inst->GetName()); - + InstrList = llvm::make_unique>(); + InstrList->push_back(inst->GetName()); } } else if (CurrentClusterSize > 0) { - // Case 2: Exiting out of an active cluster -// Logger::Info("Inst %d pushing cluster size %d onto the stack", - // inst->GetNum(), CurrentClusterSize); - + // Case 4: Exiting out of an active cluster // Save the cluster to restore when backtracking. if (LastCluster) { - // Save previous current cluster in a vector + // Save previous clusters in a vector except the last cluster + // that we just exited out of. PastClustersList.push_back(std::move(LastCluster)); - // Current cluster + // Last cluster that we just exited out of, used for fast accessing + // to its contents. LastCluster = llvm::make_unique( ActiveClusterGroup, CurrentClusterSize, inst->GetNum()); } else - // This is the first cluster that we are saving + // This is the first cluster block that we exited out of. LastCluster = llvm::make_unique( ActiveClusterGroup, CurrentClusterSize, inst->GetNum()); LastCluster->InstrList = std::move(InstrList); - // If InstrScheduledInEachCluster != Max - // blocks++ - - // If this cluster did not finish then that means there have to be an extra cluster block to finish all of the instructions - // in the cluster - if (InstructionsScheduledInEachCluster[ActiveClusterGroup] < dataDepGraph_->getMaxInstructionsInCluster(ActiveClusterGroup)) { + // If this cluster did not finish then that means there have to be an + // extra cluster block to finish all of the instructions in the cluster. + if (InstructionsScheduledInEachCluster[ActiveClusterGroup] < + dataDepGraph_->getTotalInstructionsInCluster(ActiveClusterGroup)) { CurrentClusterBlocks++; } - assert(InstructionsScheduledInEachCluster[ActiveClusterGroup] <= dataDepGraph_->getMaxInstructionsInCluster(ActiveClusterGroup)); + // Assert that the total instructions accounted for doesn't exceed the + // expected total instructions in the cluster + assert(InstructionsScheduledInEachCluster[ActiveClusterGroup] <= + dataDepGraph_->getTotalInstructionsInCluster(ActiveClusterGroup)); - ActiveClusterGroup = 0; // Reset active cluster + // Reset active cluster + ActiveClusterGroup = 0; inst->SetActiveCluster(0); - CurrentClusterSize = 0; // Set cluster size to 0 + CurrentClusterSize = 0; } } - // Logger::Info("schedule, Currently active cluster %d", ActiveClusterGroup); defCnt = inst->GetDefs(defs); useCnt = inst->GetUses(uses); @@ -761,43 +769,39 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) { #endif // Backtracking cases for clustering project: - // 1.) Cluster <- Cluster - // Simple case, just decrement 1 from cluster size - // 2.) Cluster <- Non-Cluster - // Have to restore state of Cluster and ?? - // Can/should we use a stack to restore state? - // 3.) Non-Cluster <- Cluster - // Simple case, just decrement 1 from cluster size - // If cluster size == 0, set ActiveClusterGroup = 0; + // 1.) Same Cluster <- Same Cluster + // 2.) Non-Cluster <- Cluster + // 3.) Different Cluster <- Cluster + // 4.) Cluster <- Non-cluster if (isSecondPass && ClusterMemoryOperations) { - // TODO: Check for different cluster to different cluster - // backtracking. + // If the instruction we are backtracking from is part of a cluster if (inst->GetMayCluster()) { - // Case 1 and 3 + // Case 1, 2, and 3 + // Reduce the cluster size CurrentClusterSize--; + // Decrement instructions scheduled in this cluster InstructionsScheduledInEachCluster[ActiveClusterGroup]--; assert(InstructionsScheduledInEachCluster[ActiveClusterGroup] >= 0); + // Remove instruction's name from the list InstrList->pop_back(); - //Logger::Info("Undoing an instruction from the cluster. Current size: %d", - // CurrentClusterSize); - - // If there is no more member in the currently active cluster then disable - // the cluster + // Case 2: If there are no more instructions in the currently active + // cluster then it indicates that we backtracked out of a cluster. if (CurrentClusterSize == 0) { + // Set active cluster to none. ActiveClusterGroup = 0; inst->SetActiveCluster(0); - // If there was a previously active cluster, check last cluster to see - // if we need to restore the state + // Case 3: Check If this instruction ended another cluster if (LastCluster) { + // If so, then we need to restore the state of the previous cluster if (LastCluster->InstNum == inst->GetNum()) { CurrentClusterSize = LastCluster->ClusterSize; ActiveClusterGroup = LastCluster->ClusterGroup; inst->SetActiveCluster(ActiveClusterGroup); - InstrList = std::move(LastCluster->InstrList); + InstrList = std::move(LastCluster->InstrList); LastCluster.reset(); // Release current cluster pointer @@ -806,42 +810,52 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) { LastCluster = std::move(PastClustersList.back()); PastClustersList.pop_back(); } - if (InstructionsScheduledInEachCluster[ActiveClusterGroup] != dataDepGraph_->getMaxInstructionsInCluster(ActiveClusterGroup)) { - CurrentClusterBlocks--; - assert(CurrentClusterBlocks >= MaxClusterBlocks); - } + + // If we backtracked into another cluster that has not yet + // scheduled all of its instructions in the cluster, then undo our + // remaining cluster block estimate. There is a possibility that it + // is able to cluster all of the instructions in its cluster block + // and does not need an extra block. + if (InstructionsScheduledInEachCluster[ActiveClusterGroup] != + dataDepGraph_->getTotalInstructionsInCluster( + ActiveClusterGroup)) { + CurrentClusterBlocks--; + assert(CurrentClusterBlocks >= MinClusterBlocks); + } } } } } else if (LastCluster) { if (LastCluster->InstNum == inst->GetNum()) { - // Case 2: If there was a previous cluster and - // this instruction ended the cluster then restore the previous - // cluster's state + // Case 4: If there was a previous cluster and this instruction + // ended the cluster then restore the previous cluster's state CurrentClusterSize = LastCluster->ClusterSize; ActiveClusterGroup = LastCluster->ClusterGroup; inst->SetActiveCluster(ActiveClusterGroup); - InstrList = std::move(LastCluster->InstrList); + InstrList = std::move(LastCluster->InstrList); - LastCluster.reset(); // Release current cluster pointer + LastCluster.reset(); // Get previous cluster from vector list if (!PastClustersList.empty()) { LastCluster = std::move(PastClustersList.back()); PastClustersList.pop_back(); } - //Logger::Info("Inst %d popping cluster size %d off the stacks", - // inst->GetNum(), CurrentClusterSize); - - if (InstructionsScheduledInEachCluster[ActiveClusterGroup] != dataDepGraph_->getMaxInstructionsInCluster(ActiveClusterGroup)) { - CurrentClusterBlocks--; - assert(CurrentClusterBlocks >= MaxClusterBlocks); - } + + // If we backtracked into another cluster that has not yet + // scheduled all of its instructions in the cluster, then undo our + // remaining cluster block estimate. There is a possibility that it is + // able to cluster all of the instructions in its cluster block and + // does not need an extra block. + if (InstructionsScheduledInEachCluster[ActiveClusterGroup] != + dataDepGraph_->getTotalInstructionsInCluster(ActiveClusterGroup)) { + CurrentClusterBlocks--; + assert(CurrentClusterBlocks >= MinClusterBlocks); + } } } } -// Logger::Info("unschedule, Currently active cluster %d", ActiveClusterGroup); defCnt = inst->GetDefs(defs); useCnt = inst->GetUses(uses); @@ -1107,6 +1121,7 @@ InstCount BBWithSpill::UpdtOptmlSched(InstSchedule *crntSched, enumBestSched_->Copy(crntSched); bestSched_ = enumBestSched_; + // Print the instructions in the clusters after finding a schedule. if (isSecondPass && ClusterMemoryOperations) { dbgs() << "Printing clustered instructions:\n"; int i = 1; @@ -1172,7 +1187,7 @@ bool BBWithSpill::ChkCostFsblty(InstCount trgtLngth, EnumTreeNode *node) { } else { crntCost = crntSpillCost_ * SCW_ + trgtLngth * schedCostFactor_; } - // TODO: Implement cost function for clustering + // Add the cost of clustering if (isSecondPass && ClusterMemoryOperations) crntCost += CurrentClusterBlocks * ClusteringWeight; diff --git a/lib/Scheduler/data_dep.cpp b/lib/Scheduler/data_dep.cpp index 4db8ace6..24100b05 100644 --- a/lib/Scheduler/data_dep.cpp +++ b/lib/Scheduler/data_dep.cpp @@ -198,8 +198,8 @@ DataDepGraph::DataDepGraph(MachineModel *machMdl, LATENCY_PRECISION ltncyPrcsn) RegFiles = llvm::make_unique(machMdl_->GetRegTypeCnt()); - MaxClusterCount = 0; - MaxInstructionsInAllClusters = 0; + MinClusterCount = 0; + TotalInstructionsInAllClusters = 0; } DataDepGraph::~DataDepGraph() { @@ -214,7 +214,7 @@ DataDepGraph::~DataDepGraph() { delete[] instCntPerType_; } -int DataDepGraph::getMaxInstructionsInCluster(int Cluster) { +int DataDepGraph::getTotalInstructionsInCluster(int Cluster) { assert(Cluster > 0); return MaxInstructionsInEachClusters[Cluster]; } diff --git a/lib/Scheduler/ready_list.cpp b/lib/Scheduler/ready_list.cpp index ad377e84..65620e03 100644 --- a/lib/Scheduler/ready_list.cpp +++ b/lib/Scheduler/ready_list.cpp @@ -14,9 +14,13 @@ ReadyList::ReadyList(DataDepGraph *dataDepGraph, SchedPriorities prirts) { useCntBits_ = crtclPathBits_ = scsrCntBits_ = ltncySumBits_ = nodeID_Bits_ = inptSchedOrderBits_ = 0; - if (prirts_.isDynmc) + if (prirts_.isDynmc) { keyedEntries_ = new KeyedEntry *[dataDepGraph->GetInstCnt()]; + for (int j = 0; j < dataDepGraph->GetInstCnt(); j++) { + keyedEntries_[j] = nullptr; + } + } else keyedEntries_ = nullptr; @@ -32,9 +36,6 @@ ReadyList::ReadyList(DataDepGraph *dataDepGraph, SchedPriorities prirts) { break; case LSH_LUC: - for (int j = 0; j < dataDepGraph->GetInstCnt(); j++) { - keyedEntries_[j] = NULL; - } maxUseCnt_ = dataDepGraph->GetMaxUseCnt(); useCntBits_ = Utilities::clcltBitsNeededToHoldNum(maxUseCnt_); totKeyBits += useCntBits_; @@ -72,7 +73,10 @@ ReadyList::ReadyList(DataDepGraph *dataDepGraph, SchedPriorities prirts) { totKeyBits += ltncySumBits_; break; - case LSH_MEM: + case LSH_CLUSTER: + // Bits needed: 1 + // 0: Not part of an active cluster + // 1: Part of an active cluster ClusterBit = Utilities::clcltBitsNeededToHoldNum(1); totKeyBits += ClusterBit; break; @@ -123,7 +127,7 @@ ReadyList::ReadyList(DataDepGraph *dataDepGraph, SchedPriorities prirts) { maxLtncySum_); break; - case LSH_MEM: + case LSH_CLUSTER: AddPrirtyToKey_(maxPriority_, keySize, ClusterBit, 1, 1); break; @@ -212,7 +216,8 @@ unsigned long ReadyList::CmputKey_(SchedInstruction *inst, bool isUpdate, maxLtncySum_); break; - case LSH_MEM: + case LSH_CLUSTER: + // Partially copied how LUC is calculated to be updated. if (inst->GetClusterGroup() == 0) ValueForKey = 0; else { @@ -224,7 +229,7 @@ unsigned long ReadyList::CmputKey_(SchedInstruction *inst, bool isUpdate, } ValueForKey = inst->GetClusterGroup() == SchedInstruction::GetActiveCluster() ? 1 - : 0; + : 0; } AddPrirtyToKey_(key, keySize, ClusterBit, ValueForKey, 1); break; diff --git a/lib/Scheduler/sched_basic_data.cpp b/lib/Scheduler/sched_basic_data.cpp index c858be34..c59f7e7c 100644 --- a/lib/Scheduler/sched_basic_data.cpp +++ b/lib/Scheduler/sched_basic_data.cpp @@ -4,6 +4,7 @@ using namespace llvm::opt_sched; +// Initially set the active clustering to 0 for none. int SchedInstruction::ActiveCluster = 0; SchedInstruction::SchedInstruction(InstCount num, const string &name, diff --git a/lib/Scheduler/sched_region.cpp b/lib/Scheduler/sched_region.cpp index 5ff6ae94..0954cbb9 100644 --- a/lib/Scheduler/sched_region.cpp +++ b/lib/Scheduler/sched_region.cpp @@ -601,13 +601,6 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule( } #endif -#ifdef IS_DEBUG_MEMORY_CLUSTERING - if (isSecondPass) { - Logger::Info("Printing final schedule."); - bestSched->Print(Logger::GetLogStream(), "Best Sched"); - } -#endif - return rslt; } diff --git a/lib/Wrapper/AMDGPU/OptSchedDDGWrapperGCN.cpp b/lib/Wrapper/AMDGPU/OptSchedDDGWrapperGCN.cpp index 0aaf5bc4..57aa0713 100644 --- a/lib/Wrapper/AMDGPU/OptSchedDDGWrapperGCN.cpp +++ b/lib/Wrapper/AMDGPU/OptSchedDDGWrapperGCN.cpp @@ -182,7 +182,7 @@ void OptSchedDDGWrapperGCN::convertRegFiles() { } LLVM_DEBUG(DAG->dumpLLVMRegisters()); - //LLVM_DEBUG(dumpOptSchedRegisters()); + LLVM_DEBUG(dumpOptSchedRegisters()); } void OptSchedDDGWrapperGCN::addSubRegDefs(SchedInstruction *Instr, unsigned Reg, diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp index 6dfdb4a9..4a808010 100644 --- a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp +++ b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp @@ -210,7 +210,7 @@ void OptSchedDDGWrapperBasic::addDefsAndUses() { } LLVM_DEBUG(DAG->dumpLLVMRegisters()); - //LLVM_DEBUG(dumpOptSchedRegisters()); + LLVM_DEBUG(dumpOptSchedRegisters()); } void OptSchedDDGWrapperBasic::addUse(unsigned RegUnit, InstCount Index) { @@ -505,14 +505,18 @@ void OptSchedDDGWrapperBasic::countBoundaryLiveness( } } -/// Partially copied from -/// https://github.com/RadeonOpenCompute/llvm/blob/roc-ocl-2.4.0/lib/CodeGen/MachineScheduler.cpp#L1554 +// Iterate through all chains found by LLVm and verify that the instructions +// are actually able to be clustered together. +// Partially copied from +// https://github.com/RadeonOpenCompute/llvm/blob/roc-ocl-2.4.0/lib/CodeGen/MachineScheduler.cpp#L1554 int OptSchedDDGWrapperBasic::clusterNeighboringMemOps( ArrayRef MemOps) { - SmallVector MemOpRecords; + // Will be set to true if clustering was found to be possible in this chain. bool ClusterPossible = false; + // Keep track of the count of instructions that are able to be clustered + // and return the number. int TotalInstructionsPossible = 0; - + SmallVector MemOpRecords; for (const SUnit *SU : MemOps) { MachineOperand *BaseOp; int64_t Offset; @@ -537,15 +541,15 @@ int OptSchedDDGWrapperBasic::clusterNeighboringMemOps( dbgs() << " Cluster possible at SU(" << SUa->NodeNum << ")- SU(" << SUb->NodeNum << ")\n"; // If clustering was possible then increase the cluster count. This only - // happens once every cluster + // happens once every new cluster if (!ClusterPossible) { ClusterPossible = true; ClusterCount++; - setMaxClusterCount(ClusterCount); + setMinClusterCount(ClusterCount); dbgs() << " Setting total cluster count to " << ClusterCount << "\n"; } - // Tell the instructions what cluster number they are in + // Tell the instructions what cluster group they are in if (insts_[SUa->NodeNum]->GetClusterGroup() == 0) { insts_[SUa->NodeNum]->SetMayCluster(ClusterCount); TotalInstructionsPossible++; @@ -560,17 +564,23 @@ int OptSchedDDGWrapperBasic::clusterNeighboringMemOps( } else ClusterLength = 1; } + // Save the total instructions possible in this cluster. This number will be + // used in enumeration to estimate an optimistic cost on the remaining + // cluster blocks. MaxInstructionsInEachClusters.insert(std::make_pair(ClusterCount, TotalInstructionsPossible)); + + // Return the total number of instructions in this cluster block return TotalInstructionsPossible; } -/// Iterate through SUnits and find all possible clustering then transfer -/// the information so that our scheduler can access it. -/// Partially copied from https://github.com/RadeonOpenCompute/llvm/blob/roc-ocl-2.4.0/lib/CodeGen/MachineScheduler.cpp#L1595 +// Iterate through SUnits and find all possible clustering using LLVM/AMD's +// method for possible clustering detection then transfer the information to +// our scheduler so that our scheduler can access it during enumeration. +// Partially copied from https://github.com/RadeonOpenCompute/llvm/blob/roc-ocl-2.4.0/lib/CodeGen/MachineScheduler.cpp#L1595 int OptSchedDDGWrapperBasic::findPossibleClusters(bool IsLoad) { - // TODO: Add For-loop to also do store clusters. Currently only does load - // clusters + // The count of all of the instructions that are in a load/store cluster. int TotalInstructionsPossible = 0; + // Map DAG NodeNum to store chain ID. DenseMap StoreChainIDs; // Map each store chain to a set of dependent MemOps. SmallVector, 32> StoreChainDependents; @@ -580,7 +590,10 @@ int OptSchedDDGWrapperBasic::findPossibleClusters(bool IsLoad) { continue; auto MI = SU.getInstr(); - dbgs() << "Instruction (" << SU.NodeNum << ") " << DAG->TII->getName(MI->getOpcode()) << " may " << (IsLoad ? "load" : "store") << "\n"; + // Print which instruction may load or store. Used for debugging purposes. + dbgs() << "Instruction (" << SU.NodeNum << ") " << + DAG->TII->getName(MI->getOpcode()) << " may " << + (IsLoad ? "load" : "store") << "\n"; unsigned ChainPredID = DAG->SUnits.size(); for (const SDep &Pred : SU.Preds) { @@ -601,14 +614,15 @@ int OptSchedDDGWrapperBasic::findPossibleClusters(bool IsLoad) { // Iterate over the store chains. for (auto &SCD : StoreChainDependents) { + // Print the chain that LLVM has found dbgs() << "Printing the Node ID of the current chain: "; for (auto SU1 : SCD) dbgs() << SU1->NodeNum << " "; dbgs() << '\n'; + TotalInstructionsPossible += clusterNeighboringMemOps(SCD); } return TotalInstructionsPossible; -// setMaxInstructionsInAllClusters(TotalInstructionsPossible); } LLVMRegTypeFilter::LLVMRegTypeFilter( diff --git a/lib/Wrapper/OptimizingScheduler.cpp b/lib/Wrapper/OptimizingScheduler.cpp index ee75b2e3..528801fc 100644 --- a/lib/Wrapper/OptimizingScheduler.cpp +++ b/lib/Wrapper/OptimizingScheduler.cpp @@ -49,7 +49,7 @@ constexpr struct { } HeuristicNames[] = { {"CP", LSH_CP}, {"LUC", LSH_LUC}, {"UC", LSH_UC}, {"NID", LSH_NID}, {"CPR", LSH_CPR}, {"ISO", LSH_ISO}, {"SC", LSH_SC}, {"LS", LSH_LS}, - {"LLVM", LSH_LLVM}, {"MEM", LSH_MEM} + {"LLVM", LSH_LLVM}, {"CLUSTER", LSH_CLUSTER} }; // Default path to the the configuration directory for opt-sched. @@ -378,24 +378,37 @@ void ScheduleDAGOptSched::schedule() { OST->createDDGWrapper(C, this, MM.get(), LatencyPrecision, RegionName); DDG->convertSUnits(); DDG->convertRegFiles(); + + // Find all clusterable instructions for the second pass. if (SecondPass) { dbgs() << "Finding load clusters.\n"; int TotalLoadsInstructionsClusterable = DDG->findPossibleClusters(true); if (TotalLoadsInstructionsClusterable == 0) dbgs() << " No load clustering possible\n"; + dbgs() << "Finding store clusters.\n"; int TotalStoreInstructionsClusterable = DDG->findPossibleClusters(false); if (TotalStoreInstructionsClusterable == 0) dbgs() << " No store clustering possible\n"; - auto DDG2 = static_cast(DDG.get()); - Logger::Info("Total clusterable instructions: %d loads, %d stores", TotalLoadsInstructionsClusterable, TotalStoreInstructionsClusterable); - DDG2->setMaxInstructionsInAllClusters(TotalLoadsInstructionsClusterable + TotalStoreInstructionsClusterable); - int end = DDG2->getMaxClusterCount(); + Logger::Info("Total clusterable instructions: %d loads, %d stores", + TotalLoadsInstructionsClusterable, TotalStoreInstructionsClusterable); + + // Get the DDG instance so that we can set and get information that will be + // read later on during enumeration. + auto DataDepGraphInstance = static_cast(DDG.get()); + // Store total instructions in all clusters in the DDG instance. + DataDepGraphInstance->setTotalInstructionsInAllClusters( + TotalLoadsInstructionsClusterable + TotalStoreInstructionsClusterable); + int end = DataDepGraphInstance->getMinClusterCount(); + + // Iterate through all of the cluster blocks and print the total + // instructions in each block. if (end > 0) { Logger::Info("Total clusters in region: %d", end); for (int begin = 1; begin <= end; begin++) { - Logger::Info(" Cluster %d has total instructions %d", begin, DDG2->getMaxInstructionsInCluster(begin)); + Logger::Info(" Cluster %d has total instructions %d", begin, + DataDepGraphInstance->getTotalInstructionsInCluster(begin)); } } } @@ -576,8 +589,6 @@ void ScheduleDAGOptSched::loadOptSchedConfig() { LowerBoundAlgorithm = parseLowerBoundAlgorithm(); HeuristicPriorities = parseHeuristic(schedIni.GetString("HEURISTIC")); EnumPriorities = parseHeuristic(schedIni.GetString("ENUM_HEURISTIC")); - SecondPassPriorities = - parseHeuristic(schedIni.GetString("SECOND_PASS_HEURISTIC")); SecondPassEnumPriorities = parseHeuristic(schedIni.GetString("SECOND_PASS_ENUM_HEURISTIC")); SCF = parseSpillCostFunc(); @@ -686,7 +697,7 @@ SchedPriorities ScheduleDAGOptSched::parseHeuristic(const std::string &Str) { Priorities.vctr[Priorities.cnt++] = LSH; switch (LSH) { // Is LUC still the only dynamic heuristic? - case LSH_MEM: + case LSH_CLUSTER: case LSH_LUC: Priorities.isDynmc = true; break; @@ -841,7 +852,6 @@ void ScheduleDAGOptSched::scheduleOptSchedBalanced() { // Set the heuristic for the enumerator in the second pass. EnumPriorities = SecondPassEnumPriorities; - HeuristicPriorities = SecondPassPriorities; // Force the input to the balanced scheduler to be the sequential order of the // (hopefully) good register pressure schedule. We don’t want the list From 46b9542a77e7d38c49635e311832c3d3819d9d3a Mon Sep 17 00:00:00 2001 From: Vang Thao Date: Thu, 30 Apr 2020 12:16:46 -0700 Subject: [PATCH 29/40] Fix not accounting for multiple clusters within the same store-chain. --- lib/Wrapper/OptSchedDDGWrapperBasic.cpp | 61 ++++++++++++++++--------- 1 file changed, 40 insertions(+), 21 deletions(-) diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp index 4a808010..a976a8c4 100644 --- a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp +++ b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp @@ -512,15 +512,17 @@ void OptSchedDDGWrapperBasic::countBoundaryLiveness( int OptSchedDDGWrapperBasic::clusterNeighboringMemOps( ArrayRef MemOps) { // Will be set to true if clustering was found to be possible in this chain. - bool ClusterPossible = false; + bool InitForNewCluster = true; // Keep track of the count of instructions that are able to be clustered // and return the number. int TotalInstructionsPossible = 0; + int InstructionsInEachCluster = 0; SmallVector MemOpRecords; for (const SUnit *SU : MemOps) { MachineOperand *BaseOp; int64_t Offset; - if (DAG->TII->getMemOperandWithOffset(*SU->getInstr(), BaseOp, Offset, DAG->TRI)) + if (DAG->TII->getMemOperandWithOffset(*SU->getInstr(), BaseOp, Offset, + DAG->TRI)) MemOpRecords.push_back(MemOpInfo(SU, BaseOp, Offset)); } @@ -534,16 +536,18 @@ int OptSchedDDGWrapperBasic::clusterNeighboringMemOps( for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) { const SUnit *SUa = MemOpRecords[Idx].SU; const SUnit *SUb = MemOpRecords[Idx + 1].SU; - dbgs() << " Checking possible clustering of (" << SUa->NodeNum << ") and (" << SUb->NodeNum << ")\n"; + dbgs() << " Checking possible clustering of (" << SUa->NodeNum << ") and (" + << SUb->NodeNum << ")\n"; if (DAG->TII->shouldClusterMemOps(*MemOpRecords[Idx].BaseOp, - *MemOpRecords[Idx + 1].BaseOp, - ClusterLength)) { - dbgs() << " Cluster possible at SU(" << SUa->NodeNum << ")- SU(" << SUb->NodeNum << ")\n"; - - // If clustering was possible then increase the cluster count. This only + *MemOpRecords[Idx + 1].BaseOp, + ClusterLength)) { + dbgs() << " Cluster possible at SU(" << SUa->NodeNum << ")- SU(" + << SUb->NodeNum << ")\n"; + + // If clustering is possible then increase the cluster count. This only // happens once every new cluster - if (!ClusterPossible) { - ClusterPossible = true; + if (InitForNewCluster) { + InitForNewCluster = false; ClusterCount++; setMinClusterCount(ClusterCount); dbgs() << " Setting total cluster count to " << ClusterCount << "\n"; @@ -552,22 +556,36 @@ int OptSchedDDGWrapperBasic::clusterNeighboringMemOps( // Tell the instructions what cluster group they are in if (insts_[SUa->NodeNum]->GetClusterGroup() == 0) { insts_[SUa->NodeNum]->SetMayCluster(ClusterCount); - TotalInstructionsPossible++; + InstructionsInEachCluster++; } if (insts_[SUb->NodeNum]->GetClusterGroup() == 0) { insts_[SUb->NodeNum]->SetMayCluster(ClusterCount); - TotalInstructionsPossible++; + InstructionsInEachCluster++; } ++ClusterLength; - } else + } else { + if (!InitForNewCluster) { + // If a cluster was initialized and started then the information before + // starting a new one. + MaxInstructionsInEachClusters.insert( + std::make_pair(ClusterCount, InstructionsInEachCluster)); + TotalInstructionsPossible += InstructionsInEachCluster; + InitForNewCluster = true; + InstructionsInEachCluster = 0; + } ClusterLength = 1; + } } // Save the total instructions possible in this cluster. This number will be // used in enumeration to estimate an optimistic cost on the remaining - // cluster blocks. - MaxInstructionsInEachClusters.insert(std::make_pair(ClusterCount, TotalInstructionsPossible)); + // cluster blocks.i + if (!InitForNewCluster) { + MaxInstructionsInEachClusters.insert( + std::make_pair(ClusterCount, InstructionsInEachCluster)); + TotalInstructionsPossible += InstructionsInEachCluster; + } // Return the total number of instructions in this cluster block return TotalInstructionsPossible; @@ -576,7 +594,8 @@ int OptSchedDDGWrapperBasic::clusterNeighboringMemOps( // Iterate through SUnits and find all possible clustering using LLVM/AMD's // method for possible clustering detection then transfer the information to // our scheduler so that our scheduler can access it during enumeration. -// Partially copied from https://github.com/RadeonOpenCompute/llvm/blob/roc-ocl-2.4.0/lib/CodeGen/MachineScheduler.cpp#L1595 +// Partially copied from +// https://github.com/RadeonOpenCompute/llvm/blob/roc-ocl-2.4.0/lib/CodeGen/MachineScheduler.cpp#L1595 int OptSchedDDGWrapperBasic::findPossibleClusters(bool IsLoad) { // The count of all of the instructions that are in a load/store cluster. int TotalInstructionsPossible = 0; @@ -591,9 +610,9 @@ int OptSchedDDGWrapperBasic::findPossibleClusters(bool IsLoad) { auto MI = SU.getInstr(); // Print which instruction may load or store. Used for debugging purposes. - dbgs() << "Instruction (" << SU.NodeNum << ") " << - DAG->TII->getName(MI->getOpcode()) << " may " << - (IsLoad ? "load" : "store") << "\n"; + dbgs() << "Instruction (" << SU.NodeNum << ") " + << DAG->TII->getName(MI->getOpcode()) << " may " + << (IsLoad ? "load" : "store") << "\n"; unsigned ChainPredID = DAG->SUnits.size(); for (const SDep &Pred : SU.Preds) { @@ -617,12 +636,12 @@ int OptSchedDDGWrapperBasic::findPossibleClusters(bool IsLoad) { // Print the chain that LLVM has found dbgs() << "Printing the Node ID of the current chain: "; for (auto SU1 : SCD) - dbgs() << SU1->NodeNum << " "; + dbgs() << SU1->NodeNum << " "; dbgs() << '\n'; TotalInstructionsPossible += clusterNeighboringMemOps(SCD); } - return TotalInstructionsPossible; + return TotalInstructionsPossible; } LLVMRegTypeFilter::LLVMRegTypeFilter( From 19184f523f8c52a0754e3c6442ca26c15f5b5310 Mon Sep 17 00:00:00 2001 From: Vang Thao Date: Wed, 3 Jun 2020 23:29:36 -0500 Subject: [PATCH 30/40] Working implementation of clustering using B&B. No history domination. Currently working implementation of clustering with B&B. No hard limits on cluster size when using AMD's shouldClusterMemOps() function but there is a hard limit of 15 during B&B. Currently still debugging history domination. --- include/opt-sched/Scheduler/bb_spill.h | 48 ++- include/opt-sched/Scheduler/enumerator.h | 67 ++- include/opt-sched/Scheduler/graph.h | 2 +- include/opt-sched/Scheduler/hist_table.h | 5 +- include/opt-sched/Scheduler/sched_region.h | 4 + lib/Scheduler/bb_spill.cpp | 462 ++++++++++++--------- lib/Scheduler/enumerator.cpp | 73 +++- lib/Scheduler/hist_table.cpp | 47 +++ lib/Scheduler/ready_list.cpp | 18 +- lib/Scheduler/sched_basic_data.cpp | 2 - lib/Scheduler/sched_region.cpp | 1 + lib/Wrapper/OptSchedDDGWrapperBasic.cpp | 7 +- 12 files changed, 478 insertions(+), 258 deletions(-) diff --git a/include/opt-sched/Scheduler/bb_spill.h b/include/opt-sched/Scheduler/bb_spill.h index 5a5b1ced..e0a55a8f 100644 --- a/include/opt-sched/Scheduler/bb_spill.h +++ b/include/opt-sched/Scheduler/bb_spill.h @@ -35,21 +35,53 @@ class BBWithSpill : public SchedRegion { InstCount crntSpillCost_; InstCount optmlSpillCost_; + int CurrentClusterCost; + + /// Used to calculate the dynamic lower bound for clustering. + llvm::SmallVector ClusterCount; + llvm::SmallVector ClusterInstrRemainderCount; + int ClusterGroupCount; + + /// Print the current clusters found so far in the schedule. + void printCurrentClustering(); + + void initForClustering(); + + /// Calculate the lower bound cost for memory operations clustering and + /// return the lower bound cost. Does not take into account the clustering + /// weight. + int calculateClusterStaticLB(); + + /// Helper function for clustering to save the state of the current cluster. + void saveCluster(SchedInstruction *inst); + + /// Helper function for clustering to start a new clustering. + void initCluster(SchedInstruction *inst); + + /// Reset the active cluster to 0 (none). + void resetActiveCluster(SchedInstruction *inst); + + /// Helper function to restore the previous cluster. + void restorePreviousCluster(SchedInstruction *inst); + + bool isClusterFinished(); + + int calculateClusterDLB(); /// Current cluster size unsigned int CurrentClusterSize; - MapVector InstructionsScheduledInEachCluster; - /// The minimum amount of cluster blocks possible. int MinClusterBlocks; /// The minimum amount of cluster blocks + the optimistic expected cluster /// blocks remaining. - int CurrentClusterBlocks; + int DynamicClusterLowerBound; /// Current active cluster group. - int ActiveClusterGroup; + int ClusterActiveGroup; + + int StartCycle; /// Flag to enable or disable clustering memory operations in the ILP pass. /// Reads from the sched.ini file then set the flag accordingly. @@ -70,13 +102,15 @@ class BBWithSpill : public SchedRegion { /// restore the cluster state when backtracking. int InstNum; + int Start; + /// Contains the actual names of the instructions in the cluster. Only used /// for printing and debugging purposes. std::unique_ptr> InstrList; /// Constructor for this struct - PastClusters(int Cluster, int Size, int Instructions) - : ClusterGroup(Cluster), ClusterSize(Size), InstNum(Instructions) {} + PastClusters(int Cluster, int Size, int Instructions, int CycleStart) + : ClusterGroup(Cluster), ClusterSize(Size), InstNum(Instructions), Start(CycleStart) {} }; /// Vector containing the (n-1) past clusters @@ -161,7 +195,7 @@ class BBWithSpill : public SchedRegion { void InitForCostCmputtn_(); InstCount CmputDynmcCost_(); - void UpdateSpillInfoForSchdul_(SchedInstruction *inst, bool trackCnflcts); + void UpdateSpillInfoForSchdul_(SchedInstruction *inst, bool trackCnflcts, int Start); void UpdateSpillInfoForUnSchdul_(SchedInstruction *inst); void SetupPhysRegs_(); void CmputCrntSpillCost_(); diff --git a/include/opt-sched/Scheduler/enumerator.h b/include/opt-sched/Scheduler/enumerator.h index be2f376f..d165ddd0 100644 --- a/include/opt-sched/Scheduler/enumerator.h +++ b/include/opt-sched/Scheduler/enumerator.h @@ -153,6 +153,12 @@ class EnumTreeNode { InstCount peakSpillCost_; InstCount spillCostSum_; InstCount totalCost_ = -1; + int ClusterCost; + int ClusterActiveGroup; + int ClusterAbsorbCount; + int ClusterDLB; + int ClusterTotalCost = -1; + int ClusterBestCost; bool totalCostIsActualCost_ = false; ReserveSlot *rsrvSlots_; @@ -276,6 +282,18 @@ class EnumTreeNode { inline void SetSpillCostSum(InstCount cost); inline InstCount GetSpillCostSum(); + inline void setClusteringCost(int Cost); + inline int getClusteringCost(); + inline void setCurClusteringGroup(int Group); + inline int getCurClusteringGroup(); + inline void setClusterAbsorbCount(int Absorb); + inline int getClusterAbsorbCount(); + inline void setClusterLwrBound(int ClusterDynamicLowerBound); + inline int getClusterLwrBound(); + inline void setTotalClusterCost(int Cost); + inline int getTotalClusterCost(); + inline bool isClustering(); + bool ChkInstRdndncy(SchedInstruction *inst, int brnchNum); bool IsNxtSlotStall(); @@ -317,6 +335,9 @@ class Enumerator : public ConstrainedScheduler { friend class HistEnumTreeNode; friend class CostHistEnumTreeNode; + // Should we cluster memory operations + bool Clustering; + // TODO(max): Document. bool isCnstrctd_; @@ -508,7 +529,7 @@ class Enumerator : public ConstrainedScheduler { InstCount schedUprBound, int16_t sigHashSize, SchedPriorities prirts, Pruning PruningStrategy, bool SchedForRPOnly, bool enblStallEnum, Milliseconds timeout, - InstCount preFxdInstCnt = 0, + bool ClusteringEnabled, InstCount preFxdInstCnt = 0, SchedInstruction *preFxdInsts[] = NULL); virtual ~Enumerator(); virtual void Reset(); @@ -525,6 +546,8 @@ class Enumerator : public ConstrainedScheduler { // (Chris) inline bool IsSchedForRPOnly() const { return SchedForRPOnly_; } + inline bool isClustering() const { return Clustering; } + // Calculates the schedule and returns it in the passed argument. FUNC_RESULT FindSchedule(InstSchedule *sched, SchedRegion *rgn) { return RES_ERROR; @@ -586,6 +609,7 @@ class LengthCostEnumerator : public Enumerator { bool WasObjctvMet_(); bool BackTrack_(); InstCount GetBestCost_(); + int GetBestClusterCost_(); void CreateRootNode_(); // Check if branching from the current node by scheduling this instruction @@ -603,7 +627,7 @@ class LengthCostEnumerator : public Enumerator { SchedPriorities prirts, Pruning PruningStrategy, bool SchedForRPOnly, bool enblStallEnum, Milliseconds timeout, SPILL_COST_FUNCTION spillCostFunc, - InstCount preFxdInstCnt = 0, + bool ClusteringEnabled, InstCount preFxdInstCnt = 0, SchedInstruction *preFxdInsts[] = NULL); virtual ~LengthCostEnumerator(); void Reset(); @@ -616,6 +640,7 @@ class LengthCostEnumerator : public Enumerator { bool IsCostEnum(); SPILL_COST_FUNCTION GetSpillCostFunc() { return spillCostFunc_; } inline InstCount GetBestCost() { return GetBestCost_(); } + int getBestClusterCost() { return GetBestClusterCost_(); } }; /*****************************************************************************/ @@ -851,6 +876,44 @@ void EnumTreeNode::SetSpillCostSum(InstCount cost) { InstCount EnumTreeNode::GetSpillCostSum() { return spillCostSum_; } /*****************************************************************************/ +void EnumTreeNode::setClusteringCost(int Cost) { + assert(Cost >= 0); + ClusterCost = Cost; +} + +int EnumTreeNode::getClusteringCost() { return ClusterCost; } + +void EnumTreeNode::setCurClusteringGroup(int Group) { + assert(Group >= 0); + ClusterActiveGroup = Group; +} + +int EnumTreeNode::getCurClusteringGroup() { return ClusterActiveGroup; } + +void EnumTreeNode::setClusterAbsorbCount(int Absorb) { + assert(Absorb >= 0); + ClusterAbsorbCount = Absorb; +} + +int EnumTreeNode::getClusterAbsorbCount() { return ClusterAbsorbCount; } + +void EnumTreeNode::setClusterLwrBound(int ClusterDynamicLowerBound) { + assert(ClusterDynamicLowerBound >= 0); + ClusterDLB = ClusterDynamicLowerBound; +} + +int EnumTreeNode::getClusterLwrBound() { return ClusterDLB; } + +void EnumTreeNode::setTotalClusterCost(int Cost) { + assert(Cost >= 0); + ClusterTotalCost = Cost; +} + +int EnumTreeNode::getTotalClusterCost() { return ClusterTotalCost; } + +bool EnumTreeNode::isClustering() { return enumrtr_->isClustering(); } +/*****************************************************************************/ + bool EnumTreeNode::IsNxtCycleNew_() { if (enumrtr_->issuRate_ == 1) { return true; diff --git a/include/opt-sched/Scheduler/graph.h b/include/opt-sched/Scheduler/graph.h index af8ba8f2..790b7164 100644 --- a/include/opt-sched/Scheduler/graph.h +++ b/include/opt-sched/Scheduler/graph.h @@ -512,7 +512,7 @@ inline UDT_GEDGES GraphNode::GetRcrsvScsrCnt() const { } inline LinkedList *GraphNode::GetNghbrLst(DIRECTION dir) { - return dir == DIR_FRWRD ? scsrLst_ : prdcsrLst_; + return dir == DIR_FRWRD ? prdcsrLst_ : scsrLst_; } inline GraphEdge *GraphNode::GetFrstScsrEdge() { diff --git a/include/opt-sched/Scheduler/hist_table.h b/include/opt-sched/Scheduler/hist_table.h index 982c87a6..85f6592b 100644 --- a/include/opt-sched/Scheduler/hist_table.h +++ b/include/opt-sched/Scheduler/hist_table.h @@ -109,6 +109,10 @@ class CostHistEnumTreeNode : public HistEnumTreeNode { InstCount cost_; InstCount peakSpillCost_; InstCount spillCostSum_; + int ClusterCost; + int ClusterActiveGroup; + int ClusterAbsorbCount; + int ClusterTotalCost; // (Chris) InstCount totalCost_ = -1; @@ -119,7 +123,6 @@ class CostHistEnumTreeNode : public HistEnumTreeNode { #ifdef IS_DEBUG bool costInfoSet_; #endif - bool ChkCostDmntnForBBSpill_(EnumTreeNode *node, Enumerator *enumrtr); bool ChkCostDmntn_(EnumTreeNode *node, Enumerator *enumrtr, InstCount &maxShft); diff --git a/include/opt-sched/Scheduler/sched_region.h b/include/opt-sched/Scheduler/sched_region.h index de36f85b..553d73b8 100644 --- a/include/opt-sched/Scheduler/sched_region.h +++ b/include/opt-sched/Scheduler/sched_region.h @@ -58,6 +58,7 @@ class SchedRegion { inline int GetCostLwrBound() { return costLwrBound_; } // Returns the best cost found so far for this region. inline InstCount GetBestCost() { return bestCost_; } + inline int getBestClusterCost() { return BestClusterCost; } // Returns a pointer to the list scheduler heurisitcs. inline SchedPriorities GetHeuristicPriorities() { return hurstcPrirts_; } // Get the number of simulated spills code added for this block. @@ -132,6 +133,7 @@ class SchedRegion { // The best results found so far. InstCount bestCost_; + int BestClusterCost; InstCount bestSchedLngth_; // (Chris): The cost function. Defaults to PERP. @@ -180,6 +182,8 @@ class SchedRegion { void SetBestCost(InstCount bestCost) { bestCost_ = bestCost; } + void setBestClusterCost(int BestCost) { BestClusterCost = BestCost; } + void SetBestSchedLength(InstCount bestSchedLngth) { bestSchedLngth_ = bestSchedLngth; } const SchedPriorities& GetEnumPriorities() const { return enumPrirts_; } diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp index 2b89f76b..6920eae8 100644 --- a/lib/Scheduler/bb_spill.cpp +++ b/lib/Scheduler/bb_spill.cpp @@ -27,6 +27,9 @@ using namespace llvm::opt_sched; // The denominator used when calculating cost weight. static const int COST_WGHT_BASE = 10; +// The max number of instructions in a cluster +static const unsigned MAX_INSTR_IN_CLUSTER = 15; + BBWithSpill::BBWithSpill(const OptSchedTarget *OST_, DataDepGraph *dataDepGraph, long rgnNum, int16_t sigHashSize, LB_ALG lbAlg, SchedPriorities hurstcPrirts, @@ -67,23 +70,37 @@ BBWithSpill::BBWithSpill(const OptSchedTarget *OST_, DataDepGraph *dataDepGraph, schduldEntryInstCnt_ = 0; schduldExitInstCnt_ = 0; schduldInstCnt_ = 0; - - // Memory clustering variables initialization - CurrentClusterSize = 0; - ActiveClusterGroup = 0; - PastClustersList.clear(); - LastCluster = nullptr; Config &schedIni = SchedulerOptions::getInstance(); ClusterMemoryOperations = schedIni.GetBool("CLUSTER_MEMORY_OPS"); ClusteringWeight = schedIni.GetInt("CLUSTER_WEIGHT"); - MinClusterBlocks = dataDepGraph_->getMinClusterCount(); - CurrentClusterBlocks = MinClusterBlocks; - for (int begin = 1; begin <= MinClusterBlocks; begin++) { - InstructionsScheduledInEachCluster[begin] = 0; + ClusterGroupCount = dataDepGraph_->getMinClusterCount(); + MinClusterBlocks = 0; + if (ClusterMemoryOperations && ClusterGroupCount > 0) { + ClusterCount.resize(ClusterGroupCount+1); + ClusterInstrRemainderCount.resize(ClusterGroupCount+1); + MinClusterBlocks = calculateClusterStaticLB(); + initForClustering(); } } /****************************************************************************/ +void BBWithSpill::initForClustering() { + // Memory clustering variables initialization + SchedInstruction::SetActiveCluster(0); + CurrentClusterSize = 0; + ClusterActiveGroup = 0; + CurrentClusterCost = 0; + PastClustersList.clear(); + LastCluster.reset(); + InstrList.reset(); + DynamicClusterLowerBound = 0; + + for (int begin = 1; begin <= ClusterGroupCount; begin++) { + ClusterCount[begin] = 0; + ClusterInstrRemainderCount[begin] = dataDepGraph_->getTotalInstructionsInCluster(begin); + } +} + BBWithSpill::~BBWithSpill() { if (enumrtr_ != NULL) { delete enumrtr_; @@ -96,6 +113,25 @@ BBWithSpill::~BBWithSpill() { } /*****************************************************************************/ +int BBWithSpill::calculateClusterStaticLB() { + // No cluster in this scheduling region + if (ClusterGroupCount == 0) + return 0; + + // Calculate the minimum cluster blocks that will be needed to cluster all of + // the instructions. The maximum amount in a cluster block is determined by + // the constant MAX_INSTR_IN_CLUSTER. + int ClusterCost = 0; + for (int begin = 1; begin <= ClusterGroupCount; begin++) { + int InstructionCount = dataDepGraph_->getTotalInstructionsInCluster(begin); + int CurrentClusterCost = std::ceil(double(InstructionCount) / MAX_INSTR_IN_CLUSTER); + Logger::Info("Cost for block %d is %d", begin, CurrentClusterCost); + ClusterCost += CurrentClusterCost; + } + + return ClusterCost; +} + bool BBWithSpill::EnableEnum_() { return true; /* @@ -338,8 +374,6 @@ InstCount BBWithSpill::CmputCostLwrBound() { void BBWithSpill::InitForSchdulng() { InitForCostCmputtn_(); - InstrList.reset(); - schduldEntryInstCnt_ = 0; schduldExitInstCnt_ = 0; schduldInstCnt_ = 0; @@ -347,18 +381,8 @@ void BBWithSpill::InitForSchdulng() { /*****************************************************************************/ void BBWithSpill::InitForCostCmputtn_() { - // Init/Reset memory clustering values if it is enabled - if (IsSecondPass() && ClusterMemoryOperations) { - SchedInstruction::SetActiveCluster(0); - CurrentClusterSize = 0; - ActiveClusterGroup = 0; - PastClustersList.clear(); - LastCluster.reset(); - CurrentClusterBlocks = MinClusterBlocks; - for (int begin = 1; begin <= MinClusterBlocks; begin++) { - InstructionsScheduledInEachCluster[begin] = 0; - } -} + if (IsSecondPass() && ClusterMemoryOperations) + initForClustering(); int i; @@ -424,8 +448,11 @@ InstCount BBWithSpill::CmputCost_(InstSchedule *sched, COST_COMP_MODE compMode, execCost = cost; cost += crntSpillCost_ * SCW_; // Add the current clustering cost - if (IsSecondPass() && ClusterMemoryOperations) - cost += CurrentClusterBlocks * ClusteringWeight; + if (IsSecondPass() && ClusterMemoryOperations) { + cost += CurrentClusterCost * ClusteringWeight; + assert(calculateClusterDLB() == CurrentClusterCost); + } + sched->SetSpillCosts(spillCosts_); sched->SetPeakRegPressures(peakRegPressures_); sched->SetSpillCost(crntSpillCost_); @@ -458,8 +485,85 @@ void BBWithSpill::CmputCrntSpillCost_() { } /*****************************************************************************/ +void BBWithSpill::saveCluster(SchedInstruction *inst) { + if (LastCluster) + // Save previous clusters in a vector except the last cluster + // that we just exited out of. + PastClustersList.push_back(std::move(LastCluster)); + + // Last cluster that we just exited out of, used for fast accessing + // to its contents. + LastCluster = llvm::make_unique( + ClusterActiveGroup, CurrentClusterSize, inst->GetNum(), StartCycle); + + LastCluster->InstrList = std::move(InstrList); +} + +void BBWithSpill::initCluster(SchedInstruction *inst) { + ClusterActiveGroup = inst->GetClusterGroup(); + inst->SetActiveCluster(ClusterActiveGroup); + CurrentClusterSize = 1; + ClusterInstrRemainderCount[ClusterActiveGroup]--; + InstrList = llvm::make_unique>(); + InstrList->push_back(inst->GetName()); + ClusterCount[ClusterActiveGroup]++; + CurrentClusterCost++; +} + +void BBWithSpill::resetActiveCluster(SchedInstruction *inst) { + ClusterActiveGroup = 0; + inst->SetActiveCluster(0); + CurrentClusterSize = 0; +} + +void BBWithSpill::restorePreviousCluster(SchedInstruction *inst) { + CurrentClusterSize = LastCluster->ClusterSize; + ClusterActiveGroup = LastCluster->ClusterGroup; + StartCycle = LastCluster->Start; + inst->SetActiveCluster(ClusterActiveGroup); + InstrList = std::move(LastCluster->InstrList); + LastCluster.reset(); // Release current cluster pointer + + // Get previous cluster from vector list + if (!PastClustersList.empty()) { + LastCluster = std::move(PastClustersList.back()); + PastClustersList.pop_back(); + } +} + +bool BBWithSpill::isClusterFinished() { + assert(ClusterActiveGroup != 0); + if (ClusterInstrRemainderCount[ClusterActiveGroup] == 0 || + CurrentClusterSize == MAX_INSTR_IN_CLUSTER) { + return true; + } + return false; +} + +int BBWithSpill::calculateClusterDLB() { + int OptimisticLowerBound = 0; + + for (int begin = 1; begin <= ClusterGroupCount; begin++) { + if (begin != ClusterActiveGroup) + OptimisticLowerBound += std::ceil(double(ClusterInstrRemainderCount[begin])/MAX_INSTR_IN_CLUSTER); + else { + // The amount of instructions remaining that the current open cluster can add + int AbsorbCount = MAX_INSTR_IN_CLUSTER - CurrentClusterSize; + // Assume the current open cluster can add the max amount of instructions + // that a cluster can contain. + int Remainder = ClusterInstrRemainderCount[begin] - AbsorbCount; + // If the remainder is negative then that indicates the open cluster can absorb all of the remaining instructions. + if (Remainder < 0) + Remainder = 0; + // Estimate the optimistic dynamic lower bound for the current cluster + OptimisticLowerBound += std::ceil(double(Remainder)/MAX_INSTR_IN_CLUSTER); + } + } + return CurrentClusterCost + OptimisticLowerBound; +} + void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst, - bool trackCnflcts) { + bool trackCnflcts, int Start) { int16_t regType; int defCnt, useCnt, regNum, physRegNum; Register **defs, **uses; @@ -467,8 +571,17 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst, int liveRegs; InstCount newSpillCost; +// Conditions for creating a cluster: +// 1.) If a block is ended before it reaches 15 && there are remaining instructions + +// Conditions for removing a cluster: +// 1.) If the block is not 15 && there are remaining instructions + // Scheduling cases for clustering project: // 1.) Same Cluster -> Same Cluster + // If size == MAX_INSTR_IN_CLUSTER + // Save cluster to restore + // Set active to 0 // 2.) Cluster -> Different Cluster // 3.) Non-Cluster -> Cluster // 4.) Cluster -> Non-Cluster @@ -479,100 +592,47 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst, // Check if the current instruction is part of a cluster if (inst->GetMayCluster()) { // Check if there is a current active cluster - if (CurrentClusterSize > 0) { + // A ClusterActiveGroup == 0 indicates that there is no currently active clustering + // While ClusterActiveGroup != 0 indicates that there is active clustering + if (ClusterActiveGroup != 0) { // Check if the instruction is in the same cluster group as the active // cluster - if (ActiveClusterGroup == inst->GetClusterGroup()) { + if (ClusterActiveGroup == inst->GetClusterGroup()) { // Case 1: Simple case where the current instruction is part of an // already active cluster. CurrentClusterSize++; - InstructionsScheduledInEachCluster[ActiveClusterGroup]++; - + ClusterInstrRemainderCount[ClusterActiveGroup]--; InstrList->push_back(inst->GetName()); + // If we reach the max amount for this cluster then save the cluster + // and reset. + if (isClusterFinished()) + { + saveCluster(inst); + resetActiveCluster(inst); + } } else { // Case 2: Else the instruction is part of different cluster that // is not currently active. Store information of the old cluster // group and start clustering for the new cluster. - if (LastCluster) { - // Save previous clusters in a vector except the last cluster - // that we just exited out of. - PastClustersList.push_back(std::move(LastCluster)); - - // Last cluster that we just exited out of, used for fast accessing - // to its contents - LastCluster = llvm::make_unique( - ActiveClusterGroup, CurrentClusterSize, inst->GetNum()); - } else - // This is the first cluster block that we exited out of. - LastCluster = llvm::make_unique( - ActiveClusterGroup, CurrentClusterSize, inst->GetNum()); - - LastCluster->InstrList = std::move(InstrList); - - // If the old cluster did not finish clustering all possible - // instructions in its cluster then that means there have to be an - // extra cluster block to finish all of the instructions in the - // cluster. - if (InstructionsScheduledInEachCluster[ActiveClusterGroup] < - dataDepGraph_->getTotalInstructionsInCluster( - ActiveClusterGroup)) { - CurrentClusterBlocks++; - } + saveCluster(inst); // Finish setting up the new cluster - ActiveClusterGroup = inst->GetClusterGroup(); - inst->SetActiveCluster(ActiveClusterGroup); - CurrentClusterSize = 1; - InstructionsScheduledInEachCluster[ActiveClusterGroup]++; - InstrList = llvm::make_unique< - llvm::SmallVector>(); - InstrList->push_back(inst->GetName()); + initCluster(inst); + StartCycle = Start; } } else { // Case 3: Not currently clustering. Initialize clustering - ActiveClusterGroup = inst->GetClusterGroup(); - inst->SetActiveCluster(ActiveClusterGroup); - CurrentClusterSize = 1; - InstructionsScheduledInEachCluster[ActiveClusterGroup]++; - InstrList = llvm::make_unique>(); - InstrList->push_back(inst->GetName()); + initCluster(inst); + StartCycle = Start; } - } else if (CurrentClusterSize > 0) { + } else if (ClusterActiveGroup != 0) { // Case 4: Exiting out of an active cluster // Save the cluster to restore when backtracking. - if (LastCluster) { - // Save previous clusters in a vector except the last cluster - // that we just exited out of. - PastClustersList.push_back(std::move(LastCluster)); - - // Last cluster that we just exited out of, used for fast accessing - // to its contents. - LastCluster = llvm::make_unique( - ActiveClusterGroup, CurrentClusterSize, inst->GetNum()); - } else - // This is the first cluster block that we exited out of. - LastCluster = llvm::make_unique( - ActiveClusterGroup, CurrentClusterSize, inst->GetNum()); - - LastCluster->InstrList = std::move(InstrList); - - // If this cluster did not finish then that means there have to be an - // extra cluster block to finish all of the instructions in the cluster. - if (InstructionsScheduledInEachCluster[ActiveClusterGroup] < - dataDepGraph_->getTotalInstructionsInCluster(ActiveClusterGroup)) { - CurrentClusterBlocks++; - } - - // Assert that the total instructions accounted for doesn't exceed the - // expected total instructions in the cluster - assert(InstructionsScheduledInEachCluster[ActiveClusterGroup] <= - dataDepGraph_->getTotalInstructionsInCluster(ActiveClusterGroup)); + saveCluster(inst); // Reset active cluster - ActiveClusterGroup = 0; - inst->SetActiveCluster(0); - CurrentClusterSize = 0; + resetActiveCluster(inst); } } @@ -775,84 +835,45 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) { if (IsSecondPass() && ClusterMemoryOperations) { // If the instruction we are backtracking from is part of a cluster if (inst->GetMayCluster()) { - // Case 1, 2, and 3 - // Reduce the cluster size - CurrentClusterSize--; - // Decrement instructions scheduled in this cluster - InstructionsScheduledInEachCluster[ActiveClusterGroup]--; - assert(InstructionsScheduledInEachCluster[ActiveClusterGroup] >= 0); - - // Remove instruction's name from the list - InstrList->pop_back(); - - // Case 2: If there are no more instructions in the currently active - // cluster then it indicates that we backtracked out of a cluster. - if (CurrentClusterSize == 0) { - // Set active cluster to none. - ActiveClusterGroup = 0; - inst->SetActiveCluster(0); - - // Case 3: Check If this instruction ended another cluster - if (LastCluster) { - // If so, then we need to restore the state of the previous cluster - if (LastCluster->InstNum == inst->GetNum()) { - CurrentClusterSize = LastCluster->ClusterSize; - ActiveClusterGroup = LastCluster->ClusterGroup; - inst->SetActiveCluster(ActiveClusterGroup); - - InstrList = std::move(LastCluster->InstrList); - - LastCluster.reset(); // Release current cluster pointer - - // Get previous cluster from vector list - if (!PastClustersList.empty()) { - LastCluster = std::move(PastClustersList.back()); - PastClustersList.pop_back(); - } - - // If we backtracked into another cluster that has not yet - // scheduled all of its instructions in the cluster, then undo our - // remaining cluster block estimate. There is a possibility that it - // is able to cluster all of the instructions in its cluster block - // and does not need an extra block. - if (InstructionsScheduledInEachCluster[ActiveClusterGroup] != - dataDepGraph_->getTotalInstructionsInCluster( - ActiveClusterGroup)) { - CurrentClusterBlocks--; - assert(CurrentClusterBlocks >= MinClusterBlocks); - } + if (CurrentClusterSize != 0) { + // Case 1, 2, and 3 + // Reduce the cluster size + CurrentClusterSize--; + ClusterInstrRemainderCount[ClusterActiveGroup]++; + // Remove instruction's name from the list + InstrList->pop_back(); + + // Case 2: If there are no more instructions in the currently active + // cluster then it indicates that we backtracked out of a cluster. + if (CurrentClusterSize == 0) { + ClusterCount[ClusterActiveGroup]--; + assert(ClusterCount[ClusterActiveGroup] >= 0); + CurrentClusterCost--; + // Set active cluster to none. + resetActiveCluster(inst); + + // Case 3: Check If this instruction ended another cluster + if (LastCluster && LastCluster->InstNum == inst->GetNum()) { + // If so, then we need to restore the state of the previous cluster + restorePreviousCluster(inst); } } } - } else if (LastCluster) { - if (LastCluster->InstNum == inst->GetNum()) { - // Case 4: If there was a previous cluster and this instruction - // ended the cluster then restore the previous cluster's state - CurrentClusterSize = LastCluster->ClusterSize; - ActiveClusterGroup = LastCluster->ClusterGroup; - inst->SetActiveCluster(ActiveClusterGroup); - - InstrList = std::move(LastCluster->InstrList); - - LastCluster.reset(); - - // Get previous cluster from vector list - if (!PastClustersList.empty()) { - LastCluster = std::move(PastClustersList.back()); - PastClustersList.pop_back(); - } - - // If we backtracked into another cluster that has not yet - // scheduled all of its instructions in the cluster, then undo our - // remaining cluster block estimate. There is a possibility that it is - // able to cluster all of the instructions in its cluster block and - // does not need an extra block. - if (InstructionsScheduledInEachCluster[ActiveClusterGroup] != - dataDepGraph_->getTotalInstructionsInCluster(ActiveClusterGroup)) { - CurrentClusterBlocks--; - assert(CurrentClusterBlocks >= MinClusterBlocks); - } + // A cluster size of 0 while an instruction may cluster indicates that + // the current instruction is at the end of a finished cluster + else if (CurrentClusterSize == 0) { + assert(inst->GetNum() == LastCluster->InstNum); + restorePreviousCluster(inst); + + CurrentClusterSize--; + ClusterInstrRemainderCount[ClusterActiveGroup]++; + // Remove instruction's name from the list + InstrList->pop_back(); } + } else if (LastCluster && LastCluster->InstNum == inst->GetNum()) { + // Case 4: If there was a previous cluster and this instruction + // ended the cluster then restore the previous cluster's state + restorePreviousCluster(inst); } } @@ -963,7 +984,7 @@ void BBWithSpill::SchdulInst(SchedInstruction *inst, InstCount cycleNum, if (inst == NULL) return; assert(inst != NULL); - UpdateSpillInfoForSchdul_(inst, trackCnflcts); + UpdateSpillInfoForSchdul_(inst, trackCnflcts, crntCycleNum_); } /*****************************************************************************/ @@ -999,7 +1020,7 @@ void BBWithSpill::FinishHurstc_() { void BBWithSpill::FinishOptml_() { #ifdef IS_DEBUG_BBSPILL_COST - stats::traceOptimalCost.Record(bestCost_); + stats::traceOptimalCost.Record(GetBestCost()); stats::traceOptimalScheduleLength.Record(bestSchedLngth_); #endif } @@ -1007,6 +1028,7 @@ void BBWithSpill::FinishOptml_() { Enumerator *BBWithSpill::AllocEnumrtr_(Milliseconds timeout) { bool enblStallEnum = enblStallEnum_; + bool ClusteringEnabled = IsSecondPass() && ClusterMemoryOperations; /* if (!dataDepGraph_->IncludesUnpipelined()) { enblStallEnum = false; }*/ @@ -1014,7 +1036,7 @@ Enumerator *BBWithSpill::AllocEnumrtr_(Milliseconds timeout) { enumrtr_ = new LengthCostEnumerator( dataDepGraph_, machMdl_, schedUprBound_, GetSigHashSize(), GetEnumPriorities(), GetPruningStrategy(), SchedForRPOnly_, enblStallEnum, - timeout, GetSpillCostFunc(), 0, NULL); + timeout, GetSpillCostFunc(), ClusteringEnabled, 0, NULL); return enumrtr_; } @@ -1047,7 +1069,7 @@ FUNC_RESULT BBWithSpill::Enumerate_(Milliseconds startTime, timeout = true; HandlEnumrtrRslt_(rslt, trgtLngth); - if (bestCost_ == 0 || rslt == RES_ERROR || + if (GetBestCost() == 0 || rslt == RES_ERROR || (lngthDeadline == rgnDeadline && rslt == RES_TIMEOUT)) { //|| //(rslt == RES_SUCCESS && IsSecondPass())) { @@ -1116,44 +1138,49 @@ InstCount BBWithSpill::UpdtOptmlSched(InstSchedule *crntSched, Logger::Info("$$$ GOOD_HIT: Better spill cost for a longer schedule"); SetBestCost(crntCost); + if (IsSecondPass() && ClusterMemoryOperations) + setBestClusterCost(CurrentClusterCost); optmlSpillCost_ = crntSpillCost_; SetBestSchedLength(crntSched->GetCrntLngth()); enumBestSched_->Copy(crntSched); bestSched_ = enumBestSched_; + printCurrentClustering(); + } - // Print the instructions in the clusters after finding a schedule. - if (IsSecondPass() && ClusterMemoryOperations) { - dbgs() << "Printing clustered instructions:\n"; - int i = 1; - for (const auto &clusters : PastClustersList) { - dbgs() << "Printing cluster " << i << ": "; - for (const auto &instr : *clusters->InstrList) { - dbgs() << instr << " "; - } - i++; - dbgs() << '\n'; + return GetBestCost(); +} + +void BBWithSpill::printCurrentClustering() { + // Print the instructions in the clusters after finding a schedule. + if (IsSecondPass() && ClusterMemoryOperations) { + dbgs() << "Printing clustered instructions:\n"; + int i = 1; + for (const auto &clusters : PastClustersList) { + dbgs() << "Printing cluster " << i << ", start cycle (" << clusters->Start << "): "; + for (const auto &instr : *clusters->InstrList) { + dbgs() << instr << " "; } + i++; + dbgs() << '\n'; + } - if (LastCluster) { - dbgs() << "Printing cluster " << i << ": "; - for (const auto &instr : *(LastCluster->InstrList)) { - dbgs() << instr << " "; - } - i++; - dbgs() << '\n'; + if (LastCluster) { + dbgs() << "Printing cluster " << i << ", start cycle (" << LastCluster->Start << "): "; + for (const auto &instr : *(LastCluster->InstrList)) { + dbgs() << instr << " "; } + i++; + dbgs() << '\n'; + } - if (InstrList && InstrList->size() > 0) { - dbgs() << "Printing cluster " << i << ": "; - for (const auto &instr : *InstrList) { - dbgs() << instr << " "; - } - dbgs() << '\n'; + if (InstrList && InstrList->size() > 0) { + dbgs() << "Printing cluster " << i << ", start cycle (" << StartCycle << "): "; + for (const auto &instr : *InstrList) { + dbgs() << instr << " "; } + dbgs() << '\n'; } } - - return GetBestCost(); } /*****************************************************************************/ @@ -1181,21 +1208,31 @@ void BBWithSpill::SetupForSchdulng_() { bool BBWithSpill::ChkCostFsblty(InstCount trgtLngth, EnumTreeNode *node) { bool fsbl = true; InstCount crntCost, dynmcCostLwrBound; + int ClusterDynamicLowerBound; if (GetSpillCostFunc() == SCF_SLIL) { crntCost = dynamicSlilLowerBound_ * SCW_ + trgtLngth * schedCostFactor_; } else { crntCost = crntSpillCost_ * SCW_ + trgtLngth * schedCostFactor_; } // Add the cost of clustering - if (IsSecondPass() && ClusterMemoryOperations) - crntCost += CurrentClusterBlocks * ClusteringWeight; + if (IsSecondPass() && ClusterMemoryOperations) { + ClusterDynamicLowerBound = calculateClusterDLB(); + crntCost += ClusterDynamicLowerBound * ClusteringWeight; + } - crntCost -= costLwrBound_; + crntCost -= GetCostLwrBound(); dynmcCostLwrBound = crntCost; // assert(cost >= 0); assert(dynmcCostLwrBound >= 0); +/* + if (IsSecondPass() && ClusterMemoryOperations) { + dbgs() << "Current cycle: " << node->GetTime() <<", current cost is: " << dynmcCostLwrBound << ". Current best is: " << GetBestCost() << '\n'; + printCurrentClustering(); + } +*/ + fsbl = dynmcCostLwrBound < GetBestCost(); // FIXME: RP tracking should be limited to the current SCF. We need RP @@ -1205,6 +1242,17 @@ bool BBWithSpill::ChkCostFsblty(InstCount trgtLngth, EnumTreeNode *node) { node->SetCostLwrBound(dynmcCostLwrBound); node->SetPeakSpillCost(peakSpillCost_); node->SetSpillCostSum(totSpillCost_); + if (IsSecondPass() && ClusterMemoryOperations) { + node->setClusteringCost(CurrentClusterCost); + node->setCurClusteringGroup(ClusterActiveGroup); + node->setClusterLwrBound(ClusterDynamicLowerBound); + if (ClusterActiveGroup != 0) { + node->setClusterAbsorbCount(15 - CurrentClusterSize); + } + else { + node->setClusterAbsorbCount(0); + } + } } return fsbl; } diff --git a/lib/Scheduler/enumerator.cpp b/lib/Scheduler/enumerator.cpp index d9c4e3b1..43bf6ed6 100644 --- a/lib/Scheduler/enumerator.cpp +++ b/lib/Scheduler/enumerator.cpp @@ -64,6 +64,12 @@ void EnumTreeNode::Init_() { isLeaf_ = false; cost_ = INVALID_VALUE; costLwrBound_ = INVALID_VALUE; + ClusterCost = INVALID_VALUE; + ClusterActiveGroup = INVALID_VALUE; + ClusterAbsorbCount = INVALID_VALUE; + ClusterDLB = INVALID_VALUE; + ClusterTotalCost = -1; + ClusterBestCost = 99999999; crntCycleBlkd_ = false; rsrvSlots_ = NULL; totalCostIsActualCost_ = false; @@ -434,8 +440,8 @@ Enumerator::Enumerator(DataDepGraph *dataDepGraph, MachineModel *machMdl, InstCount schedUprBound, int16_t sigHashSize, SchedPriorities prirts, Pruning PruningStrategy, bool SchedForRPOnly, bool enblStallEnum, - Milliseconds timeout, InstCount preFxdInstCnt, - SchedInstruction *preFxdInsts[]) + Milliseconds timeout, bool ClusteringEnabled, + InstCount preFxdInstCnt, SchedInstruction *preFxdInsts[]) : ConstrainedScheduler(dataDepGraph, machMdl, schedUprBound) { memAllocBlkSize_ = (int)timeout / TIMEOUT_TO_MEMBLOCK_RATIO; assert(preFxdInstCnt >= 0); @@ -454,6 +460,7 @@ Enumerator::Enumerator(DataDepGraph *dataDepGraph, MachineModel *machMdl, prune_ = PruningStrategy; SchedForRPOnly_ = SchedForRPOnly; enblStallEnum_ = enblStallEnum; + Clustering = ClusteringEnabled; isEarlySubProbDom_ = true; @@ -1316,17 +1323,27 @@ void SetTotalCostsAndSuffixes(EnumTreeNode *const currentNode, Logger::Info("Leaf node total cost %d", currentNode->GetCost()); #endif currentNode->SetTotalCost(currentNode->GetCost()); + if (currentNode->isClustering()) + currentNode->setTotalClusterCost(currentNode->getClusteringCost()); currentNode->SetTotalCostIsActualCost(true); } else { - if (!currentNode->GetTotalCostIsActualCost() && - (currentNode->GetTotalCost() == -1 || - currentNode->GetCostLwrBound() < currentNode->GetTotalCost())) { -#if defined(IS_DEBUG_ARCHIVE) - Logger::Info("Inner node doesn't have a real cost yet. Setting total " - "cost to dynamic lower bound %d", - currentNode->GetCostLwrBound()); -#endif - currentNode->SetTotalCost(currentNode->GetCostLwrBound()); + if (!currentNode->GetTotalCostIsActualCost()) { + // Set overall weighted sum cost + if (currentNode->GetTotalCost() == -1 || + currentNode->GetCostLwrBound() < currentNode->GetTotalCost()) { + #if defined(IS_DEBUG_ARCHIVE) + Logger::Info("Inner node doesn't have a real cost yet. Setting total " + "cost to dynamic lower bound %d", + currentNode->GetCostLwrBound()); + #endif + currentNode->SetTotalCost(currentNode->GetCostLwrBound()); + } + + // Set clustering cost + if ((currentNode->isClustering() && currentNode->getTotalClusterCost() == -1) || + (currentNode->getClusterLwrBound() < currentNode->getTotalClusterCost())) { + currentNode->setTotalClusterCost(currentNode->getClusterLwrBound()); + } } } @@ -1359,16 +1376,25 @@ void SetTotalCostsAndSuffixes(EnumTreeNode *const currentNode, currentNode->GetTotalCost()); #endif parentNode->SetTotalCost(currentNode->GetTotalCost()); + if (currentNode->isClustering()) + parentNode->setTotalClusterCost(currentNode->getTotalClusterCost()); parentNode->SetTotalCostIsActualCost(true); parentNode->SetSuffix(std::move(parentSuffix)); - } else if (currentNode->GetTotalCost() < parentNode->GetTotalCost()) { -#if defined(IS_DEBUG_ARCHIVE) - Logger::Info( - "Current node has a real cost (%d), and so does parent. (%d)", - currentNode->GetTotalCost(), parentNode->GetTotalCost()); -#endif - parentNode->SetTotalCost(currentNode->GetTotalCost()); - parentNode->SetSuffix(std::move(parentSuffix)); + } else { + if (currentNode->GetTotalCost() < parentNode->GetTotalCost()) { + #if defined(IS_DEBUG_ARCHIVE) + Logger::Info( + "Current node has a real cost (%d), and so does parent. (%d)", + currentNode->GetTotalCost(), parentNode->GetTotalCost()); + #endif + parentNode->SetTotalCost(currentNode->GetTotalCost()); + parentNode->SetSuffix(std::move(parentSuffix)); + } + + // Set clustering cost + if (currentNode->isClustering() && currentNode->getTotalClusterCost() < parentNode->getTotalClusterCost()) { + parentNode->setTotalClusterCost(currentNode->getTotalClusterCost()); + } } } } @@ -1856,7 +1882,7 @@ LengthEnumerator::LengthEnumerator( bool SchedForRPOnly, bool enblStallEnum, Milliseconds timeout, InstCount preFxdInstCnt, SchedInstruction *preFxdInsts[]) : Enumerator(dataDepGraph, machMdl, schedUprBound, sigHashSize, prirts, - PruningStrategy, SchedForRPOnly, enblStallEnum, timeout, + PruningStrategy, SchedForRPOnly, enblStallEnum, timeout, false, preFxdInstCnt, preFxdInsts) { SetupAllocators_(); tmpHstryNode_ = new HistEnumTreeNode; @@ -1941,11 +1967,11 @@ LengthCostEnumerator::LengthCostEnumerator( DataDepGraph *dataDepGraph, MachineModel *machMdl, InstCount schedUprBound, int16_t sigHashSize, SchedPriorities prirts, Pruning PruningStrategy, bool SchedForRPOnly, bool enblStallEnum, Milliseconds timeout, - SPILL_COST_FUNCTION spillCostFunc, InstCount preFxdInstCnt, - SchedInstruction *preFxdInsts[]) + SPILL_COST_FUNCTION spillCostFunc, bool ClusteringEnabled, + InstCount preFxdInstCnt, SchedInstruction *preFxdInsts[]) : Enumerator(dataDepGraph, machMdl, schedUprBound, sigHashSize, prirts, PruningStrategy, SchedForRPOnly, enblStallEnum, timeout, - preFxdInstCnt, preFxdInsts) { + ClusteringEnabled, preFxdInstCnt, preFxdInsts) { SetupAllocators_(); costChkCnt_ = 0; @@ -2141,6 +2167,7 @@ bool LengthCostEnumerator::BackTrack_() { /*****************************************************************************/ InstCount LengthCostEnumerator::GetBestCost_() { return rgn_->GetBestCost(); } +int LengthCostEnumerator::GetBestClusterCost_() { return rgn_->getBestClusterCost(); } /*****************************************************************************/ void LengthCostEnumerator::CreateRootNode_() { diff --git a/lib/Scheduler/hist_table.cpp b/lib/Scheduler/hist_table.cpp index a4c1cae7..8a9ff356 100644 --- a/lib/Scheduler/hist_table.cpp +++ b/lib/Scheduler/hist_table.cpp @@ -400,6 +400,10 @@ void CostHistEnumTreeNode::Init_() { costInfoSet_ = false; #endif cost_ = 0; + ClusterCost = 9999999; + ClusterTotalCost = 9999999; + ClusterActiveGroup = 0; + ClusterAbsorbCount = 0; } bool CostHistEnumTreeNode::DoesDominate(EnumTreeNode *node, @@ -467,6 +471,41 @@ static bool doesHistoryPeakCostDominate(InstCount OtherPrefixCost, return LCE->GetBestCost() <= OtherPrefixCost; } +static bool doesClusterCostDominate(EnumTreeNode *CurEnumNode, + int ClusterActiveGroup, int ClusterCost, + int ClusterAbsorbCount, int ClusterTotalCost, + int ClusterBest) { + // Correct but too restrictive + if (CurEnumNode->getCurClusteringGroup() != ClusterActiveGroup) + return false; + + // Count the instructions only if there is an instruction in the ready list that belongs + // to the open cluster. If there is none, you can't add any instructions. If there are no instructions + // on the ready list that belong to the open cluster, we can set the cluster absorb count to 0. + if (CurEnumNode->getClusteringCost() >= ClusterCost && + CurEnumNode->getClusterAbsorbCount() <= ClusterAbsorbCount) + return true; + + // More room in the open cluster can reduce the number clusters by at most one + if (CurEnumNode->getClusteringCost() >= ClusterCost + 1) + return true; + + int improvement = ClusterCost - CurEnumNode->getClusteringCost(); + + // If the current node has a better absorb count then we optimistically assume it may + // improve the number of clusters by 1 + if (CurEnumNode->getClusterAbsorbCount() < ClusterAbsorbCount) + improvement++; + + // Two cases for a history node, + // 1.) One without a full schedule below it. Look at DLB. + // 2.) One with a full schedule below it. Look at the best found below the history node. + if (ClusterBest != INVALID_VALUE && improvement <= ClusterTotalCost - ClusterBest) + return true; + + return false; +} + // Should we prune the other node based on RP cost. bool CostHistEnumTreeNode::ChkCostDmntnForBBSpill_(EnumTreeNode *Node, Enumerator *E) { @@ -502,6 +541,10 @@ bool CostHistEnumTreeNode::ChkCostDmntnForBBSpill_(EnumTreeNode *Node, ShouldPrune = spillCostSum_ % instCnt >= Node->GetSpillCostSum() % instCnt; } + if (!ShouldPrune && LCE->isClustering()) { + int ClusterBest = LCE->getBestClusterCost(); + ShouldPrune = doesClusterCostDominate(Node, ClusterActiveGroup, ClusterCost, ClusterAbsorbCount, ClusterTotalCost, ClusterBest); + } } return ShouldPrune; } @@ -511,6 +554,10 @@ void CostHistEnumTreeNode::SetCostInfo(EnumTreeNode *node, bool, Enumerator *) { peakSpillCost_ = node->GetPeakSpillCost(); spillCostSum_ = node->GetSpillCostSum(); isLngthFsbl_ = node->IsLngthFsbl(); + ClusterCost = node->getClusteringCost(); + ClusterActiveGroup = node->getCurClusteringGroup(); + ClusterAbsorbCount = node->getClusterAbsorbCount(); + ClusterTotalCost = node->getTotalClusterCost(); // (Chris) partialCost_ = node->GetCostLwrBound(); diff --git a/lib/Scheduler/ready_list.cpp b/lib/Scheduler/ready_list.cpp index f04f467f..6bee513b 100644 --- a/lib/Scheduler/ready_list.cpp +++ b/lib/Scheduler/ready_list.cpp @@ -78,9 +78,6 @@ ReadyList::ReadyList(DataDepGraph *dataDepGraph, SchedPriorities prirts) { ClusterBit = Utilities::clcltBitsNeededToHoldNum(1); totKeyBits += ClusterBit; break; - - default: - break; } // end switch } // end for @@ -128,9 +125,6 @@ ReadyList::ReadyList(DataDepGraph *dataDepGraph, SchedPriorities prirts) { case LSH_CLUSTER: AddPrirtyToKey_(maxPriority_, keySize, ClusterBit, 1, 1); break; - - default: - break; } } } @@ -186,7 +180,6 @@ unsigned long ReadyList::CmputKey_(SchedInstruction *inst, bool isUpdate, newLastUseCnt = inst->CmputLastUseCnt(); if (newLastUseCnt != oldLastUseCnt) changed = true; - } AddPrirtyToKey_(key, keySize, useCntBits_, newLastUseCnt, maxUseCnt_); break; @@ -234,9 +227,6 @@ unsigned long ReadyList::CmputKey_(SchedInstruction *inst, bool isUpdate, } AddPrirtyToKey_(key, keySize, ClusterBit, ValueForKey, 1); break; - - default: - break; } } return key; @@ -253,15 +243,17 @@ void ReadyList::AddLatestSubLists(LinkedList *lst1, } void ReadyList::Print(std::ostream &out) { + PriorityList *OutList = new PriorityList; + OutList->CopyList(prirtyLst_, nullptr); out << "Ready List: "; - for (auto *crntInst = prirtyLst_->GetFrstElmnt(); crntInst != NULL; - crntInst = prirtyLst_->GetNxtElmnt()) { + for (auto *crntInst = OutList->GetFrstElmnt(); crntInst != NULL; + crntInst = OutList->GetNxtElmnt()) { out << " " << crntInst->GetNum() << "(" << crntInst->GetClusterGroup() << ")"; } out << '\n'; - prirtyLst_->ResetIterator(); + delete OutList; } void ReadyList::AddLatestSubList_(LinkedList *lst) { diff --git a/lib/Scheduler/sched_basic_data.cpp b/lib/Scheduler/sched_basic_data.cpp index a529e530..e301893e 100644 --- a/lib/Scheduler/sched_basic_data.cpp +++ b/lib/Scheduler/sched_basic_data.cpp @@ -75,8 +75,6 @@ SchedInstruction::~SchedInstruction() { } bool SchedInstruction::computeWasActive() { - if (ClusterGroup == 0) return false; - WasActive = GetActiveCluster() == GetClusterGroup(); return WasActive; } diff --git a/lib/Scheduler/sched_region.cpp b/lib/Scheduler/sched_region.cpp index a885145f..9f0f5535 100644 --- a/lib/Scheduler/sched_region.cpp +++ b/lib/Scheduler/sched_region.cpp @@ -39,6 +39,7 @@ SchedRegion::SchedRegion(MachineModel *machMdl, DataDepGraph *dataDepGraph, totalSimSpills_ = INVALID_VALUE; bestCost_ = INVALID_VALUE; + BestClusterCost = INVALID_VALUE; bestSchedLngth_ = INVALID_VALUE; hurstcCost_ = INVALID_VALUE; enumCrntSched_ = NULL; diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp index a976a8c4..2c9f55d7 100644 --- a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp +++ b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp @@ -538,9 +538,12 @@ int OptSchedDDGWrapperBasic::clusterNeighboringMemOps( const SUnit *SUb = MemOpRecords[Idx + 1].SU; dbgs() << " Checking possible clustering of (" << SUa->NodeNum << ") and (" << SUb->NodeNum << ")\n"; + + // Pass constant of 1 to AMD's function to determine clustering to remove + // the limit of 15. Our enumerator can determine when it has reached the + // limit instead of depending on AMD. if (DAG->TII->shouldClusterMemOps(*MemOpRecords[Idx].BaseOp, - *MemOpRecords[Idx + 1].BaseOp, - ClusterLength)) { + *MemOpRecords[Idx + 1].BaseOp, 1u)) { dbgs() << " Cluster possible at SU(" << SUa->NodeNum << ")- SU(" << SUb->NodeNum << ")\n"; From 4bfbc61a79c57416a7e6b0293e0b948c993502d2 Mon Sep 17 00:00:00 2001 From: Vang Thao Date: Tue, 9 Jun 2020 22:43:23 -0500 Subject: [PATCH 31/40] Copy in dag mutation fix. --- .../Scheduler/OptSchedDDGWrapperBase.h | 3 ++- lib/Scheduler/sched_region.cpp | 15 ++++++++++++ lib/Wrapper/OptSchedDDGWrapperBasic.cpp | 23 +++++++++++++++---- lib/Wrapper/OptSchedDDGWrapperBasic.h | 20 ++++++++-------- lib/Wrapper/OptimizingScheduler.cpp | 14 +++++++---- 5 files changed, 57 insertions(+), 18 deletions(-) diff --git a/include/opt-sched/Scheduler/OptSchedDDGWrapperBase.h b/include/opt-sched/Scheduler/OptSchedDDGWrapperBase.h index b10c9248..6180e344 100644 --- a/include/opt-sched/Scheduler/OptSchedDDGWrapperBase.h +++ b/include/opt-sched/Scheduler/OptSchedDDGWrapperBase.h @@ -14,7 +14,8 @@ class OptSchedDDGWrapperBase { public: virtual ~OptSchedDDGWrapperBase() = default; - virtual void convertSUnits() = 0; + virtual void convertSUnits(bool IgnoreRealEdges, + bool IgnoreArtificialEdges) = 0; virtual void convertRegFiles() = 0; diff --git a/lib/Scheduler/sched_region.cpp b/lib/Scheduler/sched_region.cpp index 9f0f5535..762b2625 100644 --- a/lib/Scheduler/sched_region.cpp +++ b/lib/Scheduler/sched_region.cpp @@ -2,6 +2,7 @@ #include #include +#include "Wrapper/OptSchedDDGWrapperBasic.h" #include "opt-sched/Scheduler/aco.h" #include "opt-sched/Scheduler/bb_spill.h" #include "opt-sched/Scheduler/config.h" @@ -245,6 +246,19 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule( #endif } + // After the sequential scheduler in the second pass, add the artificial edges + // to the DDG. Some mutations were adding artificial edges which caused a + // conflict with the sequential scheduler. Therefore, wait until the + // sequential scheduler is done before adding artificial edges. + if (IsSecondPass()) { + static_cast(dataDepGraph_)->addArtificialEdges(); + rslt = dataDepGraph_->UpdateSetupForSchdulng(needTransitiveClosure); + if (rslt != RES_SUCCESS) { + Logger::Info("Invalid DAG after adding artificial cluster edges"); + return rslt; + } + } + // Step #2: Use ACO to find a schedule if enabled and no optimal schedule is // yet to be found. if (AcoBeforeEnum && !isLstOptml) { @@ -649,6 +663,7 @@ FUNC_RESULT SchedRegion::Optimize_(Milliseconds startTime, } stats::unsolvedProblemSize.Record(dataDepGraph_->GetInstCnt()); } + return rslt; } diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp index 2c9f55d7..e2ccd8b5 100644 --- a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp +++ b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp @@ -78,7 +78,8 @@ OptSchedDDGWrapperBasic::OptSchedDDGWrapperBasic( ClusterCount = 0; } -void OptSchedDDGWrapperBasic::convertSUnits() { +void OptSchedDDGWrapperBasic::convertSUnits(bool IgnoreRealEdges, + bool IgnoreArtificialEdges) { LLVM_DEBUG(dbgs() << "Building opt_sched DAG\n"); // The extra 2 are for the artifical root and leaf nodes. instCnt_ = nodeCnt_ = DAG->SUnits.size() + 2; @@ -94,7 +95,7 @@ void OptSchedDDGWrapperBasic::convertSUnits() { // Create edges. for (const auto &SU : DAG->SUnits) { - convertEdges(SU); + convertEdges(SU, IgnoreRealEdges, IgnoreArtificialEdges); } // Add artificial root and leaf nodes and edges. @@ -412,13 +413,27 @@ inline void OptSchedDDGWrapperBasic::setupLeaf() { CreateEdge_(i, LeafNum, 0, DEP_OTHER); } -void OptSchedDDGWrapperBasic::convertEdges(const SUnit &SU) { +void OptSchedDDGWrapperBasic::addArtificialEdges() { + for (const auto &SU : DAG->SUnits) { + convertEdges(SU, true, false); + } +} + +void OptSchedDDGWrapperBasic::convertEdges(const SUnit &SU, + bool IgnoreRealEdges, + bool IgnoreArtificialEdges) { const MachineInstr *instr = SU.getInstr(); SUnit::const_succ_iterator I, E; for (I = SU.Succs.begin(), E = SU.Succs.end(); I != E; ++I) { if (I->getSUnit()->isBoundaryNode()) continue; + bool IsArtificial = I->isArtificial() || I->isCluster(); + if (IgnoreArtificialEdges && IsArtificial) + continue; + else if (IgnoreRealEdges && !IsArtificial) + continue; + DependenceType DepType; switch (I->getKind()) { case SDep::Data: @@ -538,7 +553,7 @@ int OptSchedDDGWrapperBasic::clusterNeighboringMemOps( const SUnit *SUb = MemOpRecords[Idx + 1].SU; dbgs() << " Checking possible clustering of (" << SUa->NodeNum << ") and (" << SUb->NodeNum << ")\n"; - + // Pass constant of 1 to AMD's function to determine clustering to remove // the limit of 15. Our enumerator can determine when it has reached the // limit instead of depending on AMD. diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.h b/lib/Wrapper/OptSchedDDGWrapperBasic.h index 373ddc52..0679e2b8 100644 --- a/lib/Wrapper/OptSchedDDGWrapperBasic.h +++ b/lib/Wrapper/OptSchedDDGWrapperBasic.h @@ -13,8 +13,8 @@ #include "opt-sched/Scheduler/graph_trans.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineScheduler.h" -#include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" #include #include #include @@ -48,7 +48,8 @@ class OptSchedDDGWrapperBasic : public DataDepGraph { /// Dump Optsched register def/use information for the region. void dumpOptSchedRegisters() const; - void convertSUnits() override; + void convertSUnits(bool IgnoreRealEdges, bool IgnoreArtificialEdges) override; + void addArtificialEdges(); void convertRegFiles() override; int findPossibleClusters(bool IsLoad) override; @@ -125,7 +126,8 @@ class OptSchedDDGWrapperBasic : public DataDepGraph { void convertSUnit(const llvm::SUnit &SU); // Create edges between optsched graph nodes using SUnit successors. - void convertEdges(const llvm::SUnit &SU); + void convertEdges(const llvm::SUnit &SU, bool IgnoreRealEdges, + bool IgnoreArtificialEdges); // Count number or registers defined by the region boundary. void countBoundaryLiveness(std::vector &RegDefCounts, @@ -145,11 +147,11 @@ class OptSchedDDGWrapperBasic : public DataDepGraph { std::vector consumers; }; - /// Count of the total clusters possible + /// Count of the total clusters possible int ClusterCount; -// Copied from -// https://github.com/RadeonOpenCompute/llvm/blob/roc-ocl-2.4.0/lib/CodeGen/MachineScheduler.cpp#L1467 + // Copied from + // https://github.com/RadeonOpenCompute/llvm/blob/roc-ocl-2.4.0/lib/CodeGen/MachineScheduler.cpp#L1467 struct MemOpInfo { const SUnit *SU; MachineOperand *BaseOp; @@ -191,9 +193,9 @@ class OptSchedDDGWrapperBasic : public DataDepGraph { }; // Exclude certain registers from being visible to the scheduler. Use LLVM's -// register pressure tracker to find the MAX register pressure for each register -// type (pressure set). If the MAX pressure is below a certain threshold don't -// track that register. +// register pressure tracker to find the MAX register pressure for each +// register type (pressure set). If the MAX pressure is below a certain +// threshold don't track that register. class LLVMRegTypeFilter { private: const MachineModel *MM; diff --git a/lib/Wrapper/OptimizingScheduler.cpp b/lib/Wrapper/OptimizingScheduler.cpp index 528801fc..d12e8294 100644 --- a/lib/Wrapper/OptimizingScheduler.cpp +++ b/lib/Wrapper/OptimizingScheduler.cpp @@ -126,7 +126,8 @@ nextIfDebug(MachineBasicBlock::iterator I, return I; } -static bool scheduleSpecificRegion(const StringRef RegionName, const Config &SchedIni) { +static bool scheduleSpecificRegion(const StringRef RegionName, + const Config &SchedIni) { const bool ScheduleSpecificRegions = SchedIni.GetBool("SCHEDULE_SPECIFIC_REGIONS"); @@ -376,11 +377,13 @@ void ScheduleDAGOptSched::schedule() { // Convert graph auto DDG = OST->createDDGWrapper(C, this, MM.get(), LatencyPrecision, RegionName); - DDG->convertSUnits(); - DDG->convertRegFiles(); // Find all clusterable instructions for the second pass. if (SecondPass) { + // In the second pass, ignore artificial edges before running the sequential + // heuristic list scheduler. + DDG->convertSUnits(false, true); + dbgs() << "Finding load clusters.\n"; int TotalLoadsInstructionsClusterable = DDG->findPossibleClusters(true); if (TotalLoadsInstructionsClusterable == 0) @@ -411,7 +414,10 @@ void ScheduleDAGOptSched::schedule() { DataDepGraphInstance->getTotalInstructionsInCluster(begin)); } } - } + } else + DDG->convertSUnits(false, false); + + DDG->convertRegFiles(); auto *BDDG = static_cast(DDG.get()); addGraphTransformations(BDDG); From 0d80260cea45033520b30dcec25a5524c12d5c59 Mon Sep 17 00:00:00 2001 From: Vang Thao Date: Wed, 10 Jun 2020 08:49:22 -0500 Subject: [PATCH 32/40] Copy verify schedule bugfix patch for dag mutation fix. --- include/opt-sched/Scheduler/data_dep.h | 6 ++++-- include/opt-sched/Scheduler/graph.h | 8 ++++++-- include/opt-sched/Scheduler/sched_basic_data.h | 12 +++++------- lib/Scheduler/data_dep.cpp | 14 ++++++++++---- lib/Scheduler/sched_basic_data.cpp | 12 +++++++++--- lib/Wrapper/OptSchedDDGWrapperBasic.cpp | 3 ++- 6 files changed, 36 insertions(+), 19 deletions(-) diff --git a/include/opt-sched/Scheduler/data_dep.h b/include/opt-sched/Scheduler/data_dep.h index 5dd5c1e8..2fcd19be 100644 --- a/include/opt-sched/Scheduler/data_dep.h +++ b/include/opt-sched/Scheduler/data_dep.h @@ -295,7 +295,9 @@ class DataDepGraph : public llvm::opt_sched::OptSchedDDGWrapperBase, // Memory clustering helper functions int getMinClusterCount() { return MinClusterCount; } void setMinClusterCount(int Max) { MinClusterCount = Max; } - int getTotalInstructionsInAllClusters() { return TotalInstructionsInAllClusters; } + int getTotalInstructionsInAllClusters() { + return TotalInstructionsInAllClusters; + } void setTotalInstructionsInAllClusters(int Max) { TotalInstructionsInAllClusters = Max; } @@ -407,7 +409,7 @@ class DataDepGraph : public llvm::opt_sched::OptSchedDDGWrapperBase, InstCount fileUB, int blkNum); FUNC_RESULT FinishNode_(InstCount nodeNum, InstCount edgeCnt = -1); void CreateEdge_(InstCount frmInstNum, InstCount toInstNum, int ltncy, - DependenceType depType); + DependenceType depType, bool IsArtificial = false); FUNC_RESULT Finish_(); diff --git a/include/opt-sched/Scheduler/graph.h b/include/opt-sched/Scheduler/graph.h index 790b7164..fea0576f 100644 --- a/include/opt-sched/Scheduler/graph.h +++ b/include/opt-sched/Scheduler/graph.h @@ -49,11 +49,15 @@ struct GraphEdge { UDT_GEDGES predOrder; // The second node's order in the first node's successor list. UDT_GEDGES succOrder; + // Whether or not the edge is an artificial dependency meaning it isn't + // required to be correct + bool IsArtificial; // Creates an edge between two nodes with labels label and label2. GraphEdge(GraphNode *from, GraphNode *to, UDT_GLABEL label, - UDT_GLABEL label2 = 0) - : from(from), to(to), label(label), label2(label2) {} + UDT_GLABEL label2 = 0, bool IsArtificial = false) + : from(from), to(to), label(label), label2(label2), + IsArtificial(IsArtificial) {} // Returns the node on the other side of the edge from the provided node. // Assumes that the argument is one of the nodes on the sides of the edge. diff --git a/include/opt-sched/Scheduler/sched_basic_data.h b/include/opt-sched/Scheduler/sched_basic_data.h index d2c3518a..46117e9e 100644 --- a/include/opt-sched/Scheduler/sched_basic_data.h +++ b/include/opt-sched/Scheduler/sched_basic_data.h @@ -8,14 +8,11 @@ Last Update: Sept. 2013 #ifndef OPTSCHED_BASIC_SCHED_BASIC_DATA_H #define OPTSCHED_BASIC_SCHED_BASIC_DATA_H -// For class string. -#include -// For class ostream. #include "opt-sched/Scheduler/defines.h" #include "opt-sched/Scheduler/graph.h" #include "opt-sched/Scheduler/hash_table.h" #include "opt-sched/Scheduler/machine_model.h" -#include +#include namespace llvm { namespace opt_sched { @@ -208,12 +205,14 @@ class SchedInstruction : public GraphNode { // depType: the type of dependence between this node and the successor. SchedInstruction *GetFrstScsr(InstCount *prdcsrNum = NULL, UDT_GLABEL *ltncy = NULL, - DependenceType *depType = NULL); + DependenceType *depType = NULL, + bool *IsArtificial = nullptr); // Returns the next successor of this instruction node and moves the // successor iterator forward. Fills parameters as above. SchedInstruction *GetNxtScsr(InstCount *prdcsrNum = NULL, UDT_GLABEL *ltncy = NULL, - DependenceType *depType = NULL); + DependenceType *depType = NULL, + bool *IsArtificial = nullptr); // Returns the last successor of this instruction node and moves the // successor iterator to the end of the list. If prdcsrNum is provided, this @@ -436,7 +435,6 @@ class SchedInstruction : public GraphNode { string opCode_; bool WasActive; - /// The cluster group that the current instruction is a part of. /// Default of 0 means that it is not part of any cluster. diff --git a/lib/Scheduler/data_dep.cpp b/lib/Scheduler/data_dep.cpp index c58b4d92..513e8d9e 100644 --- a/lib/Scheduler/data_dep.cpp +++ b/lib/Scheduler/data_dep.cpp @@ -214,7 +214,7 @@ DataDepGraph::~DataDepGraph() { delete[] instCntPerType_; } -int DataDepGraph::getTotalInstructionsInCluster(int Cluster) { +int DataDepGraph::getTotalInstructionsInCluster(int Cluster) { assert(Cluster > 0); return MaxInstructionsInEachClusters[Cluster]; } @@ -2980,8 +2980,15 @@ bool InstSchedule::VerifyDataDeps_(DataDepGraph *dataDepGraph) { UDT_GLABEL ltncy; DependenceType depType; - for (SchedInstruction *scsr = inst->GetFrstScsr(NULL, <ncy, &depType); - scsr != NULL; scsr = inst->GetNxtScsr(NULL, <ncy, &depType)) { + bool IsArtificial; + for (SchedInstruction *scsr = + inst->GetFrstScsr(NULL, <ncy, &depType, &IsArtificial); + scsr != NULL; + scsr = inst->GetNxtScsr(NULL, <ncy, &depType, &IsArtificial)) { + // Artificial nodes are not required for the schedule to be correct + if (IsArtificial) + continue; + InstCount scsrCycle = GetSchedCycle(scsr); if (scsrCycle < (instCycle + ltncy)) { Logger::Error("Invalid schedule: Latency from %d to %d not satisfied", @@ -3213,7 +3220,6 @@ bool DataDepGraph::DoesFeedUser(SchedInstruction *inst) { // If there is a successor instruction that decreases live intervals // or one that does not increase live intervals, then return true. return true; - } // Return false if there is no recursive successor of inst // that uses a live register. diff --git a/lib/Scheduler/sched_basic_data.cpp b/lib/Scheduler/sched_basic_data.cpp index e301893e..4aec6ec6 100644 --- a/lib/Scheduler/sched_basic_data.cpp +++ b/lib/Scheduler/sched_basic_data.cpp @@ -19,7 +19,7 @@ SchedInstruction::SchedInstruction(InstCount num, const string &name, opCode_ = opCode; instType_ = instType; ClusterGroup = 0; - MayCluster = false; + MayCluster = false; frwrdLwrBound_ = INVALID_VALUE; bkwrdLwrBound_ = INVALID_VALUE; @@ -384,7 +384,8 @@ SchedInstruction *SchedInstruction::GetNxtPrdcsr(InstCount *scsrNum, SchedInstruction *SchedInstruction::GetFrstScsr(InstCount *prdcsrNum, UDT_GLABEL *ltncy, - DependenceType *depType) { + DependenceType *depType, + bool *IsArtificial) { GraphEdge *edge = GetFrstScsrEdge(); if (!edge) return NULL; @@ -394,12 +395,15 @@ SchedInstruction *SchedInstruction::GetFrstScsr(InstCount *prdcsrNum, *ltncy = edge->label; if (depType) *depType = (DependenceType)edge->label2; + if (IsArtificial) + *IsArtificial = edge->IsArtificial; return (SchedInstruction *)(edge->to); } SchedInstruction *SchedInstruction::GetNxtScsr(InstCount *prdcsrNum, UDT_GLABEL *ltncy, - DependenceType *depType) { + DependenceType *depType, + bool *IsArtificial) { GraphEdge *edge = GetNxtScsrEdge(); if (!edge) return NULL; @@ -409,6 +413,8 @@ SchedInstruction *SchedInstruction::GetNxtScsr(InstCount *prdcsrNum, *ltncy = edge->label; if (depType) *depType = (DependenceType)edge->label2; + if (IsArtificial) + *IsArtificial = edge->IsArtificial; return (SchedInstruction *)(edge->to); } diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp index e2ccd8b5..94126a51 100644 --- a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp +++ b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp @@ -460,7 +460,8 @@ void OptSchedDDGWrapperBasic::convertEdges(const SUnit &SU, else Latency = 1; // unit latency = ignore ilp - CreateEdge_(SU.NodeNum, I->getSUnit()->NodeNum, Latency, DepType); + CreateEdge_(SU.NodeNum, I->getSUnit()->NodeNum, Latency, DepType, + IsArtificial); } } From 58978df53231779f72c6450b6234fcbeabde47e6 Mon Sep 17 00:00:00 2001 From: Vang Thao Date: Wed, 10 Jun 2020 08:54:03 -0500 Subject: [PATCH 33/40] Missed a file to copy over. --- lib/Scheduler/data_dep.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/Scheduler/data_dep.cpp b/lib/Scheduler/data_dep.cpp index 513e8d9e..14a38ee7 100644 --- a/lib/Scheduler/data_dep.cpp +++ b/lib/Scheduler/data_dep.cpp @@ -907,7 +907,8 @@ void DataDepGraph::CreateEdge(SchedInstruction *frmNode, } void DataDepGraph::CreateEdge_(InstCount frmNodeNum, InstCount toNodeNum, - int ltncy, DependenceType depType) { + int ltncy, DependenceType depType, + bool IsArtificial) { GraphEdge *edge; assert(frmNodeNum < instCnt_); @@ -936,7 +937,7 @@ void DataDepGraph::CreateEdge_(InstCount frmNodeNum, InstCount toNodeNum, Logger::Info("Creating edge from %d to %d of type %d and latency %d", frmNodeNum, toNodeNum, depType, ltncy); #endif - edge = new GraphEdge(frmNode, toNode, ltncy, depType); + edge = new GraphEdge(frmNode, toNode, ltncy, depType, IsArtificial); frmNode->AddScsr(edge); toNode->AddPrdcsr(edge); From ee1d32f9fb2d6b39f3aab9e9e4420cf7d80d52fd Mon Sep 17 00:00:00 2001 From: Vang Thao Date: Fri, 12 Jun 2020 06:37:09 -0500 Subject: [PATCH 34/40] Ignore artificial edges for potential clustering and display clusters after sequential scheduler. --- include/opt-sched/Scheduler/bb_spill.h | 14 ++-- include/opt-sched/Scheduler/sched_region.h | 9 ++- lib/Scheduler/bb_spill.cpp | 91 ++++++++++++---------- lib/Scheduler/sched_region.cpp | 3 + lib/Wrapper/OptSchedDDGWrapperBasic.cpp | 2 +- lib/Wrapper/OptimizingScheduler.cpp | 18 +++-- 6 files changed, 79 insertions(+), 58 deletions(-) diff --git a/include/opt-sched/Scheduler/bb_spill.h b/include/opt-sched/Scheduler/bb_spill.h index e0a55a8f..5d7bd0c8 100644 --- a/include/opt-sched/Scheduler/bb_spill.h +++ b/include/opt-sched/Scheduler/bb_spill.h @@ -43,10 +43,10 @@ class BBWithSpill : public SchedRegion { int ClusterGroupCount; /// Print the current clusters found so far in the schedule. - void printCurrentClustering(); + void printCurrentClustering() override; void initForClustering(); - + /// Calculate the lower bound cost for memory operations clustering and /// return the lower bound cost. Does not take into account the clustering /// weight. @@ -69,7 +69,7 @@ class BBWithSpill : public SchedRegion { int calculateClusterDLB(); /// Current cluster size - unsigned int CurrentClusterSize; + unsigned int CurrentClusterSize; /// The minimum amount of cluster blocks possible. int MinClusterBlocks; @@ -100,7 +100,7 @@ class BBWithSpill : public SchedRegion { /// Instruction number that ended this cluster. Used to check if we should /// restore the cluster state when backtracking. - int InstNum; + int InstNum; int Start; @@ -110,7 +110,8 @@ class BBWithSpill : public SchedRegion { /// Constructor for this struct PastClusters(int Cluster, int Size, int Instructions, int CycleStart) - : ClusterGroup(Cluster), ClusterSize(Size), InstNum(Instructions), Start(CycleStart) {} + : ClusterGroup(Cluster), ClusterSize(Size), InstNum(Instructions), + Start(CycleStart) {} }; /// Vector containing the (n-1) past clusters @@ -195,7 +196,8 @@ class BBWithSpill : public SchedRegion { void InitForCostCmputtn_(); InstCount CmputDynmcCost_(); - void UpdateSpillInfoForSchdul_(SchedInstruction *inst, bool trackCnflcts, int Start); + void UpdateSpillInfoForSchdul_(SchedInstruction *inst, bool trackCnflcts, + int Start); void UpdateSpillInfoForUnSchdul_(SchedInstruction *inst); void SetupPhysRegs_(); void CmputCrntSpillCost_(); diff --git a/include/opt-sched/Scheduler/sched_region.h b/include/opt-sched/Scheduler/sched_region.h index 553d73b8..d5e6a9e2 100644 --- a/include/opt-sched/Scheduler/sched_region.h +++ b/include/opt-sched/Scheduler/sched_region.h @@ -52,6 +52,7 @@ class SchedRegion { // Destroys the region. Must be overriden by child classes. virtual ~SchedRegion() {} + virtual void printCurrentClustering() = 0; // Returns the dependence graph of this region. inline DataDepGraph *GetDepGraph() { return dataDepGraph_; } // Returns the lower bound on the cost of this region. @@ -130,7 +131,7 @@ class SchedRegion { // The absolute cost lower bound to be used as a ref for normalized costs. InstCount costLwrBound_ = 0; - + // The best results found so far. InstCount bestCost_; int BestClusterCost; @@ -184,9 +185,11 @@ class SchedRegion { void setBestClusterCost(int BestCost) { BestClusterCost = BestCost; } - void SetBestSchedLength(InstCount bestSchedLngth) { bestSchedLngth_ = bestSchedLngth; } + void SetBestSchedLength(InstCount bestSchedLngth) { + bestSchedLngth_ = bestSchedLngth; + } - const SchedPriorities& GetEnumPriorities() const { return enumPrirts_; } + const SchedPriorities &GetEnumPriorities() const { return enumPrirts_; } int16_t GetSigHashSize() const { return sigHashSize_; } diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp index 6920eae8..f62326da 100644 --- a/lib/Scheduler/bb_spill.cpp +++ b/lib/Scheduler/bb_spill.cpp @@ -76,8 +76,8 @@ BBWithSpill::BBWithSpill(const OptSchedTarget *OST_, DataDepGraph *dataDepGraph, ClusterGroupCount = dataDepGraph_->getMinClusterCount(); MinClusterBlocks = 0; if (ClusterMemoryOperations && ClusterGroupCount > 0) { - ClusterCount.resize(ClusterGroupCount+1); - ClusterInstrRemainderCount.resize(ClusterGroupCount+1); + ClusterCount.resize(ClusterGroupCount + 1); + ClusterInstrRemainderCount.resize(ClusterGroupCount + 1); MinClusterBlocks = calculateClusterStaticLB(); initForClustering(); } @@ -97,7 +97,8 @@ void BBWithSpill::initForClustering() { for (int begin = 1; begin <= ClusterGroupCount; begin++) { ClusterCount[begin] = 0; - ClusterInstrRemainderCount[begin] = dataDepGraph_->getTotalInstructionsInCluster(begin); + ClusterInstrRemainderCount[begin] = + dataDepGraph_->getTotalInstructionsInCluster(begin); } } @@ -124,11 +125,12 @@ int BBWithSpill::calculateClusterStaticLB() { int ClusterCost = 0; for (int begin = 1; begin <= ClusterGroupCount; begin++) { int InstructionCount = dataDepGraph_->getTotalInstructionsInCluster(begin); - int CurrentClusterCost = std::ceil(double(InstructionCount) / MAX_INSTR_IN_CLUSTER); + int CurrentClusterCost = + std::ceil(double(InstructionCount) / MAX_INSTR_IN_CLUSTER); Logger::Info("Cost for block %d is %d", begin, CurrentClusterCost); ClusterCost += CurrentClusterCost; } - + return ClusterCost; } @@ -357,7 +359,7 @@ InstCount BBWithSpill::CmputCostLwrBound() { // Add the minimum of the possible clusters to the lower bound if (IsSecondPass() && ClusterMemoryOperations) { - staticLowerBound += MinClusterBlocks * ClusteringWeight; + staticLowerBound += MinClusterBlocks * ClusteringWeight; } #if defined(IS_DEBUG_STATIC_LOWER_BOUND) @@ -452,7 +454,7 @@ InstCount BBWithSpill::CmputCost_(InstSchedule *sched, COST_COMP_MODE compMode, cost += CurrentClusterCost * ClusteringWeight; assert(calculateClusterDLB() == CurrentClusterCost); } - + sched->SetSpillCosts(spillCosts_); sched->SetPeakRegPressures(peakRegPressures_); sched->SetSpillCost(crntSpillCost_); @@ -495,7 +497,7 @@ void BBWithSpill::saveCluster(SchedInstruction *inst) { // to its contents. LastCluster = llvm::make_unique( ClusterActiveGroup, CurrentClusterSize, inst->GetNum(), StartCycle); - + LastCluster->InstrList = std::move(InstrList); } @@ -535,28 +537,32 @@ bool BBWithSpill::isClusterFinished() { assert(ClusterActiveGroup != 0); if (ClusterInstrRemainderCount[ClusterActiveGroup] == 0 || CurrentClusterSize == MAX_INSTR_IN_CLUSTER) { - return true; + return true; } return false; } int BBWithSpill::calculateClusterDLB() { int OptimisticLowerBound = 0; - + for (int begin = 1; begin <= ClusterGroupCount; begin++) { if (begin != ClusterActiveGroup) - OptimisticLowerBound += std::ceil(double(ClusterInstrRemainderCount[begin])/MAX_INSTR_IN_CLUSTER); + OptimisticLowerBound += std::ceil( + double(ClusterInstrRemainderCount[begin]) / MAX_INSTR_IN_CLUSTER); else { - // The amount of instructions remaining that the current open cluster can add + // The amount of instructions remaining that the current open cluster can + // add int AbsorbCount = MAX_INSTR_IN_CLUSTER - CurrentClusterSize; // Assume the current open cluster can add the max amount of instructions // that a cluster can contain. int Remainder = ClusterInstrRemainderCount[begin] - AbsorbCount; - // If the remainder is negative then that indicates the open cluster can absorb all of the remaining instructions. + // If the remainder is negative then that indicates the open cluster can + // absorb all of the remaining instructions. if (Remainder < 0) Remainder = 0; // Estimate the optimistic dynamic lower bound for the current cluster - OptimisticLowerBound += std::ceil(double(Remainder)/MAX_INSTR_IN_CLUSTER); + OptimisticLowerBound += + std::ceil(double(Remainder) / MAX_INSTR_IN_CLUSTER); } } return CurrentClusterCost + OptimisticLowerBound; @@ -571,11 +577,12 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst, int liveRegs; InstCount newSpillCost; -// Conditions for creating a cluster: -// 1.) If a block is ended before it reaches 15 && there are remaining instructions + // Conditions for creating a cluster: + // 1.) If a block is ended before it reaches 15 && there are remaining + // instructions -// Conditions for removing a cluster: -// 1.) If the block is not 15 && there are remaining instructions + // Conditions for removing a cluster: + // 1.) If the block is not 15 && there are remaining instructions // Scheduling cases for clustering project: // 1.) Same Cluster -> Same Cluster @@ -592,8 +599,9 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst, // Check if the current instruction is part of a cluster if (inst->GetMayCluster()) { // Check if there is a current active cluster - // A ClusterActiveGroup == 0 indicates that there is no currently active clustering - // While ClusterActiveGroup != 0 indicates that there is active clustering + // A ClusterActiveGroup == 0 indicates that there is no currently active + // clustering While ClusterActiveGroup != 0 indicates that there is active + // clustering if (ClusterActiveGroup != 0) { // Check if the instruction is in the same cluster group as the active // cluster @@ -602,12 +610,11 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst, // already active cluster. CurrentClusterSize++; ClusterInstrRemainderCount[ClusterActiveGroup]--; - InstrList->push_back(inst->GetName()); + InstrList->push_back(inst->GetName()); // If we reach the max amount for this cluster then save the cluster // and reset. - if (isClusterFinished()) - { + if (isClusterFinished()) { saveCluster(inst); resetActiveCluster(inst); } @@ -835,7 +842,7 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) { if (IsSecondPass() && ClusterMemoryOperations) { // If the instruction we are backtracking from is part of a cluster if (inst->GetMayCluster()) { - if (CurrentClusterSize != 0) { + if (CurrentClusterSize != 0) { // Case 1, 2, and 3 // Reduce the cluster size CurrentClusterSize--; @@ -1071,7 +1078,7 @@ FUNC_RESULT BBWithSpill::Enumerate_(Milliseconds startTime, if (GetBestCost() == 0 || rslt == RES_ERROR || (lngthDeadline == rgnDeadline && rslt == RES_TIMEOUT)) { //|| - //(rslt == RES_SUCCESS && IsSecondPass())) { + //(rslt == RES_SUCCESS && IsSecondPass())) { // If doing two pass optsched and on the second pass then terminate if a // schedule is found with the same min-RP found in first pass. @@ -1080,7 +1087,8 @@ FUNC_RESULT BBWithSpill::Enumerate_(Milliseconds startTime, Logger::Info("Schedule found in second pass, terminating BB loop."); if (trgtLngth < schedUprBound_) - Logger::Info("Schedule found with length %d is shorter than current schedule with length %d.", trgtLngth, schedUprBound_); + Logger::Info("Schedule found with length %d is shorter than current + schedule with length %d.", trgtLngth, schedUprBound_); }*/ break; @@ -1156,29 +1164,32 @@ void BBWithSpill::printCurrentClustering() { dbgs() << "Printing clustered instructions:\n"; int i = 1; for (const auto &clusters : PastClustersList) { - dbgs() << "Printing cluster " << i << ", start cycle (" << clusters->Start << "): "; + dbgs() << "Printing cluster " << i << ", start cycle (" << clusters->Start + << "): "; for (const auto &instr : *clusters->InstrList) { dbgs() << instr << " "; } i++; - dbgs() << '\n'; + dbgs() << '\n'; } if (LastCluster) { - dbgs() << "Printing cluster " << i << ", start cycle (" << LastCluster->Start << "): "; + dbgs() << "Printing cluster " << i << ", start cycle (" + << LastCluster->Start << "): "; for (const auto &instr : *(LastCluster->InstrList)) { dbgs() << instr << " "; } i++; - dbgs() << '\n'; + dbgs() << '\n'; } if (InstrList && InstrList->size() > 0) { - dbgs() << "Printing cluster " << i << ", start cycle (" << StartCycle << "): "; + dbgs() << "Printing cluster " << i << ", start cycle (" << StartCycle + << "): "; for (const auto &instr : *InstrList) { dbgs() << instr << " "; } - dbgs() << '\n'; + dbgs() << '\n'; } } } @@ -1226,12 +1237,13 @@ bool BBWithSpill::ChkCostFsblty(InstCount trgtLngth, EnumTreeNode *node) { // assert(cost >= 0); assert(dynmcCostLwrBound >= 0); -/* - if (IsSecondPass() && ClusterMemoryOperations) { - dbgs() << "Current cycle: " << node->GetTime() <<", current cost is: " << dynmcCostLwrBound << ". Current best is: " << GetBestCost() << '\n'; - printCurrentClustering(); - } -*/ + /* + if (IsSecondPass() && ClusterMemoryOperations) { + dbgs() << "Current cycle: " << node->GetTime() <<", current cost is: " << + dynmcCostLwrBound << ". Current best is: " << GetBestCost() << '\n'; + printCurrentClustering(); + } + */ fsbl = dynmcCostLwrBound < GetBestCost(); @@ -1248,8 +1260,7 @@ bool BBWithSpill::ChkCostFsblty(InstCount trgtLngth, EnumTreeNode *node) { node->setClusterLwrBound(ClusterDynamicLowerBound); if (ClusterActiveGroup != 0) { node->setClusterAbsorbCount(15 - CurrentClusterSize); - } - else { + } else { node->setClusterAbsorbCount(0); } } diff --git a/lib/Scheduler/sched_region.cpp b/lib/Scheduler/sched_region.cpp index 762b2625..8d52bb76 100644 --- a/lib/Scheduler/sched_region.cpp +++ b/lib/Scheduler/sched_region.cpp @@ -210,6 +210,9 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule( hurstcTime = Utilities::GetProcessorTime() - hurstcStart; stats::heuristicTime.Record(hurstcTime); + if (IsSecondPass()) + printCurrentClustering(); + if (hurstcTime > 0) Logger::Info("Heuristic_Time %d", hurstcTime); diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp index 94126a51..62305fa3 100644 --- a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp +++ b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp @@ -635,7 +635,7 @@ int OptSchedDDGWrapperBasic::findPossibleClusters(bool IsLoad) { unsigned ChainPredID = DAG->SUnits.size(); for (const SDep &Pred : SU.Preds) { - if (Pred.isCtrl()) { + if (Pred.isCtrl() && !(Pred.isArtificial() || Pred.isCluster())) { ChainPredID = Pred.getSUnit()->NodeNum; break; } diff --git a/lib/Wrapper/OptimizingScheduler.cpp b/lib/Wrapper/OptimizingScheduler.cpp index d12e8294..8d6e1d77 100644 --- a/lib/Wrapper/OptimizingScheduler.cpp +++ b/lib/Wrapper/OptimizingScheduler.cpp @@ -44,13 +44,13 @@ bool OPTSCHED_gPrintSpills; // An array of possible OptSched heuristic names constexpr struct { - const char* Name; + const char *Name; LISTSCHED_HEURISTIC HID; -} HeuristicNames[] = { - {"CP", LSH_CP}, {"LUC", LSH_LUC}, {"UC", LSH_UC}, {"NID", LSH_NID}, - {"CPR", LSH_CPR}, {"ISO", LSH_ISO}, {"SC", LSH_SC}, {"LS", LSH_LS}, - {"LLVM", LSH_LLVM}, {"CLUSTER", LSH_CLUSTER} -}; +} HeuristicNames[] = {{"CP", LSH_CP}, {"LUC", LSH_LUC}, + {"UC", LSH_UC}, {"NID", LSH_NID}, + {"CPR", LSH_CPR}, {"ISO", LSH_ISO}, + {"SC", LSH_SC}, {"LS", LSH_LS}, + {"LLVM", LSH_LLVM}, {"CLUSTER", LSH_CLUSTER}}; // Default path to the the configuration directory for opt-sched. static constexpr const char *DEFAULT_CFG_DIR = "~/.optsched-cfg/"; @@ -395,7 +395,8 @@ void ScheduleDAGOptSched::schedule() { dbgs() << " No store clustering possible\n"; Logger::Info("Total clusterable instructions: %d loads, %d stores", - TotalLoadsInstructionsClusterable, TotalStoreInstructionsClusterable); + TotalLoadsInstructionsClusterable, + TotalStoreInstructionsClusterable); // Get the DDG instance so that we can set and get information that will be // read later on during enumeration. @@ -410,7 +411,8 @@ void ScheduleDAGOptSched::schedule() { if (end > 0) { Logger::Info("Total clusters in region: %d", end); for (int begin = 1; begin <= end; begin++) { - Logger::Info(" Cluster %d has total instructions %d", begin, + Logger::Info( + " Cluster %d has total instructions %d", begin, DataDepGraphInstance->getTotalInstructionsInCluster(begin)); } } From decb49f889d3ff7f3a7631e20a055d2ec434798f Mon Sep 17 00:00:00 2001 From: vang thao Date: Sat, 18 Jul 2020 10:09:09 -0700 Subject: [PATCH 35/40] Add option to print cluster information after scheduling and revert changes to upper bound calculation. --- example/optsched-cfg/sched.ini | 5 ++ include/opt-sched/Scheduler/bb_spill.h | 9 +-- include/opt-sched/Scheduler/data_dep.h | 5 ++ include/opt-sched/Scheduler/sched_region.h | 9 +++ lib/Scheduler/bb_spill.cpp | 47 +++++++------ lib/Scheduler/data_dep.cpp | 4 ++ lib/Scheduler/sched_region.cpp | 78 +++++++++++++--------- 7 files changed, 97 insertions(+), 60 deletions(-) diff --git a/example/optsched-cfg/sched.ini b/example/optsched-cfg/sched.ini index 11370b31..89a79d2b 100644 --- a/example/optsched-cfg/sched.ini +++ b/example/optsched-cfg/sched.ini @@ -8,6 +8,11 @@ USE_OPT_SCHED YES # Same options as use optimal scheduling. PRINT_SPILL_COUNTS YES +# Print clustering information +# YES +# NO +PRINT_CLUSTER YES + # Use two pass scheduling approach. # First pass minimizes RP and second pass tries to balances RP and ILP. # YES diff --git a/include/opt-sched/Scheduler/bb_spill.h b/include/opt-sched/Scheduler/bb_spill.h index 5d7bd0c8..ef536b85 100644 --- a/include/opt-sched/Scheduler/bb_spill.h +++ b/include/opt-sched/Scheduler/bb_spill.h @@ -42,6 +42,8 @@ class BBWithSpill : public SchedRegion { llvm::SmallVector ClusterInstrRemainderCount; int ClusterGroupCount; + void computeAndPrintClustering(InstSchedule *Sched) override; + /// Print the current clusters found so far in the schedule. void printCurrentClustering() override; @@ -83,13 +85,6 @@ class BBWithSpill : public SchedRegion { int StartCycle; - /// Flag to enable or disable clustering memory operations in the ILP pass. - /// Reads from the sched.ini file then set the flag accordingly. - bool ClusterMemoryOperations; - - /// The weight for memory ops clustering. - int ClusteringWeight; - /// Data struct to contain information about the previous clusters struct PastClusters { /// The cluster group diff --git a/include/opt-sched/Scheduler/data_dep.h b/include/opt-sched/Scheduler/data_dep.h index 2fcd19be..4a1494ed 100644 --- a/include/opt-sched/Scheduler/data_dep.h +++ b/include/opt-sched/Scheduler/data_dep.h @@ -647,6 +647,9 @@ class InstSchedule { // The schedule's spill cost according to the cost function used InstCount spillCost_; + // The number of clusters + int ClusterSize; + // An array of peak reg pressures for all reg types in the schedule InstCount *peakRegPressures_; @@ -694,6 +697,8 @@ class InstSchedule { InstCount GetExecCost() const; void SetSpillCost(InstCount cost); InstCount GetSpillCost() const; + void setClusterSize(int size); + int getClusterSize() const; void ResetInstIter(); InstCount GetFrstInst(InstCount &cycleNum, InstCount &slotNum); diff --git a/include/opt-sched/Scheduler/sched_region.h b/include/opt-sched/Scheduler/sched_region.h index d5e6a9e2..0cc0a40e 100644 --- a/include/opt-sched/Scheduler/sched_region.h +++ b/include/opt-sched/Scheduler/sched_region.h @@ -52,6 +52,10 @@ class SchedRegion { // Destroys the region. Must be overriden by child classes. virtual ~SchedRegion() {} + bool PrintClustering; + + virtual void computeAndPrintClustering(InstSchedule *Sched) = 0; + virtual void printCurrentClustering() = 0; // Returns the dependence graph of this region. inline DataDepGraph *GetDepGraph() { return dataDepGraph_; } @@ -163,6 +167,11 @@ class SchedRegion { InstSchedule *enumBestSched_; // The best schedule found so far (may be heuristic or enumerator generated) InstSchedule *bestSched_; + /// Flag to enable or disable clustering memory operations in the ILP pass. + /// Reads from the sched.ini file then set the flag accordingly. + bool ClusterMemoryOperations; + /// The weight for memory ops clustering. + int ClusteringWeight; // TODO(max): Document. InstCount schedLwrBound_; diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp index f62326da..92d93fe2 100644 --- a/lib/Scheduler/bb_spill.cpp +++ b/lib/Scheduler/bb_spill.cpp @@ -25,7 +25,7 @@ extern bool OPTSCHED_gPrintSpills; using namespace llvm::opt_sched; // The denominator used when calculating cost weight. -static const int COST_WGHT_BASE = 10; +static const int COST_WGHT_BASE = 100; // The max number of instructions in a cluster static const unsigned MAX_INSTR_IN_CLUSTER = 15; @@ -70,12 +70,10 @@ BBWithSpill::BBWithSpill(const OptSchedTarget *OST_, DataDepGraph *dataDepGraph, schduldEntryInstCnt_ = 0; schduldExitInstCnt_ = 0; schduldInstCnt_ = 0; - Config &schedIni = SchedulerOptions::getInstance(); - ClusterMemoryOperations = schedIni.GetBool("CLUSTER_MEMORY_OPS"); - ClusteringWeight = schedIni.GetInt("CLUSTER_WEIGHT"); ClusterGroupCount = dataDepGraph_->getMinClusterCount(); MinClusterBlocks = 0; - if (ClusterMemoryOperations && ClusterGroupCount > 0) { +// if (ClusterMemoryOperations && ClusterGroupCount > 0) { + if (ClusterGroupCount > 0) { ClusterCount.resize(ClusterGroupCount + 1); ClusterInstrRemainderCount.resize(ClusterGroupCount + 1); MinClusterBlocks = calculateClusterStaticLB(); @@ -453,6 +451,7 @@ InstCount BBWithSpill::CmputCost_(InstSchedule *sched, COST_COMP_MODE compMode, if (IsSecondPass() && ClusterMemoryOperations) { cost += CurrentClusterCost * ClusteringWeight; assert(calculateClusterDLB() == CurrentClusterCost); + sched->setClusterSize(CurrentClusterCost); } sched->SetSpillCosts(spillCosts_); @@ -487,6 +486,25 @@ void BBWithSpill::CmputCrntSpillCost_() { } /*****************************************************************************/ +void BBWithSpill::computeAndPrintClustering(InstSchedule *Sched) { + InstCount instNum; + InstCount cycleNum; + InstCount slotNum; + SchedInstruction *inst; + bool temp = ClusterMemoryOperations; + + ClusterMemoryOperations = true; + InitForCostCmputtn_(); + for (instNum = Sched->GetFrstInst(cycleNum, slotNum); + instNum != INVALID_VALUE; + instNum = Sched->GetNxtInst(cycleNum, slotNum)) { + inst = dataDepGraph_->GetInstByIndx(instNum); + SchdulInst(inst, cycleNum, slotNum, false); + } + printCurrentClustering(); + ClusterMemoryOperations = temp; +} + void BBWithSpill::saveCluster(SchedInstruction *inst) { if (LastCluster) // Save previous clusters in a vector except the last cluster @@ -1077,28 +1095,14 @@ FUNC_RESULT BBWithSpill::Enumerate_(Milliseconds startTime, HandlEnumrtrRslt_(rslt, trgtLngth); if (GetBestCost() == 0 || rslt == RES_ERROR || - (lngthDeadline == rgnDeadline && rslt == RES_TIMEOUT)) { //|| - //(rslt == RES_SUCCESS && IsSecondPass())) { - - // If doing two pass optsched and on the second pass then terminate if a - // schedule is found with the same min-RP found in first pass. - /* - if (rslt == RES_SUCCESS && IsSecondPass()) { - Logger::Info("Schedule found in second pass, terminating BB loop."); - - if (trgtLngth < schedUprBound_) - Logger::Info("Schedule found with length %d is shorter than current - schedule with length %d.", trgtLngth, schedUprBound_); - }*/ - + (lngthDeadline == rgnDeadline && rslt == RES_TIMEOUT)) { break; } enumrtr_->Reset(); enumCrntSched_->Reset(); - if (!IsSecondPass()) - CmputSchedUprBound_(); + CmputSchedUprBound_(); iterCnt++; costLwrBound += 1; @@ -1152,7 +1156,6 @@ InstCount BBWithSpill::UpdtOptmlSched(InstSchedule *crntSched, SetBestSchedLength(crntSched->GetCrntLngth()); enumBestSched_->Copy(crntSched); bestSched_ = enumBestSched_; - printCurrentClustering(); } return GetBestCost(); diff --git a/lib/Scheduler/data_dep.cpp b/lib/Scheduler/data_dep.cpp index 14a38ee7..7e6b3502 100644 --- a/lib/Scheduler/data_dep.cpp +++ b/lib/Scheduler/data_dep.cpp @@ -3059,6 +3059,10 @@ void InstSchedule::SetSpillCost(InstCount cost) { spillCost_ = cost; } InstCount InstSchedule::GetSpillCost() const { return spillCost_; } +void InstSchedule::setClusterSize(int size) { ClusterSize = size; } + +int InstSchedule::getClusterSize() const { return ClusterSize; } + /******************************************************************************* * Previously inlined functions ******************************************************************************/ diff --git a/lib/Scheduler/sched_region.cpp b/lib/Scheduler/sched_region.cpp index 8d52bb76..d40cf81d 100644 --- a/lib/Scheduler/sched_region.cpp +++ b/lib/Scheduler/sched_region.cpp @@ -49,6 +49,7 @@ SchedRegion::SchedRegion(MachineModel *machMdl, DataDepGraph *dataDepGraph, schedUprBound_ = INVALID_VALUE; spillCostFunc_ = spillCostFunc; + PrintClustering = false; } void SchedRegion::UseFileBounds_() { @@ -124,6 +125,9 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule( // heuristic scheduler or ACO before the branch & bound enumerator must be // enabled. Config &schedIni = SchedulerOptions::getInstance(); + PrintClustering = schedIni.GetBool("PRINT_CLUSTER"); + ClusterMemoryOperations = schedIni.GetBool("CLUSTER_MEMORY_OPS"); + ClusteringWeight = schedIni.GetInt("CLUSTER_WEIGHT"); bool HeuristicSchedulerEnabled = schedIni.GetBool("HEUR_ENABLED"); bool AcoSchedulerEnabled = schedIni.GetBool("ACO_ENABLED"); bool BbSchedulerEnabled = isBbEnabled(schedIni, rgnTimeout); @@ -178,17 +182,6 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule( CmputAbslutUprBound_(); schedLwrBound_ = dataDepGraph_->GetSchedLwrBound(); - // We can calculate lower bounds here since it is only dependent - // on schedLwrBound_ - if (!BbSchedulerEnabled) - costLwrBound_ = CmputCostLwrBound(); - else - CmputLwrBounds_(false); - - // Log the lower bound on the cost, allowing tools reading the log to compare - // absolute rather than relative costs. - Logger::Info("Lower bound of cost before scheduling: %d", costLwrBound_); - // Step #1: Find the heuristic schedule if enabled. // Note: Heuristic scheduler is required for the two-pass scheduler // to use the sequential list scheduler which inserts stalls into @@ -210,12 +203,37 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule( hurstcTime = Utilities::GetProcessorTime() - hurstcStart; stats::heuristicTime.Record(hurstcTime); - if (IsSecondPass()) - printCurrentClustering(); if (hurstcTime > 0) Logger::Info("Heuristic_Time %d", hurstcTime); + } + // After the sequential scheduler in the second pass, add the artificial edges + // to the DDG. Some mutations were adding artificial edges which caused a + // conflict with the sequential scheduler. Therefore, wait until the + // sequential scheduler is done before adding artificial edges. + if (IsSecondPass()) { + static_cast(dataDepGraph_)->addArtificialEdges(); + rslt = dataDepGraph_->UpdateSetupForSchdulng(needTransitiveClosure); + if (rslt != RES_SUCCESS) { + Logger::Info("Invalid DAG after adding artificial cluster edges"); + return rslt; + } + } + + // This must be done after SetupForSchdulng() or UpdateSetupForSchdulng() to + // avoid resetting lower bound values. + if (!BbSchedulerEnabled) + costLwrBound_ = CmputCostLwrBound(); + else + CmputLwrBounds_(false); + + // Log the lower bound on the cost, allowing tools reading the log to compare + // absolute rather than relative costs. + Logger::Info("Lower bound of cost before scheduling: %d", costLwrBound_); + + // Cost calculation must be below lower bounds calculation + if (HeuristicSchedulerEnabled || IsSecondPass()) { heuristicScheduleLength = lstSched->GetCrntLngth(); InstCount hurstcExecCost; // Compute cost for Heuristic list scheduler, this must be called before @@ -223,6 +241,9 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule( CmputNormCost_(lstSched, CCM_DYNMC, hurstcExecCost, true); hurstcCost_ = lstSched->GetCost(); + if (IsSecondPass() && PrintClustering) + computeAndPrintClustering(lstSched); + // This schedule is optimal so ACO will not be run // so set bestSched here. if (hurstcCost_ == 0) { @@ -230,6 +251,8 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule( bestSched = bestSched_ = lstSched; bestSchedLngth_ = heuristicScheduleLength; bestCost_ = hurstcCost_; + if (IsSecondPass() && ClusterMemoryOperations) + bestSched->setClusterSize(lstSched->getClusterSize()); } FinishHurstc_(); @@ -249,19 +272,6 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule( #endif } - // After the sequential scheduler in the second pass, add the artificial edges - // to the DDG. Some mutations were adding artificial edges which caused a - // conflict with the sequential scheduler. Therefore, wait until the - // sequential scheduler is done before adding artificial edges. - if (IsSecondPass()) { - static_cast(dataDepGraph_)->addArtificialEdges(); - rslt = dataDepGraph_->UpdateSetupForSchdulng(needTransitiveClosure); - if (rslt != RES_SUCCESS) { - Logger::Info("Invalid DAG after adding artificial cluster edges"); - return rslt; - } - } - // Step #2: Use ACO to find a schedule if enabled and no optimal schedule is // yet to be found. if (AcoBeforeEnum && !isLstOptml) { @@ -297,6 +307,8 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule( bestSched = bestSched_ = AcoSchedule; bestSchedLngth_ = AcoScheduleLength_; bestCost_ = AcoScheduleCost_; + if (IsSecondPass() && ClusterMemoryOperations) + bestSched->setClusterSize(AcoSchedule->getClusterSize()); } } @@ -312,6 +324,8 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule( bestSched = bestSched_ = lstSched; bestSchedLngth_ = heuristicScheduleLength; bestCost_ = hurstcCost_; + if (IsSecondPass() && ClusterMemoryOperations) + bestSched->setClusterSize(lstSched->getClusterSize()); } // B) Heuristic was never run. In that case, just use ACO and run with its // results, into B&B. @@ -319,6 +333,8 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule( bestSched = bestSched_ = AcoSchedule; bestSchedLngth_ = AcoScheduleLength_; bestCost_ = AcoScheduleCost_; + if (IsSecondPass() && ClusterMemoryOperations) + bestSched->setClusterSize(AcoSchedule->getClusterSize()); // C) Neither scheduler was optimal. In that case, compare the two // schedules and use the one that's better as the input (initialSched) for // B&B. @@ -327,6 +343,8 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule( bestSched = bestSched_; bestSchedLngth_ = bestSched_->GetCrntLngth(); bestCost_ = bestSched_->GetCost(); + if (IsSecondPass() && ClusterMemoryOperations) + bestSched->setClusterSize(bestSched_->getClusterSize()); } } // Step #3: Compute the cost upper bound. @@ -453,6 +471,9 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule( enumTime = Utilities::GetProcessorTime() - enumStart; stats::enumerationTime.Record(enumTime); + + if (IsSecondPass() && PrintClustering && enumBestSched_ != NULL) + computeAndPrintClustering(enumBestSched_); } // Step 5: Run ACO if schedule from enumerator is not optimal @@ -727,11 +748,6 @@ bool SchedRegion::CmputUprBounds_(InstSchedule *schedule, bool useFileBounds) { // If the heuristic schedule is optimal, we are done! schedUprBound_ = bestSchedLngth_; return true; - } else if (IsSecondPass()) { - // In the second pass, the upper bound is the length of the min-RP schedule - // that was found in the first pass with stalls inserted. - schedUprBound_ = schedule->GetCrntLngth(); - return false; } else { CmputSchedUprBound_(); return false; From 913f83d150c24bc053948338e6ec568e6df99fc5 Mon Sep 17 00:00:00 2001 From: vang thao Date: Sat, 15 Aug 2020 17:27:26 -0700 Subject: [PATCH 36/40] Added 2nd ILP pass with lower target occupancy --- include/opt-sched/Scheduler/data_dep.h | 1 + include/opt-sched/Scheduler/sched_region.h | 7 ++- lib/CMakeLists.txt | 1 + lib/Scheduler/bb_spill.cpp | 27 ++++++-- lib/Scheduler/data_dep.cpp | 39 ++++++++++++ lib/Scheduler/sched_region.cpp | 15 +++-- lib/Wrapper/AMDGPU/GCNOptSched.cpp | 45 ++++++++++++- lib/Wrapper/AMDGPU/GCNOptSched.h | 10 ++- lib/Wrapper/AMDGPU/OptSchedGCNTarget.cpp | 71 +++++---------------- lib/Wrapper/AMDGPU/OptSchedGCNTarget.h | 73 ++++++++++++++++++++++ lib/Wrapper/OptSchedDDGWrapperBasic.cpp | 16 ++--- lib/Wrapper/OptimizingScheduler.cpp | 51 ++++++++++----- lib/Wrapper/OptimizingScheduler.h | 13 +++- 13 files changed, 274 insertions(+), 95 deletions(-) create mode 100644 lib/Wrapper/AMDGPU/OptSchedGCNTarget.h diff --git a/include/opt-sched/Scheduler/data_dep.h b/include/opt-sched/Scheduler/data_dep.h index 4a1494ed..5b021145 100644 --- a/include/opt-sched/Scheduler/data_dep.h +++ b/include/opt-sched/Scheduler/data_dep.h @@ -722,6 +722,7 @@ class InstSchedule { void Print(std::ostream &out, char const *const title); void PrintInstList(FILE *file, DataDepGraph *dataDepGraph, const char *title) const; + void Print(std::ostream &out, char const *const title, DataDepGraph *ddg); void PrintRegPressures() const; bool Verify(MachineModel *machMdl, DataDepGraph *dataDepGraph); void PrintClassData(); diff --git a/include/opt-sched/Scheduler/sched_region.h b/include/opt-sched/Scheduler/sched_region.h index 0cc0a40e..2685b7d0 100644 --- a/include/opt-sched/Scheduler/sched_region.h +++ b/include/opt-sched/Scheduler/sched_region.h @@ -53,7 +53,7 @@ class SchedRegion { virtual ~SchedRegion() {} bool PrintClustering; - + bool TwoPassEnabled; virtual void computeAndPrintClustering(InstSchedule *Sched) = 0; virtual void printCurrentClustering() = 0; @@ -113,6 +113,9 @@ class SchedRegion { // Initialie variables for the second pass of the two-pass-optsched void InitSecondPass(); + bool enumFoundSchedule() { return EnumFoundSchedule; } + void setEnumFoundSchedule() { EnumFoundSchedule = true; } + private: // The algorithm to use for calculated lower bounds. LB_ALG lbAlg_; @@ -133,6 +136,8 @@ class SchedRegion { // Used for two-pass-optsched to enable second pass functionalies. bool isSecondPass_; + bool EnumFoundSchedule; + // The absolute cost lower bound to be used as a ref for normalized costs. InstCount costLwrBound_ = 0; diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 847cc6e5..9102bf94 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -5,3 +5,4 @@ ELSE() ENDIF() add_dependencies(OptSched ${OPT_SCHED_TARGET_DEPS}) +target_link_libraries(OptSched -L/home/vang/src/ROCm-2.4/opencl/build/lib/ libamdocl64.so) diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp index 92d93fe2..ee817ec7 100644 --- a/lib/Scheduler/bb_spill.cpp +++ b/lib/Scheduler/bb_spill.cpp @@ -381,7 +381,7 @@ void BBWithSpill::InitForSchdulng() { /*****************************************************************************/ void BBWithSpill::InitForCostCmputtn_() { - if (IsSecondPass() && ClusterMemoryOperations) + if (ClusterMemoryOperations && (IsSecondPass() || !TwoPassEnabled)) initForClustering(); int i; @@ -434,8 +434,23 @@ InstCount BBWithSpill::CmputNormCost_(InstSchedule *sched, InstCount BBWithSpill::CmputCost_(InstSchedule *sched, COST_COMP_MODE compMode, InstCount &execCost, bool trackCnflcts) { + + InstCount instNum; + InstCount cycleNum; + InstCount slotNum; + SchedInstruction *inst; + if (compMode == CCM_STTC) { - if (GetSpillCostFunc() == SCF_SPILLS) { + if (GetSpillCostFunc() != SCF_SPILLS) { + InitForCostCmputtn_(); + + for (instNum = sched->GetFrstInst(cycleNum, slotNum); + instNum != INVALID_VALUE; + instNum = sched->GetNxtInst(cycleNum, slotNum)) { + inst = dataDepGraph_->GetInstByIndx(instNum); + SchdulInst(inst, cycleNum, slotNum, trackCnflcts); + } + } else { LocalRegAlloc regAlloc(sched, dataDepGraph_); regAlloc.SetupForRegAlloc(); regAlloc.AllocRegs(); @@ -613,7 +628,7 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst, // Possibly keep track of the current memory clustering size here // and in UpdateSpillInfoForUnSchdul_() - if (IsSecondPass() && ClusterMemoryOperations) { + if (ClusterMemoryOperations && (IsSecondPass() || !TwoPassEnabled)) { // Check if the current instruction is part of a cluster if (inst->GetMayCluster()) { // Check if there is a current active cluster @@ -857,7 +872,7 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) { // 2.) Non-Cluster <- Cluster // 3.) Different Cluster <- Cluster // 4.) Cluster <- Non-cluster - if (IsSecondPass() && ClusterMemoryOperations) { + if (ClusterMemoryOperations && (IsSecondPass() || !TwoPassEnabled)) { // If the instruction we are backtracking from is part of a cluster if (inst->GetMayCluster()) { if (CurrentClusterSize != 0) { @@ -1156,6 +1171,8 @@ InstCount BBWithSpill::UpdtOptmlSched(InstSchedule *crntSched, SetBestSchedLength(crntSched->GetCrntLngth()); enumBestSched_->Copy(crntSched); bestSched_ = enumBestSched_; + if (!enumFoundSchedule()) + setEnumFoundSchedule(); } return GetBestCost(); @@ -1163,7 +1180,7 @@ InstCount BBWithSpill::UpdtOptmlSched(InstSchedule *crntSched, void BBWithSpill::printCurrentClustering() { // Print the instructions in the clusters after finding a schedule. - if (IsSecondPass() && ClusterMemoryOperations) { + if (ClusterMemoryOperations && (IsSecondPass() || !TwoPassEnabled)) { dbgs() << "Printing clustered instructions:\n"; int i = 1; for (const auto &clusters : PastClustersList) { diff --git a/lib/Scheduler/data_dep.cpp b/lib/Scheduler/data_dep.cpp index 7e6b3502..ef6e2cda 100644 --- a/lib/Scheduler/data_dep.cpp +++ b/lib/Scheduler/data_dep.cpp @@ -2762,6 +2762,7 @@ void InstSchedule::Copy(InstSchedule *src) { SetSpillCosts(src->spillCosts_); SetPeakRegPressures(src->peakRegPressures_); + setClusterSize(src->getClusterSize()); cost_ = src->cost_; execCost_ = src->execCost_; spillCost_ = src->spillCost_; @@ -2836,6 +2837,44 @@ void InstSchedule::Print(std::ostream &out, char const *const label) { } } + + void InstSchedule::Print(std::ostream &out, char const *const title, + DataDepGraph *ddg) { + InstCount slotInCycle = 0; + InstCount cycleNum = 0; + InstCount i; + + // out << '\n' << label << " Schedule"; + Logger::Info("Printing Schedule"); + + for (i = 0; i < crntSlotNum_; i++) { + if (slotInCycle == 0) { + if (instInSlot_[i] != SCHD_STALL) { + InstCount instNum = instInSlot_[i]; + SchedInstruction *inst = ddg->GetInstByIndx(instNum); + Logger::Info("Cycle# %d : %d - %s", cycleNum, instInSlot_[i], inst->GetName()); + } else + Logger::Info("Cycle# %d : %d -", cycleNum, instInSlot_[i]); + } + /* + out << "\nCycle# " << cycleNum << ": "; + + if (instInSlot_[i] == SCHD_STALL) { + out << "X "; + } else { + out << instInSlot_[i] << ' '; + } + */ + + slotInCycle++; + + if (slotInCycle == issuRate_) { + slotInCycle = 0; + cycleNum++; + } + } + } + #if defined(IS_DEBUG_PEAK_PRESSURE) || defined(IS_DEBUG_OPTSCHED_PRESSURES) void InstSchedule::PrintRegPressures() const { Logger::Info("OptSched max reg pressures:"); diff --git a/lib/Scheduler/sched_region.cpp b/lib/Scheduler/sched_region.cpp index d40cf81d..cb545bc7 100644 --- a/lib/Scheduler/sched_region.cpp +++ b/lib/Scheduler/sched_region.cpp @@ -50,6 +50,7 @@ SchedRegion::SchedRegion(MachineModel *machMdl, DataDepGraph *dataDepGraph, spillCostFunc_ = spillCostFunc; PrintClustering = false; + EnumFoundSchedule = false; } void SchedRegion::UseFileBounds_() { @@ -126,6 +127,7 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule( // enabled. Config &schedIni = SchedulerOptions::getInstance(); PrintClustering = schedIni.GetBool("PRINT_CLUSTER"); + TwoPassEnabled = schedIni.GetBool("USE_TWO_PASS"); ClusterMemoryOperations = schedIni.GetBool("CLUSTER_MEMORY_OPS"); ClusteringWeight = schedIni.GetInt("CLUSTER_WEIGHT"); bool HeuristicSchedulerEnabled = schedIni.GetBool("HEUR_ENABLED"); @@ -241,9 +243,6 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule( CmputNormCost_(lstSched, CCM_DYNMC, hurstcExecCost, true); hurstcCost_ = lstSched->GetCost(); - if (IsSecondPass() && PrintClustering) - computeAndPrintClustering(lstSched); - // This schedule is optimal so ACO will not be run // so set bestSched here. if (hurstcCost_ == 0) { @@ -471,9 +470,6 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule( enumTime = Utilities::GetProcessorTime() - enumStart; stats::enumerationTime.Record(enumTime); - - if (IsSecondPass() && PrintClustering && enumBestSched_ != NULL) - computeAndPrintClustering(enumBestSched_); } // Step 5: Run ACO if schedule from enumerator is not optimal @@ -646,6 +642,13 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule( } #endif + if (PrintClustering && bestSched != NULL && (IsSecondPass() || !TwoPassEnabled)) { + computeAndPrintClustering(bestSched); + } + + //if (bestSched != NULL) + //bestSched->Print(Logger::GetLogStream(), "FinalSched", dataDepGraph_); + return rslt; } diff --git a/lib/Wrapper/AMDGPU/GCNOptSched.cpp b/lib/Wrapper/AMDGPU/GCNOptSched.cpp index 46d6c1a3..8b987e3e 100644 --- a/lib/Wrapper/AMDGPU/GCNOptSched.cpp +++ b/lib/Wrapper/AMDGPU/GCNOptSched.cpp @@ -8,7 +8,10 @@ #include "AMDGPUMacroFusion.h" #include "GCNSchedStrategy.h" #include "SIMachineFunctionInfo.h" +#include "OptSchedGCNTarget.h" +//#include "llvm/CodeGen/OptSequential.h" #include "llvm/Support/Debug.h" +#include #define DEBUG_TYPE "optsched" @@ -60,6 +63,7 @@ void ScheduleDAGOptSchedGCN::initSchedulers() { SchedPasses.push_back(OptSchedMaxOcc); // Second SchedPasses.push_back(OptSchedBalanced); + SchedPasses.push_back(OptSchedReschedule); } // Execute scheduling passes. @@ -67,15 +71,42 @@ void ScheduleDAGOptSchedGCN::initSchedulers() { void ScheduleDAGOptSchedGCN::finalizeSchedule() { if (TwoPassEnabled && OptSchedEnabled) { initSchedulers(); + RescheduleRegions.resize(Regions.size()); + RescheduleRegions.set(); LLVM_DEBUG(dbgs() << "Starting two pass scheduling approach\n"); TwoPassSchedulingStarted = true; for (const SchedPassStrategy &S : SchedPasses) { MachineBasicBlock *MBB = nullptr; // Reset - RegionNumber = ~0u; + RegionIdx = 0; + if (S == OptSchedReschedule) { + if (RescheduleRegions.none()) { + dbgs() << "No regions to reschedule.\n"; + continue; + } else { + auto GCNOST = static_cast(OST.get()); + unsigned TargetOccupancy = GCNOST->getTargetOcc(); + if (TargetOccupancy == 1u) { + dbgs() << "Cannot lower occupancy to below 1.\n"; + continue; + } + + dbgs() << "Beginning rescheduling of regions.\n"; + unsigned NewTarget = TargetOccupancy - 1u; + dbgs() << "Decreasing current target occupancy " << TargetOccupancy + << " to new target " << NewTarget << '\n'; + GCNOST->limitOccupancy(NewTarget); + } + } for (auto &Region : Regions) { + /*if (S == OptSchedReschedule && !RescheduleRegions[RegionIdx]) { + dbgs() << "Region " << RegionIdx << " does not need to be rescheduled.\n"; + ++RegionIdx; + continue; + }*/ + RegionBegin = Region.first; RegionEnd = Region.second; @@ -98,6 +129,7 @@ void ScheduleDAGOptSchedGCN::finalizeSchedule() { LLVM_DEBUG(getRealRegionPressure(RegionBegin, RegionEnd, LIS, "After")); Region = std::make_pair(RegionBegin, RegionEnd); exitRegion(); + ++RegionIdx; } finishBlock(); } @@ -114,6 +146,7 @@ void ScheduleDAGOptSchedGCN::finalizeSchedule() { } void ScheduleDAGOptSchedGCN::runSchedPass(SchedPassStrategy S) { + RescheduleRegions[RegionIdx] = false; switch (S) { case GCNMaxOcc: scheduleGCNMaxOcc(); @@ -124,6 +157,9 @@ void ScheduleDAGOptSchedGCN::runSchedPass(SchedPassStrategy S) { case OptSchedBalanced: scheduleOptSchedBalanced(); break; + case OptSchedReschedule: + scheduleOptSchedReschedule(); + break; } } @@ -144,3 +180,10 @@ void ScheduleDAGOptSchedGCN::scheduleOptSchedMaxOcc() { void ScheduleDAGOptSchedGCN::scheduleOptSchedBalanced() { ScheduleDAGOptSched::scheduleOptSchedBalanced(); } + +void ScheduleDAGOptSchedGCN::scheduleOptSchedReschedule() { + IsThirdPass = true; + ScheduleDAGOptSched::scheduleOptSchedBalanced(); + Logger::Info("End of third pass through\n"); +} + diff --git a/lib/Wrapper/AMDGPU/GCNOptSched.h b/lib/Wrapper/AMDGPU/GCNOptSched.h index f08056aa..3d2646af 100644 --- a/lib/Wrapper/AMDGPU/GCNOptSched.h +++ b/lib/Wrapper/AMDGPU/GCNOptSched.h @@ -9,13 +9,19 @@ #include "../OptimizingScheduler.h" #include "GCNRegPressure.h" +#include "OptSchedGCNTarget.h" namespace llvm { namespace opt_sched { class ScheduleDAGOptSchedGCN : public ScheduleDAGOptSched { private: - enum SchedPassStrategy { GCNMaxOcc, OptSchedMaxOcc, OptSchedBalanced }; + enum SchedPassStrategy { + GCNMaxOcc, + OptSchedMaxOcc, + OptSchedBalanced, + OptSchedReschedule + }; // Vector of scheduling passes to execute. SmallVector SchedPasses; @@ -45,6 +51,8 @@ class ScheduleDAGOptSchedGCN : public ScheduleDAGOptSched { // Run OptSched in ILP/RP balanced mode. void scheduleOptSchedBalanced() override; + + void scheduleOptSchedReschedule(); }; } // namespace opt_sched diff --git a/lib/Wrapper/AMDGPU/OptSchedGCNTarget.cpp b/lib/Wrapper/AMDGPU/OptSchedGCNTarget.cpp index 21faf51e..9f63a720 100644 --- a/lib/Wrapper/AMDGPU/OptSchedGCNTarget.cpp +++ b/lib/Wrapper/AMDGPU/OptSchedGCNTarget.cpp @@ -3,6 +3,7 @@ // AMDGCN OptSched target. // //===----------------------------------------------------------------------===// +#include "OptSchedGCNTarget.h" #include "OptSchedDDGWrapperGCN.h" #include "SIMachineFunctionInfo.h" #include "Wrapper/OptSchedMachineWrapper.h" @@ -22,7 +23,7 @@ using namespace llvm::opt_sched; // This is necessary because we cannot perfectly predict the number of registers // of each type that will be allocated. -static const unsigned GPRErrorMargin = 3; +static const unsigned GPRErrorMargin = 0; #ifndef NDEBUG static unsigned getOccupancyWeight(unsigned Occupancy) { @@ -62,56 +63,6 @@ static unsigned getAdjustedOccupancy(const GCNSubtarget *ST, unsigned VGPRCount, namespace { -class OptSchedGCNTarget : public OptSchedTarget { -public: - std::unique_ptr - createMachineModel(const char *ConfigPath) override { - return llvm::make_unique(ConfigPath); - } - - std::unique_ptr - createDDGWrapper(llvm::MachineSchedContext *Context, ScheduleDAGOptSched *DAG, - OptSchedMachineModel *MM, LATENCY_PRECISION LatencyPrecision, - const std::string &RegionID) override { - return llvm::make_unique(Context, DAG, MM, - LatencyPrecision, RegionID); - } - - void initRegion(llvm::ScheduleDAGInstrs *DAG, MachineModel *MM_) override; - - void finalizeRegion(const InstSchedule *Schedule) override; - - // Returns occupancy cost with number of VGPRs and SGPRs from PRP for - // a partial or complete schedule. - InstCount getCost(const llvm::SmallVectorImpl &PRP) const override; - - void dumpOccupancyInfo(const InstSchedule *Schedule) const; - - // Revert scheduing if we decrease occupancy. - bool shouldKeepSchedule() override; - -private: - const llvm::MachineFunction *MF; - SIMachineFunctionInfo *MFI; - ScheduleDAGOptSched *DAG; - const GCNSubtarget *ST; - - unsigned RegionStartingOccupancy; - unsigned RegionEndingOccupancy; - unsigned TargetOccupancy; - - // Max occupancy with local memory size; - unsigned MaxOccLDS; - - // In RP only (max occupancy) scheduling mode we should try to find - // a min-RP schedule without considering perf hints which suggest limiting - // occupancy. Returns true if we should consider perf hints. - bool shouldLimitWaves() const; - - // Find occupancy with spill cost. - unsigned getOccupancyWithCost(const InstCount Cost) const; -}; - std::unique_ptr createOptSchedGCNTarget() { return llvm::make_unique(); } @@ -161,9 +112,9 @@ void OptSchedGCNTarget::initRegion(llvm::ScheduleDAGInstrs *DAG_, TargetOccupancy = shouldLimitWaves() ? MFI->getMinAllowedOccupancy() : MFI->getOccupancy(); - LLVM_DEBUG(dbgs() << "Region starting occupancy is " + dbgs() << "Region starting occupancy is " << RegionStartingOccupancy << "\n" - << "Target occupancy is " << TargetOccupancy << "\n"); + << "Target occupancy is " << TargetOccupancy << "\n"; } bool OptSchedGCNTarget::shouldLimitWaves() const { @@ -173,6 +124,16 @@ bool OptSchedGCNTarget::shouldLimitWaves() const { return false; } +void OptSchedGCNTarget::setTargetOcc(unsigned Target) { + dbgs() << "Setting target occupancy to " << Target << '\n'; + TargetOccupancy = Target; +} +void OptSchedGCNTarget::limitOccupancy(unsigned Limit) { + dbgs() << "Limiting occupancy to " << Limit << '\n'; + MFI->limitOccupancy(Limit); + TargetOccupancy = MFI->getOccupancy(); +} + unsigned OptSchedGCNTarget::getOccupancyWithCost(const InstCount Cost) const { return TargetOccupancy - Cost; } @@ -184,9 +145,9 @@ void OptSchedGCNTarget::finalizeRegion(const InstSchedule *Schedule) { // If we decrease occupancy we may revert scheduling. unsigned RegionOccupancy = std::max(RegionStartingOccupancy, RegionEndingOccupancy); - LLVM_DEBUG(if (RegionOccupancy < MFI->getOccupancy()) dbgs() + if (RegionOccupancy < MFI->getOccupancy()) dbgs() << "Limiting occupancy to " << RegionEndingOccupancy - << " waves.\n"); + << " waves.\n"; MFI->limitOccupancy(RegionOccupancy); } diff --git a/lib/Wrapper/AMDGPU/OptSchedGCNTarget.h b/lib/Wrapper/AMDGPU/OptSchedGCNTarget.h new file mode 100644 index 00000000..996caaff --- /dev/null +++ b/lib/Wrapper/AMDGPU/OptSchedGCNTarget.h @@ -0,0 +1,73 @@ +#ifndef LLVM_GCN_OPT_SCHED_TARGET_H +#define LLVM_GCN_OPT_SCHED_TARGET_H + +#include "OptSchedDDGWrapperGCN.h" +#include "SIMachineFunctionInfo.h" +#include "Wrapper/OptSchedMachineWrapper.h" +#include "opt-sched/Scheduler/OptSchedTarget.h" +#include "opt-sched/Scheduler/data_dep.h" +#include "opt-sched/Scheduler/defines.h" +#include "opt-sched/Scheduler/machine_model.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/ScheduleDAGInstrs.h" +#include +#include + +using namespace llvm; +using namespace llvm::opt_sched; + +class OptSchedGCNTarget : public OptSchedTarget { +public: + std::unique_ptr + createMachineModel(const char *ConfigPath) override { + return llvm::make_unique(ConfigPath); + } + + std::unique_ptr + createDDGWrapper(llvm::MachineSchedContext *Context, ScheduleDAGOptSched *DAG, + OptSchedMachineModel *MM, LATENCY_PRECISION LatencyPrecision, + const std::string &RegionID) override { + return llvm::make_unique(Context, DAG, MM, + LatencyPrecision, RegionID); + } + + void initRegion(llvm::ScheduleDAGInstrs *DAG, MachineModel *MM_) override; + + void finalizeRegion(const InstSchedule *Schedule) override; + + // Returns occupancy cost with number of VGPRs and SGPRs from PRP for + // a partial or complete schedule. + InstCount getCost(const llvm::SmallVectorImpl &PRP) const override; + + void dumpOccupancyInfo(const InstSchedule *Schedule) const; + + // Revert scheduing if we decrease occupancy. + bool shouldKeepSchedule() override; + + void limitOccupancy(unsigned Limit); + unsigned getTargetOcc() { return TargetOccupancy; } + void setTargetOcc(unsigned Target); + +private: + const llvm::MachineFunction *MF; + SIMachineFunctionInfo *MFI; + ScheduleDAGOptSched *DAG; + const GCNSubtarget *ST; + + unsigned RegionStartingOccupancy; + unsigned RegionEndingOccupancy; + unsigned TargetOccupancy; + + // Max occupancy with local memory size; + unsigned MaxOccLDS; + + // In RP only (max occupancy) scheduling mode we should try to find + // a min-RP schedule without considering perf hints which suggest limiting + // occupancy. Returns true if we should consider perf hints. + bool shouldLimitWaves() const; + + // Find occupancy with spill cost. + unsigned getOccupancyWithCost(const InstCount Cost) const; +}; + +#endif diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp index 62305fa3..f5b03fe7 100644 --- a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp +++ b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp @@ -543,7 +543,7 @@ int OptSchedDDGWrapperBasic::clusterNeighboringMemOps( } if (MemOpRecords.size() < 2) { - dbgs() << " Unable to cluster memop cluster of 1.\n"; + LLVM_DEBUG(dbgs() << " Unable to cluster memop cluster of 1.\n"); return 0; } @@ -552,16 +552,16 @@ int OptSchedDDGWrapperBasic::clusterNeighboringMemOps( for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) { const SUnit *SUa = MemOpRecords[Idx].SU; const SUnit *SUb = MemOpRecords[Idx + 1].SU; - dbgs() << " Checking possible clustering of (" << SUa->NodeNum << ") and (" - << SUb->NodeNum << ")\n"; + LLVM_DEBUG(dbgs() << " Checking possible clustering of (" << SUa->NodeNum << ") and (" + << SUb->NodeNum << ")\n"); // Pass constant of 1 to AMD's function to determine clustering to remove // the limit of 15. Our enumerator can determine when it has reached the // limit instead of depending on AMD. if (DAG->TII->shouldClusterMemOps(*MemOpRecords[Idx].BaseOp, *MemOpRecords[Idx + 1].BaseOp, 1u)) { - dbgs() << " Cluster possible at SU(" << SUa->NodeNum << ")- SU(" - << SUb->NodeNum << ")\n"; + LLVM_DEBUG(dbgs() << " Cluster possible at SU(" << SUa->NodeNum << ")- SU(" + << SUb->NodeNum << ")\n"); // If clustering is possible then increase the cluster count. This only // happens once every new cluster @@ -653,10 +653,10 @@ int OptSchedDDGWrapperBasic::findPossibleClusters(bool IsLoad) { // Iterate over the store chains. for (auto &SCD : StoreChainDependents) { // Print the chain that LLVM has found - dbgs() << "Printing the Node ID of the current chain: "; + LLVM_DEBUG(dbgs() << "Printing the Node ID of the current chain: "); for (auto SU1 : SCD) - dbgs() << SU1->NodeNum << " "; - dbgs() << '\n'; + LLVM_DEBUG(dbgs() << SU1->NodeNum << " "); + LLVM_DEBUG(dbgs() << '\n'); TotalInstructionsPossible += clusterNeighboringMemOps(SCD); } diff --git a/lib/Wrapper/OptimizingScheduler.cpp b/lib/Wrapper/OptimizingScheduler.cpp index 8d6e1d77..ab09f7c2 100644 --- a/lib/Wrapper/OptimizingScheduler.cpp +++ b/lib/Wrapper/OptimizingScheduler.cpp @@ -16,10 +16,14 @@ #include "opt-sched/Scheduler/register.h" #include "opt-sched/Scheduler/sched_region.h" #include "opt-sched/Scheduler/utilities.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringMap.h" +#include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/MachineScheduler.h" +/*#include "llvm/CodeGen/OptSequential.h"*/ #include "llvm/CodeGen/RegisterClassInfo.h" #include "llvm/CodeGen/RegisterPressure.h" #include "llvm/CodeGen/ScheduleDAG.h" @@ -34,11 +38,15 @@ #include #include #include +#include "AMDGPU/OptSchedGCNTarget.h" #define DEBUG_TYPE "optsched" using namespace llvm::opt_sched; +llvm::SmallVector UniqueRegionNames; +llvm::DenseMap RegionCounter; + // hack to print spills bool OPTSCHED_gPrintSpills; @@ -71,8 +79,8 @@ static ScheduleDAGInstrs *createOptSched(MachineSchedContext *C) { // Register the machine scheduler. static MachineSchedRegistry OptSchedMIRegistry("optsched", - "Use the OptSched scheduler.", - createOptSched); + "Use the OptSched scheduler.", + createOptSched); // Command line options for opt-sched. static cl::opt OptSchedCfg( @@ -258,10 +266,9 @@ void ScheduleDAGOptSched::schedule() { ShouldTrackLaneMasks = true; Config &schedIni = SchedulerOptions::getInstance(); - ++RegionNumber; const std::string RegionName = C->MF->getFunction().getName().data() + std::string(":") + - std::to_string(RegionNumber); + std::to_string(RegionIdx); // If two pass scheduling is enabled then // first just record the scheduling region. @@ -374,16 +381,21 @@ void ScheduleDAGOptSched::schedule() { // Build LLVM DAG SetupLLVMDag(); OST->initRegion(this, MM.get()); + // Convert graph auto DDG = OST->createDDGWrapper(C, this, MM.get(), LatencyPrecision, RegionName); - // Find all clusterable instructions for the second pass. - if (SecondPass) { - // In the second pass, ignore artificial edges before running the sequential - // heuristic list scheduler. - DDG->convertSUnits(false, true); + // In the second pass, ignore artificial edges before running the sequential + // heuristic list scheduler. + if (SecondPass) + DDG->convertSUnits(/*IgnoreRealEdges=*/false, + /*IgnoreArtificialEdges=*/true); + else + DDG->convertSUnits(false, false); + // Find all clusterable instructions for the second pass. + if (SecondPass || (!TwoPassEnabled && schedIni.GetBool("PRINT_CLUSTER"))) { dbgs() << "Finding load clusters.\n"; int TotalLoadsInstructionsClusterable = DDG->findPossibleClusters(true); if (TotalLoadsInstructionsClusterable == 0) @@ -416,8 +428,7 @@ void ScheduleDAGOptSched::schedule() { DataDepGraphInstance->getTotalInstructionsInCluster(begin)); } } - } else - DDG->convertSUnits(false, false); + } DDG->convertRegFiles(); @@ -469,10 +480,18 @@ void ScheduleDAGOptSched::schedule() { return; } + // BB Enumerator did not find a schedule. + // Add the region to the list to be rescheduled. + if (SecondPass && !region->enumFoundSchedule() && !IsEasy && !IsThirdPass) + RescheduleRegions[RegionIdx] = true; + LLVM_DEBUG(Logger::Info("OptSched succeeded.")); - OST->finalizeRegion(Sched); - if (!OST->shouldKeepSchedule()) - return; + + if (!IsThirdPass) { + OST->finalizeRegion(Sched); + if (!OST->shouldKeepSchedule()) + return; + } // Count simulated spills. if (isSimRegAllocEnabled()) { @@ -570,6 +589,7 @@ void ScheduleDAGOptSched::loadOptSchedConfig() { TwoPassEnabled = isTwoPassEnabled(); TwoPassSchedulingStarted = false; SecondPass = false; + IsThirdPass = false; LatencyPrecision = fetchLatencyPrecision(); TreatOrderAsDataDeps = schedIni.GetBool("TREAT_ORDER_DEPS_AS_DATA_DEPS"); @@ -784,13 +804,14 @@ bool ScheduleDAGOptSched::rpMismatch(InstSchedule *sched) { void ScheduleDAGOptSched::finalizeSchedule() { if (TwoPassEnabled && OptSchedEnabled) { initSchedulers(); + RescheduleRegions.resize(Regions.size()); LLVM_DEBUG(dbgs() << "Starting two pass scheduling approach\n"); TwoPassSchedulingStarted = true; for (const SchedPassStrategy &S : SchedPasses) { MachineBasicBlock *MBB = nullptr; // Reset - RegionNumber = ~0u; + RegionIdx = 0; for (auto &Region : Regions) { RegionBegin = Region.first; diff --git a/lib/Wrapper/OptimizingScheduler.h b/lib/Wrapper/OptimizingScheduler.h index 784c0681..aadca182 100644 --- a/lib/Wrapper/OptimizingScheduler.h +++ b/lib/Wrapper/OptimizingScheduler.h @@ -14,6 +14,7 @@ #include "opt-sched/Scheduler/data_dep.h" #include "opt-sched/Scheduler/graph_trans.h" #include "opt-sched/Scheduler/sched_region.h" +#include "llvm/ADT/BitVector.h" #include "llvm/ADT/SmallString.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/Support/Debug.h" @@ -59,13 +60,19 @@ class ScheduleDAGOptSched : public ScheduleDAGMILive { // pass. Used for the two pass scheduling approach. bool SecondPass; + bool IsThirdPass; + // Region number uniquely identifies DAGs. - unsigned RegionNumber = ~0u; + size_t RegionIdx; + + // Records if a region is not yet scheduled, or schedule has been reverted, + // or we generally desire to reschedule it. + llvm::BitVector RescheduleRegions; MachineSchedContext *C; // The OptSched target machine. - std::unique_ptr OST; + std::shared_ptr OST; // into the OptSched machine model std::unique_ptr MM; @@ -251,7 +258,7 @@ class ScheduleDAGOptSched : public ScheduleDAGMILive { void dumpLLVMRegisters() const; // Getter for region number - int getRegionNum() const { return RegionNumber; } + int getRegionNum() const { return RegionIdx; } // Return the boundary instruction for this region if it is not a sentinel // value. From b01eeff892f23af377febdf3ce637d91b984869c Mon Sep 17 00:00:00 2001 From: Vang Thao Date: Mon, 17 Aug 2020 15:51:41 -0500 Subject: [PATCH 37/40] Add two conditions for re-scheduling ILP pass; Minimum occupancy and minimum ILP improvements. --- example/optsched-cfg/sched.ini | 14 +++- lib/Wrapper/AMDGPU/GCNOptSched.cpp | 116 +++++++++++++++++++++------- lib/Wrapper/AMDGPU/GCNOptSched.h | 21 ++++- lib/Wrapper/OptimizingScheduler.cpp | 50 ++++++++---- lib/Wrapper/OptimizingScheduler.h | 18 ++++- 5 files changed, 170 insertions(+), 49 deletions(-) diff --git a/example/optsched-cfg/sched.ini b/example/optsched-cfg/sched.ini index 89a79d2b..d1c88a18 100644 --- a/example/optsched-cfg/sched.ini +++ b/example/optsched-cfg/sched.ini @@ -19,7 +19,19 @@ PRINT_CLUSTER YES # NO USE_TWO_PASS NO -# Allow enumerator to try to cluster memory operations together in the second pass. +# Sets a limit for occupancy in the second ILP pass. We will not go below this +# occupancy when attempting rescheduling. +# Valid values: 1-10 (whole integers) +MIN_OCCUPANCY_FOR_RESCHEDULE 3 + +# Sets the required schedule length improvement percentage for the second ILP +# pass. If we do not meet this minimum improvement then we do not keep the +# lower occupancy schedules. +# Valid values: 1-100 (whole integers) +MIN_ILP_IMPROVEMENT 10 + +# Allow enumerator to try to cluster memory operations together in the second +# pass. # YES # NO CLUSTER_MEMORY_OPS NO diff --git a/lib/Wrapper/AMDGPU/GCNOptSched.cpp b/lib/Wrapper/AMDGPU/GCNOptSched.cpp index 8b987e3e..c2f65463 100644 --- a/lib/Wrapper/AMDGPU/GCNOptSched.cpp +++ b/lib/Wrapper/AMDGPU/GCNOptSched.cpp @@ -7,8 +7,8 @@ #include "GCNOptSched.h" #include "AMDGPUMacroFusion.h" #include "GCNSchedStrategy.h" -#include "SIMachineFunctionInfo.h" #include "OptSchedGCNTarget.h" +#include "SIMachineFunctionInfo.h" //#include "llvm/CodeGen/OptSequential.h" #include "llvm/Support/Debug.h" #include @@ -46,7 +46,29 @@ static void getRealRegionPressure(MachineBasicBlock::const_iterator Begin, ScheduleDAGOptSchedGCN::ScheduleDAGOptSchedGCN( llvm::MachineSchedContext *C, std::unique_ptr S) - : ScheduleDAGOptSched(C, std::move(S)) {} + : ScheduleDAGOptSched(C, std::move(S)) { + MinOcc = getMinOcc(); +} + +unsigned ScheduleDAGOptSchedGCN::getMinOcc() { + SchedulerOptions &schedIni = SchedulerOptions::getInstance(); + int MinOcc = schedIni.GetInt("MIN_OCCUPANCY_FOR_RESCHEDULE"); + if (MinOcc <= 10 || MinOcc >= 1) + return MinOcc; + + llvm_unreachable( + "Unrecognized option for MIN_OCCUPANCY_FOR_RESCHEDULE setting.") +} + +int ScheduleDAGOptSchedGCN::getMinILPImprovement() { + SchedulerOptions &schedIni = SchedulerOptions::getInstance(); + int MinIlpImprovement = schedIni.GetInt("MIN_ILP_IMPROVEMENT"); + if (MinIlpImprovement <= 100 || MinIlpImprovement >= 1) + return MinIlpImprovement; + + llvm_unreachable( + "Unrecognized option for MIN_OCCUPANCY_FOR_RESCHEDULE setting.") +} void ScheduleDAGOptSchedGCN::initSchedulers() { // Add DAG mutations that apply to both GCN and OptSched DAG's @@ -61,10 +83,11 @@ void ScheduleDAGOptSchedGCN::initSchedulers() { // First SchedPasses.push_back(OptSchedMaxOcc); - // Second + // Second ILP passes SchedPasses.push_back(OptSchedBalanced); - SchedPasses.push_back(OptSchedReschedule); -} + SchedPasses.push_back(OptSchedLowerOccAnalysis); + SchedPasses.push_back(OptSchedCommitLowerOcc); +} // Execute scheduling passes. // Partially copied GCNScheduleDAGMILive::finalizeSchedule @@ -72,6 +95,8 @@ void ScheduleDAGOptSchedGCN::finalizeSchedule() { if (TwoPassEnabled && OptSchedEnabled) { initSchedulers(); RescheduleRegions.resize(Regions.size()); + ILPAnalysis.resize(Regions.size()); + CostAnalysis.resize(Regions.size()); RescheduleRegions.set(); LLVM_DEBUG(dbgs() << "Starting two pass scheduling approach\n"); @@ -80,32 +105,37 @@ void ScheduleDAGOptSchedGCN::finalizeSchedule() { MachineBasicBlock *MBB = nullptr; // Reset RegionIdx = 0; - if (S == OptSchedReschedule) { + if (S == OptSchedLowerOccAnalysis) { if (RescheduleRegions.none()) { - dbgs() << "No regions to reschedule.\n"; - continue; - } else { + dbgs() << "No regions to reschedule.\n"; + break; + } else { auto GCNOST = static_cast(OST.get()); unsigned TargetOccupancy = GCNOST->getTargetOcc(); - if (TargetOccupancy == 1u) { - dbgs() << "Cannot lower occupancy to below 1.\n"; - continue; - } + if (TargetOccupancy <= MinOcc) { + dbgs() << "Cannot lower occupancy to below minimum occupancy of " + << MinOCc << '\n'; + break; + } dbgs() << "Beginning rescheduling of regions.\n"; - unsigned NewTarget = TargetOccupancy - 1u; - dbgs() << "Decreasing current target occupancy " << TargetOccupancy + unsigned NewTarget = TargetOccupancy - 1u; + dbgs() << "Decreasing current target occupancy " << TargetOccupancy << " to new target " << NewTarget << '\n'; - GCNOST->limitOccupancy(NewTarget); - } + GCNOST->limitOccupancy(NewTarget); + } + } else if (S == OptSchedCommitLowerOcc) { + if (!shouldCommitLowerOccSched()) + break; } for (auto &Region : Regions) { - /*if (S == OptSchedReschedule && !RescheduleRegions[RegionIdx]) { - dbgs() << "Region " << RegionIdx << " does not need to be rescheduled.\n"; - ++RegionIdx; - continue; - }*/ + /*if (S == OptSchedLowerOccAnalysis && !RescheduleRegions[RegionIdx]) { + dbgs() << "Region " << RegionIdx << " does not need to be + rescheduled.\n"; + ++RegionIdx; + continue; + }*/ RegionBegin = Region.first; RegionEnd = Region.second; @@ -124,7 +154,8 @@ void ScheduleDAGOptSchedGCN::finalizeSchedule() { exitRegion(); continue; } - LLVM_DEBUG(getRealRegionPressure(RegionBegin, RegionEnd, LIS, "Before")); + LLVM_DEBUG( + getRealRegionPressure(RegionBegin, RegionEnd, LIS, "Before")); runSchedPass(S); LLVM_DEBUG(getRealRegionPressure(RegionBegin, RegionEnd, LIS, "After")); Region = std::make_pair(RegionBegin, RegionEnd); @@ -153,12 +184,19 @@ void ScheduleDAGOptSchedGCN::runSchedPass(SchedPassStrategy S) { break; case OptSchedMaxOcc: scheduleOptSchedMaxOcc(); + Logger::Info("End of first pass through"); break; case OptSchedBalanced: scheduleOptSchedBalanced(); + Logger::Info("End of second pass through"); + break; + case OptSchedLowerOccAnalysis: + scheduleOptSchedLowerOccAnalysis(); + Logger::Info("End of third pass through"); break; - case OptSchedReschedule: - scheduleOptSchedReschedule(); + case OptSchedCommitLowerOcc: + scheduleCommitLowerOcc(); + Logger::Info("End of fourth pass through"); break; } } @@ -181,9 +219,33 @@ void ScheduleDAGOptSchedGCN::scheduleOptSchedBalanced() { ScheduleDAGOptSched::scheduleOptSchedBalanced(); } -void ScheduleDAGOptSchedGCN::scheduleOptSchedReschedule() { +void ScheduleDAGOptSchedGCN::scheduleOptSchedLowerOccAnalysis() { IsThirdPass = true; ScheduleDAGOptSched::scheduleOptSchedBalanced(); - Logger::Info("End of third pass through\n"); + IsThirdPass = false; } +void ScheduleDAGOptSchedGCN::scheduleCommitLowerOcc() { + IsFourthPass = true; + ScheduleDAGOptSched::scheduleOptSchedBalanced(); + IsFourthPass = false; +} + +bool ScheduleDAGOptSchedGCN::shouldCommitLowerOccSched() { + // First analyze ILP improvements + int FirstPassILP = 0; + int SecondPassILP = 0; + int MinILPImprovement = getMinILPImprovement(); + for (std::pair &RegionLength : ILPAnalysis) { + FirstPassILP += RegionLength.first; + SecondPassILP += RegionLength.second; + } + double ILPImprovement = + ((FirstPassILP - SecondPassILP) / (double)FirstPassILP) * 100.0; + dbgs() << "ILPImprovement from second ILP pass is " << ILPImprovement + << ", min improvement is: " << MinILPImprovement << '\n'; + if (ILPImprovement >= MinILPImprovement) + return true; + + return false; +} \ No newline at end of file diff --git a/lib/Wrapper/AMDGPU/GCNOptSched.h b/lib/Wrapper/AMDGPU/GCNOptSched.h index 3d2646af..0a8df221 100644 --- a/lib/Wrapper/AMDGPU/GCNOptSched.h +++ b/lib/Wrapper/AMDGPU/GCNOptSched.h @@ -20,12 +20,24 @@ class ScheduleDAGOptSchedGCN : public ScheduleDAGOptSched { GCNMaxOcc, OptSchedMaxOcc, OptSchedBalanced, - OptSchedReschedule + OptSchedLowerOccAnalysis, + OptSchedCommitLowerOcc }; + /// Get the minimum occupancy value from the sched.ini settings file. Check + /// if the value is between 1-10 and gives an error if it is not between the + /// valid range. + unsigned getMinOcc(); + + /// Analyze the possible improvements from lowering the target occupancy + /// and decide if we should keep the schedules. + bool shouldCommitLowerOccSched(); + // Vector of scheduling passes to execute. SmallVector SchedPasses; + unsigned MinOcc; + public: ScheduleDAGOptSchedGCN(llvm::MachineSchedContext *C, std::unique_ptr S); @@ -52,7 +64,12 @@ class ScheduleDAGOptSchedGCN : public ScheduleDAGOptSched { // Run OptSched in ILP/RP balanced mode. void scheduleOptSchedBalanced() override; - void scheduleOptSchedReschedule(); + // Lower occupancy and run OptSched in ILP/RP balanced mode for analysis. + void scheduleOptSchedLowerOccAnalysis(); + + // Lower occupancy and run OptSched in ILP/RP balanced mode to commit + // scheduling in analysis pass. + void scheduleCommitLowerOcc(); }; } // namespace opt_sched diff --git a/lib/Wrapper/OptimizingScheduler.cpp b/lib/Wrapper/OptimizingScheduler.cpp index ab09f7c2..356e5c32 100644 --- a/lib/Wrapper/OptimizingScheduler.cpp +++ b/lib/Wrapper/OptimizingScheduler.cpp @@ -24,6 +24,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/MachineScheduler.h" /*#include "llvm/CodeGen/OptSequential.h"*/ +#include "AMDGPU/OptSchedGCNTarget.h" #include "llvm/CodeGen/RegisterClassInfo.h" #include "llvm/CodeGen/RegisterPressure.h" #include "llvm/CodeGen/ScheduleDAG.h" @@ -38,7 +39,6 @@ #include #include #include -#include "AMDGPU/OptSchedGCNTarget.h" #define DEBUG_TYPE "optsched" @@ -79,8 +79,8 @@ static ScheduleDAGInstrs *createOptSched(MachineSchedContext *C) { // Register the machine scheduler. static MachineSchedRegistry OptSchedMIRegistry("optsched", - "Use the OptSched scheduler.", - createOptSched); + "Use the OptSched scheduler.", + createOptSched); // Command line options for opt-sched. static cl::opt OptSchedCfg( @@ -267,8 +267,7 @@ void ScheduleDAGOptSched::schedule() { Config &schedIni = SchedulerOptions::getInstance(); const std::string RegionName = C->MF->getFunction().getName().data() + - std::string(":") + - std::to_string(RegionIdx); + std::string(":") + std::to_string(RegionIdx); // If two pass scheduling is enabled then // first just record the scheduling region. @@ -388,14 +387,14 @@ void ScheduleDAGOptSched::schedule() { // In the second pass, ignore artificial edges before running the sequential // heuristic list scheduler. - if (SecondPass) + if (IsSecondPass) DDG->convertSUnits(/*IgnoreRealEdges=*/false, /*IgnoreArtificialEdges=*/true); else DDG->convertSUnits(false, false); // Find all clusterable instructions for the second pass. - if (SecondPass || (!TwoPassEnabled && schedIni.GetBool("PRINT_CLUSTER"))) { + if (IsSecondPass || (!TwoPassEnabled && schedIni.GetBool("PRINT_CLUSTER"))) { dbgs() << "Finding load clusters.\n"; int TotalLoadsInstructionsClusterable = DDG->findPossibleClusters(true); if (TotalLoadsInstructionsClusterable == 0) @@ -460,7 +459,7 @@ void ScheduleDAGOptSched::schedule() { } // Used for two-pass-optsched to alter upper bound value. - if (SecondPass) + if (IsSecondPass) region->InitSecondPass(); // Setup time before scheduling @@ -482,15 +481,29 @@ void ScheduleDAGOptSched::schedule() { // BB Enumerator did not find a schedule. // Add the region to the list to be rescheduled. - if (SecondPass && !region->enumFoundSchedule() && !IsEasy && !IsThirdPass) + if (IsSecondPass && !region->enumFoundSchedule() && !IsEasy && !IsThirdPass && + !IsFourthPass) RescheduleRegions[RegionIdx] = true; LLVM_DEBUG(Logger::Info("OptSched succeeded.")); - if (!IsThirdPass) { - OST->finalizeRegion(Sched); - if (!OST->shouldKeepSchedule()) + OST->finalizeRegion(Sched); + + if (IsFirstPass || IsSecondPass) + if (!OST->shouldKeepSchedule()) { + if (IsSecondPass) { + // We do not keep the schedule so the results of the sequential + // heuristic scheduler is the final result for the second pass. + ILPAnalysis[RegionIdx].first = HurstcSchedLngth; + } return; + } + + if (IsSecondPass) + ILPAnalysis[RegionIdx].first = BestSchedLngth; + else if (IsThirdPass) { + ILPAnalysis[RegionIdx].second = BestSchedLngth; + return; } // Count simulated spills. @@ -588,8 +601,10 @@ void ScheduleDAGOptSched::loadOptSchedConfig() { OptSchedEnabled = isOptSchedEnabled(); TwoPassEnabled = isTwoPassEnabled(); TwoPassSchedulingStarted = false; - SecondPass = false; + IsFirstPass = false; + IsSecondPass = false; IsThirdPass = false; + IsFourthPass = false; LatencyPrecision = fetchLatencyPrecision(); TreatOrderAsDataDeps = schedIni.GetBool("TREAT_ORDER_DEPS_AS_DATA_DEPS"); @@ -853,14 +868,17 @@ void ScheduleDAGOptSched::runSchedPass(SchedPassStrategy S) { switch (S) { case OptSchedMinRP: scheduleOptSchedMinRP(); + Logger::Info("End of first pass through"); break; case OptSchedBalanced: scheduleOptSchedBalanced(); + Logger::Info("End of second pass through"); break; } } void ScheduleDAGOptSched::scheduleOptSchedMinRP() { + IsFirstPass = true; LatencyPrecision = LTP_UNITY; // Set times for the first pass RegionTimeout = FirstPassRegionTimeout; @@ -868,11 +886,11 @@ void ScheduleDAGOptSched::scheduleOptSchedMinRP() { HeurSchedType = SCHED_LIST; schedule(); - Logger::Info("End of first pass through\n"); + IsFirstPass = false; } void ScheduleDAGOptSched::scheduleOptSchedBalanced() { - SecondPass = true; + IsSecondPass = true; LatencyPrecision = LTP_ROUGH; // Set times for the second pass @@ -899,7 +917,7 @@ void ScheduleDAGOptSched::scheduleOptSchedBalanced() { MultiPassStaticNodeSup = false; schedule(); - Logger::Info("End of second pass through"); + IsSecondPass = false; } bool ScheduleDAGOptSched::isSimRegAllocEnabled() const { diff --git a/lib/Wrapper/OptimizingScheduler.h b/lib/Wrapper/OptimizingScheduler.h index aadca182..872b3f81 100644 --- a/lib/Wrapper/OptimizingScheduler.h +++ b/lib/Wrapper/OptimizingScheduler.h @@ -37,8 +37,15 @@ class ScheduleDAGOptSched : public ScheduleDAGMILive { // Vector of scheduling passes to execute. SmallVector SchedPasses; -protected: + /// Contains the results of the first ILP pass and second analysis ILP pass. + /// Used to calculate if we should keep the lower target occupancy schedules + /// in the second ILP pass. First element is the first ILP pass and second + /// element is the second analysis ILP pass. + SmallVector, 32> ILPAnalysis; + /// TODO: Same as above for cost analysis. + SmallVector, 32> CostAnalysis; +protected: // Vector of regions recorded for later rescheduling SmallVector< std::pair, 32> @@ -56,12 +63,16 @@ class ScheduleDAGOptSched : public ScheduleDAGMILive { // Path to the machine model specification file for opt-sched. SmallString<128> PathCfgMM; + bool IsFirstPass; + // Bool value indicating that the scheduler is in the second // pass. Used for the two pass scheduling approach. - bool SecondPass; + bool IsSecondPass; bool IsThirdPass; + bool isFourthPass; + // Region number uniquely identifies DAGs. size_t RegionIdx; @@ -167,7 +178,8 @@ class ScheduleDAGOptSched : public ScheduleDAGMILive { SchedPriorities SecondPassPriorities; - // The heuristic used for the second pass enumerator in the two-pass scheduling approach. + // The heuristic used for the second pass enumerator in the two-pass + // scheduling approach. SchedPriorities SecondPassEnumPriorities; // Static node superiority RP only graph transformation. From 9bbb91d4d5bfe0bc597724b5677f0823134b2ef1 Mon Sep 17 00:00:00 2001 From: vang thao Date: Wed, 19 Aug 2020 09:11:15 -0700 Subject: [PATCH 38/40] Fix ILP Improvement calculation bugs --- lib/CMakeLists.txt | 1 - lib/Scheduler/sched_region.cpp | 5 ++- lib/Wrapper/AMDGPU/GCNOptSched.cpp | 47 +++++++++++++++++++---------- lib/Wrapper/AMDGPU/GCNOptSched.h | 2 ++ lib/Wrapper/OptimizingScheduler.cpp | 8 +++-- lib/Wrapper/OptimizingScheduler.h | 15 ++++----- 6 files changed, 51 insertions(+), 27 deletions(-) diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 9102bf94..847cc6e5 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -5,4 +5,3 @@ ELSE() ENDIF() add_dependencies(OptSched ${OPT_SCHED_TARGET_DEPS}) -target_link_libraries(OptSched -L/home/vang/src/ROCm-2.4/opencl/build/lib/ libamdocl64.so) diff --git a/lib/Scheduler/sched_region.cpp b/lib/Scheduler/sched_region.cpp index cb545bc7..64e4bc56 100644 --- a/lib/Scheduler/sched_region.cpp +++ b/lib/Scheduler/sched_region.cpp @@ -411,6 +411,9 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule( InitialSchedule = bestSched_; InitialScheduleCost = bestCost_; InitialScheduleLength = bestSchedLngth_; + /*Logger::Info("Printing Initiial schedule"); + InitialSchedule->Print(Logger::GetLogStream(), "InitialSched", dataDepGraph_); + Logger::Info("Finish printing initial schedule");*/ // Step #4: Find the optimal schedule if the heuristc and ACO was not optimal. if (BbSchedulerEnabled) { @@ -647,7 +650,7 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule( } //if (bestSched != NULL) - //bestSched->Print(Logger::GetLogStream(), "FinalSched", dataDepGraph_); + // bestSched->Print(Logger::GetLogStream(), "FinalSched", dataDepGraph_); return rslt; } diff --git a/lib/Wrapper/AMDGPU/GCNOptSched.cpp b/lib/Wrapper/AMDGPU/GCNOptSched.cpp index c2f65463..0a9434af 100644 --- a/lib/Wrapper/AMDGPU/GCNOptSched.cpp +++ b/lib/Wrapper/AMDGPU/GCNOptSched.cpp @@ -53,21 +53,23 @@ ScheduleDAGOptSchedGCN::ScheduleDAGOptSchedGCN( unsigned ScheduleDAGOptSchedGCN::getMinOcc() { SchedulerOptions &schedIni = SchedulerOptions::getInstance(); int MinOcc = schedIni.GetInt("MIN_OCCUPANCY_FOR_RESCHEDULE"); - if (MinOcc <= 10 || MinOcc >= 1) + if (MinOcc <= 10 && MinOcc >= 1) return MinOcc; - llvm_unreachable( - "Unrecognized option for MIN_OCCUPANCY_FOR_RESCHEDULE setting.") + Logger::Fatal( + "Unrecognized option for MIN_OCCUPANCY_FOR_RESCHEDULE setting: %d", + MinOcc); } int ScheduleDAGOptSchedGCN::getMinILPImprovement() { SchedulerOptions &schedIni = SchedulerOptions::getInstance(); int MinIlpImprovement = schedIni.GetInt("MIN_ILP_IMPROVEMENT"); - if (MinIlpImprovement <= 100 || MinIlpImprovement >= 1) + if (MinIlpImprovement <= 100 && MinIlpImprovement >= 0) return MinIlpImprovement; - llvm_unreachable( - "Unrecognized option for MIN_OCCUPANCY_FOR_RESCHEDULE setting.") + Logger::Fatal( + "Unrecognized option for MIN_OCCUPANCY_FOR_RESCHEDULE setting: %d", + MinIlpImprovement); } void ScheduleDAGOptSchedGCN::initSchedulers() { @@ -114,7 +116,7 @@ void ScheduleDAGOptSchedGCN::finalizeSchedule() { unsigned TargetOccupancy = GCNOST->getTargetOcc(); if (TargetOccupancy <= MinOcc) { dbgs() << "Cannot lower occupancy to below minimum occupancy of " - << MinOCc << '\n'; + << MinOcc << '\n'; break; } @@ -125,8 +127,14 @@ void ScheduleDAGOptSchedGCN::finalizeSchedule() { GCNOST->limitOccupancy(NewTarget); } } else if (S == OptSchedCommitLowerOcc) { - if (!shouldCommitLowerOccSched()) + dbgs() + << "Analyzing if we should commit the lower occupancy schedule\n"; + if (!shouldCommitLowerOccSched()) { + dbgs() + << "Lower occupancy schedule did not meet minimum improvement.\n"; break; + } + dbgs() << "Lower occupancy met minimum improvement requirement!\n"; } for (auto &Region : Regions) { @@ -233,19 +241,26 @@ void ScheduleDAGOptSchedGCN::scheduleCommitLowerOcc() { bool ScheduleDAGOptSchedGCN::shouldCommitLowerOccSched() { // First analyze ILP improvements - int FirstPassILP = 0; - int SecondPassILP = 0; + int FirstPassLengthSum = 0; + int SecondPassLengthSum = 0; int MinILPImprovement = getMinILPImprovement(); for (std::pair &RegionLength : ILPAnalysis) { - FirstPassILP += RegionLength.first; - SecondPassILP += RegionLength.second; + dbgs() << "First length -- " << RegionLength.first << ", Second length -- " + << RegionLength.second << '\n'; + FirstPassLengthSum += RegionLength.first; + SecondPassLengthSum += RegionLength.second; } - double ILPImprovement = - ((FirstPassILP - SecondPassILP) / (double)FirstPassILP) * 100.0; + dbgs() << "First pass length sum: " << FirstPassLengthSum << '\n'; + dbgs() << "Second pass length sum: " << SecondPassLengthSum << '\n'; + double FirstPassAverageLength = (double)FirstPassLengthSum / Regions.size(); + double SecondPassAverageLength = (double)SecondPassLengthSum / Regions.size(); + double ILPImprovement = ((FirstPassAverageLength - SecondPassAverageLength) / + FirstPassAverageLength) * + 100.0; dbgs() << "ILPImprovement from second ILP pass is " << ILPImprovement << ", min improvement is: " << MinILPImprovement << '\n'; - if (ILPImprovement >= MinILPImprovement) + if (ILPImprovement - MinILPImprovement >= 0) return true; return false; -} \ No newline at end of file +} diff --git a/lib/Wrapper/AMDGPU/GCNOptSched.h b/lib/Wrapper/AMDGPU/GCNOptSched.h index 0a8df221..c24c93c1 100644 --- a/lib/Wrapper/AMDGPU/GCNOptSched.h +++ b/lib/Wrapper/AMDGPU/GCNOptSched.h @@ -29,6 +29,8 @@ class ScheduleDAGOptSchedGCN : public ScheduleDAGOptSched { /// valid range. unsigned getMinOcc(); + int getMinILPImprovement(); + /// Analyze the possible improvements from lowering the target occupancy /// and decide if we should keep the schedules. bool shouldCommitLowerOccSched(); diff --git a/lib/Wrapper/OptimizingScheduler.cpp b/lib/Wrapper/OptimizingScheduler.cpp index 356e5c32..3e9a03b7 100644 --- a/lib/Wrapper/OptimizingScheduler.cpp +++ b/lib/Wrapper/OptimizingScheduler.cpp @@ -380,6 +380,10 @@ void ScheduleDAGOptSched::schedule() { // Build LLVM DAG SetupLLVMDag(); OST->initRegion(this, MM.get()); + /*if (IsSecondPass && !IsThirdPass && !IsFourthPass) { + auto GCNOST = static_cast(OST.get()); + GCNOST->setTargetOcc(5); + }*/ // Convert graph auto DDG = @@ -489,7 +493,7 @@ void ScheduleDAGOptSched::schedule() { OST->finalizeRegion(Sched); - if (IsFirstPass || IsSecondPass) + if (!IsThirdPass && !IsFourthPass && (IsFirstPass || IsSecondPass)) if (!OST->shouldKeepSchedule()) { if (IsSecondPass) { // We do not keep the schedule so the results of the sequential @@ -499,7 +503,7 @@ void ScheduleDAGOptSched::schedule() { return; } - if (IsSecondPass) + if (IsSecondPass && !IsThirdPass && !IsFourthPass) ILPAnalysis[RegionIdx].first = BestSchedLngth; else if (IsThirdPass) { ILPAnalysis[RegionIdx].second = BestSchedLngth; diff --git a/lib/Wrapper/OptimizingScheduler.h b/lib/Wrapper/OptimizingScheduler.h index 872b3f81..502a7cd2 100644 --- a/lib/Wrapper/OptimizingScheduler.h +++ b/lib/Wrapper/OptimizingScheduler.h @@ -37,6 +37,13 @@ class ScheduleDAGOptSched : public ScheduleDAGMILive { // Vector of scheduling passes to execute. SmallVector SchedPasses; + +protected: + // Vector of regions recorded for later rescheduling + SmallVector< + std::pair, 32> + Regions; + /// Contains the results of the first ILP pass and second analysis ILP pass. /// Used to calculate if we should keep the lower target occupancy schedules /// in the second ILP pass. First element is the first ILP pass and second @@ -45,12 +52,6 @@ class ScheduleDAGOptSched : public ScheduleDAGMILive { /// TODO: Same as above for cost analysis. SmallVector, 32> CostAnalysis; -protected: - // Vector of regions recorded for later rescheduling - SmallVector< - std::pair, 32> - Regions; - // Path to opt-sched config options directory. SmallString<128> PathCfg; @@ -71,7 +72,7 @@ class ScheduleDAGOptSched : public ScheduleDAGMILive { bool IsThirdPass; - bool isFourthPass; + bool IsFourthPass; // Region number uniquely identifies DAGs. size_t RegionIdx; From 6b28d0d9ec8709711325d511dd2d899cdae6d4a1 Mon Sep 17 00:00:00 2001 From: Vang Thao Date: Thu, 20 Aug 2020 21:06:31 -0500 Subject: [PATCH 39/40] Disable heuristic scheduler and B&B enumerator in 3rd ILP pass. --- lib/Wrapper/AMDGPU/GCNOptSched.cpp | 47 +++++++++-------------------- lib/Wrapper/OptimizingScheduler.cpp | 36 +++++++++++++--------- lib/Wrapper/OptimizingScheduler.h | 3 +- 3 files changed, 38 insertions(+), 48 deletions(-) diff --git a/lib/Wrapper/AMDGPU/GCNOptSched.cpp b/lib/Wrapper/AMDGPU/GCNOptSched.cpp index 0a9434af..55067d32 100644 --- a/lib/Wrapper/AMDGPU/GCNOptSched.cpp +++ b/lib/Wrapper/AMDGPU/GCNOptSched.cpp @@ -11,7 +11,9 @@ #include "SIMachineFunctionInfo.h" //#include "llvm/CodeGen/OptSequential.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" #include +#include #define DEBUG_TYPE "optsched" @@ -56,9 +58,9 @@ unsigned ScheduleDAGOptSchedGCN::getMinOcc() { if (MinOcc <= 10 && MinOcc >= 1) return MinOcc; - Logger::Fatal( + llvm::report_fatal_error( "Unrecognized option for MIN_OCCUPANCY_FOR_RESCHEDULE setting: %d", - MinOcc); + std::to_string(MinOcc), false); } int ScheduleDAGOptSchedGCN::getMinILPImprovement() { @@ -67,9 +69,9 @@ int ScheduleDAGOptSchedGCN::getMinILPImprovement() { if (MinIlpImprovement <= 100 && MinIlpImprovement >= 0) return MinIlpImprovement; - Logger::Fatal( + llvm::report_fatal_error( "Unrecognized option for MIN_OCCUPANCY_FOR_RESCHEDULE setting: %d", - MinIlpImprovement); + std::to_string(MinIlpImprovement), false); } void ScheduleDAGOptSchedGCN::initSchedulers() { @@ -99,6 +101,7 @@ void ScheduleDAGOptSchedGCN::finalizeSchedule() { RescheduleRegions.resize(Regions.size()); ILPAnalysis.resize(Regions.size()); CostAnalysis.resize(Regions.size()); + LowerOccScheds.resize(Regions.size()); RescheduleRegions.set(); LLVM_DEBUG(dbgs() << "Starting two pass scheduling approach\n"); @@ -107,28 +110,24 @@ void ScheduleDAGOptSchedGCN::finalizeSchedule() { MachineBasicBlock *MBB = nullptr; // Reset RegionIdx = 0; + if (S == OptSchedLowerOccAnalysis) { - if (RescheduleRegions.none()) { - dbgs() << "No regions to reschedule.\n"; + if (RescheduleRegions.none()) break; - } else { + else { auto GCNOST = static_cast(OST.get()); unsigned TargetOccupancy = GCNOST->getTargetOcc(); - if (TargetOccupancy <= MinOcc) { - dbgs() << "Cannot lower occupancy to below minimum occupancy of " - << MinOcc << '\n'; + if (TargetOccupancy <= MinOcc) break; - } - dbgs() << "Beginning rescheduling of regions.\n"; unsigned NewTarget = TargetOccupancy - 1u; dbgs() << "Decreasing current target occupancy " << TargetOccupancy << " to new target " << NewTarget << '\n'; GCNOST->limitOccupancy(NewTarget); } - } else if (S == OptSchedCommitLowerOcc) { - dbgs() - << "Analyzing if we should commit the lower occupancy schedule\n"; + } + + if (S == OptSchedCommitLowerOcc) { if (!shouldCommitLowerOccSched()) { dbgs() << "Lower occupancy schedule did not meet minimum improvement.\n"; @@ -138,13 +137,6 @@ void ScheduleDAGOptSchedGCN::finalizeSchedule() { } for (auto &Region : Regions) { - /*if (S == OptSchedLowerOccAnalysis && !RescheduleRegions[RegionIdx]) { - dbgs() << "Region " << RegionIdx << " does not need to be - rescheduled.\n"; - ++RegionIdx; - continue; - }*/ - RegionBegin = Region.first; RegionEnd = Region.second; @@ -175,13 +167,6 @@ void ScheduleDAGOptSchedGCN::finalizeSchedule() { } ScheduleDAGMILive::finalizeSchedule(); - - LLVM_DEBUG(if (isSimRegAllocEnabled()) { - dbgs() << "*************************************\n"; - dbgs() << "Function: " << MF.getName() - << "\nTotal Simulated Spills: " << SimulatedSpills << "\n"; - dbgs() << "*************************************\n"; - }); } void ScheduleDAGOptSchedGCN::runSchedPass(SchedPassStrategy S) { @@ -245,13 +230,9 @@ bool ScheduleDAGOptSchedGCN::shouldCommitLowerOccSched() { int SecondPassLengthSum = 0; int MinILPImprovement = getMinILPImprovement(); for (std::pair &RegionLength : ILPAnalysis) { - dbgs() << "First length -- " << RegionLength.first << ", Second length -- " - << RegionLength.second << '\n'; FirstPassLengthSum += RegionLength.first; SecondPassLengthSum += RegionLength.second; } - dbgs() << "First pass length sum: " << FirstPassLengthSum << '\n'; - dbgs() << "Second pass length sum: " << SecondPassLengthSum << '\n'; double FirstPassAverageLength = (double)FirstPassLengthSum / Regions.size(); double SecondPassAverageLength = (double)SecondPassLengthSum / Regions.size(); double ILPImprovement = ((FirstPassAverageLength - SecondPassAverageLength) / diff --git a/lib/Wrapper/OptimizingScheduler.cpp b/lib/Wrapper/OptimizingScheduler.cpp index 3e9a03b7..a3ad4e1c 100644 --- a/lib/Wrapper/OptimizingScheduler.cpp +++ b/lib/Wrapper/OptimizingScheduler.cpp @@ -413,8 +413,8 @@ void ScheduleDAGOptSched::schedule() { TotalLoadsInstructionsClusterable, TotalStoreInstructionsClusterable); - // Get the DDG instance so that we can set and get information that will be - // read later on during enumeration. + // Get the DDG instance so that we can set and get information that will + // be read later on during enumeration. auto DataDepGraphInstance = static_cast(DDG.get()); // Store total instructions in all clusters in the DDG instance. DataDepGraphInstance->setTotalInstructionsInAllClusters( @@ -469,18 +469,25 @@ void ScheduleDAGOptSched::schedule() { // Setup time before scheduling Utilities::startTime = std::chrono::high_resolution_clock::now(); // Schedule region. - Rslt = region->FindOptimalSchedule(CurrentRegionTimeout, CurrentLengthTimeout, - IsEasy, NormBestCost, BestSchedLngth, - NormHurstcCost, HurstcSchedLngth, Sched, - FilterByPerp, blocksToKeep(schedIni)); - - if ((!(Rslt == RES_SUCCESS || Rslt == RES_TIMEOUT) || Sched == NULL)) { - LLVM_DEBUG( - Logger::Info("OptSched run failed: rslt=%d, sched=%p. Falling back.", - Rslt, (void *)Sched)); - // Scheduling with opt-sched failed. - // fallbackScheduler(); - return; + if (!IsFourthPass) { + Rslt = region->FindOptimalSchedule( + CurrentRegionTimeout, CurrentLengthTimeout, IsEasy, NormBestCost, + BestSchedLngth, NormHurstcCost, HurstcSchedLngth, Sched, FilterByPerp, + blocksToKeep(schedIni)); + + if ((!(Rslt == RES_SUCCESS || Rslt == RES_TIMEOUT) || Sched == NULL)) { + LLVM_DEBUG( + Logger::Info("OptSched run failed: rslt=%d, sched=%p. Falling back.", + Rslt, (void *)Sched)); + // Scheduling with opt-sched failed. + // fallbackScheduler(); + return; + } + } else { + dbgs() << "Processing DAG " << RegionName << '\n'; + dbgs() << "Restoring schedule from second ILP pass: \n"; + Sched = LowerOccScheds[RegionIdx]; + dbgs() << "Applying lower occupancy schedule\n"; } // BB Enumerator did not find a schedule. @@ -507,6 +514,7 @@ void ScheduleDAGOptSched::schedule() { ILPAnalysis[RegionIdx].first = BestSchedLngth; else if (IsThirdPass) { ILPAnalysis[RegionIdx].second = BestSchedLngth; + LowerOccScheds[RegionIdx] = Sched; return; } diff --git a/lib/Wrapper/OptimizingScheduler.h b/lib/Wrapper/OptimizingScheduler.h index 502a7cd2..72191801 100644 --- a/lib/Wrapper/OptimizingScheduler.h +++ b/lib/Wrapper/OptimizingScheduler.h @@ -37,7 +37,6 @@ class ScheduleDAGOptSched : public ScheduleDAGMILive { // Vector of scheduling passes to execute. SmallVector SchedPasses; - protected: // Vector of regions recorded for later rescheduling SmallVector< @@ -51,6 +50,8 @@ class ScheduleDAGOptSched : public ScheduleDAGMILive { SmallVector, 32> ILPAnalysis; /// TODO: Same as above for cost analysis. SmallVector, 32> CostAnalysis; + /// Store the lower occupancy schedules from the second ILP pass. + SmallVector LowerOccScheds; // Path to opt-sched config options directory. SmallString<128> PathCfg; From 527d08f60bc5ea561ffc8f8e60c82a598502c1f1 Mon Sep 17 00:00:00 2001 From: vang thao Date: Sun, 23 Aug 2020 19:15:17 -0700 Subject: [PATCH 40/40] Fix incorrect statement --- lib/Wrapper/AMDGPU/GCNOptSched.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/Wrapper/AMDGPU/GCNOptSched.cpp b/lib/Wrapper/AMDGPU/GCNOptSched.cpp index 55067d32..915f4e6b 100644 --- a/lib/Wrapper/AMDGPU/GCNOptSched.cpp +++ b/lib/Wrapper/AMDGPU/GCNOptSched.cpp @@ -59,7 +59,7 @@ unsigned ScheduleDAGOptSchedGCN::getMinOcc() { return MinOcc; llvm::report_fatal_error( - "Unrecognized option for MIN_OCCUPANCY_FOR_RESCHEDULE setting: %d", + "Unrecognized option for MIN_OCCUPANCY_FOR_RESCHEDULE setting: %d" + std::to_string(MinOcc), false); } @@ -70,7 +70,7 @@ int ScheduleDAGOptSchedGCN::getMinILPImprovement() { return MinIlpImprovement; llvm::report_fatal_error( - "Unrecognized option for MIN_OCCUPANCY_FOR_RESCHEDULE setting: %d", + "Unrecognized option for MIN_OCCUPANCY_FOR_RESCHEDULE setting: %d" + std::to_string(MinIlpImprovement), false); }