diff --git a/CMakeLists.txt b/CMakeLists.txt index dce8385ea08..eb66bea9c4c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -226,7 +226,6 @@ list(APPEND NVFUSER_SRCS ${NVFUSER_SRCS_DIR}/scheduler/normalization_outer.cpp ${NVFUSER_SRCS_DIR}/scheduler/normalization_utils.cpp ${NVFUSER_SRCS_DIR}/scheduler/pointwise.cpp - ${NVFUSER_SRCS_DIR}/scheduler/pointwise_utils.cpp ${NVFUSER_SRCS_DIR}/scheduler/reduction.cpp ${NVFUSER_SRCS_DIR}/scheduler/reduction_utils.cpp ${NVFUSER_SRCS_DIR}/scheduler/registry.cpp @@ -234,6 +233,7 @@ list(APPEND NVFUSER_SRCS ${NVFUSER_SRCS_DIR}/scheduler/resize.cpp ${NVFUSER_SRCS_DIR}/scheduler/runtime_info.cpp ${NVFUSER_SRCS_DIR}/scheduler/scheduler_types.cpp + ${NVFUSER_SRCS_DIR}/scheduler/tools/domain_map.cpp ${NVFUSER_SRCS_DIR}/scheduler/tools/inlining.cpp ${NVFUSER_SRCS_DIR}/scheduler/tools/loop_domain_scheduler.cpp ${NVFUSER_SRCS_DIR}/scheduler/tools/maxinfo_propagator.cpp diff --git a/csrc/scheduler/compile_time_info.h b/csrc/scheduler/compile_time_info.h index f7ec9d4a97f..3436bcd70eb 100644 --- a/csrc/scheduler/compile_time_info.h +++ b/csrc/scheduler/compile_time_info.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -54,7 +55,7 @@ enum class CompileTimeEntryType { //! stores the domain map of a fusion. class DomainMap { public: - using DataType = pointwise_utils::DomainMap; + using DataType = scheduler_tools::DomainMap; static const CompileTimeEntryType EntryType = CompileTimeEntryType::DOMAIN_MAP; }; @@ -63,7 +64,7 @@ class DomainMap { //! stores the domain map of a fusion, used by transpose scheduler. class TransposeDomainMap { public: - using DataType = pointwise_utils::DomainMap; + using DataType = scheduler_tools::DomainMap; static const CompileTimeEntryType EntryType = CompileTimeEntryType::TRANSPOSE_DOMAIN_MAP; }; diff --git a/csrc/scheduler/pointwise.cpp b/csrc/scheduler/pointwise.cpp index bc7a0fb32c6..840ec90e69a 100644 --- a/csrc/scheduler/pointwise.cpp +++ b/csrc/scheduler/pointwise.cpp @@ -29,37 +29,6 @@ namespace { // Unused at the moment, commenting for clang tidy constexpr int64_t kThreadX = 128; -class DomainMap : public pointwise_utils::DomainMap { - public: - using pointwise_utils::DomainMap::DomainMap; - - // The pointwise scheduler heuristics requires a minimum number of axes. - // The output reference tensor should respect this requirement. - TensorView* findReferenceTensorView(int64_t minimum_num_axes = 0) const { - TensorView* result = nullptr; - int64_t max_dims = -1; - for (auto output_tv : - ir_utils::filterByType(fusion_->outputs())) { - if (isValidReference(output_tv) && - hasMinimumSize(output_tv, minimum_num_axes) && - !output_tv->isFusionInput()) { - int64_t n_dims = pointwise_utils::nRootDims(output_tv); - if (n_dims > max_dims) { - result = output_tv; - max_dims = n_dims; - } - } - } - return result; - } - - private: - bool hasMinimumSize(TensorView* tv, int64_t num_axes) const { - NVF_ERROR(tv != nullptr); - return (num_axes == 0 || (int64_t)tv->getLogicalDomain().size() > num_axes); - } -}; - } // namespace std::unique_ptr getPointwiseHeuristics( @@ -79,9 +48,11 @@ std::unique_ptr getPointwiseHeuristics( auto domain_map_entry = HeuristicDataCacheEntry( - data_cache, - [fusion]() { return std::make_unique(fusion); }); - const auto& domain_map = dynamic_cast(domain_map_entry.get()); + data_cache, [fusion]() { + return std::make_unique(fusion); + }); + const auto& domain_map = + dynamic_cast(domain_map_entry.get()); auto largest_out_entry = HeuristicDataCacheEntry( @@ -435,7 +406,7 @@ std::unique_ptr getPointwiseHeuristics( // Return reference tensor view. TensorView* getReferenceTensorView(Fusion* fusion) { FusionGuard fg(fusion); - DomainMap domain_map(fusion); + pointwise_utils::DomainMap domain_map(fusion); auto reference_tv = domain_map.findReferenceTensorView(); return reference_tv; } diff --git a/csrc/scheduler/pointwise_utils.h b/csrc/scheduler/pointwise_utils.h index 56db0ee0806..5812b9de19c 100644 --- a/csrc/scheduler/pointwise_utils.h +++ b/csrc/scheduler/pointwise_utils.h @@ -11,56 +11,12 @@ #include #include #include +#include #include namespace nvfuser { namespace pointwise_utils { -// DomainMap uses the ComputeAtMap to find a reference TensorView -// that maps to all IterDomains in the fusion. -class DomainMap { - public: - DomainMap(Fusion* fusion); - virtual ~DomainMap() = default; - - const ComputeAtMap& getComputeAtMap() const { - return ca_map_; - } - - // Determine if a TensorView is a valid reference tensor for this fusion. - // The reference tensor must map to all the iterDomains in each input. - bool isValidReference(TensorView* tv) const; - - protected: - // Determine if all IterDomains are mapped between input and the given tvs - bool areAllInputIdsMappedTo(TensorView* input_tv, TensorView* output_tv) - const; - - virtual IterDomain* getMappedInputConcreteID( - const std::unordered_set& in_concrete_ids, - IterDomain* out_id) const; - - // Erase input concrete ID if it is mapped to output ID - bool eraseIfMapped( - std::unordered_set& in_concrete_ids, - IterDomain* out_id) const; - - // Check if in_ids are mapped to ids through any root domain as - // well as indirectly accessed domains with ops like torchGather - void eraseifInputMappedThroughRootDomainAndIndexing( - std::unordered_set& in_ids, - const std::vector& ids) const; - - // Find any id in domain that maps with target id - IterDomain* anyMapped( - const std::vector& domain, - IterDomain* target) const; - - Fusion* fusion_ = nullptr; - ComputeAtMap ca_map_; - std::vector tvs_with_rfactor_; -}; - // Returns number of non-reduction/non-broadcas/non-device dims in logical // domain inline int64_t nRootDims(const TensorView* tv) { @@ -74,5 +30,36 @@ inline int64_t nRootDims(const TensorView* tv) { return tv_n_dims; } +class DomainMap : public scheduler_tools::DomainMap { + public: + using scheduler_tools::DomainMap::DomainMap; + + // The pointwise scheduler heuristics requires a minimum number of axes. + // The output reference tensor should respect this requirement. + TensorView* findReferenceTensorView(int64_t minimum_num_axes = 0) const { + TensorView* result = nullptr; + int64_t max_dims = -1; + for (auto output_tv : + ir_utils::filterByType(fusion_->outputs())) { + if (isValidReference(output_tv) && + hasMinimumSize(output_tv, minimum_num_axes) && + !output_tv->isFusionInput()) { + int64_t n_dims = pointwise_utils::nRootDims(output_tv); + if (n_dims > max_dims) { + result = output_tv; + max_dims = n_dims; + } + } + } + return result; + } + + private: + bool hasMinimumSize(TensorView* tv, int64_t num_axes) const { + NVF_ERROR(tv != nullptr); + return (num_axes == 0 || (int64_t)tv->getLogicalDomain().size() > num_axes); + } +}; + } // namespace pointwise_utils } // namespace nvfuser diff --git a/csrc/scheduler/resize.cpp b/csrc/scheduler/resize.cpp index cd07f572872..08f2aa04afd 100644 --- a/csrc/scheduler/resize.cpp +++ b/csrc/scheduler/resize.cpp @@ -95,269 +95,8 @@ std::unique_ptr ResizeScheduler::computeHeuristics( namespace { -std::vector>> -getReferenceTensors(Fusion* fusion) { - std::vector ref_candidates; - - std::cerr << "getReferenceTensors\n"; - fusion->printMath(); - - const auto all_tvs = fusion->allTvs(); - - DisjointSets disjoint_val_sets; - - std::vector resize_ops = - ir_utils::getOpsOfType(fusion); - - // Group all tvs that are dependent on resize op outputs - for (Expr* resize_op : resize_ops) { - auto ref_tv = resize_op->output(0)->as(); - - auto dep_vals = DependencyCheck::getAllValsBetween( - {fusion->inputs().begin(), fusion->inputs().end()}, {ref_tv}); - - for (auto dep_tv : ir_utils::filterByType(dep_vals)) { - // Don't add inputs. Inputs are not replicated nor scheduled. - if (dep_tv->isFusionInput()) { - continue; - } - std::cerr << "Mapping " << ref_tv->toString() << " and " - << dep_tv->toString() << "\n"; - disjoint_val_sets.mapEntries(ref_tv, dep_tv); - } - } - - // TODO: Reuse - IdModel id_model(fusion, /*build_graphs=*/false); - const auto& broadcast_graph = id_model.buildBroadcastGraph(); - - for (const auto i : c10::irange(resize_ops.size() - 1)) { - for (const auto j : c10::irange(i + 1, resize_ops.size())) { - auto out_tv_i = resize_ops.at(i)->output(0)->as(); - auto out_tv_j = resize_ops.at(j)->output(0)->as(); - if (disjoint_val_sets.strictAreMapped(out_tv_i, out_tv_j)) { - continue; - } - - const auto out_tv_i_loop_groups = - broadcast_graph.toGroups(out_tv_i->getLoopDomain()); - const auto out_tv_j_loop_groups = - broadcast_graph.toGroups(out_tv_j->getLoopDomain()); - - bool same_loop_domain = - broadcast_graph.toGroups(out_tv_i->getLoopDomain()).set() == - broadcast_graph.toGroups(out_tv_j->getLoopDomain()).set(); - std::cerr << "Comparing " << out_tv_i->toString() << " and " - << out_tv_j->toString() << ": " << same_loop_domain << "\n"; - if (!same_loop_domain) { - auto [path_from_i_to_j, all_visited] = - ValGraphBFS::getExprGroupsBetween( - broadcast_graph, - out_tv_i_loop_groups, - out_tv_j_loop_groups, - /*require_all_to_visited=*/false); - if (!all_visited) { - // There are some unreachable loop groups - continue; - } - - // If there's any resize node, don't merge them - if (std::any_of( - path_from_i_to_j.begin(), - path_from_i_to_j.end(), - [](const auto& path_component) { - return path_component.first->front()->template isA(); - })) { - continue; - } - } - - std::cerr << "Same loop domain: " << out_tv_i->toString() << " and " - << out_tv_j->toString() << "\n"; - disjoint_val_sets.mapEntries(out_tv_i, out_tv_j); - } - } - - const auto num_disjoint_resize_groups = disjoint_val_sets.size(); - - std::cerr << "Number of disjoint resize groups: " - << num_disjoint_resize_groups << "\n"; - - std::cerr << "Initial disjoint grouping of tensors\n"; - for (const auto& set : disjoint_val_sets.disjointSets()) { - std::cerr << "\t{"; - for (auto tv : *set) { - std::cerr << " T" << tv->name(); - } - std::cerr << "}\n"; - } - - // Include outputs - for (Expr* resize_op : resize_ops) { - auto resize_out = resize_op->output(0)->as(); - auto output_dep_vals = - DependencyCheck::getAllValsBetween({resize_out}, fusion->outputs()); - for (auto tv : ir_utils::filterByType(output_dep_vals)) { - disjoint_val_sets.mapEntries(resize_out, tv); - } - } - - // Output dep vals should also be disjointly grouped, so the number - // of groups should not change - NVF_ERROR( - num_disjoint_resize_groups == disjoint_val_sets.size(), - "Expected number of groups: ", - num_disjoint_resize_groups, - ". Actual: ", - disjoint_val_sets.size()); - - // There can still be tensors that are not producers nor consumers - // of resize ops. They should be fine with any of the groups. - // All of them should now be privatized. - - auto first_group_tv = resize_ops.at(0)->output(0)->as(); - - for (auto tv : all_tvs) { - if (tv->isFusionInput() || disjoint_val_sets.mappingExists(tv)) { - continue; - } - - std::cerr << "Remaining tv: " << tv->toString() - << ". Put into the group of " << first_group_tv->toString() - << "\n"; - - auto dep_outputs = DependencyCheck::getAllOutputsOf({tv}); - NVF_ERROR(!dep_outputs.empty()); - - TensorView* first_dep_output = (*(dep_outputs.begin()))->as(); - bool added_to_group = false; - for (const auto& disjoint_set : disjoint_val_sets.disjointSets()) { - if (!disjoint_set->has(first_dep_output)) { - continue; - } - - // Make sure all outputs are in the same set - for (const auto dep_output : dep_outputs) { - NVF_ERROR(disjoint_set->has(dep_output->as())); - } - - disjoint_val_sets.mapEntries(tv, disjoint_set->front()); - added_to_group = true; - break; - } - - // Could not find any group to join - NVF_ERROR( - added_to_group, "Could not find any group to add ", tv->toString()); - } - - NVF_ERROR( - num_disjoint_resize_groups == disjoint_val_sets.size(), - "Expected number of groups: ", - num_disjoint_resize_groups, - ". Actual: ", - disjoint_val_sets.size()); - - std::cerr << "TV disjoint groups: " << disjoint_val_sets.size() << "\n"; - - std::vector>> ref_list; - - // Pick a reference in each disjoint set - for (const auto& disjoint_set : disjoint_val_sets.disjointSets()) { - TensorView* ref_tv = nullptr; - // TensorView* input_tv = nullptr; - std::unordered_set resize_op_outputs; -#if 0 - for (TensorView* tv : *disjoint_set) { - // All of the slice/pad/cat output tensors should have the same - // loop domain. Any of them can be equally used as the reference - // for this group. - // Update: But propagation could still fail due to the resize - // cyclic mapping. Don't use resize outputs as reference for - // now. - - if (auto def = tv->definition(); - def != nullptr && def->isOneOf()) { - ref_tv = def->output(0)->as(); - break; - } - - if (auto def = tv->definition(); std::any_of( - def->inputs().begin(), def->inputs().end(), [](Val* input) { - return input->isA() && input->isFusionInput(); - })) { - if (input_tv == nullptr || - (input_tv->domain()->noBroadcasts().size() < - tv->domain()->noBroadcasts().size())) { - input_tv = tv; - } - } - } -#endif - - for (TensorView* tv : *disjoint_set) { - if (auto def = tv->definition(); - def != nullptr && def->isOneOf()) { - resize_op_outputs.insert(def->output(0)->as()); - } - } - - for (TensorView* tv : *disjoint_set) { - if (!tv->isFusionOutput()) { - continue; - } - - // Ref if all resize_outputs have a dependency with this output - auto all_dep_vals = DependencyCheck::getAllValsBetween( - {fusion->inputs().begin(), fusion->inputs().end()}, {tv}); - bool all_resize_out_dependent = true; - for (auto resize_out : resize_op_outputs) { - auto it = - std::find(all_dep_vals.begin(), all_dep_vals.end(), resize_out); - if (it == all_dep_vals.end()) { - std::cerr << "Not a dependency: " << resize_out->toString() << " of " - << tv->toString() << "\n"; - all_resize_out_dependent = false; - break; - } - } - - if (!all_resize_out_dependent) { - continue; - } - - ref_tv = tv; - } - - if (ref_tv) { - std::cerr << "Reference: " << ref_tv->toString() << "\n"; - - ref_list.emplace_back(ref_tv, std::vector{}); - auto& member_list = ref_list.back().second; - for (auto tv : all_tvs) { - if (disjoint_set->has(tv)) { - member_list.push_back(tv); - } - } - - continue; - } - - NVF_THROW( - "No reference found for ", toDelimitedString(disjoint_set->vector())); - } - - std::cerr << "Disjoint grouping of tensors with representatives:\n"; - for (const auto& [ref, set] : ref_list) { - std::cerr << "\tRepresentative: " << ref->toString() << "\n" - << "\t{"; - for (auto tv : set) { - std::cerr << " T" << tv->name(); - } - std::cerr << "}\n"; - } - - return ref_list; +TensorView* getReferenceTensor(Fusion* fusion) { + return nullptr; } } // namespace @@ -387,25 +126,21 @@ void ResizeScheduler::schedule(Fusion* fusion, const HeuristicParams* params) { scheduler_tools::propagateResizeToInputs(expr); } - const auto ref_tensors = getReferenceTensors(fusion); + auto ref_tv = getReferenceTensor(fusion); - for (const auto& [ref_tv, tvs_to_schedule] : ref_tensors) { - std::cerr << "Reference: " << ref_tv->toString() << "\n"; - std::cerr << "Tvs to schedule: " << toDelimitedString(tvs_to_schedule) - << "\n"; + std::cerr << "Reference: " << ref_tv->toString() << "\n"; - ref_tv->flatten(); - ref_tv->split(0, 128); - ref_tv->split(0, 1 << 14); - ref_tv->axis(-1)->parallelize(ParallelType::TIDx); - ref_tv->axis(-2)->parallelize(ParallelType::BIDx); + ref_tv->flatten(); + ref_tv->split(0, 128); + ref_tv->split(0, 1 << 14); + ref_tv->axis(-1)->parallelize(ParallelType::TIDx); + ref_tv->axis(-2)->parallelize(ParallelType::BIDx); - std::cerr << "Scheduled reference:\n"; - ref_tv->printTransforms(); + std::cerr << "Scheduled reference:\n"; + ref_tv->printTransforms(); - scheduler_tools::scheduleLoopDomainsLike( - tvs_to_schedule, ref_tv->getLoopDomain()); - } + scheduler_tools::scheduleLoopDomainsLike( + fusion->allTvs(), ref_tv->getLoopDomain()); { std::cerr << "All done\n"; diff --git a/csrc/scheduler/pointwise_utils.cpp b/csrc/scheduler/tools/domain_map.cpp similarity index 97% rename from csrc/scheduler/pointwise_utils.cpp rename to csrc/scheduler/tools/domain_map.cpp index 2f4f119fc46..0a713d346a0 100644 --- a/csrc/scheduler/pointwise_utils.cpp +++ b/csrc/scheduler/tools/domain_map.cpp @@ -5,54 +5,13 @@ * SPDX-License-Identifier: BSD-3-Clause */ // clang-format on -#include -#include -#include -#include - -#include +#include +#include namespace nvfuser { -namespace pointwise_utils { +namespace scheduler_tools { namespace { - -// Grab all exact set mappings from consumer to producer domains of -// indexed accesses, e.g., index_select -std::unordered_multimap< - std::shared_ptr>, - std::shared_ptr>> -getIndexedConsumerToProducerMap(Fusion* fusion, const ComputeAtMap& ca_map) { - std::unordered_multimap< - std::shared_ptr>, - std::shared_ptr>> - indexed_id_map; - - for (auto expr : fusion->exprs()) { - if (auto gather = dynamic_cast(expr)) { - auto p_id = gather->getIndexedID(); - auto c_id = gather->getConsumerOfIndexedID(); - indexed_id_map.emplace( - ca_map.disjointSetOf(c_id, IdMappingMode::EXACT), - ca_map.disjointSetOf(p_id, IdMappingMode::EXACT)); - } else if (auto index_select = dynamic_cast(expr)) { - auto p_id = index_select->getIndexedID(); - auto c_id = index_select->getConsumerOfIndexedID(); - indexed_id_map.emplace( - ca_map.disjointSetOf(c_id, IdMappingMode::EXACT), - ca_map.disjointSetOf(p_id, IdMappingMode::EXACT)); - } else { - // Note there's no consumer ID for select. This means we can't - // just propagate from consumers to indexed producers. It seems - // it's necessary to schedule producers and consumers separately - // in those cases. - continue; - } - } - - return indexed_id_map; -} - // Check if a root ID of a fusion input tensor that is indirectly // accessed by ops such as torchGather needs to be mapped with // a reference tensor. Select has a similar effect as squeeze as the @@ -108,6 +67,42 @@ bool canIgnoreIndexedInputDomainID( return true; } +// Grab all exact set mappings from consumer to producer domains of +// indexed accesses, e.g., index_select +std::unordered_multimap< + std::shared_ptr>, + std::shared_ptr>> +getIndexedConsumerToProducerMap(Fusion* fusion, const ComputeAtMap& ca_map) { + std::unordered_multimap< + std::shared_ptr>, + std::shared_ptr>> + indexed_id_map; + + for (auto expr : fusion->exprs()) { + if (auto gather = dynamic_cast(expr)) { + auto p_id = gather->getIndexedID(); + auto c_id = gather->getConsumerOfIndexedID(); + indexed_id_map.emplace( + ca_map.disjointSetOf(c_id, IdMappingMode::EXACT), + ca_map.disjointSetOf(p_id, IdMappingMode::EXACT)); + } else if (auto index_select = dynamic_cast(expr)) { + auto p_id = index_select->getIndexedID(); + auto c_id = index_select->getConsumerOfIndexedID(); + indexed_id_map.emplace( + ca_map.disjointSetOf(c_id, IdMappingMode::EXACT), + ca_map.disjointSetOf(p_id, IdMappingMode::EXACT)); + } else { + // Note there's no consumer ID for select. This means we can't + // just propagate from consumers to indexed producers. It seems + // it's necessary to schedule producers and consumers separately + // in those cases. + continue; + } + } + + return indexed_id_map; +} + } // namespace DomainMap::DomainMap(Fusion* fusion) : fusion_(fusion), ca_map_(fusion) { @@ -248,5 +243,5 @@ bool DomainMap::isValidReference(TensorView* tv) const { return true; } -} // namespace pointwise_utils +} // namespace scheduler_tools } // namespace nvfuser diff --git a/csrc/scheduler/tools/domain_map.h b/csrc/scheduler/tools/domain_map.h new file mode 100644 index 00000000000..88dadcba721 --- /dev/null +++ b/csrc/scheduler/tools/domain_map.h @@ -0,0 +1,69 @@ +// clang-format off +/* + * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES. + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + */ +// clang-format on +#pragma once + +#include + +#include +#include + +namespace nvfuser { + +class Fusion; +class TensorView; +class IterDomain; + +namespace scheduler_tools { + +// DomainMap uses the ComputeAtMap to find a reference TensorView +// that maps to all IterDomains in the fusion. +class DomainMap { + public: + DomainMap(Fusion* fusion); + virtual ~DomainMap() = default; + + const ComputeAtMap& getComputeAtMap() const { + return ca_map_; + } + + // Determine if a TensorView is a valid reference tensor for this fusion. + // The reference tensor must map to all the iterDomains in each input. + bool isValidReference(TensorView* tv) const; + + protected: + // Determine if all IterDomains are mapped between input and the given tvs + bool areAllInputIdsMappedTo(TensorView* input_tv, TensorView* output_tv) + const; + + virtual IterDomain* getMappedInputConcreteID( + const std::unordered_set& in_concrete_ids, + IterDomain* out_id) const; + + // Erase input concrete ID if it is mapped to output ID + bool eraseIfMapped( + std::unordered_set& in_concrete_ids, + IterDomain* out_id) const; + + // Check if in_ids are mapped to ids through any root domain as + // well as indirectly accessed domains with ops like torchGather + void eraseifInputMappedThroughRootDomainAndIndexing( + std::unordered_set& in_ids, + const std::vector& ids) const; + + // Find any id in domain that maps with target id + IterDomain* anyMapped( + const std::vector& domain, + IterDomain* target) const; + + Fusion* fusion_ = nullptr; + ComputeAtMap ca_map_; + std::vector tvs_with_rfactor_; +}; + +} // namespace scheduler_tools +} // namespace nvfuser diff --git a/csrc/scheduler/transpose.cpp b/csrc/scheduler/transpose.cpp index 7e320f99a91..259f3cb4c89 100644 --- a/csrc/scheduler/transpose.cpp +++ b/csrc/scheduler/transpose.cpp @@ -155,9 +155,9 @@ bool hasSmallTransposeDimensions( // DomainMap uses the ComputeAtMap to find a reference TensorView // that maps to all iterDomains in the fusion. -class DomainMap : public pointwise_utils::DomainMap { +class DomainMap : public scheduler_tools::DomainMap { public: - using pointwise_utils::DomainMap::DomainMap; + using scheduler_tools::DomainMap::DomainMap; // Note that this may not be able to find any reference if any // tensor in the group is only connected with an input through