Skip to content

Commit

Permalink
Merge branch 'main' into translate_no_reduction_matmul
Browse files Browse the repository at this point in the history
  • Loading branch information
naoyam committed Feb 4, 2025
2 parents a4d6060 + 212ac38 commit 3021324
Show file tree
Hide file tree
Showing 37 changed files with 1,842 additions and 211 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -868,6 +868,7 @@ list(APPEND NVFUSER_RUNTIME_FILES
${NVFUSER_ROOT}/runtime/mbarrier.cu
${NVFUSER_ROOT}/runtime/memory.cu
${NVFUSER_ROOT}/runtime/random_numbers.cu
${NVFUSER_ROOT}/runtime/tensor_memory.cu
${NVFUSER_ROOT}/runtime/tensor.cu
${NVFUSER_ROOT}/runtime/tuple.cu
${NVFUSER_ROOT}/runtime/type_traits.cu
Expand Down
12 changes: 10 additions & 2 deletions csrc/codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -684,7 +684,10 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
}

if (ti->view()->getMemoryType() == MemoryType::Tensor) {
code_ << genInline(ti->index());
// Generate code like:
// (uint32_t)(T2 + Array<uint16_t, 2, 1>{0, 0})
code_ << "(uint32_t)(" << genVariableName(ti->view()) << " + "
<< genInline(ti->index()) << ")";
return;
}

Expand Down Expand Up @@ -3197,7 +3200,12 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
break;
}
case MemoryType::Tensor: {
// Do nothing for now. This behavior will change soon.
// Generate code like:
// TMemTensor T2(T5[0], 0, 0);
indent() << "TMemTensor " << genVariableName(tv) << "("
<< genInline(alloc->address()) << ", "
<< genInline(alloc->laneOffset()) << ", "
<< genInline(alloc->colOffset()) << ");\n";
break;
}
default:
Expand Down
79 changes: 74 additions & 5 deletions csrc/device_lower/analysis/tensor_memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,87 @@
#include <device_lower/analysis/tensor_memory.h>
#include <fusion.h>
#include <ir/all_nodes.h>
#include <type.h>

namespace nvfuser {

// See note [Tensor Memory Allocation] for the overall design.
TensorMemoryInfo computeTMemInfo(Fusion* fusion) {
bool found = false;
TensorMemoryInfo result;

// Step 1: partition the tensors. Each partition of tensors will become a
// region, so we use the term partition and region interchangeably. The user
// may have provided full or partial partitioning information. For the
// TensorViews that the user has already specified which region they belong
// to, we will use that information. For the rest of the tensors, we will
// assign each of them to a separate region.
using Partition = std::vector<std::vector<TensorView*>>;
Partition partitions;
if (fusion->hasManaged("tmem_regions")) {
partitions = fusion->getManaged<Partition>("tmem_regions");
} else {
partitions = {};
}

// Verify that there is no overlap between user specified partitions
std::unordered_set<TensorView*> tensors;
for (auto& partition : partitions) {
NVF_ERROR(!partition.empty(), "Empty partition");
for (auto tv : partition) {
NVF_ERROR(
tv->getMemoryType() == MemoryType::Tensor, "Invalid memory type");
NVF_ERROR(
tensors.insert(tv).second, "Tensors cannot be in multiple regions");
}
}

// For all TensorViews whose partition is not specified, assign them to a
// separate region.
for (auto tv : fusion->allTvs()) {
if (tv->getMemoryType() == MemoryType::Tensor) {
NVF_ERROR(!found, "Only one tensor on TMem is supported");
found = true;
if (tv->getMemoryType() != MemoryType::Tensor) {
continue;
}
if (tensors.count(tv) == 0) {
partitions.push_back({tv});
}
}
return {};

// Step 2: Compute the allocation information for tensor memory. That is, for
// each partition, we create a Region object and fill in the necessary
// information.
using Region = TMemAlllocationInfo::Region;
std::vector<Region>& regions = result.allocation.regions;
for (const auto& partition : partitions) {
regions.emplace_back();
auto& region = regions.back();

// tcgen05.alloc stores the allocated address in shared memory. So we use a
// TensorView with MemoryType::Shared to store this address.
region.address = TensorViewBuilder()
.shape(std::vector<Val*>{})
.dtype(DataType::UInt32)
.build();
region.address->setMemoryType(MemoryType::Shared);

// Assign each tensor in the region a whole 128 lanes and N columns.
region.num_columns = region.address->fusion()->zeroVal(DataType::UInt16);
for (auto tv : partition) {
// TODO: right now we hardcode the number of columns of each tensor to
// be 32. This is definitely not correct.
Val* num_columns = IrBuilder::create<Val>(32, DataType::UInt16);
region.covered_tensors.emplace_back();
auto& covered_tensor = region.covered_tensors.back();
covered_tensor.tensor = tv;
covered_tensor.lane_offset = tv->fusion()->zeroVal(DataType::UInt16);
covered_tensor.column_offset = region.num_columns;
region.num_columns =
SimplifyingIrBuilder::addExpr(region.num_columns, num_columns);
}
region.num_columns =
IrBuilder::maybeCastExpr(DataType::UInt32, region.num_columns);
}

return result;
}

} // namespace nvfuser
128 changes: 111 additions & 17 deletions csrc/device_lower/analysis/tensor_memory.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,15 @@
// clang-format on
#pragma once

#include <vector>

namespace nvfuser {

class Val;
class TensorView;
class Fusion;

// Information used to lower tensor memory. So far, there is no information
// needed, the computeTMemInfo just check that there is only one tensor on TMem
// in the fusion. This limitation is described in the note below, and it is only
// for incremental development. This limitation will be removed soon in the
// future.
// Information used to lower tensor memory. So far, it is just about allocation.
struct TensorMemoryInfo;
TensorMemoryInfo computeTMemInfo(Fusion* fusion);

Expand Down Expand Up @@ -48,18 +48,112 @@ TensorMemoryInfo computeTMemInfo(Fusion* fusion);
// relinquishes the right to allocate, the next CTA that is blocked will be
// unblocked and can acquire the mutex to allocate TMem.
//
// Currently, the TMem allocation is not supported in nvFuser. We currently only
// allow one TensorView to be on TMem, and because we never relinquish the right
// to allocate TMem, CTA will be serialized on SM. A new CTA can be scheduled on
// an SM only after the previous CTA on that SM has completely finished
// executing. Thanks to this serialization, we can just skip allocating and
// think that our only TMem TensorView own the entire TMem, because we are sure
// that there will not be another CTA using that address. As a result, we could
// just provide address 0 to our instructions that access TMem. In principle, it
// is clearly wrong to write to an address that is not allocated, but because we
// are sure that it will in practice work for the specific unit test that we are
// targeting, we just do it so we have incremental development.
// The tcgen05.alloc instruction is like the following:
// tcgen05.alloc [dest], nCols
//
// There are three important things to note about this instruction:
//
// 1. The output of this instruction is in shared memory address.
// 2. The unit of allocation is 32 whole columns of tensor memory. And nCols
// must be a power of two.
// 3. The right to allocate is like a mutex and will serialize CTA scheduling.
// The tcgen05.alloc is blocking when there is no space to allocate.
//
// The point 1 above is not a big trouble for us, but we need to make sure we
// allocate the address tensor in shared memory before allocating the tensor
// memory. But the point 2 and 3 can be a big challenge. There are basically
// two things to worry about when allocating tensor memory:
//
// 1. Fragmentation. When the tensor does not occupy all lanes or the tensor's
// size is not a power of two columns or < 32 columns, naively allocating all
// lanes with 32 or higher power of 2 columns could waste some space. In a
// perfect world, it would be nice to have a 2D allocator that is capable
// merging the allocation of multiple tensors into a single tcgen05.alloc.
// For example, if tv0 and tv2 both has 64 rows and 32 columns, we can allocate
// tv0 on the first 64 lanes, and tv1 on the next 64 lanes. Another example is,
// if tv0 has 128 rows and 31 columns, and tv1 has 128 rows and 33 columns, we
// pack the two tensors into a single tcgen05.alloc of 64 columns.
//
// 2. Latency. We should relinquish the right to allocate as soon as we are done
// with allocating, so that other CTAs can grab the "right to allocate" mutex.
// We should also deallocate the tensor memory as soon as we are done with using
// it, so that other CTA's tcgen05.alloc can get unblocked. In a perfect world,
// it would be nice to able to break one TensorView into multiple deallocations.
// For example, if tv0 has 128 rows and 256 columns, and we are sequentially
// reading these 256 columns one by one. For this case, instead of waiting for
// the entire 256-size loop to finish, it would be nice to deallocate the first
// 128 columns if we are done with reading them, so that other CTAs have a
// chance to allocate their memory in the freed space.
//
// From the above analysis, it is important to realize that the allocation of
// TensorView and the allocation of the tensor memory are not a one-to-one
// correspondence. A TensorView can be allocated by multiple tcgen05.allocs, and
// a tcgen05.alloc can be used to allocate multiple TensorViews. For now, we
// limit ourselves that a TensorView can not span multiple tcgen05.allocs, and
// we call a piece of TMem area that is allocated by a single tcgen05.alloc and
// may span multiple TensorViews a "region". This design derives a
// TMem -> region -> TensorView hierarchy.
//
// In practice, it is very difficult to optimize both fragmentation and latency
// perfectly. Although tensor memory was originally designed for matmul, because
// it is a large and fast memory, it would be nice to use it for other purposes,
// such as persistent buffers. This could make it even more difficult to
// allocate tensor memory optimally. Considering the complexity of the problem,
// the development of a tensor memory allocator is likely an incremental
// process. With this in mind, we design the allocation of tensor memory in
// nvFuser to be hackable.
//
// There are three main components in the design:
// 1. A data structure, TMemAllocationInfo, that describes how we allocate
// tensor memory.
// 2. A heuristic, executed as part of computeTMemInfo, that generates the
// allocation information as an instance of TMemAlllocationInfo.
// 3. A pass, executed as part of insertAllocations, that generates the actual
// IR nodes based on the TMemAlllocationInfo.
//
// The TMemAllocationInfo data structure and the insertAllocations support
// a wider range of allocation strategies than the heuristic in computeTMemInfo.
// This provides some flexibility for prototyping and experimentation by just
// manually specifying TMemAllocationInfo. To manually specify the allocation
// strategy, the user can specify a managed variable "tmem_regions" in the
// fusion. The type of this managed variable is vector<vector<TensorView*>>
// which specifies which TensorViews should be coalesced into the same region.

// The data structure that describes how we allocate tensor memory. It is
// assumed that:
// 1. TMem allocation are split into regions, with each region described by a
// Region. Each region spans a full 128 lanes and N columns of tensor memory.
// The number of columns must be a power of two and minimum 32. Each region
// is allocated by a single tcgen05.alloc and deallocated by a matching
// tcgen05.dealloc.
// 2. Each kernel can have multiple regions.
// 3. Each region can cover multiple TensorViews, but each TensorView can not
// span multiple regions.
struct TMemAlllocationInfo {
// Each entry describes a region of 128 rows x N columns of tensor memory
// allocated by a single tcgen05.alloc.
struct Region {
// tcgen05.alloc stores the allocated address in shared memory. So we use a
// TensorView with MemoryType::Shared to store this address.
TensorView* address;
// The number of columns to allocate. Must be >= 32 and a power of two.
Val* num_columns;
// The TMem TensorViews covered by this region. Each region can be used to
// store multiple TensorViews. The (lane_offset, column_offset) specifies
// the starting offset of each TensorView in this region.
struct TVInfo {
TensorView* tensor;
Val* lane_offset;
Val* column_offset;
};
std::vector<TVInfo> covered_tensors;
};
std::vector<Region> regions;
};

struct TensorMemoryInfo {};
// The actual definition of TensorMemoryInfo.
struct TensorMemoryInfo {
TMemAlllocationInfo allocation;
};

} // namespace nvfuser
97 changes: 93 additions & 4 deletions csrc/device_lower/pass/allocation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -473,12 +473,39 @@ class AllocationInserter : public kir::ExprMutator {
}

// Create the allocation node
return IrBuilder::create<kir::Allocate>(
auto alloc_expr = IrBuilder::create<kir::Allocate>(
info.buffer, info.buffer->getMemoryType(), alloc_dims);

// Fill in the base address, lane offset, and column offset for tensor
// memory allocations
if (memory_type == MemoryType::Tensor) {
const auto& regions = GpuLower::current()->tmemInfo().allocation.regions;
for (const auto& region : regions) {
auto tv_info_it = std::find_if(
region.covered_tensors.begin(),
region.covered_tensors.end(),
[&](const auto& tv_info) { return tv_info.tensor == info.buffer; });
if (tv_info_it != region.covered_tensors.end()) {
auto address_ti = IrBuilder::create<kir::TensorIndex>(
region.address, region.address->fusion()->zeroVal());
alloc_expr->setAddress(address_ti);
alloc_expr->setLaneOffset(tv_info_it->lane_offset);
alloc_expr->setColOffset(tv_info_it->column_offset);
break;
}
}
NVF_ERROR(
alloc_expr->address() != nullptr,
"Could not find region for tensor memory allocation of ",
info.buffer);
}

return alloc_expr;
}

void dispatch(Expr* expr) override {
if (!ir_utils::isTvOp(expr) || expr->isA<kir::Allocate>()) {
if (!ir_utils::isTvOp(expr) || expr->isA<kir::Allocate>() ||
expr->isA<kir::AllocTMem>()) {
ExprMutator::dispatch(expr);
return;
}
Expand Down Expand Up @@ -601,7 +628,7 @@ class AllocationInserter : public kir::ExprMutator {
// generic-async proxy fence and wgmma fence before each mma
// instruction. For this case, we need to insert these fences
// after the initialization of the accumulator, so that the
// inilization is visible to the async proxy.
// initialization is visible to the async proxy.
// When all inputs are guarded by mbarrier, we will insert these
// fences before each mma instruction, so there is no need to
// insert them after the initialization of the accumulator here.
Expand Down Expand Up @@ -813,11 +840,73 @@ class AllocationInserter : public kir::ExprMutator {
}
};

// Insert IR nodes that allocate and deallocate TMem regions.
// See note [Tensor Memory Allocation] for the overall design.
// We insert the tcgen05.allocs of each region and the relinquish of the right
// to allocate at the beginning of the top-level scope of the kernel. We do not
// tcgen05.dealloc for now. The allocation of each TMem TensorView within each
// region is inserted by AllocationInserter::insert, therefore not handled here.
std::vector<Expr*> insertTMemRegionAllocsAndDeallocs(
const std::vector<Expr*>& exprs) {
// Expressions to be inserted at the beginning of the top-level scope.
std::list<Expr*> prologue;
{
const auto& regions = GpuLower::current()->tmemInfo().allocation.regions;
// For each TMem region, allocate its address in shared memory, and insert
// the tcgen05.alloc for tensor memory allocation.
for (const auto& region : regions) {
// kir::Allocate for the address tensor on shared memory
auto address_alloc_expr =
IrBuilder::create<kir::Allocate>(region.address, MemoryType::Shared);
prologue.push_back(address_alloc_expr);
// the tcgen05.alloc instruction
auto alloc_expr =
IrBuilder::create<kir::AllocTMem>(region.address, region.num_columns);
prologue.push_back(alloc_expr);
}

if (!regions.empty()) {
// Relinquish the right to allocate after all regions have been allocated
auto tcgen05_relinquish_expr = IrBuilder::create<kir::Asm>(
"tcgen05.relinquish_alloc_permit.cta_group::1.sync.aligned",
std::vector<Val*>{},
std::vector<Val*>{},
kir::Asm::Options{/*volatile=*/true});
prologue.push_back(tcgen05_relinquish_expr);

// Block sync that makes allocation visible to all threads
auto block_sync = IrBuilder::create<kir::BlockSync>();
prologue.push_back(block_sync);
}
}

// Combine prologue and exprs
std::vector<Expr*> result;
result.reserve(prologue.size() + exprs.size());
result.insert(result.end(), prologue.begin(), prologue.end());
result.insert(result.end(), exprs.begin(), exprs.end());
return result;
}

} // namespace

std::vector<Expr*> insertAllocations(const std::vector<Expr*>& exprs) {
FUSER_PERF_SCOPE("GpuLower::Lower::insertAllocations");
return AllocationInserter::insert(exprs);
// If the fusion uses tensor memory, insert the following things to the
// fusion:
// - A tcgen05.alloc for each tensor memory region
// - A kir::Allocate for a shared memory TensorView for each tensor memory
// region for storing addresses of these regions. Because tcgen05.alloc
// writes the address of allocated memory to the shared memory, there must
// be shared memory TensorViews to store these addresses. These address
// TensorViews are not part of the fusion math, and not handled by
// AllocationInserter::insert. Note that these address TensorViews are not
// the tensor memory TensorViews in fusion math.
// - A tcgen05.relinquish_alloc_permit after all tcgen05.allocs
auto result = insertTMemRegionAllocsAndDeallocs(exprs);
// Insert kir::Allocate for each Val, including the kir::Allocate for tensor
// memory TensorViews, in fusion math.
return AllocationInserter::insert(result);
}

} // namespace nvfuser
Loading

0 comments on commit 3021324

Please sign in to comment.