Skip to content

Commit

Permalink
Simplify getting a host buffer from tensor to avoid direct calls to c…
Browse files Browse the repository at this point in the history
…ommand queue (#18533)

### Ticket

### Problem description
Currently we have a few places, where to get a host buffer from tensor
we do some manual logic with calls to command queue, which isn't
supported on MeshDevice.

### What's changed
Move repeating code to a single function and use Tensor::cpu() to
convert device to cpu tensor

### Checklist
- [x] [All post commit CI
passes](https://github.com/tenstorrent/tt-metal/actions/runs/13613038021)
- [x] New/Existing tests provide coverage for changes
  • Loading branch information
sminakov-tt authored Mar 4, 2025
1 parent 741aeb8 commit 23791be
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 60 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -29,18 +29,7 @@ static Tensor manual_insertion(
"Required shape volume ({}) must match old shape volume ({})",
logical_shape.volume(),
input_tensor.get_logical_volume());
auto device_buffer = input_tensor.device_buffer();
uint32_t size_in_bytes = device_buffer->size();
std::vector<uint16_t> data_vec;
const char* TT_METAL_SLOW_DISPATCH_MODE = std::getenv("TT_METAL_SLOW_DISPATCH_MODE");
if (TT_METAL_SLOW_DISPATCH_MODE == nullptr) {
data_vec.resize(size_in_bytes / sizeof(uint16_t));
tt::tt_metal::tensor_impl::read_data_from_device_buffer<uint16_t>(
input_tensor.device()->command_queue(), device_buffer, data_vec.data(), true);
} else {
tt::tt_metal::tensor_impl::read_data_from_device_buffer<uint16_t>(device_buffer, data_vec);
}
auto owned_buffer = tt::tt_metal::owned_buffer::create<uint16_t>(std::move(data_vec));
auto owned_buffer = ttnn::detail::to_host_buffer<uint16_t>(input_tensor);
auto output =
Tensor(
OwnedStorage{owned_buffer},
Expand Down
71 changes: 23 additions & 48 deletions ttnn/cpp/ttnn/operations/functions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#pragma once

#include <tt-metalium/math.hpp>
#include <tt-metalium/overloaded.hpp>
#include <optional>
#include <random>
#include <ttnn/tensor/host_buffer/functions.hpp>
Expand All @@ -28,6 +29,24 @@ using tt::tt_metal::Tensor;
using tt::tt_metal::TensorLayout;
using tt::tt_metal::TensorMemoryLayout;

namespace detail {
template <typename T>
tt::tt_metal::owned_buffer::Buffer<T> to_host_buffer(const Tensor& tensor) {
auto cpu_tensor = tensor.cpu();
auto& storage = cpu_tensor.storage();
tt::tt_metal::OwnedBuffer buffer = std::visit(
tt::stl::overloaded{
[](const tt::tt_metal::OwnedStorage& storage) { return storage.get_buffer(); },
[](const tt::tt_metal::MultiDeviceHostStorage& storage) {
TT_FATAL(storage.num_buffers() == 1, "Can't get a single buffer from multi device host storage");
return storage.get_buffer(0);
},
[](const auto&) -> tt::tt_metal::OwnedBuffer { TT_THROW("Not supported storage type"); }},
storage);
return std::get<tt::tt_metal::owned_buffer::Buffer<T>>(buffer);
}
} // namespace detail

template <typename T, bool IS_UPPER>
static Tensor index_trilu(
const ttnn::Shape& logical_shape,
Expand Down Expand Up @@ -247,18 +266,7 @@ static Tensor fill_first_val_into_tensor(
.memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED}) {
auto physical_volume = input_tensor.volume();
auto owned_buffer = tt::tt_metal::owned_buffer::create<T>(physical_volume); // ouput
auto device_buffer = input_tensor.device_buffer();
uint32_t size_in_bytes = device_buffer->size();
std::vector<T> data_vec;
const char* TT_METAL_SLOW_DISPATCH_MODE = std::getenv("TT_METAL_SLOW_DISPATCH_MODE");
if (TT_METAL_SLOW_DISPATCH_MODE == nullptr) {
data_vec.resize(size_in_bytes / sizeof(T));
tt::tt_metal::tensor_impl::read_data_from_device_buffer<T>(
input_tensor.device()->command_queue(), device_buffer, data_vec.data(), true);
} else {
tt::tt_metal::tensor_impl::read_data_from_device_buffer<T>(device_buffer, data_vec);
}
auto input_buffer = tt::tt_metal::owned_buffer::create<T>(std::move(data_vec));
auto input_buffer = detail::to_host_buffer<T>(input_tensor);
const ttnn::Shape input_tensor_strides = input_tensor.strides();
for (uint32_t i = 0; i < physical_volume; i++) {
owned_buffer[i] = input_buffer[0];
Expand Down Expand Up @@ -290,18 +298,7 @@ static Tensor prod_result_computation_GS(
.memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED}) {
const ttnn::Shape& s_a = input_tensor.get_padded_shape();
auto owned_buffer = tt::tt_metal::owned_buffer::create<T>(input_tensor.volume()); // ouput
auto device_buffer = input_tensor.device_buffer();
uint32_t size_in_bytes = device_buffer->size();
std::vector<T> data_vec;
const char* TT_METAL_SLOW_DISPATCH_MODE = std::getenv("TT_METAL_SLOW_DISPATCH_MODE");
if (TT_METAL_SLOW_DISPATCH_MODE == nullptr) {
data_vec.resize(size_in_bytes / sizeof(T));
tt::tt_metal::tensor_impl::read_data_from_device_buffer<T>(
input_tensor.device()->command_queue(), device_buffer, data_vec.data(), true);
} else {
tt::tt_metal::tensor_impl::read_data_from_device_buffer<T>(device_buffer, data_vec);
}
auto input_buffer = tt::tt_metal::owned_buffer::create<T>(std::move(data_vec));
auto input_buffer = detail::to_host_buffer<T>(input_tensor);
const ttnn::Shape input_tensor_strides = input_tensor.strides();
auto result = static_cast<T>(1.0f);
for (uint32_t i = s_a[0] - 1; i < s_a[0]; i++) {
Expand Down Expand Up @@ -349,18 +346,7 @@ static Tensor prod_result_computation_WH_B0(
.memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED}) {
const auto& s_a = input_tensor.get_padded_shape();
auto owned_buffer = tt::tt_metal::owned_buffer::create<T>(s_a.volume()); // ouput
auto device_buffer = input_tensor.device_buffer();
uint32_t size_in_bytes = device_buffer->size();
std::vector<T> data_vec;
const char* TT_METAL_SLOW_DISPATCH_MODE = std::getenv("TT_METAL_SLOW_DISPATCH_MODE");
if (TT_METAL_SLOW_DISPATCH_MODE == nullptr) {
data_vec.resize(size_in_bytes / sizeof(T));
tt::tt_metal::tensor_impl::read_data_from_device_buffer<T>(
input_tensor.device()->command_queue(), device_buffer, data_vec.data(), true);
} else {
tt::tt_metal::tensor_impl::read_data_from_device_buffer<T>(device_buffer, data_vec);
}
auto input_buffer = tt::tt_metal::owned_buffer::create<T>(std::move(data_vec));
auto input_buffer = detail::to_host_buffer<T>(input_tensor);
const ttnn::Shape input_tensor_strides = input_tensor.strides();
auto result = static_cast<T>(1.0f);
// need to access the last 4 rows and alternating columns of index 17 ,19, 21, 23, 25, 27, 29, 31
Expand Down Expand Up @@ -499,18 +485,7 @@ static Tensor manual_insertion(
TT_ASSERT(
padded_shape[0] * padded_shape[1] * padded_shape[2] * padded_shape[3] == input_tensor.volume(),
"Required shape volume must match old shape volume");
auto device_buffer = input_tensor.device_buffer();
uint32_t size_in_bytes = device_buffer->size();
std::vector<T> data_vec;
const char* TT_METAL_SLOW_DISPATCH_MODE = std::getenv("TT_METAL_SLOW_DISPATCH_MODE");
if (TT_METAL_SLOW_DISPATCH_MODE == nullptr) {
data_vec.resize(size_in_bytes / sizeof(T));
tt::tt_metal::tensor_impl::read_data_from_device_buffer<T>(
input_tensor.device()->command_queue(), device_buffer, data_vec.data(), true);
} else {
tt::tt_metal::tensor_impl::read_data_from_device_buffer<T>(device_buffer, data_vec);
}
auto owned_buffer = tt::tt_metal::owned_buffer::create<T>(std::move(data_vec));
auto owned_buffer = detail::to_host_buffer<T>(input_tensor);
auto output = Tensor(
OwnedStorage{owned_buffer},
TensorSpec(
Expand Down

0 comments on commit 23791be

Please sign in to comment.