From d7058e936b7299be15c4b29b956db87a203d877a Mon Sep 17 00:00:00 2001 From: Samarth Agarwal Date: Wed, 29 Jan 2025 22:06:47 +0000 Subject: [PATCH 01/24] Saving work --- ...queueWriteBuffer_and_EnqueueReadBuffer.cpp | 9 +++- tt_metal/impl/buffers/dispatch.cpp | 53 +++++++++++++++---- 2 files changed, 51 insertions(+), 11 deletions(-) diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp index 5e9ffe9576e..ee8701b680d 100644 --- a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 #include +#include #include #include "buffer.hpp" @@ -252,6 +253,12 @@ void test_EnqueueWriteBuffer_and_EnqueueReadBuffer(IDevice* device, CommandQueue detail::ReadFromBuffer(*bufa, result); } + for (uint32_t i = 0; i < result.size(); i++) { + if (result[i] != i) { + std::cout << "i: " + std::to_string(i) + " result[i]: " + std::to_string(result[i]) << std::endl; + } + } + EXPECT_EQ(src, result); } } @@ -525,7 +532,7 @@ TEST_F(CommandQueueSingleCardBufferFixture, TestPageLargerThanMaxPrefetchCommand CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id()); const uint32_t max_prefetch_command_size = DispatchMemMap::get(dispatch_core_type).max_prefetch_command_size(); TestBufferConfig config = { - .num_pages = 1, .page_size = max_prefetch_command_size + 2048, .buftype = BufferType::DRAM}; + .num_pages = 9, .page_size = max_prefetch_command_size + 2048, .buftype = BufferType::DRAM}; local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(device, device->command_queue(), config); } } diff --git a/tt_metal/impl/buffers/dispatch.cpp b/tt_metal/impl/buffers/dispatch.cpp index cd20da802c1..dd791946780 100644 --- a/tt_metal/impl/buffers/dispatch.cpp +++ b/tt_metal/impl/buffers/dispatch.cpp @@ -116,10 +116,9 @@ InterleavedBufferWriteDispatchParams initialize_interleaved_buf_dispatch_params( dispatch_params.padded_buffer_size = num_pages * padded_page_size; if (dispatch_params.write_partial_pages) { - TT_FATAL(num_pages == 1, "TODO: add support for multi-paged buffer with page size > 64KB"); uint32_t partial_size = DispatchSettings::BASE_PARTIAL_PAGE_SIZE; uint32_t pcie_alignment = hal.get_alignment(HalMemType::HOST); - while (dispatch_params.padded_buffer_size % partial_size != 0) { + while (padded_page_size % partial_size != 0) { partial_size += pcie_alignment; } dispatch_params.page_size_to_write = partial_size; @@ -131,10 +130,10 @@ InterleavedBufferWriteDispatchParams initialize_interleaved_buf_dispatch_params( const uint32_t num_partial_pages_per_page = padded_page_size / dispatch_params.page_size_to_write; const uint32_t num_partials_round_robined = num_partial_pages_per_page * num_pages_round_robined; - dispatch_params.max_num_pages_to_write = - (dispatch_params.write_partial_pages) - ? (num_pages_round_robined > 0 ? (num_banks * num_partials_round_robined) : num_banks_with_residual_pages) - : dispatch_params.total_pages_to_write; + // dispatch_params.max_num_pages_to_write = + // (dispatch_params.write_partial_pages) + // ? (num_pages_round_robined > 0 ? (num_banks * num_partials_round_robined) : + // num_banks_with_residual_pages) : dispatch_params.total_pages_to_write; dispatch_params.address = buffer.address(); dispatch_params.device = buffer.device(); dispatch_params.cq_id = cq_id; @@ -171,7 +170,7 @@ void populate_interleaved_buffer_write_dispatch_cmds( // TODO: Consolidate if (write_partial_pages) { uint32_t padding = full_page_size - buffer.page_size(); - uint32_t src_address_offset = dispatch_params.address - buffer.address(); + uint32_t src_address_offset = dispatch_params.total_pages_written * dispatch_params.page_size_to_write; for (uint32_t sysmem_address_offset = 0; sysmem_address_offset < data_size_bytes; sysmem_address_offset += dispatch_params.page_size_to_write) { uint32_t page_size_to_copy = dispatch_params.page_size_to_write; @@ -305,6 +304,7 @@ void write_interleaved_buffer_to_device( SystemMemoryManager& sysmem_manager = dispatch_params.device->sysmem_manager(); uint32_t data_offsetB = hal.get_alignment(HalMemType::HOST); // data appended after CQ_PREFETCH_CMD_RELAY_INLINE // + CQ_DISPATCH_CMD_WRITE_PAGED + const uint32_t num_banks = buffer.device()->num_banks(buffer.buffer_type()); const uint32_t starting_dst_page_index = dispatch_params.dst_page_index; while (dispatch_params.total_pages_to_write > 0) { dispatch_params.issue_wait = @@ -328,16 +328,20 @@ void write_interleaved_buffer_to_device( dispatch_params.pages_per_txn = std::min( {(uint32_t)num_pages_available, - dispatch_params.max_num_pages_to_write, + // dispatch_params.max_num_pages_to_write, dispatch_params.total_pages_to_write}); + if (dispatch_params.write_partial_pages) { + dispatch_params.pages_per_txn = std::min(dispatch_params.pages_per_txn, (uint32_t)1); + // dispatch_params.address += dispatch_params.page_size_to_write; + } + // Page offset in CQ_DISPATCH_CMD_WRITE_PAGED is uint16_t // To handle larger page offsets move bank base address up and update page offset to be relative to the new // bank address if (dispatch_params.dst_page_index > 0xFFFF or (dispatch_params.pages_per_txn == dispatch_params.max_num_pages_to_write and dispatch_params.write_partial_pages)) { - uint32_t num_banks = buffer.device()->allocator()->get_num_banks(buffer.buffer_type()); uint32_t num_banks_to_use = dispatch_params.write_partial_pages ? dispatch_params.max_num_pages_to_write : num_banks; uint32_t residual = dispatch_params.dst_page_index % num_banks_to_use; @@ -346,12 +350,41 @@ void write_interleaved_buffer_to_device( dispatch_params.dst_page_index = residual; } + if (dispatch_params.write_partial_pages) { + ; + } + tt::log_debug(tt::LogDispatch, "EnqueueWriteBuffer for command queue {}", dispatch_params.cq_id); issue_buffer_dispatch_command_sequence(src, buffer, dispatch_params, sub_device_ids, dispatch_core_type); dispatch_params.total_pages_written += dispatch_params.pages_per_txn; dispatch_params.total_pages_to_write -= dispatch_params.pages_per_txn; - dispatch_params.dst_page_index += dispatch_params.pages_per_txn; + // dispatch_params.dst_page_index += dispatch_params.pages_per_txn; + if (dispatch_params.write_partial_pages) { + const uint32_t num_partial_pages_per_full_page = + buffer.aligned_page_size() / dispatch_params.page_size_to_write; + dispatch_params.address += (dispatch_params.pages_per_txn * dispatch_params.page_size_to_write); + const bool has_full_page_been_written = + dispatch_params.total_pages_written > 0 && + dispatch_params.total_pages_written % num_partial_pages_per_full_page == 0; + if (has_full_page_been_written) { + // might need to be modified when page is being round robined + dispatch_params.dst_page_index += dispatch_params.pages_per_txn; + const bool will_next_page_be_round_robined = + (dispatch_params.dst_page_index / num_banks) != + ((dispatch_params.dst_page_index - dispatch_params.pages_per_txn) / num_banks); + if (!will_next_page_be_round_robined) { + dispatch_params.address -= (num_partial_pages_per_full_page * dispatch_params.page_size_to_write); + } else { + dispatch_params.dst_page_index = 0; + } + } + // else { + // dispatch_params.address += (dispatch_params.pages_per_txn * dispatch_params.page_size_to_write); + // } + } else { + dispatch_params.dst_page_index += dispatch_params.pages_per_txn; + } } } From 679e7b9c4d4cd77fff8515985e642c239a0dab09 Mon Sep 17 00:00:00 2001 From: Samarth Agarwal Date: Fri, 31 Jan 2025 17:48:57 +0000 Subject: [PATCH 02/24] Saving work --- ...queueWriteBuffer_and_EnqueueReadBuffer.cpp | 84 +++++++++++++++---- tt_metal/impl/buffers/dispatch.cpp | 62 ++++++-------- 2 files changed, 96 insertions(+), 50 deletions(-) diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp index ee8701b680d..10a5ab003d8 100644 --- a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp @@ -526,19 +526,50 @@ TEST_F(CommandQueueSingleCardBufferFixture, TestPageLargerThanAndUnalignedToTran } } -TEST_F(CommandQueueSingleCardBufferFixture, TestPageLargerThanMaxPrefetchCommandSize) { - constexpr uint32_t num_round_robins = 1; +TEST_F(CommandQueueSingleCardBufferFixture, TestSinglePageLargerThanMaxPrefetchCommandSize) { for (IDevice* device : devices_) { CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id()); const uint32_t max_prefetch_command_size = DispatchMemMap::get(dispatch_core_type).max_prefetch_command_size(); TestBufferConfig config = { - .num_pages = 9, .page_size = max_prefetch_command_size + 2048, .buftype = BufferType::DRAM}; + .num_pages = 1, .page_size = max_prefetch_command_size + 2048, .buftype = BufferType::DRAM}; local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(device, device->command_queue(), config); } } -TEST_F(CommandQueueSingleCardBufferFixture, TestUnalignedPageLargerThanMaxPrefetchCommandSize) { - constexpr uint32_t num_round_robins = 1; +TEST_F(CommandQueueSingleCardBufferFixture, TestMultiplePagesLargerThanMaxPrefetchCommandSize) { + for (IDevice* device : devices_) { + CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id()); + const uint32_t max_prefetch_command_size = + dispatch_constants::get(dispatch_core_type).max_prefetch_command_size(); + TestBufferConfig config = { + .num_pages = 1024, .page_size = max_prefetch_command_size + 2048, .buftype = BufferType::DRAM}; + local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(device, device->command_queue(), config); + } +} + +TEST_F(CommandQueueSingleCardBufferFixture, TestMultiplePagesLargerThanMaxPrefetchCommandSizeSubBuffer) { + for (IDevice* device : devices_) { + tt::log_info("Running On Device {}", device->id()); + CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id()); + + const uint32_t max_prefetch_command_size = + dispatch_constants::get(dispatch_core_type).max_prefetch_command_size(); + const uint32_t page_size = max_prefetch_command_size + 2048; + const uint32_t buffer_size = 40 * page_size; + const uint32_t region_size = 5 * page_size; + const uint32_t region_offset = 30 * page_size; + + const BufferRegion region(region_offset, region_size); + auto buffer = Buffer::create(device, buffer_size, page_size, BufferType::DRAM); + auto src = local_test_functions::generate_arange_vector(region.size); + EnqueueWriteSubBuffer(device->command_queue(), *buffer, src, region, false); + vector result; + EnqueueReadSubBuffer(device->command_queue(), *buffer, result, region, true); + EXPECT_EQ(src, result); + } +} + +TEST_F(CommandQueueSingleCardBufferFixture, TestSingleUnalignedPageLargerThanMaxPrefetchCommandSize) { for (IDevice* device : devices_) { CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id()); const uint32_t max_prefetch_command_size = DispatchMemMap::get(dispatch_core_type).max_prefetch_command_size(); @@ -548,6 +579,39 @@ TEST_F(CommandQueueSingleCardBufferFixture, TestUnalignedPageLargerThanMaxPrefet } } +TEST_F(CommandQueueSingleCardBufferFixture, TestMultipleUnalignedPagesLargerThanMaxPrefetchCommandSize) { + for (IDevice* device : devices_) { + CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id()); + const uint32_t max_prefetch_command_size = + dispatch_constants::get(dispatch_core_type).max_prefetch_command_size(); + TestBufferConfig config = { + .num_pages = 1024, .page_size = max_prefetch_command_size + 4, .buftype = BufferType::DRAM}; + local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(device, device->command_queue(), config); + } +} + +TEST_F(CommandQueueSingleCardBufferFixture, TestMultipleUnalignedPagesLargerThanMaxPrefetchCommandSizeSubBuffer) { + for (IDevice* device : devices_) { + tt::log_info("Running On Device {}", device->id()); + CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id()); + + const uint32_t max_prefetch_command_size = + dispatch_constants::get(dispatch_core_type).max_prefetch_command_size(); + const uint32_t page_size = max_prefetch_command_size + 4; + const uint32_t buffer_size = 40 * page_size; + const uint32_t region_size = 5 * page_size; + const uint32_t region_offset = 30 * page_size; + + const BufferRegion region(region_offset, region_size); + auto buffer = Buffer::create(device, buffer_size, page_size, BufferType::DRAM); + auto src = local_test_functions::generate_arange_vector(region.size); + EnqueueWriteSubBuffer(device->command_queue(), *buffer, src, region, false); + vector result; + EnqueueReadSubBuffer(device->command_queue(), *buffer, result, region, true); + EXPECT_EQ(src, result); + } +} + TEST_F(CommandQueueSingleCardBufferFixture, TestNon32BAlignedPageSizeForDram) { TestBufferConfig config = {.num_pages = 1250, .page_size = 200, .buftype = BufferType::DRAM}; @@ -565,16 +629,6 @@ TEST_F(CommandQueueSingleCardBufferFixture, TestNon32BAlignedPageSizeForDram2) { } } -TEST_F(CommandQueueSingleCardBufferFixture, TestPageSizeTooLarge) { - // Should throw a host error due to the page size not fitting in the consumer CB - TestBufferConfig config = {.num_pages = 1024, .page_size = 250880 * 2, .buftype = BufferType::DRAM}; - - for (IDevice* device : devices_) { - EXPECT_ANY_THROW((local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer( - device, device->command_queue(), config))); - } -} - // Requires enqueue write buffer TEST_F(CommandQueueSingleCardBufferFixture, TestWrapHostHugepageOnEnqueueReadBuffer) { for (IDevice* device : this->devices_) { diff --git a/tt_metal/impl/buffers/dispatch.cpp b/tt_metal/impl/buffers/dispatch.cpp index dd791946780..6ef0ff18e36 100644 --- a/tt_metal/impl/buffers/dispatch.cpp +++ b/tt_metal/impl/buffers/dispatch.cpp @@ -43,7 +43,6 @@ struct BufferWriteDispatchParams { struct InterleavedBufferWriteDispatchParams : BufferWriteDispatchParams { uint32_t write_partial_pages = 0; uint32_t padded_buffer_size = 0; - uint32_t max_num_pages_to_write = 0; }; // Parameters specific to sharded buffers @@ -124,20 +123,18 @@ InterleavedBufferWriteDispatchParams initialize_interleaved_buf_dispatch_params( dispatch_params.page_size_to_write = partial_size; dispatch_params.total_pages_to_write = dispatch_params.padded_buffer_size / dispatch_params.page_size_to_write; } - const uint32_t num_banks = buffer.device()->allocator()->get_num_banks(buffer.buffer_type()); - const uint32_t num_pages_round_robined = num_pages / num_banks; - const uint32_t num_banks_with_residual_pages = num_pages % num_banks; - const uint32_t num_partial_pages_per_page = padded_page_size / dispatch_params.page_size_to_write; - const uint32_t num_partials_round_robined = num_partial_pages_per_page * num_pages_round_robined; - - // dispatch_params.max_num_pages_to_write = - // (dispatch_params.write_partial_pages) - // ? (num_pages_round_robined > 0 ? (num_banks * num_partials_round_robined) : - // num_banks_with_residual_pages) : dispatch_params.total_pages_to_write; + dispatch_params.address = buffer.address(); dispatch_params.device = buffer.device(); dispatch_params.cq_id = cq_id; dispatch_params.expected_num_workers_completed = expected_num_workers_completed; + + if (dispatch_params.write_partial_pages) { + const uint32_t num_banks = buffer.device()->allocator()->get_num_banks(buffer.buffer_type()); + dispatch_params.address += (dispatch_params.dst_page_index / num_banks) * buffer.aligned_page_size(); + dispatch_params.dst_page_index %= num_banks; + } + return dispatch_params; } @@ -165,16 +162,22 @@ void populate_interleaved_buffer_write_dispatch_cmds( uint32_t full_page_size = buffer.aligned_page_size(); // dispatch_params.page_size_to_write could be a partial // page if buffer page size > MAX_PREFETCH_CMD_SIZE bool write_partial_pages = dispatch_params.page_size_to_write < full_page_size; - const uint32_t num_banks = buffer.device()->allocator()->get_num_banks(buffer.buffer_type()); // TODO: Consolidate if (write_partial_pages) { - uint32_t padding = full_page_size - buffer.page_size(); - uint32_t src_address_offset = dispatch_params.total_pages_written * dispatch_params.page_size_to_write; + const uint32_t padding = full_page_size - buffer.page_size(); + const uint32_t num_partial_pages_per_full_page = + buffer.aligned_page_size() / dispatch_params.page_size_to_write; + const uint32_t num_full_pages_written = dispatch_params.total_pages_written / num_partial_pages_per_full_page; + const bool is_partial_page_start_of_full_page = + dispatch_params.total_pages_written % num_partial_pages_per_full_page == 0; + uint32_t src_address_offset = + dispatch_params.total_pages_written * dispatch_params.page_size_to_write - num_full_pages_written * padding; for (uint32_t sysmem_address_offset = 0; sysmem_address_offset < data_size_bytes; sysmem_address_offset += dispatch_params.page_size_to_write) { uint32_t page_size_to_copy = dispatch_params.page_size_to_write; - if (src_address_offset + dispatch_params.page_size_to_write > buffer.page_size()) { + if (src_address_offset + dispatch_params.page_size_to_write > + (num_full_pages_written + 1) * buffer.page_size()) { // last partial page being copied from unpadded src buffer page_size_to_copy -= padding; } @@ -304,7 +307,7 @@ void write_interleaved_buffer_to_device( SystemMemoryManager& sysmem_manager = dispatch_params.device->sysmem_manager(); uint32_t data_offsetB = hal.get_alignment(HalMemType::HOST); // data appended after CQ_PREFETCH_CMD_RELAY_INLINE // + CQ_DISPATCH_CMD_WRITE_PAGED - const uint32_t num_banks = buffer.device()->num_banks(buffer.buffer_type()); + const uint32_t num_banks = buffer.device()->allocator()->get_num_banks(buffer.buffer_type()); const uint32_t starting_dst_page_index = dispatch_params.dst_page_index; while (dispatch_params.total_pages_to_write > 0) { dispatch_params.issue_wait = @@ -339,27 +342,20 @@ void write_interleaved_buffer_to_device( // Page offset in CQ_DISPATCH_CMD_WRITE_PAGED is uint16_t // To handle larger page offsets move bank base address up and update page offset to be relative to the new // bank address - if (dispatch_params.dst_page_index > 0xFFFF or - (dispatch_params.pages_per_txn == dispatch_params.max_num_pages_to_write and - dispatch_params.write_partial_pages)) { - uint32_t num_banks_to_use = - dispatch_params.write_partial_pages ? dispatch_params.max_num_pages_to_write : num_banks; - uint32_t residual = dispatch_params.dst_page_index % num_banks_to_use; - uint32_t num_pages_written_per_bank = dispatch_params.dst_page_index / num_banks_to_use; + if (dispatch_params.dst_page_index > 0xFFFF) { + TT_ASSERT(!dispatch_params.write_partial_pages); + uint32_t residual = dispatch_params.dst_page_index % num_banks; + uint32_t num_pages_written_per_bank = dispatch_params.dst_page_index / num_banks; dispatch_params.address += num_pages_written_per_bank * dispatch_params.page_size_to_write; dispatch_params.dst_page_index = residual; } - if (dispatch_params.write_partial_pages) { - ; - } - tt::log_debug(tt::LogDispatch, "EnqueueWriteBuffer for command queue {}", dispatch_params.cq_id); issue_buffer_dispatch_command_sequence(src, buffer, dispatch_params, sub_device_ids, dispatch_core_type); dispatch_params.total_pages_written += dispatch_params.pages_per_txn; + dispatch_params.total_pages_to_write -= dispatch_params.pages_per_txn; - // dispatch_params.dst_page_index += dispatch_params.pages_per_txn; if (dispatch_params.write_partial_pages) { const uint32_t num_partial_pages_per_full_page = buffer.aligned_page_size() / dispatch_params.page_size_to_write; @@ -368,20 +364,16 @@ void write_interleaved_buffer_to_device( dispatch_params.total_pages_written > 0 && dispatch_params.total_pages_written % num_partial_pages_per_full_page == 0; if (has_full_page_been_written) { - // might need to be modified when page is being round robined dispatch_params.dst_page_index += dispatch_params.pages_per_txn; const bool will_next_page_be_round_robined = (dispatch_params.dst_page_index / num_banks) != ((dispatch_params.dst_page_index - dispatch_params.pages_per_txn) / num_banks); - if (!will_next_page_be_round_robined) { - dispatch_params.address -= (num_partial_pages_per_full_page * dispatch_params.page_size_to_write); - } else { + if (will_next_page_be_round_robined) { dispatch_params.dst_page_index = 0; + } else { + dispatch_params.address -= buffer.aligned_page_size(); } } - // else { - // dispatch_params.address += (dispatch_params.pages_per_txn * dispatch_params.page_size_to_write); - // } } else { dispatch_params.dst_page_index += dispatch_params.pages_per_txn; } From d04e687574a4f11cf1588c5e01be4bcf2d794abf Mon Sep 17 00:00:00 2001 From: Samarth Agarwal Date: Fri, 31 Jan 2025 18:10:12 +0000 Subject: [PATCH 03/24] Fixing merge conflicts --- ...test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp index 10a5ab003d8..997a0c9c7de 100644 --- a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp @@ -539,8 +539,7 @@ TEST_F(CommandQueueSingleCardBufferFixture, TestSinglePageLargerThanMaxPrefetchC TEST_F(CommandQueueSingleCardBufferFixture, TestMultiplePagesLargerThanMaxPrefetchCommandSize) { for (IDevice* device : devices_) { CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id()); - const uint32_t max_prefetch_command_size = - dispatch_constants::get(dispatch_core_type).max_prefetch_command_size(); + const uint32_t max_prefetch_command_size = DispatchMemMap::get(dispatch_core_type).max_prefetch_command_size(); TestBufferConfig config = { .num_pages = 1024, .page_size = max_prefetch_command_size + 2048, .buftype = BufferType::DRAM}; local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(device, device->command_queue(), config); @@ -552,8 +551,7 @@ TEST_F(CommandQueueSingleCardBufferFixture, TestMultiplePagesLargerThanMaxPrefet tt::log_info("Running On Device {}", device->id()); CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id()); - const uint32_t max_prefetch_command_size = - dispatch_constants::get(dispatch_core_type).max_prefetch_command_size(); + const uint32_t max_prefetch_command_size = DispatchMemMap::get(dispatch_core_type).max_prefetch_command_size(); const uint32_t page_size = max_prefetch_command_size + 2048; const uint32_t buffer_size = 40 * page_size; const uint32_t region_size = 5 * page_size; @@ -582,8 +580,7 @@ TEST_F(CommandQueueSingleCardBufferFixture, TestSingleUnalignedPageLargerThanMax TEST_F(CommandQueueSingleCardBufferFixture, TestMultipleUnalignedPagesLargerThanMaxPrefetchCommandSize) { for (IDevice* device : devices_) { CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id()); - const uint32_t max_prefetch_command_size = - dispatch_constants::get(dispatch_core_type).max_prefetch_command_size(); + const uint32_t max_prefetch_command_size = DispatchMemMap::get(dispatch_core_type).max_prefetch_command_size(); TestBufferConfig config = { .num_pages = 1024, .page_size = max_prefetch_command_size + 4, .buftype = BufferType::DRAM}; local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(device, device->command_queue(), config); @@ -595,8 +592,7 @@ TEST_F(CommandQueueSingleCardBufferFixture, TestMultipleUnalignedPagesLargerThan tt::log_info("Running On Device {}", device->id()); CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id()); - const uint32_t max_prefetch_command_size = - dispatch_constants::get(dispatch_core_type).max_prefetch_command_size(); + const uint32_t max_prefetch_command_size = DispatchMemMap::get(dispatch_core_type).max_prefetch_command_size(); const uint32_t page_size = max_prefetch_command_size + 4; const uint32_t buffer_size = 40 * page_size; const uint32_t region_size = 5 * page_size; From 792dd755bdd9c7abad333751b85a57254011a668 Mon Sep 17 00:00:00 2001 From: Samarth Agarwal Date: Fri, 31 Jan 2025 18:19:22 +0000 Subject: [PATCH 04/24] Modifying condition for issuing wait --- tt_metal/impl/buffers/dispatch.cpp | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/tt_metal/impl/buffers/dispatch.cpp b/tt_metal/impl/buffers/dispatch.cpp index 6ef0ff18e36..f4f63d5d9c7 100644 --- a/tt_metal/impl/buffers/dispatch.cpp +++ b/tt_metal/impl/buffers/dispatch.cpp @@ -308,11 +308,9 @@ void write_interleaved_buffer_to_device( uint32_t data_offsetB = hal.get_alignment(HalMemType::HOST); // data appended after CQ_PREFETCH_CMD_RELAY_INLINE // + CQ_DISPATCH_CMD_WRITE_PAGED const uint32_t num_banks = buffer.device()->allocator()->get_num_banks(buffer.buffer_type()); - const uint32_t starting_dst_page_index = dispatch_params.dst_page_index; while (dispatch_params.total_pages_to_write > 0) { dispatch_params.issue_wait = - (dispatch_params.dst_page_index == starting_dst_page_index and - dispatch_params.address == buffer.address()); // only stall for the first write of the buffer + dispatch_params.total_pages_written == 0; // only stall for the first write of the buffer if (dispatch_params.issue_wait) { data_offsetB *= 2; // commands prefixed with CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_WAIT } @@ -329,14 +327,10 @@ void write_interleaved_buffer_to_device( continue; } - dispatch_params.pages_per_txn = std::min( - {(uint32_t)num_pages_available, - // dispatch_params.max_num_pages_to_write, - dispatch_params.total_pages_to_write}); + dispatch_params.pages_per_txn = std::min({(uint32_t)num_pages_available, dispatch_params.total_pages_to_write}); if (dispatch_params.write_partial_pages) { dispatch_params.pages_per_txn = std::min(dispatch_params.pages_per_txn, (uint32_t)1); - // dispatch_params.address += dispatch_params.page_size_to_write; } // Page offset in CQ_DISPATCH_CMD_WRITE_PAGED is uint16_t From ed6f2d2b772b0618725597851c456e2a75a5f02d Mon Sep 17 00:00:00 2001 From: Samarth Agarwal Date: Fri, 31 Jan 2025 19:57:15 +0000 Subject: [PATCH 05/24] Deleted test --- ...st_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp index 997a0c9c7de..ba9773d3d60 100644 --- a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp @@ -1039,20 +1039,6 @@ TEST_F(MultiCommandQueueSingleDeviceBufferFixture, TestNon32BAlignedPageSizeForD local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config)); } -TEST_F(MultiCommandQueueSingleDeviceBufferFixture, TestPageSizeTooLarge) { - if (this->arch_ == tt::ARCH::WORMHOLE_B0) { - GTEST_SKIP(); // This test hanging on wormhole b0 - } - // Should throw a host error due to the page size not fitting in the consumer CB - TestBufferConfig config = {.num_pages = 1024, .page_size = 250880 * 2, .buftype = BufferType::DRAM}; - - CommandQueue& a = this->device_->command_queue(0); - CommandQueue& b = this->device_->command_queue(1); - vector> cqs = {a, b}; - EXPECT_ANY_THROW( - local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config)); -} - TEST_F(MultiCommandQueueSingleDeviceBufferFixture, TestIssueMultipleReadWriteCommandsForOneBuffer) { uint32_t page_size = 2048; uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(this->device_->id()); From 2854706bc2d2c48e94a45c11cfec5f6aa0775687 Mon Sep 17 00:00:00 2001 From: Samarth Agarwal Date: Thu, 6 Feb 2025 02:13:34 +0000 Subject: [PATCH 06/24] Saving work --- ...queueWriteBuffer_and_EnqueueReadBuffer.cpp | 2 +- tt_metal/api/tt-metalium/cq_commands.hpp | 2 + tt_metal/impl/buffers/dispatch.cpp | 366 +++++++++++++----- 3 files changed, 264 insertions(+), 106 deletions(-) diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp index ba9773d3d60..0dc1ef1fb87 100644 --- a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp @@ -555,7 +555,7 @@ TEST_F(CommandQueueSingleCardBufferFixture, TestMultiplePagesLargerThanMaxPrefet const uint32_t page_size = max_prefetch_command_size + 2048; const uint32_t buffer_size = 40 * page_size; const uint32_t region_size = 5 * page_size; - const uint32_t region_offset = 30 * page_size; + const uint32_t region_offset = 3 * page_size; const BufferRegion region(region_offset, region_size); auto buffer = Buffer::create(device, buffer_size, page_size, BufferType::DRAM); diff --git a/tt_metal/api/tt-metalium/cq_commands.hpp b/tt_metal/api/tt-metalium/cq_commands.hpp index 48600a2c32a..c9c79313fd4 100644 --- a/tt_metal/api/tt-metalium/cq_commands.hpp +++ b/tt_metal/api/tt-metalium/cq_commands.hpp @@ -171,6 +171,8 @@ struct CQDispatchWriteHostCmd { uint32_t length; } __attribute__((packed)); +constexpr uint16_t CQ_DISPATCH_CMD_PAGED_WRITE_MAX_PAGE_INDEX = 0xFFFF; + struct CQDispatchWritePagedCmd { uint8_t is_dram; // one flag, false=l1 uint16_t start_page; diff --git a/tt_metal/impl/buffers/dispatch.cpp b/tt_metal/impl/buffers/dispatch.cpp index f4f63d5d9c7..abeaec95618 100644 --- a/tt_metal/impl/buffers/dispatch.cpp +++ b/tt_metal/impl/buffers/dispatch.cpp @@ -41,8 +41,126 @@ struct BufferWriteDispatchParams { // Parameters specific to interleaved buffers struct InterleavedBufferWriteDispatchParams : BufferWriteDispatchParams { - uint32_t write_partial_pages = 0; - uint32_t padded_buffer_size = 0; + uint32_t num_banks = 0; + const Buffer& buffer; + + InterleavedBufferWriteDispatchParams( + const Buffer& buffer, + uint32_t dst_page_index, + uint32_t total_pages_to_write, + uint32_t cq_id, + tt::stl::Span expected_num_workers_completed) : + buffer(buffer) { + this->num_banks = buffer.device()->allocator()->get_num_banks(buffer.buffer_type()); + this->address = buffer.address(); + this->dst_page_index = dst_page_index; + this->page_size_to_write = buffer.aligned_page_size(); + this->total_pages_to_write = total_pages_to_write; + this->device = buffer.device(); + this->cq_id = cq_id; + this->expected_num_workers_completed = expected_num_workers_completed; + } + virtual ~InterleavedBufferWriteDispatchParams() = default; + + void calculate_issue_wait() { + this->issue_wait = this->total_pages_written == 0; // only stall for the first write of the buffer + } + + virtual void calculate_num_pages_for_write_transaction(uint32_t num_pages_available_in_cq) { + this->pages_per_txn = std::min(this->total_pages_to_write, num_pages_available_in_cq); + } + + virtual bool is_page_offset_out_of_bounds() const { + return this->dst_page_index > CQ_DISPATCH_CMD_PAGED_WRITE_MAX_PAGE_INDEX; + } + + // Page offset in CQ_DISPATCH_CMD_WRITE_PAGED is uint16_t + // To handle larger page offsets move bank base address up and update page offset to be relative to the new + // bank address + virtual void update_params_to_be_within_bounds() { + const uint32_t num_pages_written_per_bank = this->dst_page_index / this->num_banks; + this->address += num_pages_written_per_bank * this->page_size_to_write; + this->dst_page_index %= this->num_banks; + } + + virtual void update_params_after_write_transaction() { + this->total_pages_to_write -= this->pages_per_txn; + this->total_pages_written += this->pages_per_txn; + this->dst_page_index += this->pages_per_txn; + this->address += this->page_size_to_write; + } + + virtual bool write_large_pages() const { return false; } + + virtual bool are_num_pages_available_in_cq_enough_for_transaction(uint32_t num_pages) const { + return num_pages > 0; + } + + virtual uint32_t num_full_pages_written() const { return this->total_pages_written; } + virtual uint32_t num_partial_pages_per_full_page() const { return 1; } +}; + +struct InterleavedBufferWriteLargePageDispatchParams : InterleavedBufferWriteDispatchParams { + InterleavedBufferWriteLargePageDispatchParams( + const Buffer& buffer, + uint32_t dst_page_index, + uint32_t page_size_to_write, + uint32_t total_pages_to_write, + uint32_t num_full_pages, + uint32_t cq_id, + tt::stl::Span expected_num_workers_completed) : + InterleavedBufferWriteDispatchParams( + buffer, dst_page_index, total_pages_to_write, cq_id, expected_num_workers_completed) { + this->page_size_to_write = page_size_to_write; + this->full_pages_to_write = num_full_pages; + this->full_page_size = buffer.aligned_page_size(); + this->num_partial_pages_in_single_full_page = full_page_size / page_size_to_write; + } + + void calculate_num_pages_for_write_transaction(uint32_t num_pages_available_in_cq) override { + this->pages_per_txn = std::min({this->full_pages_to_write, this->num_banks, num_pages_available_in_cq}); + } + + bool is_page_offset_out_of_bounds() const override { return this->dst_page_index >= this->num_banks; } + + void update_params_to_be_within_bounds() override { + const uint32_t num_pages_written_per_bank = this->dst_page_index / this->num_banks; + this->address += num_pages_written_per_bank * this->full_page_size; + this->dst_page_index %= this->num_banks; + } + + void update_params_after_write_transaction() override { + this->total_pages_to_write -= this->pages_per_txn; + this->total_pages_written += this->pages_per_txn; + this->address += this->page_size_to_write; + if (this->were_full_pages_written_in_last_write_transaction()) { + this->full_pages_to_write -= this->pages_per_txn; + this->full_pages_written += this->pages_per_txn; + this->dst_page_index += this->pages_per_txn; + this->dst_page_index %= this->num_banks; + } + } + + bool write_large_pages() const override { return true; } + + bool are_num_pages_available_in_cq_enough_for_transaction(uint32_t num_pages) const override { + return num_pages >= std::min(this->num_banks, this->full_pages_to_write); + } + + uint32_t num_full_pages_written() const override { return this->full_pages_written; } + + uint32_t num_partial_pages_per_full_page() const override { return this->num_partial_pages_in_single_full_page; } + +private: + uint32_t num_partial_pages_in_single_full_page = 0; + uint32_t full_page_size = 0; + uint32_t full_pages_written = 0; + uint32_t full_pages_to_write = 0; + + bool were_full_pages_written_in_last_write_transaction() const { + const uint32_t page_size = this->address - this->buffer.address(); + return page_size > 0 && page_size % this->full_page_size == 0; + } }; // Parameters specific to sharded buffers @@ -55,6 +173,25 @@ struct ShardedBufferWriteDispatchParams : BufferWriteDispatchParams { CoreCoord core; }; +void update_byte_offset_in_cq(uint32_t& byte_offset, bool issue_wait) { + if (issue_wait) { + byte_offset *= 2; // commands prefixed with CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_WAIT + } +} + +int32_t calculate_num_pages_available_in_cq( + const InterleavedBufferWriteDispatchParams& dispatch_params, + const BufferDispatchConstants& dispatch_constants, + uint32_t byte_offset_in_cq) { + SystemMemoryManager& sysmem_manager = dispatch_params.device->sysmem_manager(); + uint32_t space_availableB = std::min( + dispatch_constants.issue_queue_cmd_limit - sysmem_manager.get_issue_queue_write_ptr(dispatch_params.cq_id), + dispatch_constants.max_prefetch_cmd_size); + int32_t num_pages_available = + (int32_t(space_availableB) - int32_t(byte_offset_in_cq)) / int32_t(dispatch_params.page_size_to_write); + return num_pages_available; +} + // Generate dispatch constants BufferDispatchConstants generate_buffer_dispatch_constants( const SystemMemoryManager& sysmem_manager, CoreType dispatch_core_type, uint32_t cq_id) { @@ -97,42 +234,39 @@ ShardedBufferWriteDispatchParams initialize_sharded_buf_dispatch_params( return dispatch_params; } -InterleavedBufferWriteDispatchParams initialize_interleaved_buf_dispatch_params( +std::unique_ptr initialize_interleaved_buf_dispatch_params( Buffer& buffer, const BufferDispatchConstants& buf_dispatch_constants, uint32_t cq_id, tt::stl::Span expected_num_workers_completed, const BufferRegion& region) { - InterleavedBufferWriteDispatchParams dispatch_params; - dispatch_params.dst_page_index = region.offset / buffer.page_size(); - uint32_t num_pages = region.size / buffer.page_size(); + std::unique_ptr dispatch_params; - uint32_t padded_page_size = buffer.aligned_page_size(); - dispatch_params.total_pages_to_write = num_pages; - dispatch_params.total_pages_written = 0; - dispatch_params.write_partial_pages = padded_page_size > buf_dispatch_constants.max_data_sizeB; - dispatch_params.page_size_to_write = padded_page_size; - dispatch_params.padded_buffer_size = num_pages * padded_page_size; - - if (dispatch_params.write_partial_pages) { - uint32_t partial_size = DispatchSettings::BASE_PARTIAL_PAGE_SIZE; - uint32_t pcie_alignment = hal.get_alignment(HalMemType::HOST); - while (padded_page_size % partial_size != 0) { - partial_size += pcie_alignment; - } - dispatch_params.page_size_to_write = partial_size; - dispatch_params.total_pages_to_write = dispatch_params.padded_buffer_size / dispatch_params.page_size_to_write; - } - - dispatch_params.address = buffer.address(); - dispatch_params.device = buffer.device(); - dispatch_params.cq_id = cq_id; - dispatch_params.expected_num_workers_completed = expected_num_workers_completed; + uint32_t total_pages_to_write = region.size / buffer.page_size(); + const uint32_t dst_page_index = region.offset / buffer.page_size(); - if (dispatch_params.write_partial_pages) { - const uint32_t num_banks = buffer.device()->allocator()->get_num_banks(buffer.buffer_type()); - dispatch_params.address += (dispatch_params.dst_page_index / num_banks) * buffer.aligned_page_size(); - dispatch_params.dst_page_index %= num_banks; + const bool write_large_pages = buffer.aligned_page_size() > buf_dispatch_constants.max_data_sizeB; + if (write_large_pages) { + uint32_t partial_page_size = DispatchSettings::BASE_PARTIAL_PAGE_SIZE; + const uint32_t pcie_alignment = hal.get_alignment(HalMemType::HOST); + while (buffer.aligned_page_size() % partial_page_size != 0) { + partial_page_size += pcie_alignment; + } + const uint32_t page_size_to_write = partial_page_size; + const uint32_t padded_buffer_size = total_pages_to_write * buffer.aligned_page_size(); + const uint32_t num_full_pages = total_pages_to_write; + total_pages_to_write = padded_buffer_size / page_size_to_write; + dispatch_params = std::make_unique( + buffer, + dst_page_index, + page_size_to_write, + total_pages_to_write, + num_full_pages, + cq_id, + expected_num_workers_completed); + } else { + dispatch_params = std::make_unique( + buffer, dst_page_index, total_pages_to_write, cq_id, expected_num_workers_completed); } return dispatch_params; @@ -144,12 +278,12 @@ void populate_interleaved_buffer_write_dispatch_cmds( HugepageDeviceCommand& command_sequence, Buffer& buffer, InterleavedBufferWriteDispatchParams& dispatch_params) { - uint8_t is_dram = uint8_t(buffer.is_dram()); + const uint8_t is_dram = uint8_t(buffer.is_dram()); TT_ASSERT( - dispatch_params.dst_page_index <= 0xFFFF, + dispatch_params.dst_page_index <= CQ_DISPATCH_CMD_PAGED_WRITE_MAX_PAGE_INDEX, "Page offset needs to fit within range of uint16_t, bank_base_address was computed incorrectly!"); - uint16_t start_page = uint16_t(dispatch_params.dst_page_index & 0xFFFF); - bool flush_prefetch = true; + const uint16_t start_page = uint16_t(dispatch_params.dst_page_index & CQ_DISPATCH_CMD_PAGED_WRITE_MAX_PAGE_INDEX); + const bool flush_prefetch = true; command_sequence.add_dispatch_write_paged( flush_prefetch, is_dram, @@ -158,32 +292,37 @@ void populate_interleaved_buffer_write_dispatch_cmds( dispatch_params.page_size_to_write, dispatch_params.pages_per_txn); - uint32_t data_size_bytes = dispatch_params.pages_per_txn * dispatch_params.page_size_to_write; - uint32_t full_page_size = buffer.aligned_page_size(); // dispatch_params.page_size_to_write could be a partial - // page if buffer page size > MAX_PREFETCH_CMD_SIZE - bool write_partial_pages = dispatch_params.page_size_to_write < full_page_size; + const uint32_t data_size_bytes = dispatch_params.pages_per_txn * dispatch_params.page_size_to_write; // TODO: Consolidate - if (write_partial_pages) { - const uint32_t padding = full_page_size - buffer.page_size(); - const uint32_t num_partial_pages_per_full_page = - buffer.aligned_page_size() / dispatch_params.page_size_to_write; - const uint32_t num_full_pages_written = dispatch_params.total_pages_written / num_partial_pages_per_full_page; - const bool is_partial_page_start_of_full_page = - dispatch_params.total_pages_written % num_partial_pages_per_full_page == 0; - uint32_t src_address_offset = - dispatch_params.total_pages_written * dispatch_params.page_size_to_write - num_full_pages_written * padding; + if (dispatch_params.write_large_pages()) { + // const uint32_t num_partial_pages_per_full_page = + // buffer.aligned_page_size() / dispatch_params.page_size_to_write; + // const uint32_t num_full_pages_written = dispatch_params.total_pages_written / + // num_partial_pages_per_full_page; + const uint32_t num_full_pages_written = dispatch_params.num_full_pages_written(); + const uint32_t num_partial_pages_written = dispatch_params.total_pages_written; + const uint32_t num_partial_pages_per_full_page = dispatch_params.num_partial_pages_per_full_page(); + const uint32_t num_partial_pages_written_associated_with_current_full_pages = + num_partial_pages_written - (num_full_pages_written * num_partial_pages_per_full_page); + const uint32_t num_partial_pages_written_per_current_full_page = + num_partial_pages_written_associated_with_current_full_pages / dispatch_params.pages_per_txn; + uint32_t num_partial_pages_written_curr_txn = 0; for (uint32_t sysmem_address_offset = 0; sysmem_address_offset < data_size_bytes; sysmem_address_offset += dispatch_params.page_size_to_write) { uint32_t page_size_to_copy = dispatch_params.page_size_to_write; - if (src_address_offset + dispatch_params.page_size_to_write > - (num_full_pages_written + 1) * buffer.page_size()) { + uint32_t src_address_offset = num_full_pages_written * buffer.page_size() + + num_partial_pages_written_per_current_full_page * page_size_to_copy + + num_partial_pages_written_curr_txn * buffer.page_size(); + if (num_partial_pages_written_per_current_full_page == num_partial_pages_per_full_page - 1) { // last partial page being copied from unpadded src buffer + const uint32_t padding = buffer.aligned_page_size() - buffer.page_size(); page_size_to_copy -= padding; } command_sequence.add_data( (char*)src + src_address_offset, page_size_to_copy, dispatch_params.page_size_to_write); - src_address_offset += page_size_to_copy; + // src_address_offset += page_size_to_copy; + num_partial_pages_written_curr_txn += 1; } } else { uint32_t src_address_offset = dispatch_params.total_pages_written * buffer.page_size(); @@ -304,73 +443,89 @@ void write_interleaved_buffer_to_device( const BufferDispatchConstants& buf_dispatch_constants, tt::stl::Span sub_device_ids, CoreType dispatch_core_type) { - SystemMemoryManager& sysmem_manager = dispatch_params.device->sysmem_manager(); - uint32_t data_offsetB = hal.get_alignment(HalMemType::HOST); // data appended after CQ_PREFETCH_CMD_RELAY_INLINE - // + CQ_DISPATCH_CMD_WRITE_PAGED + uint32_t byte_offset_in_cq = + hal.get_alignment(HalMemType::HOST); // data appended after CQ_PREFETCH_CMD_RELAY_INLINE + // + CQ_DISPATCH_CMD_WRITE_PAGED const uint32_t num_banks = buffer.device()->allocator()->get_num_banks(buffer.buffer_type()); while (dispatch_params.total_pages_to_write > 0) { - dispatch_params.issue_wait = - dispatch_params.total_pages_written == 0; // only stall for the first write of the buffer - if (dispatch_params.issue_wait) { - data_offsetB *= 2; // commands prefixed with CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_WAIT - } - - uint32_t space_availableB = std::min( - buf_dispatch_constants.issue_queue_cmd_limit - - sysmem_manager.get_issue_queue_write_ptr(dispatch_params.cq_id), - buf_dispatch_constants.max_prefetch_cmd_size); - int32_t num_pages_available = - (int32_t(space_availableB) - int32_t(data_offsetB)) / int32_t(dispatch_params.page_size_to_write); - - if (num_pages_available <= 0) { + // calculate issue wait + // dispatch_params.issue_wait = + // dispatch_params.total_pages_written == 0; // only stall for the first write of the buffer + dispatch_params.calculate_issue_wait(); + + // calculate num pages available in cq + // if (dispatch_params.issue_wait) { + // byte_offset_in_cq *= 2; // commands prefixed with CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_WAIT + // } + update_byte_offset_in_cq(byte_offset_in_cq, dispatch_params.issue_wait); + + // uint32_t space_availableB = std::min( + // buf_dispatch_constants.issue_queue_cmd_limit - + // sysmem_manager.get_issue_queue_write_ptr(dispatch_params.cq_id), + // buf_dispatch_constants.max_prefetch_cmd_size); + // int32_t num_pages_available = + // (int32_t(space_availableB) - int32_t(data_offsetB)) / int32_t(dispatch_params.page_size_to_write); + + const int32_t num_pages_available_in_cq = + calculate_num_pages_available_in_cq(dispatch_params, buf_dispatch_constants, byte_offset_in_cq); + if (!dispatch_params.are_num_pages_available_in_cq_enough_for_transaction(num_pages_available_in_cq)) { + SystemMemoryManager& sysmem_manager = dispatch_params.device->sysmem_manager(); sysmem_manager.wrap_issue_queue_wr_ptr(dispatch_params.cq_id); continue; } - dispatch_params.pages_per_txn = std::min({(uint32_t)num_pages_available, dispatch_params.total_pages_to_write}); + // calculate num pages for current write transaction + // dispatch_params.pages_per_txn = std::min({(uint32_t)num_pages_available, + // dispatch_params.total_pages_to_write}); - if (dispatch_params.write_partial_pages) { - dispatch_params.pages_per_txn = std::min(dispatch_params.pages_per_txn, (uint32_t)1); - } + // if (dispatch_params.write_partial_pages) { + // dispatch_params.pages_per_txn = std::min(dispatch_params.pages_per_txn, num_banks); + // } + dispatch_params.calculate_num_pages_for_write_transaction(num_pages_available_in_cq); // Page offset in CQ_DISPATCH_CMD_WRITE_PAGED is uint16_t // To handle larger page offsets move bank base address up and update page offset to be relative to the new // bank address - if (dispatch_params.dst_page_index > 0xFFFF) { - TT_ASSERT(!dispatch_params.write_partial_pages); - uint32_t residual = dispatch_params.dst_page_index % num_banks; - uint32_t num_pages_written_per_bank = dispatch_params.dst_page_index / num_banks; - dispatch_params.address += num_pages_written_per_bank * dispatch_params.page_size_to_write; - dispatch_params.dst_page_index = residual; + // if page offset out of bounds, update_dispatch_params_for_out_bounds + if (dispatch_params.is_page_offset_out_of_bounds()) { + // TT_ASSERT(!dispatch_params.write_partial_pages); + // uint32_t residual = dispatch_params.dst_page_index % num_banks; + // uint32_t num_pages_written_per_bank = dispatch_params.dst_page_index / num_banks; + // dispatch_params.address += num_pages_written_per_bank * dispatch_params.page_size_to_write; + // dispatch_params.dst_page_index = residual; + dispatch_params.update_params_to_be_within_bounds(); } tt::log_debug(tt::LogDispatch, "EnqueueWriteBuffer for command queue {}", dispatch_params.cq_id); issue_buffer_dispatch_command_sequence(src, buffer, dispatch_params, sub_device_ids, dispatch_core_type); - dispatch_params.total_pages_written += dispatch_params.pages_per_txn; - - dispatch_params.total_pages_to_write -= dispatch_params.pages_per_txn; - if (dispatch_params.write_partial_pages) { - const uint32_t num_partial_pages_per_full_page = - buffer.aligned_page_size() / dispatch_params.page_size_to_write; - dispatch_params.address += (dispatch_params.pages_per_txn * dispatch_params.page_size_to_write); - const bool has_full_page_been_written = - dispatch_params.total_pages_written > 0 && - dispatch_params.total_pages_written % num_partial_pages_per_full_page == 0; - if (has_full_page_been_written) { - dispatch_params.dst_page_index += dispatch_params.pages_per_txn; - const bool will_next_page_be_round_robined = - (dispatch_params.dst_page_index / num_banks) != - ((dispatch_params.dst_page_index - dispatch_params.pages_per_txn) / num_banks); - if (will_next_page_be_round_robined) { - dispatch_params.dst_page_index = 0; - } else { - dispatch_params.address -= buffer.aligned_page_size(); - } - } - } else { - dispatch_params.dst_page_index += dispatch_params.pages_per_txn; - } + // update dispatch params after write transaction + dispatch_params.update_params_after_write_transaction(); + + // dispatch_params.total_pages_written += dispatch_params.pages_per_txn; + + // dispatch_params.total_pages_to_write -= dispatch_params.pages_per_txn; + // if (dispatch_params.write_large_pages()) { + // const uint32_t num_partial_pages_per_full_page = + // buffer.aligned_page_size() / dispatch_params.page_size_to_write; + // dispatch_params.address += dispatch_params.page_size_to_write; + // const bool have_full_pages_been_written = + // dispatch_params.total_pages_written > 0 && + // dispatch_params.total_pages_written % num_partial_pages_per_full_page == 0; + // if (have_full_pages_been_written) { + // dispatch_params.dst_page_index += dispatch_params.pages_per_txn; + // const bool will_next_page_be_round_robined = + // (dispatch_params.dst_page_index / num_banks) != + // ((dispatch_params.dst_page_index - dispatch_params.pages_per_txn) / num_banks); + // if (will_next_page_be_round_robined) { + // dispatch_params.dst_page_index = 0; + // } else { + // dispatch_params.address -= buffer.aligned_page_size(); + // } + // } + // } else { + // dispatch_params.dst_page_index += dispatch_params.pages_per_txn; + // } } } @@ -580,10 +735,11 @@ void write_to_device_buffer( dispatch_core_type); } } else { - InterleavedBufferWriteDispatchParams dispatch_params = initialize_interleaved_buf_dispatch_params( - buffer, buf_dispatch_constants, cq_id, expected_num_workers_completed, region); + std::unique_ptr dispatch_params = + initialize_interleaved_buf_dispatch_params( + buffer, buf_dispatch_constants, cq_id, expected_num_workers_completed, region); write_interleaved_buffer_to_device( - src, dispatch_params, buffer, buf_dispatch_constants, sub_device_ids, dispatch_core_type); + src, *dispatch_params, buffer, buf_dispatch_constants, sub_device_ids, dispatch_core_type); } } From c7633b27cdd3b249d4ee24a4e2203f728e60ad17 Mon Sep 17 00:00:00 2001 From: Samarth Agarwal Date: Thu, 6 Feb 2025 20:02:18 +0000 Subject: [PATCH 07/24] Cleanup --- ...queueWriteBuffer_and_EnqueueReadBuffer.cpp | 28 ++++-- tt_metal/impl/buffers/dispatch.cpp | 95 ++++--------------- 2 files changed, 38 insertions(+), 85 deletions(-) diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp index 0dc1ef1fb87..7ea342a4dd0 100644 --- a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp @@ -253,12 +253,6 @@ void test_EnqueueWriteBuffer_and_EnqueueReadBuffer(IDevice* device, CommandQueue detail::ReadFromBuffer(*bufa, result); } - for (uint32_t i = 0; i < result.size(); i++) { - if (result[i] != i) { - std::cout << "i: " + std::to_string(i) + " result[i]: " + std::to_string(result[i]) << std::endl; - } - } - EXPECT_EQ(src, result); } } @@ -555,7 +549,7 @@ TEST_F(CommandQueueSingleCardBufferFixture, TestMultiplePagesLargerThanMaxPrefet const uint32_t page_size = max_prefetch_command_size + 2048; const uint32_t buffer_size = 40 * page_size; const uint32_t region_size = 5 * page_size; - const uint32_t region_offset = 3 * page_size; + const uint32_t region_offset = 30 * page_size; const BufferRegion region(region_offset, region_size); auto buffer = Buffer::create(device, buffer_size, page_size, BufferType::DRAM); @@ -1156,6 +1150,26 @@ TEST_F(CommandQueueSingleCardBufferFixture, TestMultipleNonOverlappingWritesShar } } +TEST_F(CommandQueueSingleCardBufferFixture, TestMultiplePagesLargerThanMaxPrefetchCommandSizeForL1) { + for (IDevice* device : devices_) { + CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id()); + const uint32_t max_prefetch_command_size = DispatchMemMap::get(dispatch_core_type).max_prefetch_command_size(); + TestBufferConfig config = { + .num_pages = 30, .page_size = max_prefetch_command_size + 2048, .buftype = BufferType::L1}; + local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(device, device->command_queue(), config); + } +} + +TEST_F(CommandQueueSingleCardBufferFixture, TestMultipleUnalignedPagesLargerThanMaxPrefetchCommandSizeForL1) { + for (IDevice* device : devices_) { + CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id()); + const uint32_t max_prefetch_command_size = DispatchMemMap::get(dispatch_core_type).max_prefetch_command_size(); + TestBufferConfig config = { + .num_pages = 30, .page_size = max_prefetch_command_size + 4, .buftype = BufferType::L1}; + local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(device, device->command_queue(), config); + } +} + TEST_F(CommandQueueSingleCardBufferFixture, TestMultipleNonOverlappingReadsShardedSubBufferForL1) { const uint32_t page_size = 64; const uint32_t buffer_size = 16 * page_size; diff --git a/tt_metal/impl/buffers/dispatch.cpp b/tt_metal/impl/buffers/dispatch.cpp index abeaec95618..9181328feca 100644 --- a/tt_metal/impl/buffers/dispatch.cpp +++ b/tt_metal/impl/buffers/dispatch.cpp @@ -87,16 +87,12 @@ struct InterleavedBufferWriteDispatchParams : BufferWriteDispatchParams { this->total_pages_to_write -= this->pages_per_txn; this->total_pages_written += this->pages_per_txn; this->dst_page_index += this->pages_per_txn; - this->address += this->page_size_to_write; } virtual bool write_large_pages() const { return false; } - virtual bool are_num_pages_available_in_cq_enough_for_transaction(uint32_t num_pages) const { - return num_pages > 0; - } - virtual uint32_t num_full_pages_written() const { return this->total_pages_written; } + virtual uint32_t num_partial_pages_per_full_page() const { return 1; } }; @@ -118,7 +114,9 @@ struct InterleavedBufferWriteLargePageDispatchParams : InterleavedBufferWriteDis } void calculate_num_pages_for_write_transaction(uint32_t num_pages_available_in_cq) override { - this->pages_per_txn = std::min({this->full_pages_to_write, this->num_banks, num_pages_available_in_cq}); + TT_ASSERT(this->num_banks > this->dst_page_index); + this->pages_per_txn = + std::min({this->full_pages_to_write, this->num_banks - this->dst_page_index, num_pages_available_in_cq}); } bool is_page_offset_out_of_bounds() const override { return this->dst_page_index >= this->num_banks; } @@ -136,6 +134,9 @@ struct InterleavedBufferWriteLargePageDispatchParams : InterleavedBufferWriteDis if (this->were_full_pages_written_in_last_write_transaction()) { this->full_pages_to_write -= this->pages_per_txn; this->full_pages_written += this->pages_per_txn; + if (!this->will_next_full_page_be_round_robined()) { + this->address -= this->full_page_size; + } this->dst_page_index += this->pages_per_txn; this->dst_page_index %= this->num_banks; } @@ -143,10 +144,6 @@ struct InterleavedBufferWriteLargePageDispatchParams : InterleavedBufferWriteDis bool write_large_pages() const override { return true; } - bool are_num_pages_available_in_cq_enough_for_transaction(uint32_t num_pages) const override { - return num_pages >= std::min(this->num_banks, this->full_pages_to_write); - } - uint32_t num_full_pages_written() const override { return this->full_pages_written; } uint32_t num_partial_pages_per_full_page() const override { return this->num_partial_pages_in_single_full_page; } @@ -161,6 +158,11 @@ struct InterleavedBufferWriteLargePageDispatchParams : InterleavedBufferWriteDis const uint32_t page_size = this->address - this->buffer.address(); return page_size > 0 && page_size % this->full_page_size == 0; } + + bool will_next_full_page_be_round_robined() const { + const uint32_t dst_page_index_next_txn = this->dst_page_index + this->pages_per_txn; + return dst_page_index_next_txn != (dst_page_index_next_txn % this->num_banks); + } }; // Parameters specific to sharded buffers @@ -296,10 +298,6 @@ void populate_interleaved_buffer_write_dispatch_cmds( // TODO: Consolidate if (dispatch_params.write_large_pages()) { - // const uint32_t num_partial_pages_per_full_page = - // buffer.aligned_page_size() / dispatch_params.page_size_to_write; - // const uint32_t num_full_pages_written = dispatch_params.total_pages_written / - // num_partial_pages_per_full_page; const uint32_t num_full_pages_written = dispatch_params.num_full_pages_written(); const uint32_t num_partial_pages_written = dispatch_params.total_pages_written; const uint32_t num_partial_pages_per_full_page = dispatch_params.num_partial_pages_per_full_page(); @@ -321,7 +319,6 @@ void populate_interleaved_buffer_write_dispatch_cmds( } command_sequence.add_data( (char*)src + src_address_offset, page_size_to_copy, dispatch_params.page_size_to_write); - // src_address_offset += page_size_to_copy; num_partial_pages_written_curr_txn += 1; } } else { @@ -448,84 +445,26 @@ void write_interleaved_buffer_to_device( // + CQ_DISPATCH_CMD_WRITE_PAGED const uint32_t num_banks = buffer.device()->allocator()->get_num_banks(buffer.buffer_type()); while (dispatch_params.total_pages_to_write > 0) { - // calculate issue wait - // dispatch_params.issue_wait = - // dispatch_params.total_pages_written == 0; // only stall for the first write of the buffer dispatch_params.calculate_issue_wait(); - - // calculate num pages available in cq - // if (dispatch_params.issue_wait) { - // byte_offset_in_cq *= 2; // commands prefixed with CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_WAIT - // } update_byte_offset_in_cq(byte_offset_in_cq, dispatch_params.issue_wait); - // uint32_t space_availableB = std::min( - // buf_dispatch_constants.issue_queue_cmd_limit - - // sysmem_manager.get_issue_queue_write_ptr(dispatch_params.cq_id), - // buf_dispatch_constants.max_prefetch_cmd_size); - // int32_t num_pages_available = - // (int32_t(space_availableB) - int32_t(data_offsetB)) / int32_t(dispatch_params.page_size_to_write); + if (dispatch_params.is_page_offset_out_of_bounds()) { + dispatch_params.update_params_to_be_within_bounds(); + } const int32_t num_pages_available_in_cq = calculate_num_pages_available_in_cq(dispatch_params, buf_dispatch_constants, byte_offset_in_cq); - if (!dispatch_params.are_num_pages_available_in_cq_enough_for_transaction(num_pages_available_in_cq)) { + if (num_pages_available_in_cq <= 0) { SystemMemoryManager& sysmem_manager = dispatch_params.device->sysmem_manager(); sysmem_manager.wrap_issue_queue_wr_ptr(dispatch_params.cq_id); continue; } - // calculate num pages for current write transaction - // dispatch_params.pages_per_txn = std::min({(uint32_t)num_pages_available, - // dispatch_params.total_pages_to_write}); - - // if (dispatch_params.write_partial_pages) { - // dispatch_params.pages_per_txn = std::min(dispatch_params.pages_per_txn, num_banks); - // } - dispatch_params.calculate_num_pages_for_write_transaction(num_pages_available_in_cq); - - // Page offset in CQ_DISPATCH_CMD_WRITE_PAGED is uint16_t - // To handle larger page offsets move bank base address up and update page offset to be relative to the new - // bank address - // if page offset out of bounds, update_dispatch_params_for_out_bounds - if (dispatch_params.is_page_offset_out_of_bounds()) { - // TT_ASSERT(!dispatch_params.write_partial_pages); - // uint32_t residual = dispatch_params.dst_page_index % num_banks; - // uint32_t num_pages_written_per_bank = dispatch_params.dst_page_index / num_banks; - // dispatch_params.address += num_pages_written_per_bank * dispatch_params.page_size_to_write; - // dispatch_params.dst_page_index = residual; - dispatch_params.update_params_to_be_within_bounds(); - } - tt::log_debug(tt::LogDispatch, "EnqueueWriteBuffer for command queue {}", dispatch_params.cq_id); + dispatch_params.calculate_num_pages_for_write_transaction(num_pages_available_in_cq); issue_buffer_dispatch_command_sequence(src, buffer, dispatch_params, sub_device_ids, dispatch_core_type); - // update dispatch params after write transaction dispatch_params.update_params_after_write_transaction(); - - // dispatch_params.total_pages_written += dispatch_params.pages_per_txn; - - // dispatch_params.total_pages_to_write -= dispatch_params.pages_per_txn; - // if (dispatch_params.write_large_pages()) { - // const uint32_t num_partial_pages_per_full_page = - // buffer.aligned_page_size() / dispatch_params.page_size_to_write; - // dispatch_params.address += dispatch_params.page_size_to_write; - // const bool have_full_pages_been_written = - // dispatch_params.total_pages_written > 0 && - // dispatch_params.total_pages_written % num_partial_pages_per_full_page == 0; - // if (have_full_pages_been_written) { - // dispatch_params.dst_page_index += dispatch_params.pages_per_txn; - // const bool will_next_page_be_round_robined = - // (dispatch_params.dst_page_index / num_banks) != - // ((dispatch_params.dst_page_index - dispatch_params.pages_per_txn) / num_banks); - // if (will_next_page_be_round_robined) { - // dispatch_params.dst_page_index = 0; - // } else { - // dispatch_params.address -= buffer.aligned_page_size(); - // } - // } - // } else { - // dispatch_params.dst_page_index += dispatch_params.pages_per_txn; - // } } } From a006661039208bde80b9a1fa16858281fc13a885 Mon Sep 17 00:00:00 2001 From: Samarth Agarwal Date: Thu, 6 Feb 2025 20:08:02 +0000 Subject: [PATCH 08/24] Cleanup --- .../test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp | 1 - tt_metal/impl/buffers/dispatch.cpp | 1 - 2 files changed, 2 deletions(-) diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp index 7ea342a4dd0..e170fa3d523 100644 --- a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp @@ -3,7 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 #include -#include #include #include "buffer.hpp" diff --git a/tt_metal/impl/buffers/dispatch.cpp b/tt_metal/impl/buffers/dispatch.cpp index 9181328feca..f9281a3f1ae 100644 --- a/tt_metal/impl/buffers/dispatch.cpp +++ b/tt_metal/impl/buffers/dispatch.cpp @@ -443,7 +443,6 @@ void write_interleaved_buffer_to_device( uint32_t byte_offset_in_cq = hal.get_alignment(HalMemType::HOST); // data appended after CQ_PREFETCH_CMD_RELAY_INLINE // + CQ_DISPATCH_CMD_WRITE_PAGED - const uint32_t num_banks = buffer.device()->allocator()->get_num_banks(buffer.buffer_type()); while (dispatch_params.total_pages_to_write > 0) { dispatch_params.calculate_issue_wait(); update_byte_offset_in_cq(byte_offset_in_cq, dispatch_params.issue_wait); From 9e3dc10184e348b5fb2db46005d02f928a85cfcd Mon Sep 17 00:00:00 2001 From: Samarth Agarwal Date: Thu, 13 Feb 2025 19:34:17 +0000 Subject: [PATCH 09/24] Saving work --- ...queueWriteBuffer_and_EnqueueReadBuffer.cpp | 22 ++ .../api/tt-metalium/dispatch_settings.hpp | 9 +- tt_metal/distributed/mesh_command_queue.cpp | 6 +- tt_metal/impl/buffers/dispatch.cpp | 195 +++++++++++++----- tt_metal/impl/buffers/dispatch.hpp | 112 ++++++---- .../impl/dispatch/hardware_command_queue.cpp | 6 +- tt_metal/tt_metal.cpp | 124 ++++++++++- 7 files changed, 373 insertions(+), 101 deletions(-) diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp index e170fa3d523..de0c0f72018 100644 --- a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp @@ -252,6 +252,18 @@ void test_EnqueueWriteBuffer_and_EnqueueReadBuffer(IDevice* device, CommandQueue detail::ReadFromBuffer(*bufa, result); } + std::cout << "write: " << cq_write << " read: " << cq_read << std::endl; + if (result.size() != src.size()) { + std::cout << "Unequal size" << std::endl; + } + for (uint32_t i = 0; i < result.size(); i++) { + if (i != result[i]) { + std::cout << "i: " << std::to_string(i) << " result[i]: " << std::to_string(result[i]) << std::endl; + // std::cout << "Fail" << std::endl; + break; + } + } + EXPECT_EQ(src, result); } } @@ -1159,6 +1171,16 @@ TEST_F(CommandQueueSingleCardBufferFixture, TestMultiplePagesLargerThanMaxPrefet } } +TEST_F(CommandQueueSingleCardBufferFixture, TestSingleUnalignedPageLargerThanMaxPrefetchCommandSizeForL1) { + for (IDevice* device : devices_) { + CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id()); + const uint32_t max_prefetch_command_size = DispatchMemMap::get(dispatch_core_type).max_prefetch_command_size(); + TestBufferConfig config = { + .num_pages = 1, .page_size = max_prefetch_command_size + 4, .buftype = BufferType::L1}; + local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(device, device->command_queue(), config); + } +} + TEST_F(CommandQueueSingleCardBufferFixture, TestMultipleUnalignedPagesLargerThanMaxPrefetchCommandSizeForL1) { for (IDevice* device : devices_) { CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id()); diff --git a/tt_metal/api/tt-metalium/dispatch_settings.hpp b/tt_metal/api/tt-metalium/dispatch_settings.hpp index 499116ed78b..56be7bb7cde 100644 --- a/tt_metal/api/tt-metalium/dispatch_settings.hpp +++ b/tt_metal/api/tt-metalium/dispatch_settings.hpp @@ -134,10 +134,11 @@ class DispatchSettings { static constexpr uint32_t EVENT_PADDED_SIZE = 16; - // When page size of buffer to write/read exceeds MAX_PREFETCH_COMMAND_SIZE, the PCIe aligned page size is broken - // down into equal sized partial pages BASE_PARTIAL_PAGE_SIZE denotes the initial partial page size to use, it is - // incremented by PCIe alignment until page size can be evenly split - static constexpr uint32_t BASE_PARTIAL_PAGE_SIZE = 4096; + // When page size of buffer to write/read exceeds the max prefetch command size, the PCIe-aligned page size is + // broken down into equal sized partial pages. UNPADDED_PARTIAL_PAGE_SIZE denotes the unpadded partial page size to + // use. The size of the padded partial page is the smallest value >= UNPADDED_PARTIAL_PAGE_SIZE that is + // PCIE-aligned. + static constexpr uint32_t UNPADDED_PARTIAL_PAGE_SIZE = 3072; static_assert( DISPATCH_MESSAGE_ENTRIES <= diff --git a/tt_metal/distributed/mesh_command_queue.cpp b/tt_metal/distributed/mesh_command_queue.cpp index 0902cd8f69a..8435cf446d6 100644 --- a/tt_metal/distributed/mesh_command_queue.cpp +++ b/tt_metal/distributed/mesh_command_queue.cpp @@ -269,12 +269,12 @@ void MeshCommandQueue::read_shard_from_device( auto dispatch_params = buffer_dispatch::initialize_interleaved_buf_read_dispatch_params( *shard_view, id_, expected_num_workers_completed_, region); buffer_dispatch::copy_interleaved_buffer_to_completion_queue( - dispatch_params, *shard_view, sub_device_ids, this->dispatch_core_type()); - if (dispatch_params.pages_per_txn > 0) { + *dispatch_params, *shard_view, sub_device_ids, this->dispatch_core_type()); + if (dispatch_params->pages_per_txn > 0) { num_txns_per_device[device]++; auto& read_descriptor_queue = this->get_read_descriptor_queue(device); read_descriptor_queue.push( - buffer_dispatch::generate_interleaved_buffer_read_descriptor(dst, dispatch_params, *shard_view)); + buffer_dispatch::generate_interleaved_buffer_read_descriptor(dst, dispatch_params.get(), *shard_view)); } } } diff --git a/tt_metal/impl/buffers/dispatch.cpp b/tt_metal/impl/buffers/dispatch.cpp index f9281a3f1ae..eca7a7e63af 100644 --- a/tt_metal/impl/buffers/dispatch.cpp +++ b/tt_metal/impl/buffers/dispatch.cpp @@ -5,6 +5,7 @@ #include #include #include "assert.hpp" +#include "math.hpp" #include "dispatch.hpp" #include #include @@ -42,6 +43,7 @@ struct BufferWriteDispatchParams { // Parameters specific to interleaved buffers struct InterleavedBufferWriteDispatchParams : BufferWriteDispatchParams { uint32_t num_banks = 0; + uint32_t data_size_per_page_size_to_write = 0; const Buffer& buffer; InterleavedBufferWriteDispatchParams( @@ -55,6 +57,7 @@ struct InterleavedBufferWriteDispatchParams : BufferWriteDispatchParams { this->address = buffer.address(); this->dst_page_index = dst_page_index; this->page_size_to_write = buffer.aligned_page_size(); + this->data_size_per_page_size_to_write = buffer.page_size(); this->total_pages_to_write = total_pages_to_write; this->device = buffer.device(); this->cq_id = cq_id; @@ -94,23 +97,28 @@ struct InterleavedBufferWriteDispatchParams : BufferWriteDispatchParams { virtual uint32_t num_full_pages_written() const { return this->total_pages_written; } virtual uint32_t num_partial_pages_per_full_page() const { return 1; } + + virtual uint32_t get_additional_padding_for_last_partial_page() const { return 0; } }; struct InterleavedBufferWriteLargePageDispatchParams : InterleavedBufferWriteDispatchParams { InterleavedBufferWriteLargePageDispatchParams( const Buffer& buffer, uint32_t dst_page_index, - uint32_t page_size_to_write, + const PartialPageSpec& partial_page_spec, uint32_t total_pages_to_write, + uint32_t full_page_size, uint32_t num_full_pages, uint32_t cq_id, tt::stl::Span expected_num_workers_completed) : InterleavedBufferWriteDispatchParams( buffer, dst_page_index, total_pages_to_write, cq_id, expected_num_workers_completed) { - this->page_size_to_write = page_size_to_write; + this->page_size_to_write = partial_page_spec.padded_partial_page_size; + this->data_size_per_page_size_to_write = partial_page_spec.unpadded_partial_page_size; this->full_pages_to_write = num_full_pages; - this->full_page_size = buffer.aligned_page_size(); - this->num_partial_pages_in_single_full_page = full_page_size / page_size_to_write; + this->full_page_size = full_page_size; + this->num_partial_pages_in_single_full_page = partial_page_spec.num_partial_pages_per_full_page; + this->last_partial_page_additional_padding = partial_page_spec.last_partial_page_additional_padding; } void calculate_num_pages_for_write_transaction(uint32_t num_pages_available_in_cq) override { @@ -148,8 +156,13 @@ struct InterleavedBufferWriteLargePageDispatchParams : InterleavedBufferWriteDis uint32_t num_partial_pages_per_full_page() const override { return this->num_partial_pages_in_single_full_page; } + uint32_t get_additional_padding_for_last_partial_page() const override { + return this->last_partial_page_additional_padding; + } + private: uint32_t num_partial_pages_in_single_full_page = 0; + uint32_t last_partial_page_additional_padding = 0; uint32_t full_page_size = 0; uint32_t full_pages_written = 0; uint32_t full_pages_to_write = 0; @@ -194,6 +207,17 @@ int32_t calculate_num_pages_available_in_cq( return num_pages_available; } +uint32_t calculate_max_data_size(const CoreType& dispatch_core_type) { + return DispatchMemMap::get(dispatch_core_type).max_prefetch_command_size() - + (hal.get_alignment(HalMemType::HOST) * 2); // * 2 to account for issue +} + +bool are_pages_large(const Buffer& buffer) { + const CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(buffer.device()->id()); + const uint32_t max_data_size = calculate_max_data_size(dispatch_core_type); + return buffer.aligned_page_size() > max_data_size; +} + // Generate dispatch constants BufferDispatchConstants generate_buffer_dispatch_constants( const SystemMemoryManager& sysmem_manager, CoreType dispatch_core_type, uint32_t cq_id) { @@ -201,8 +225,7 @@ BufferDispatchConstants generate_buffer_dispatch_constants( buf_dispatch_constants.issue_queue_cmd_limit = sysmem_manager.get_issue_queue_limit(cq_id); buf_dispatch_constants.max_prefetch_cmd_size = DispatchMemMap::get(dispatch_core_type).max_prefetch_command_size(); - buf_dispatch_constants.max_data_sizeB = buf_dispatch_constants.max_prefetch_cmd_size - - (hal.get_alignment(HalMemType::HOST) * 2); // * 2 to account for issue + buf_dispatch_constants.max_data_sizeB = calculate_max_data_size(dispatch_core_type); return buf_dispatch_constants; } @@ -236,8 +259,30 @@ ShardedBufferWriteDispatchParams initialize_sharded_buf_dispatch_params( return dispatch_params; } +PartialPageSpec calculate_partial_page_spec(const Buffer& buffer) { + PartialPageSpec partial_page_spec; + partial_page_spec.unpadded_partial_page_size = DispatchSettings::UNPADDED_PARTIAL_PAGE_SIZE; + // while (buffer.aligned_page_size() % partial_page.unpadded_partial_page_size != 0) { + // partial_page.unpadded_partial_page_size += 1; + // } + partial_page_spec.padded_partial_page_size = partial_page_spec.unpadded_partial_page_size; + const uint32_t pcie_alignment = hal.get_alignment(HalMemType::HOST); + const uint32_t l1_alignment = hal.get_alignment(HalMemType::L1); + while (partial_page_spec.padded_partial_page_size % pcie_alignment != 0 || + partial_page_spec.padded_partial_page_size % l1_alignment != 0) { + partial_page_spec.padded_partial_page_size += 1; + } + partial_page_spec.num_partial_pages_per_full_page = + tt::div_up(buffer.aligned_page_size(), partial_page_spec.unpadded_partial_page_size); + // partial_page.last_partial_page_additional_padding = buffer.aligned_page_size() - buffer.page_size(); + partial_page_spec.last_partial_page_additional_padding = + (partial_page_spec.num_partial_pages_per_full_page * partial_page_spec.unpadded_partial_page_size) - + buffer.page_size(); + return partial_page_spec; +} + std::unique_ptr initialize_interleaved_buf_dispatch_params( - Buffer& buffer, + const Buffer& buffer, const BufferDispatchConstants& buf_dispatch_constants, uint32_t cq_id, tt::stl::Span expected_num_workers_completed, @@ -249,20 +294,32 @@ std::unique_ptr initialize_interleaved_buf const bool write_large_pages = buffer.aligned_page_size() > buf_dispatch_constants.max_data_sizeB; if (write_large_pages) { - uint32_t partial_page_size = DispatchSettings::BASE_PARTIAL_PAGE_SIZE; - const uint32_t pcie_alignment = hal.get_alignment(HalMemType::HOST); - while (buffer.aligned_page_size() % partial_page_size != 0) { - partial_page_size += pcie_alignment; - } - const uint32_t page_size_to_write = partial_page_size; - const uint32_t padded_buffer_size = total_pages_to_write * buffer.aligned_page_size(); + const PartialPageSpec partial_page_spec = calculate_partial_page_spec(buffer); + // uint32_t partial_page_size = DispatchSettings::BASE_PARTIAL_PAGE_SIZE; + // while (buffer.aligned_page_size() % partial_page_size != 0) { + // partial_page_size += 1; + // } + // const uint32_t data_size_per_partial_page = partial_page_size; + // const uint32_t num_partial_pages_per_full_page = buffer.aligned_page_size() / partial_page_size; + // const uint32_t pcie_alignment = hal.get_alignment(HalMemType::HOST); + // const uint32_t l1_alignment = hal.get_alignment(HalMemType::L1); + // while (partial_page_size % pcie_alignment != 0 || partial_page_size % l1_alignment != 0) { + // partial_page_size += 1; + // } + // const uint32_t page_size_to_write = partial_page.padded_partial_page_size; + // const uint32_t num_partial_pages_per_full_page = buffer.aligned_page_size() / + // partial_page_spec.unpadded_partial_page_size; + const uint32_t full_page_size = + partial_page_spec.num_partial_pages_per_full_page * partial_page_spec.padded_partial_page_size; const uint32_t num_full_pages = total_pages_to_write; - total_pages_to_write = padded_buffer_size / page_size_to_write; + const uint32_t padded_buffer_size = total_pages_to_write * full_page_size; + total_pages_to_write = tt::div_up(padded_buffer_size, partial_page_spec.padded_partial_page_size); dispatch_params = std::make_unique( buffer, dst_page_index, - page_size_to_write, + partial_page_spec, total_pages_to_write, + full_page_size, num_full_pages, cq_id, expected_num_workers_completed); @@ -308,13 +365,13 @@ void populate_interleaved_buffer_write_dispatch_cmds( uint32_t num_partial_pages_written_curr_txn = 0; for (uint32_t sysmem_address_offset = 0; sysmem_address_offset < data_size_bytes; sysmem_address_offset += dispatch_params.page_size_to_write) { - uint32_t page_size_to_copy = dispatch_params.page_size_to_write; + uint32_t page_size_to_copy = dispatch_params.data_size_per_page_size_to_write; uint32_t src_address_offset = num_full_pages_written * buffer.page_size() + num_partial_pages_written_per_current_full_page * page_size_to_copy + num_partial_pages_written_curr_txn * buffer.page_size(); if (num_partial_pages_written_per_current_full_page == num_partial_pages_per_full_page - 1) { // last partial page being copied from unpadded src buffer - const uint32_t padding = buffer.aligned_page_size() - buffer.page_size(); + const uint32_t padding = dispatch_params.get_additional_padding_for_last_partial_page(); page_size_to_copy -= padding; } command_sequence.add_data( @@ -328,8 +385,10 @@ void populate_interleaved_buffer_write_dispatch_cmds( for (uint32_t sysmem_address_offset = 0; sysmem_address_offset < data_size_bytes; sysmem_address_offset += dispatch_params.page_size_to_write) { command_sequence.add_data( - (char*)src + src_address_offset, buffer.page_size(), dispatch_params.page_size_to_write); - src_address_offset += buffer.page_size(); + (char*)src + src_address_offset, + dispatch_params.data_size_per_page_size_to_write, + dispatch_params.page_size_to_write); + src_address_offset += dispatch_params.data_size_per_page_size_to_write; } } else { command_sequence.add_data((char*)src + src_address_offset, data_size_bytes, data_size_bytes); @@ -711,21 +770,34 @@ ShardedBufferReadDispatchParams initialize_sharded_buf_read_dispatch_params( return dispatch_params; } -BufferReadDispatchParams initialize_interleaved_buf_read_dispatch_params( +std::unique_ptr initialize_interleaved_buf_read_dispatch_params( Buffer& buffer, uint32_t cq_id, tt::stl::Span expected_num_workers_completed, const BufferRegion& region) { validate_buffer_region_conditions(buffer, region); - BufferReadDispatchParams dispatch_params; - dispatch_params.pages_per_txn = region.size / buffer.page_size(); - dispatch_params.src_page_index = region.offset / buffer.page_size(); - dispatch_params.cq_id = cq_id; - dispatch_params.device = buffer.device(); - dispatch_params.padded_page_size = buffer.aligned_page_size(); - dispatch_params.unpadded_dst_offset = 0; - dispatch_params.expected_num_workers_completed = expected_num_workers_completed; + std::unique_ptr dispatch_params; + const bool read_large_pages = are_pages_large(buffer); + if (read_large_pages) { + dispatch_params = std::make_unique(); + BufferReadLargePageDispatchParams* large_page_dispatch_params = + dynamic_cast(dispatch_params.get()); + const PartialPageSpec partial_page = calculate_partial_page_spec(buffer); + large_page_dispatch_params->partial_page_spec = partial_page; + large_page_dispatch_params->padded_page_size = + partial_page.num_partial_pages_per_full_page * partial_page.padded_partial_page_size; + } else { + dispatch_params = std::make_unique(); + dispatch_params->padded_page_size = buffer.aligned_page_size(); + } + dispatch_params->pages_per_txn = region.size / buffer.page_size(); + dispatch_params->src_page_index = region.offset / buffer.page_size(); + dispatch_params->cq_id = cq_id; + dispatch_params->device = buffer.device(); + dispatch_params->address = buffer.address(); + dispatch_params->unpadded_dst_offset = 0; + dispatch_params->expected_num_workers_completed = expected_num_workers_completed; return dispatch_params; } @@ -866,18 +938,12 @@ void copy_interleaved_buffer_to_completion_queue( tt::stl::Span sub_device_ids, CoreType dispatch_core_type) { if (dispatch_params.pages_per_txn > 0) { - uint32_t bank_base_address = buffer.address(); - // Only 8 bits are assigned for the page offset in CQPrefetchRelayPagedCmd // To handle larger page offsets move bank base address up and update page offset to be relative to the new // bank address if (dispatch_params.src_page_index > CQ_PREFETCH_RELAY_PAGED_START_PAGE_MASK) { - const uint32_t num_banks = dispatch_params.device->allocator()->get_num_banks(buffer.buffer_type()); - const uint32_t num_pages_per_bank = dispatch_params.src_page_index / num_banks; - bank_base_address += num_pages_per_bank * buffer.aligned_page_size(); - dispatch_params.src_page_index = dispatch_params.src_page_index % num_banks; + dispatch_params.update_params_to_be_within_bounds(buffer); } - dispatch_params.address = bank_base_address; issue_read_buffer_dispatch_command_sequence(buffer, dispatch_params, sub_device_ids, dispatch_core_type); } } @@ -903,16 +969,23 @@ std::shared_ptr generate_sharded_buffer_r } std::shared_ptr generate_interleaved_buffer_read_descriptor( - void* dst, BufferReadDispatchParams& dispatch_params, Buffer& buffer) { + void* dst, BufferReadDispatchParams* dispatch_params, Buffer& buffer) { + BufferReadLargePageDispatchParams* large_page_dispatch_params = + dynamic_cast(dispatch_params); + PartialPageSpec* partial_page_spec = + large_page_dispatch_params ? &(large_page_dispatch_params->partial_page_spec) : nullptr; return std::make_shared( std::in_place_type, buffer.buffer_layout(), buffer.page_size(), - dispatch_params.padded_page_size, + dispatch_params->padded_page_size, dst, - dispatch_params.unpadded_dst_offset, - dispatch_params.pages_per_txn, - dispatch_params.src_page_index); + dispatch_params->unpadded_dst_offset, + dispatch_params->pages_per_txn, + dispatch_params->src_page_index, + 0, + nullptr, + partial_page_spec); } void copy_completion_queue_data_into_user_space( @@ -922,11 +995,12 @@ void copy_completion_queue_data_into_user_space( uint32_t cq_id, SystemMemoryManager& sysmem_manager, volatile bool& exit_condition) { - const auto& [buffer_layout, page_size, padded_page_size, buffer_page_mapping, dst, dst_offset, num_pages_read, cur_dev_page_id, starting_host_page_id] = + const auto& [buffer_layout, page_size, padded_page_size, buffer_page_mapping, dst, dst_offset, num_pages_read, cur_dev_page_id, starting_host_page_id, partial_page_spec] = read_buffer_descriptor; - uint32_t padded_num_bytes = (num_pages_read * padded_page_size) + sizeof(CQDispatchCmd); + const uint32_t padded_num_bytes = (num_pages_read * padded_page_size) + sizeof(CQDispatchCmd); uint32_t contig_dst_offset = dst_offset; uint32_t remaining_bytes_to_read = padded_num_bytes; + uint32_t total_num_bytes_read = 0; uint32_t dev_page_id = cur_dev_page_id; // track the amount of bytes read in the last non-aligned page @@ -981,9 +1055,15 @@ void copy_completion_queue_data_into_user_space( offset_in_completion_q_data = 0; uint32_t dst_offset_bytes = 0; + const uint32_t page_size_to_read = + partial_page_spec ? partial_page_spec->unpadded_partial_page_size : page_size; + const uint32_t padded_page_size_to_read = + partial_page_spec ? partial_page_spec->padded_partial_page_size : padded_page_size; + pad_size_bytes = partial_page_spec ? padded_page_size_to_read - page_size_to_read : pad_size_bytes; + while (src_offset_bytes < bytes_xfered) { - uint32_t src_offset_increment = padded_page_size; - uint32_t num_bytes_to_copy; + uint32_t src_offset_increment = padded_page_size_to_read; + uint32_t num_bytes_to_copy = 0; if (remaining_bytes_of_nonaligned_page > 0) { // Case 1: Portion of the page was copied into user buffer on the previous completion queue pop. uint32_t num_bytes_remaining = bytes_xfered - src_offset_bytes; @@ -1001,18 +1081,29 @@ void copy_completion_queue_data_into_user_space( offset_in_completion_q_data = pad_size_bytes - rem_bytes_in_cq; } } - } else if (src_offset_bytes + padded_page_size >= bytes_xfered) { + } else if (src_offset_bytes + padded_page_size_to_read >= bytes_xfered) { // Case 2: Last page of data that was popped off the completion queue // Don't need to compute src_offset_increment since this is end of loop uint32_t num_bytes_remaining = bytes_xfered - src_offset_bytes; - num_bytes_to_copy = std::min(num_bytes_remaining, page_size); - remaining_bytes_of_nonaligned_page = page_size - num_bytes_to_copy; + num_bytes_to_copy = std::min(num_bytes_remaining, page_size_to_read); + remaining_bytes_of_nonaligned_page = page_size_to_read - num_bytes_to_copy; // We've copied needed data, start of next read is offset due to remaining pad bytes if (remaining_bytes_of_nonaligned_page == 0) { - offset_in_completion_q_data = padded_page_size - num_bytes_remaining; + offset_in_completion_q_data = padded_page_size_to_read - num_bytes_remaining; } } else { - num_bytes_to_copy = page_size; + num_bytes_to_copy = page_size_to_read; + } + + // if (partial_page_spec && (total_num_bytes_read % page_size) + num_bytes_to_copy == + // (padded_page_size / padded_page_size_to_read) * page_size_to_read) { + // if (partial_page_spec && partial_page_spec->num_partial_pages_per_full_page - 1 == + // ((total_num_bytes_read % page_size) + num_bytes_to_copy) / + // partial_page_spec->unpadded_partial_page_size) { + if (partial_page_spec && (total_num_bytes_read % page_size) + num_bytes_to_copy > page_size) { + // uint32_t extra_bytes = (total_num_bytes_read % page_size) + num_bytes_to_copy - page_size; + uint32_t extra_bytes = partial_page_spec->last_partial_page_additional_padding; + num_bytes_to_copy -= extra_bytes; } tt::Cluster::instance().read_sysmem( @@ -1022,6 +1113,10 @@ void copy_completion_queue_data_into_user_space( mmio_device_id, channel); + total_num_bytes_read += num_bytes_to_copy; + // if (total_num_bytes_read == page_size) { + // total_num_bytes_read = 0; + // } src_offset_bytes += src_offset_increment; dst_offset_bytes += num_bytes_to_copy; contig_dst_offset += num_bytes_to_copy; diff --git a/tt_metal/impl/buffers/dispatch.hpp b/tt_metal/impl/buffers/dispatch.hpp index 02ae71b627e..5a3b4a4ae53 100644 --- a/tt_metal/impl/buffers/dispatch.hpp +++ b/tt_metal/impl/buffers/dispatch.hpp @@ -8,42 +8,12 @@ #include #include #include "buffer.hpp" -#include "tt_metal/impl/event/dispatch.hpp" +#include "device.hpp" namespace tt::tt_metal { - -// Used so the host knows how to properly copy data into user space from the completion queue (in hugepages) -struct ReadBufferDescriptor { - TensorMemoryLayout buffer_layout; - uint32_t page_size; - uint32_t padded_page_size; - std::shared_ptr buffer_page_mapping; - void* dst; - uint32_t dst_offset; - uint32_t num_pages_read; - uint32_t cur_dev_page_id; - uint32_t starting_host_page_id; - - ReadBufferDescriptor( - TensorMemoryLayout buffer_layout, - uint32_t page_size, - uint32_t padded_page_size, - void* dst, - uint32_t dst_offset, - uint32_t num_pages_read, - uint32_t cur_dev_page_id, - uint32_t starting_host_page_id = 0, - const std::shared_ptr& buffer_page_mapping = nullptr) : - buffer_layout(buffer_layout), - page_size(page_size), - padded_page_size(padded_page_size), - buffer_page_mapping(buffer_page_mapping), - dst(dst), - dst_offset(dst_offset), - num_pages_read(num_pages_read), - cur_dev_page_id(cur_dev_page_id), - starting_host_page_id(starting_host_page_id) {} -}; +struct ReadBufferDescriptor; +struct ReadEventDescriptor; +using CompletionReaderVariant = std::variant; // Contains helper functions to interface with buffers on device namespace buffer_dispatch { @@ -57,6 +27,26 @@ struct BufferReadDispatchParams { uint32_t unpadded_dst_offset = 0; uint32_t pages_per_txn = 0; uint32_t address = 0; + + virtual ~BufferReadDispatchParams() = default; + + void update_params_to_be_within_bounds(const Buffer& buffer) { + const uint32_t num_banks = this->device->allocator()->get_num_banks(buffer.buffer_type()); + const uint32_t num_pages_per_bank = this->src_page_index / num_banks; + this->address += num_pages_per_bank * this->padded_page_size; + this->src_page_index = this->src_page_index % num_banks; + } +}; + +struct PartialPageSpec { + uint32_t unpadded_partial_page_size = 0; + uint32_t padded_partial_page_size = 0; + uint32_t last_partial_page_additional_padding = 0; + uint32_t num_partial_pages_per_full_page = 0; +}; + +struct BufferReadLargePageDispatchParams : BufferReadDispatchParams { + PartialPageSpec partial_page_spec; }; struct ShardedBufferReadDispatchParams : BufferReadDispatchParams { @@ -85,7 +75,7 @@ ShardedBufferReadDispatchParams initialize_sharded_buf_read_dispatch_params( tt::stl::Span expected_num_workers_completed, const BufferRegion& region); -BufferReadDispatchParams initialize_interleaved_buf_read_dispatch_params( +std::unique_ptr initialize_interleaved_buf_read_dispatch_params( Buffer& buffer, uint32_t cq_id, tt::stl::Span expected_num_workers_completed, @@ -123,8 +113,58 @@ tt::stl::Span select_sub_device_ids( std::shared_ptr<::tt::tt_metal::CompletionReaderVariant> generate_sharded_buffer_read_descriptor( void* dst, ShardedBufferReadDispatchParams& dispatch_params, Buffer& buffer); std::shared_ptr<::tt::tt_metal::CompletionReaderVariant> generate_interleaved_buffer_read_descriptor( - void* dst, BufferReadDispatchParams& dispatch_params, Buffer& buffer); + void* dst, BufferReadDispatchParams* dispatch_params, Buffer& buffer); +bool are_pages_large(const Buffer& buffer); + +PartialPageSpec calculate_partial_page_spec(const Buffer& buffer); } // namespace buffer_dispatch +// Used so the host knows how to properly copy data into user space from the completion queue (in hugepages) +struct ReadBufferDescriptor { + TensorMemoryLayout buffer_layout; + uint32_t page_size; + uint32_t padded_page_size; + std::shared_ptr buffer_page_mapping; + void* dst; + uint32_t dst_offset; + uint32_t num_pages_read; + uint32_t cur_dev_page_id; + uint32_t starting_host_page_id; + buffer_dispatch::PartialPageSpec* partial_page; + + ReadBufferDescriptor( + TensorMemoryLayout buffer_layout, + uint32_t page_size, + uint32_t padded_page_size, + void* dst, + uint32_t dst_offset, + uint32_t num_pages_read, + uint32_t cur_dev_page_id, + uint32_t starting_host_page_id = 0, + const std::shared_ptr& buffer_page_mapping = nullptr, + buffer_dispatch::PartialPageSpec* partial_page = nullptr) : + buffer_layout(buffer_layout), + page_size(page_size), + padded_page_size(padded_page_size), + buffer_page_mapping(buffer_page_mapping), + dst(dst), + dst_offset(dst_offset), + num_pages_read(num_pages_read), + cur_dev_page_id(cur_dev_page_id), + starting_host_page_id(starting_host_page_id), + partial_page(partial_page) {} +}; + +// Used so host knows data in completion queue is just an event ID +struct ReadEventDescriptor { + uint32_t event_id; + uint32_t global_offset; + + explicit ReadEventDescriptor(uint32_t event) : event_id(event), global_offset(0) {} + + void set_global_offset(uint32_t offset) { global_offset = offset; } + uint32_t get_global_event_id() { return global_offset + event_id; } +}; + } // namespace tt::tt_metal diff --git a/tt_metal/impl/dispatch/hardware_command_queue.cpp b/tt_metal/impl/dispatch/hardware_command_queue.cpp index e309e87d7f9..d44404e8926 100644 --- a/tt_metal/impl/dispatch/hardware_command_queue.cpp +++ b/tt_metal/impl/dispatch/hardware_command_queue.cpp @@ -210,13 +210,13 @@ void HWCommandQueue::enqueue_read_buffer( auto dispatch_params = buffer_dispatch::initialize_interleaved_buf_read_dispatch_params( buffer_obj, this->id_, this->expected_num_workers_completed, region); buffer_dispatch::copy_interleaved_buffer_to_completion_queue( - dispatch_params, + *dispatch_params, buffer_obj, sub_device_ids, dispatch_core_manager::instance().get_dispatch_core_type(device_->id())); - if (dispatch_params.pages_per_txn > 0) { + if (dispatch_params->pages_per_txn > 0) { this->issued_completion_q_reads.push( - buffer_dispatch::generate_interleaved_buffer_read_descriptor(dst, dispatch_params, buffer_obj)); + buffer_dispatch::generate_interleaved_buffer_read_descriptor(dst, dispatch_params.get(), buffer_obj)); this->increment_num_entries_in_completion_q(); } } diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp index fa5fd4f5109..7c587be1ac3 100644 --- a/tt_metal/tt_metal.cpp +++ b/tt_metal/tt_metal.cpp @@ -15,6 +15,8 @@ #include #include #include +#include "buffer_constants.hpp" +#include "buffers/dispatch.hpp" #include "dprint_server.hpp" #include #include @@ -35,6 +37,7 @@ #include "tracy/Tracy.hpp" #include +#include #include "lightmetal/host_api_capture_helpers.hpp" #include "llrt.hpp" @@ -524,26 +527,127 @@ void WriteToBuffer(Buffer& buffer, tt::stl::Span host_buffer) { } } +void ReadFromDeviceInterleavedContiguousLargePage(const Buffer& buffer, uint8_t* host_buffer) { + const buffer_dispatch::PartialPageSpec& partial_page_spec = buffer_dispatch::calculate_partial_page_spec(buffer); + const uint32_t page_size = partial_page_spec.unpadded_partial_page_size; + const uint32_t padded_page_size = partial_page_spec.padded_partial_page_size; + const uint32_t num_partial_pages_per_full_page = partial_page_spec.num_partial_pages_per_full_page; + const uint32_t num_partial_pages = buffer.num_pages() * num_partial_pages_per_full_page; + + uint32_t num_full_pages_read = 0; + IDevice* device = buffer.device(); + const uint32_t num_banks = device->allocator()->get_num_banks(buffer.buffer_type()); + uint32_t host_idx = 0; + uint32_t bank_index = 0; + std::vector page; + page.resize(page_size / sizeof(uint32_t)); + DeviceAddr page_address = buffer.address(); + for (int page_index = 0; page_index < num_partial_pages; page_index++) { + // const bool is_start_of_full_page = page_index % num_partial_pages_per_full_page == 0; + // if (is_start_of_full_page) { + + // } + // auto absolute_address = buffer.page_address(bank_index, page_index); + // Get address offset of buffer in bank. Required when reading from DRAM. + // auto bank_local_address = buffer.bank_local_page_address(bank_index, page_index); + const uint32_t adjusted_page_index = + num_full_pages_read + (page_index - (num_full_pages_read * num_partial_pages_per_full_page)) * num_banks; + page.clear(); + switch (buffer.buffer_type()) { + case BufferType::DRAM: + case BufferType::TRACE: { + // page_address = buffer.bank_local_page_address(bank_index, adjusted_page_index); + ReadFromDeviceDRAMChannel(device, bank_index, page_address, page_size, page); + } break; + case BufferType::L1: + case BufferType::L1_SMALL: { + // page_address = buffer.page_address(bank_index, adjusted_page_index); + auto core_coordinates = + device->worker_core_from_logical_core(buffer.logical_core_from_bank_id(bank_index)); + tt::Cluster::instance().read_core( + page.data(), page_size, tt_cxy_pair(device->id(), core_coordinates), page_address); + } break; + default: TT_THROW("Unsupported buffer type to read from device!"); + } + + page_address += padded_page_size; + + uint32_t additional_padding = 0; + const bool is_last_partial_page_of_full_page = + page_index % num_partial_pages_per_full_page == num_partial_pages_per_full_page - 1; + if (is_last_partial_page_of_full_page) { + num_full_pages_read += 1; + const bool will_next_full_page_be_round_robined = num_full_pages_read % num_banks == 0; + if (!will_next_full_page_be_round_robined) { + const uint32_t full_page_size = num_partial_pages_per_full_page * padded_page_size; + page_address -= full_page_size; + } + bank_index = (bank_index + 1) % num_banks; + additional_padding = partial_page_spec.last_partial_page_additional_padding; + } + + std::memcpy(host_buffer + host_idx, page.data(), page_size - additional_padding); + host_idx += page_size - additional_padding; + } +} + +uint32_t RemovePaddingFromPartialPages( + const buffer_dispatch::PartialPageSpec& partial_page_spec, + const uint8_t* full_page_data, + uint32_t full_page_data_size_bytes, + uint8_t* page_without_padding_data) { + uint32_t total_num_bytes_copied = 0; + uint32_t num_partial_pages_processed = 0; + uint32_t i = 0; + while (i < full_page_data_size_bytes) { + uint32_t num_bytes_to_copy = 0; + if (num_partial_pages_processed == partial_page_spec.num_partial_pages_per_full_page - 1) { + num_bytes_to_copy = + partial_page_spec.unpadded_partial_page_size - partial_page_spec.last_partial_page_additional_padding; + } else { + num_bytes_to_copy = partial_page_spec.unpadded_partial_page_size; + } + // full_page_without_padding.insert( + // full_page_without_padding.end(), + // full_page_data.begin() + i, + // full_page_data.begin() + i + num_elements_to_copy); + std::memcpy(page_without_padding_data + total_num_bytes_copied, full_page_data + i, num_bytes_to_copy); + i += partial_page_spec.padded_partial_page_size; + num_partial_pages_processed += 1; + total_num_bytes_copied += num_bytes_to_copy; + } + return total_num_bytes_copied; +} + void ReadFromDeviceInterleavedContiguous(const Buffer& buffer, uint8_t* host_buffer) { - uint32_t page_size = buffer.page_size(); - uint32_t num_pages = buffer.num_pages(); + // if (buffer_dispatch::are_pages_large(buffer)) { + // ReadFromDeviceInterleavedContiguousLargePage(buffer, host_buffer); + // return; + // } + + const buffer_dispatch::PartialPageSpec& partial_page_spec = buffer_dispatch::calculate_partial_page_spec(buffer); + const uint32_t page_size = + partial_page_spec.padded_partial_page_size * partial_page_spec.num_partial_pages_per_full_page; + const uint32_t num_pages = buffer.num_pages(); auto device = buffer.device(); auto num_banks = device->allocator()->get_num_banks(buffer.buffer_type()); size_t host_idx = 0; + uint32_t num_round_robins = 0; uint32_t bank_index = 0; std::vector page; page.resize(page_size / sizeof(uint32_t)); for (int page_index = 0; page_index < num_pages; page_index++) { auto absolute_address = buffer.page_address(bank_index, page_index); // Get address offset of buffer in bank. Required when reading from DRAM. + auto addr1 = (num_round_robins > 0) ? (page_size - buffer.aligned_page_size()) * num_round_robins : 0; auto bank_local_address = buffer.bank_local_page_address(bank_index, page_index); page.clear(); switch (buffer.buffer_type()) { case BufferType::DRAM: case BufferType::TRACE: - ReadFromDeviceDRAMChannel(device, bank_index, bank_local_address, page_size, page); + ReadFromDeviceDRAMChannel(device, bank_index, bank_local_address + addr1, page_size, page); break; case BufferType::L1: case BufferType::L1_SMALL: { @@ -554,10 +658,20 @@ void ReadFromDeviceInterleavedContiguous(const Buffer& buffer, uint8_t* host_buf default: TT_THROW("Unsupported buffer type to read from device!"); } + uint8_t* page_without_padding = new uint8_t[page_size]; + uint32_t page_size_to_copy = RemovePaddingFromPartialPages( + partial_page_spec, reinterpret_cast(page.data()), page_size, page_without_padding); + // page_size = page_without_padding.size() * sizeof(uint32_t); + TT_ASSERT(page_size_to_copy == buffer.page_size()); + // Copy page into host buffer - std::memcpy(host_buffer + host_idx, page.data(), page_size); - host_idx += page_size; + std::memcpy(host_buffer + host_idx, page_without_padding, page_size_to_copy); + delete[] page_without_padding; + host_idx += page_size_to_copy; + if (bank_index + 1 == num_banks) { + num_round_robins += 1; + } bank_index = (bank_index + 1) % num_banks; } } From 23a00429b27972b586af5267a2dd08c1e8d0136f Mon Sep 17 00:00:00 2001 From: Samarth Agarwal Date: Thu, 13 Feb 2025 19:41:52 +0000 Subject: [PATCH 10/24] Fixing merge conflict --- tt_metal/impl/buffers/dispatch.hpp | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/tt_metal/impl/buffers/dispatch.hpp b/tt_metal/impl/buffers/dispatch.hpp index 5a3b4a4ae53..854c0f484df 100644 --- a/tt_metal/impl/buffers/dispatch.hpp +++ b/tt_metal/impl/buffers/dispatch.hpp @@ -9,6 +9,7 @@ #include #include "buffer.hpp" #include "device.hpp" +#include "tt_metal/impl/event/dispatch.hpp" namespace tt::tt_metal { struct ReadBufferDescriptor; @@ -156,15 +157,4 @@ struct ReadBufferDescriptor { partial_page(partial_page) {} }; -// Used so host knows data in completion queue is just an event ID -struct ReadEventDescriptor { - uint32_t event_id; - uint32_t global_offset; - - explicit ReadEventDescriptor(uint32_t event) : event_id(event), global_offset(0) {} - - void set_global_offset(uint32_t offset) { global_offset = offset; } - uint32_t get_global_event_id() { return global_offset + event_id; } -}; - } // namespace tt::tt_metal From 88780d0b691daca9ff50c91f595092f658340381 Mon Sep 17 00:00:00 2001 From: Samarth Agarwal Date: Thu, 13 Feb 2025 21:19:10 +0000 Subject: [PATCH 11/24] Saving work --- tt_metal/tt_metal.cpp | 153 ++++++++++++++++++------------------------ 1 file changed, 64 insertions(+), 89 deletions(-) diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp index 7c587be1ac3..2cd65ddfe42 100644 --- a/tt_metal/tt_metal.cpp +++ b/tt_metal/tt_metal.cpp @@ -527,151 +527,126 @@ void WriteToBuffer(Buffer& buffer, tt::stl::Span host_buffer) { } } +uint32_t RemovePaddingFromPartialPages( + const buffer_dispatch::PartialPageSpec& partial_page_spec, + const uint8_t* full_page_data, + uint32_t full_page_data_size_bytes, + uint8_t* page_without_padding_data) { + uint32_t total_num_bytes_copied = 0; + uint32_t num_partial_pages_processed = 0; + uint32_t i = 0; + while (i < full_page_data_size_bytes) { + uint32_t num_bytes_to_copy = 0; + if (num_partial_pages_processed == partial_page_spec.num_partial_pages_per_full_page - 1) { + num_bytes_to_copy = + partial_page_spec.unpadded_partial_page_size - partial_page_spec.last_partial_page_additional_padding; + } else { + num_bytes_to_copy = partial_page_spec.unpadded_partial_page_size; + } + std::memcpy(page_without_padding_data + total_num_bytes_copied, full_page_data + i, num_bytes_to_copy); + i += partial_page_spec.padded_partial_page_size; + num_partial_pages_processed += 1; + total_num_bytes_copied += num_bytes_to_copy; + } + return total_num_bytes_copied; +} + void ReadFromDeviceInterleavedContiguousLargePage(const Buffer& buffer, uint8_t* host_buffer) { const buffer_dispatch::PartialPageSpec& partial_page_spec = buffer_dispatch::calculate_partial_page_spec(buffer); - const uint32_t page_size = partial_page_spec.unpadded_partial_page_size; - const uint32_t padded_page_size = partial_page_spec.padded_partial_page_size; - const uint32_t num_partial_pages_per_full_page = partial_page_spec.num_partial_pages_per_full_page; - const uint32_t num_partial_pages = buffer.num_pages() * num_partial_pages_per_full_page; + const uint32_t full_padded_page_size = + partial_page_spec.padded_partial_page_size * partial_page_spec.num_partial_pages_per_full_page; + const uint32_t num_full_pages = buffer.num_pages(); - uint32_t num_full_pages_read = 0; IDevice* device = buffer.device(); const uint32_t num_banks = device->allocator()->get_num_banks(buffer.buffer_type()); + + uint32_t num_round_robins = 0; uint32_t host_idx = 0; uint32_t bank_index = 0; std::vector page; - page.resize(page_size / sizeof(uint32_t)); - DeviceAddr page_address = buffer.address(); - for (int page_index = 0; page_index < num_partial_pages; page_index++) { - // const bool is_start_of_full_page = page_index % num_partial_pages_per_full_page == 0; - // if (is_start_of_full_page) { - - // } - // auto absolute_address = buffer.page_address(bank_index, page_index); - // Get address offset of buffer in bank. Required when reading from DRAM. - // auto bank_local_address = buffer.bank_local_page_address(bank_index, page_index); - const uint32_t adjusted_page_index = - num_full_pages_read + (page_index - (num_full_pages_read * num_partial_pages_per_full_page)) * num_banks; + page.resize(full_padded_page_size / sizeof(uint32_t)); + for (int page_index = 0; page_index < num_full_pages; page_index++) { + const DeviceAddr full_page_address_offset = + (num_round_robins > 0) ? (full_padded_page_size - buffer.aligned_page_size()) * num_round_robins : 0; page.clear(); switch (buffer.buffer_type()) { case BufferType::DRAM: case BufferType::TRACE: { - // page_address = buffer.bank_local_page_address(bank_index, adjusted_page_index); - ReadFromDeviceDRAMChannel(device, bank_index, page_address, page_size, page); + const DeviceAddr page_address = + buffer.bank_local_page_address(bank_index, page_index) + full_page_address_offset; + ReadFromDeviceDRAMChannel(device, bank_index, page_address, full_padded_page_size, page); } break; case BufferType::L1: case BufferType::L1_SMALL: { - // page_address = buffer.page_address(bank_index, adjusted_page_index); + const DeviceAddr page_address = buffer.page_address(bank_index, page_index) + full_page_address_offset; auto core_coordinates = device->worker_core_from_logical_core(buffer.logical_core_from_bank_id(bank_index)); tt::Cluster::instance().read_core( - page.data(), page_size, tt_cxy_pair(device->id(), core_coordinates), page_address); + page.data(), full_padded_page_size, tt_cxy_pair(device->id(), core_coordinates), page_address); } break; default: TT_THROW("Unsupported buffer type to read from device!"); } - page_address += padded_page_size; - - uint32_t additional_padding = 0; - const bool is_last_partial_page_of_full_page = - page_index % num_partial_pages_per_full_page == num_partial_pages_per_full_page - 1; - if (is_last_partial_page_of_full_page) { - num_full_pages_read += 1; - const bool will_next_full_page_be_round_robined = num_full_pages_read % num_banks == 0; - if (!will_next_full_page_be_round_robined) { - const uint32_t full_page_size = num_partial_pages_per_full_page * padded_page_size; - page_address -= full_page_size; - } - bank_index = (bank_index + 1) % num_banks; - additional_padding = partial_page_spec.last_partial_page_additional_padding; - } + uint8_t* full_page_data_without_padding = new uint8_t[full_padded_page_size]; + const uint32_t full_page_data_without_padding_size_bytes = RemovePaddingFromPartialPages( + partial_page_spec, + reinterpret_cast(page.data()), + full_padded_page_size, + full_page_data_without_padding); + TT_ASSERT(full_page_data_without_padding_size_bytes == buffer.page_size()); - std::memcpy(host_buffer + host_idx, page.data(), page_size - additional_padding); - host_idx += page_size - additional_padding; - } -} + // Copy page into host buffer + std::memcpy(host_buffer + host_idx, full_page_data_without_padding, full_page_data_without_padding_size_bytes); + delete[] full_page_data_without_padding; -uint32_t RemovePaddingFromPartialPages( - const buffer_dispatch::PartialPageSpec& partial_page_spec, - const uint8_t* full_page_data, - uint32_t full_page_data_size_bytes, - uint8_t* page_without_padding_data) { - uint32_t total_num_bytes_copied = 0; - uint32_t num_partial_pages_processed = 0; - uint32_t i = 0; - while (i < full_page_data_size_bytes) { - uint32_t num_bytes_to_copy = 0; - if (num_partial_pages_processed == partial_page_spec.num_partial_pages_per_full_page - 1) { - num_bytes_to_copy = - partial_page_spec.unpadded_partial_page_size - partial_page_spec.last_partial_page_additional_padding; - } else { - num_bytes_to_copy = partial_page_spec.unpadded_partial_page_size; + host_idx += full_page_data_without_padding_size_bytes; + if (bank_index + 1 == num_banks) { + num_round_robins += 1; } - // full_page_without_padding.insert( - // full_page_without_padding.end(), - // full_page_data.begin() + i, - // full_page_data.begin() + i + num_elements_to_copy); - std::memcpy(page_without_padding_data + total_num_bytes_copied, full_page_data + i, num_bytes_to_copy); - i += partial_page_spec.padded_partial_page_size; - num_partial_pages_processed += 1; - total_num_bytes_copied += num_bytes_to_copy; + bank_index = (bank_index + 1) % num_banks; } - return total_num_bytes_copied; } void ReadFromDeviceInterleavedContiguous(const Buffer& buffer, uint8_t* host_buffer) { - // if (buffer_dispatch::are_pages_large(buffer)) { - // ReadFromDeviceInterleavedContiguousLargePage(buffer, host_buffer); - // return; - // } + if (buffer_dispatch::are_pages_large(buffer)) { + ReadFromDeviceInterleavedContiguousLargePage(buffer, host_buffer); + return; + } - const buffer_dispatch::PartialPageSpec& partial_page_spec = buffer_dispatch::calculate_partial_page_spec(buffer); - const uint32_t page_size = - partial_page_spec.padded_partial_page_size * partial_page_spec.num_partial_pages_per_full_page; - const uint32_t num_pages = buffer.num_pages(); + uint32_t page_size = buffer.page_size(); + uint32_t num_pages = buffer.num_pages(); auto device = buffer.device(); auto num_banks = device->allocator()->get_num_banks(buffer.buffer_type()); - size_t host_idx = 0; - uint32_t num_round_robins = 0; + size_t host_idx = 0; uint32_t bank_index = 0; std::vector page; page.resize(page_size / sizeof(uint32_t)); for (int page_index = 0; page_index < num_pages; page_index++) { auto absolute_address = buffer.page_address(bank_index, page_index); // Get address offset of buffer in bank. Required when reading from DRAM. - auto addr1 = (num_round_robins > 0) ? (page_size - buffer.aligned_page_size()) * num_round_robins : 0; auto bank_local_address = buffer.bank_local_page_address(bank_index, page_index); page.clear(); switch (buffer.buffer_type()) { case BufferType::DRAM: case BufferType::TRACE: - ReadFromDeviceDRAMChannel(device, bank_index, bank_local_address + addr1, page_size, page); + ReadFromDeviceDRAMChannel(device, bank_index, bank_local_address, page_size, page); break; case BufferType::L1: case BufferType::L1_SMALL: { auto core_coordinates = device->worker_core_from_logical_core(buffer.logical_core_from_bank_id(bank_index)); - tt::Cluster::instance().read_core(page.data(), page_size, tt_cxy_pair(device->id(), core_coordinates), absolute_address); + tt::Cluster::instance().read_core( + page.data(), page_size, tt_cxy_pair(device->id(), core_coordinates), absolute_address); } break; default: TT_THROW("Unsupported buffer type to read from device!"); } - uint8_t* page_without_padding = new uint8_t[page_size]; - uint32_t page_size_to_copy = RemovePaddingFromPartialPages( - partial_page_spec, reinterpret_cast(page.data()), page_size, page_without_padding); - // page_size = page_without_padding.size() * sizeof(uint32_t); - TT_ASSERT(page_size_to_copy == buffer.page_size()); - // Copy page into host buffer - std::memcpy(host_buffer + host_idx, page_without_padding, page_size_to_copy); - delete[] page_without_padding; + std::memcpy(host_buffer + host_idx, page.data(), page_size); + host_idx += page_size; - host_idx += page_size_to_copy; - if (bank_index + 1 == num_banks) { - num_round_robins += 1; - } bank_index = (bank_index + 1) % num_banks; } } From 8773c86b8634c60de926a3434e685d8139a2aca4 Mon Sep 17 00:00:00 2001 From: Samarth Agarwal Date: Sun, 16 Feb 2025 17:51:08 +0000 Subject: [PATCH 12/24] Saving work --- ...queueWriteBuffer_and_EnqueueReadBuffer.cpp | 2 +- tt_metal/impl/buffers/dispatch.cpp | 92 ++++++++++++++-- tt_metal/tt_metal.cpp | 103 ++++++++++++++++-- 3 files changed, 178 insertions(+), 19 deletions(-) diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp index de0c0f72018..c08da28327b 100644 --- a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp @@ -260,7 +260,7 @@ void test_EnqueueWriteBuffer_and_EnqueueReadBuffer(IDevice* device, CommandQueue if (i != result[i]) { std::cout << "i: " << std::to_string(i) << " result[i]: " << std::to_string(result[i]) << std::endl; // std::cout << "Fail" << std::endl; - break; + // break; } } diff --git a/tt_metal/impl/buffers/dispatch.cpp b/tt_metal/impl/buffers/dispatch.cpp index eca7a7e63af..fdcf547e9b8 100644 --- a/tt_metal/impl/buffers/dispatch.cpp +++ b/tt_metal/impl/buffers/dispatch.cpp @@ -7,6 +7,8 @@ #include "assert.hpp" #include "math.hpp" #include "dispatch.hpp" +#include +#include #include #include @@ -1009,6 +1011,9 @@ void copy_completion_queue_data_into_user_space( uint32_t offset_in_completion_q_data = sizeof(CQDispatchCmd); uint32_t pad_size_bytes = padded_page_size - page_size; + bool trigger = false; + + std::cout << "buffer page size: " << page_size << std::endl; while (remaining_bytes_to_read != 0) { uint32_t completion_queue_write_ptr_and_toggle = @@ -1055,15 +1060,59 @@ void copy_completion_queue_data_into_user_space( offset_in_completion_q_data = 0; uint32_t dst_offset_bytes = 0; - const uint32_t page_size_to_read = - partial_page_spec ? partial_page_spec->unpadded_partial_page_size : page_size; - const uint32_t padded_page_size_to_read = - partial_page_spec ? partial_page_spec->padded_partial_page_size : padded_page_size; - pad_size_bytes = partial_page_spec ? padded_page_size_to_read - page_size_to_read : pad_size_bytes; - while (src_offset_bytes < bytes_xfered) { + uint32_t page_size_to_read = + partial_page_spec ? partial_page_spec->unpadded_partial_page_size : page_size; + const uint32_t num_bytes_read_curr_full_page = total_num_bytes_read % page_size; + if (partial_page_spec && + // total_num_bytes_read % partial_page_spec->unpadded_partial_page_size == 0 && + num_bytes_read_curr_full_page / partial_page_spec->unpadded_partial_page_size == + partial_page_spec->num_partial_pages_per_full_page - 1) { + page_size_to_read -= partial_page_spec->last_partial_page_additional_padding; + } + + const uint32_t padded_page_size_to_read = + partial_page_spec ? partial_page_spec->padded_partial_page_size : padded_page_size; + + pad_size_bytes = partial_page_spec ? padded_page_size_to_read - page_size_to_read : pad_size_bytes; + // if (partial_page_spec && + // // total_num_bytes_read % partial_page_spec->unpadded_partial_page_size == 0 && + // total_num_bytes_read / partial_page_spec->unpadded_partial_page_size == + // partial_page_spec->num_partial_pages_per_full_page - 1) { + // pad_size_bytes += partial_page_spec->last_partial_page_additional_padding; + // } + + // if (partial_page_spec) { + // const uint32_t num_bytes_read_curr_full_page = total_num_bytes_read % page_size; + // if (num_bytes_read_curr_full_page + page_size_to_read > page_size) { + // // if (trigger) { + // // } + // const uint32_t extra_padding_bytes = + // partial_page_spec->last_partial_page_additional_padding; std::cout << "Output " << + // std::to_string(trigger) << " " << total_num_bytes_read << " " << page_size_to_read << " " + // << remaining_bytes_of_nonaligned_page << " " << extra_padding_bytes << std::endl; + // page_size_to_read -= extra_padding_bytes; + // // page_size_to_read -= std::min( + // // total_num_bytes_read + page_size_to_read - page_size, + // // page_size_to_read - extra_padding_bytes); + // if (remaining_bytes_of_nonaligned_page > 0) { + // // page_size_to_read - num_bytes_read_curr_partial_page => if <= 0, no more data to + // read, otherwise take min of this and remaining_bytes_of_nonaligned_page + // remaining_bytes_of_nonaligned_page = std::min(page_size_to_read, + // remaining_bytes_of_nonaligned_page); const uint32_t num_bytes_read_curr_partial_page + // = num_bytes_read_curr_full_page % partial_page_spec->unpadded_partial_page_size; + // pad_size_bytes += (partial_page_spec->unpadded_partial_page_size - + // remaining_bytes_of_nonaligned_page - num_bytes_read_curr_partial_page); + // } + // // else { + + // // } + // } + // } + uint32_t src_offset_increment = padded_page_size_to_read; uint32_t num_bytes_to_copy = 0; + if (remaining_bytes_of_nonaligned_page > 0) { // Case 1: Portion of the page was copied into user buffer on the previous completion queue pop. uint32_t num_bytes_remaining = bytes_xfered - src_offset_bytes; @@ -1076,23 +1125,41 @@ void copy_completion_queue_data_into_user_space( // There is more data after padding if (rem_bytes_in_cq >= pad_size_bytes) { src_offset_increment += pad_size_bytes; + if (trigger) { + std::cout << "rem_bytes_in_cq >= pad_size_bytes " << rem_bytes_in_cq << " " + << pad_size_bytes << " " << src_offset_increment << std::endl; + } // Only pad data left in queue } else { offset_in_completion_q_data = pad_size_bytes - rem_bytes_in_cq; + if (trigger) { + std::cout << "rem_bytes_in_cq < pad_size_bytes " << rem_bytes_in_cq << " " + << pad_size_bytes << " " << src_offset_increment << " " + << offset_in_completion_q_data << std::endl; + } } } + if (trigger) { + std::cout << num_bytes_remaining << " " << num_bytes_to_copy << " " + << remaining_bytes_of_nonaligned_page << " " << src_offset_increment << " " + << offset_in_completion_q_data << std::endl; + } + trigger = false; } else if (src_offset_bytes + padded_page_size_to_read >= bytes_xfered) { // Case 2: Last page of data that was popped off the completion queue // Don't need to compute src_offset_increment since this is end of loop + std::cout << "Total num bytes read: " << total_num_bytes_read << std::endl; uint32_t num_bytes_remaining = bytes_xfered - src_offset_bytes; num_bytes_to_copy = std::min(num_bytes_remaining, page_size_to_read); remaining_bytes_of_nonaligned_page = page_size_to_read - num_bytes_to_copy; + trigger = true; // We've copied needed data, start of next read is offset due to remaining pad bytes if (remaining_bytes_of_nonaligned_page == 0) { offset_in_completion_q_data = padded_page_size_to_read - num_bytes_remaining; } } else { num_bytes_to_copy = page_size_to_read; + trigger = false; } // if (partial_page_spec && (total_num_bytes_read % page_size) + num_bytes_to_copy == @@ -1100,11 +1167,11 @@ void copy_completion_queue_data_into_user_space( // if (partial_page_spec && partial_page_spec->num_partial_pages_per_full_page - 1 == // ((total_num_bytes_read % page_size) + num_bytes_to_copy) / // partial_page_spec->unpadded_partial_page_size) { - if (partial_page_spec && (total_num_bytes_read % page_size) + num_bytes_to_copy > page_size) { - // uint32_t extra_bytes = (total_num_bytes_read % page_size) + num_bytes_to_copy - page_size; - uint32_t extra_bytes = partial_page_spec->last_partial_page_additional_padding; - num_bytes_to_copy -= extra_bytes; - } + // if (partial_page_spec && (total_num_bytes_read % page_size) + num_bytes_to_copy > page_size) { + // // uint32_t extra_bytes = (total_num_bytes_read % page_size) + num_bytes_to_copy - page_size; + // uint32_t extra_bytes = partial_page_spec->last_partial_page_additional_padding; + // num_bytes_to_copy -= extra_bytes; + // } tt::Cluster::instance().read_sysmem( (char*)(uint64_t(contiguous_dst) + dst_offset_bytes), @@ -1113,6 +1180,9 @@ void copy_completion_queue_data_into_user_space( mmio_device_id, channel); + std::cout << "num bytes to copy " << num_bytes_to_copy << std::endl; + std::cout << "src offset increment" << src_offset_increment << std::endl; + total_num_bytes_read += num_bytes_to_copy; // if (total_num_bytes_read == page_size) { // total_num_bytes_read = 0; diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp index 2cd65ddfe42..9b8c3b0ccdb 100644 --- a/tt_metal/tt_metal.cpp +++ b/tt_metal/tt_metal.cpp @@ -2,11 +2,10 @@ // // SPDX-License-Identifier: Apache-2.0 +#include #include #include -#include -#include #include #include #include @@ -15,8 +14,9 @@ #include #include #include -#include "buffer_constants.hpp" +#include "assert.hpp" #include "buffers/dispatch.hpp" +#include "device.hpp" #include "dprint_server.hpp" #include #include @@ -37,10 +37,10 @@ #include "tracy/Tracy.hpp" #include -#include #include "lightmetal/host_api_capture_helpers.hpp" #include "llrt.hpp" +#include "umd/device/tt_core_coordinates.h" namespace tt { @@ -461,6 +461,91 @@ void WriteToDeviceSharded(Buffer& buffer, tt::stl::Span host_buff } } +uint32_t AddPaddingToPartialPages( + const buffer_dispatch::PartialPageSpec& partial_page_spec, + const uint8_t* partial_pages_data, + uint32_t full_unpadded_page_data_size_bytes, + uint8_t* page_with_padding_data) { + uint32_t total_num_bytes_added = 0; + uint32_t num_partial_pages_processed = 0; + uint32_t i = 0; + while (i < full_unpadded_page_data_size_bytes) { + uint32_t num_bytes_data_to_add = 0; + uint32_t num_bytes_padding_to_add = 0; + if (num_partial_pages_processed == partial_page_spec.num_partial_pages_per_full_page - 1) { + num_bytes_data_to_add = + partial_page_spec.unpadded_partial_page_size - partial_page_spec.last_partial_page_additional_padding; + num_bytes_padding_to_add = + partial_page_spec.last_partial_page_additional_padding + + (partial_page_spec.padded_partial_page_size - partial_page_spec.unpadded_partial_page_size); + } else { + num_bytes_data_to_add = partial_page_spec.unpadded_partial_page_size; + num_bytes_padding_to_add = + partial_page_spec.padded_partial_page_size - partial_page_spec.unpadded_partial_page_size; + } + + std::memcpy(page_with_padding_data + total_num_bytes_added, partial_pages_data + i, num_bytes_data_to_add); + total_num_bytes_added += num_bytes_data_to_add; + + std::memset(page_with_padding_data + total_num_bytes_added, 0, num_bytes_padding_to_add); + total_num_bytes_added += num_bytes_padding_to_add; + + i += num_bytes_data_to_add + num_bytes_padding_to_add; + num_partial_pages_processed += 1; + } + return total_num_bytes_added; +} + +void WriteToDeviceInterleavedContiguousLargePage(const Buffer& buffer, tt::stl::Span host_buffer) { + const buffer_dispatch::PartialPageSpec& partial_page_spec = buffer_dispatch::calculate_partial_page_spec(buffer); + const uint32_t full_padded_page_size = + partial_page_spec.padded_partial_page_size * partial_page_spec.num_partial_pages_per_full_page; + // const uint32_t full_unpadded_page_size = + // partial_page_spec.unpadded_partial_page_size * partial_page_spec.num_partial_pages_per_full_page; + const uint32_t num_full_pages = buffer.num_pages(); + + IDevice* device = buffer.device(); + const uint32_t num_banks = device->allocator()->get_num_banks(buffer.buffer_type()); + + uint32_t num_round_robins = 0; + uint32_t bank_index = 0; + uint32_t data_index = 0; + std::vector page; + page.resize(full_padded_page_size / sizeof(uint32_t)); + for (uint32_t page_index = 0; page_index < num_full_pages; page_index++) { + auto absolute_address = buffer.page_address(bank_index, page_index); + // Get address offset of buffer in bank. Required when writing to DRAM. + auto bank_local_address = buffer.bank_local_page_address(bank_index, page_index); + // std::memcpy(page.data(), host_buffer.data() + data_index, full_padded_page_size); + const DeviceAddr full_page_address_offset = + (num_round_robins > 0) ? (full_padded_page_size - buffer.aligned_page_size()) * num_round_robins : 0; + const uint32_t full_page_data_with_padding_size_bytes = AddPaddingToPartialPages( + partial_page_spec, + host_buffer.data() + data_index, + buffer.page_size(), + reinterpret_cast(page.data())); + TT_ASSERT(full_page_data_with_padding_size_bytes == full_padded_page_size); + switch (buffer.buffer_type()) { + case BufferType::DRAM: + WriteToDeviceDRAMChannel(device, bank_index, bank_local_address + full_page_address_offset, page); + break; + case BufferType::L1: + case BufferType::L1_SMALL: { + CoreCoord logical_core = buffer.logical_core_from_bank_id(bank_index); + WriteToDeviceL1( + device, logical_core, absolute_address + full_page_address_offset, page, CoreType::WORKER); + } break; + default: TT_THROW("Unsupported buffer type to write to device!"); + } + + if (bank_index + 1 == num_banks) { + num_round_robins += 1; + } + bank_index = (bank_index + 1) % num_banks; + data_index += buffer.page_size(); + } +} + void WriteToDeviceInterleavedContiguous(const Buffer& buffer, tt::stl::Span host_buffer) { uint32_t host_buffer_size_bytes = host_buffer.size(); TT_FATAL( @@ -469,6 +554,11 @@ void WriteToDeviceInterleavedContiguous(const Buffer& buffer, tt::stl::Spanworker_core_from_logical_core(buffer.logical_core_from_bank_id(bank_index)); - llrt::write_hex_vec_to_core(device->id(), core_coordinates, page, absolute_address); + CoreCoord logical_core = buffer.logical_core_from_bank_id(bank_index); + WriteToDeviceL1(device, logical_core, absolute_address, page, CoreType::WORKER); } break; default: TT_THROW("Unsupported buffer type to write to device!"); } From 0af5b5741da4f60619478493ee41b5ae0e1eb7cf Mon Sep 17 00:00:00 2001 From: Samarth Agarwal Date: Wed, 19 Feb 2025 15:10:36 +0000 Subject: [PATCH 13/24] Saving work --- .../api/tt-metalium/dispatch_settings.hpp | 2 +- tt_metal/impl/buffers/dispatch.cpp | 55 ++- tt_metal/impl/buffers/dispatch.hpp | 43 +- tt_metal/tt_metal.cpp | 384 ++++++++++-------- 4 files changed, 281 insertions(+), 203 deletions(-) diff --git a/tt_metal/api/tt-metalium/dispatch_settings.hpp b/tt_metal/api/tt-metalium/dispatch_settings.hpp index 56be7bb7cde..95eb63aff81 100644 --- a/tt_metal/api/tt-metalium/dispatch_settings.hpp +++ b/tt_metal/api/tt-metalium/dispatch_settings.hpp @@ -138,7 +138,7 @@ class DispatchSettings { // broken down into equal sized partial pages. UNPADDED_PARTIAL_PAGE_SIZE denotes the unpadded partial page size to // use. The size of the padded partial page is the smallest value >= UNPADDED_PARTIAL_PAGE_SIZE that is // PCIE-aligned. - static constexpr uint32_t UNPADDED_PARTIAL_PAGE_SIZE = 3072; + static constexpr uint32_t UNPADDED_PARTIAL_PAGE_SIZE = 3044; static_assert( DISPATCH_MESSAGE_ENTRIES <= diff --git a/tt_metal/impl/buffers/dispatch.cpp b/tt_metal/impl/buffers/dispatch.cpp index fdcf547e9b8..c46e10ae991 100644 --- a/tt_metal/impl/buffers/dispatch.cpp +++ b/tt_metal/impl/buffers/dispatch.cpp @@ -115,7 +115,7 @@ struct InterleavedBufferWriteLargePageDispatchParams : InterleavedBufferWriteDis tt::stl::Span expected_num_workers_completed) : InterleavedBufferWriteDispatchParams( buffer, dst_page_index, total_pages_to_write, cq_id, expected_num_workers_completed) { - this->page_size_to_write = partial_page_spec.padded_partial_page_size; + this->page_size_to_write = partial_page_spec.unpadded_partial_page_size; this->data_size_per_page_size_to_write = partial_page_spec.unpadded_partial_page_size; this->full_pages_to_write = num_full_pages; this->full_page_size = full_page_size; @@ -267,12 +267,12 @@ PartialPageSpec calculate_partial_page_spec(const Buffer& buffer) { // while (buffer.aligned_page_size() % partial_page.unpadded_partial_page_size != 0) { // partial_page.unpadded_partial_page_size += 1; // } - partial_page_spec.padded_partial_page_size = partial_page_spec.unpadded_partial_page_size; + // partial_page_spec.padded_partial_page_size = partial_page_spec.unpadded_partial_page_size; const uint32_t pcie_alignment = hal.get_alignment(HalMemType::HOST); const uint32_t l1_alignment = hal.get_alignment(HalMemType::L1); - while (partial_page_spec.padded_partial_page_size % pcie_alignment != 0 || - partial_page_spec.padded_partial_page_size % l1_alignment != 0) { - partial_page_spec.padded_partial_page_size += 1; + while (partial_page_spec.unpadded_partial_page_size % pcie_alignment != 0 || + partial_page_spec.unpadded_partial_page_size % l1_alignment != 0) { + partial_page_spec.unpadded_partial_page_size += 1; } partial_page_spec.num_partial_pages_per_full_page = tt::div_up(buffer.aligned_page_size(), partial_page_spec.unpadded_partial_page_size); @@ -312,10 +312,10 @@ std::unique_ptr initialize_interleaved_buf // const uint32_t num_partial_pages_per_full_page = buffer.aligned_page_size() / // partial_page_spec.unpadded_partial_page_size; const uint32_t full_page_size = - partial_page_spec.num_partial_pages_per_full_page * partial_page_spec.padded_partial_page_size; + partial_page_spec.num_partial_pages_per_full_page * partial_page_spec.unpadded_partial_page_size; const uint32_t num_full_pages = total_pages_to_write; - const uint32_t padded_buffer_size = total_pages_to_write * full_page_size; - total_pages_to_write = tt::div_up(padded_buffer_size, partial_page_spec.padded_partial_page_size); + const uint32_t padded_buffer_size = total_pages_to_write * buffer.aligned_page_size(); + total_pages_to_write = num_full_pages * partial_page_spec.num_partial_pages_per_full_page; dispatch_params = std::make_unique( buffer, dst_page_index, @@ -783,23 +783,31 @@ std::unique_ptr initialize_interleaved_buf_read_dispat const bool read_large_pages = are_pages_large(buffer); if (read_large_pages) { dispatch_params = std::make_unique(); - BufferReadLargePageDispatchParams* large_page_dispatch_params = - dynamic_cast(dispatch_params.get()); - const PartialPageSpec partial_page = calculate_partial_page_spec(buffer); - large_page_dispatch_params->partial_page_spec = partial_page; - large_page_dispatch_params->padded_page_size = - partial_page.num_partial_pages_per_full_page * partial_page.padded_partial_page_size; } else { dispatch_params = std::make_unique(); - dispatch_params->padded_page_size = buffer.aligned_page_size(); } - dispatch_params->pages_per_txn = region.size / buffer.page_size(); + + dispatch_params->total_pages_to_read = region.size / buffer.page_size(); dispatch_params->src_page_index = region.offset / buffer.page_size(); dispatch_params->cq_id = cq_id; dispatch_params->device = buffer.device(); dispatch_params->address = buffer.address(); dispatch_params->unpadded_dst_offset = 0; dispatch_params->expected_num_workers_completed = expected_num_workers_completed; + dispatch_params->num_banks = buffer.device()->allocator()->get_num_banks(buffer.buffer_type()); + + if (read_large_pages) { + BufferReadLargePageDispatchParams* large_page_dispatch_params = + dynamic_cast(dispatch_params.get()); + const PartialPageSpec partial_page_spec = calculate_partial_page_spec(buffer); + large_page_dispatch_params->partial_page_spec = partial_page_spec; + dispatch_params->padded_page_size = + partial_page_spec.unpadded_partial_page_size * partial_page_spec.num_partial_pages_per_full_page; + // large_page_dispatch_params->update_params_to_be_within_bounds(buffer); + } else { + dispatch_params->padded_page_size = buffer.aligned_page_size(); + } + return dispatch_params; } @@ -939,14 +947,16 @@ void copy_interleaved_buffer_to_completion_queue( Buffer& buffer, tt::stl::Span sub_device_ids, CoreType dispatch_core_type) { - if (dispatch_params.pages_per_txn > 0) { + if (dispatch_params.total_pages_to_read > 0) { // Only 8 bits are assigned for the page offset in CQPrefetchRelayPagedCmd // To handle larger page offsets move bank base address up and update page offset to be relative to the new // bank address if (dispatch_params.src_page_index > CQ_PREFETCH_RELAY_PAGED_START_PAGE_MASK) { dispatch_params.update_params_to_be_within_bounds(buffer); } + dispatch_params.calculate_num_pages_for_read_transaction(); issue_read_buffer_dispatch_command_sequence(buffer, dispatch_params, sub_device_ids, dispatch_core_type); + dispatch_params.update_params_after_read_transaction(); } } @@ -964,7 +974,7 @@ std::shared_ptr generate_sharded_buffer_r dispatch_params.padded_page_size, dst, dispatch_params.unpadded_dst_offset, - dispatch_params.pages_per_txn, + dispatch_params.total_pages_read, initial_src_page_index, dispatch_params.starting_src_host_page_index, dispatch_params.buffer_page_mapping); @@ -983,11 +993,11 @@ std::shared_ptr generate_interleaved_buff dispatch_params->padded_page_size, dst, dispatch_params->unpadded_dst_offset, - dispatch_params->pages_per_txn, + dispatch_params->total_pages_read, dispatch_params->src_page_index, 0, nullptr, - partial_page_spec); + nullptr); } void copy_completion_queue_data_into_user_space( @@ -1046,6 +1056,9 @@ void copy_completion_queue_data_into_user_space( if (buffer_page_mapping == nullptr) { void* contiguous_dst = (void*)(uint64_t(dst) + contig_dst_offset); if (page_size == padded_page_size) { + // if (!partial_page_spec) { + + // } uint32_t data_bytes_xfered = bytes_xfered - offset_in_completion_q_data; tt::Cluster::instance().read_sysmem( contiguous_dst, @@ -1072,7 +1085,7 @@ void copy_completion_queue_data_into_user_space( } const uint32_t padded_page_size_to_read = - partial_page_spec ? partial_page_spec->padded_partial_page_size : padded_page_size; + partial_page_spec ? partial_page_spec->unpadded_partial_page_size : padded_page_size; pad_size_bytes = partial_page_spec ? padded_page_size_to_read - page_size_to_read : pad_size_bytes; // if (partial_page_spec && diff --git a/tt_metal/impl/buffers/dispatch.hpp b/tt_metal/impl/buffers/dispatch.hpp index 854c0f484df..980e8b7bde3 100644 --- a/tt_metal/impl/buffers/dispatch.hpp +++ b/tt_metal/impl/buffers/dispatch.hpp @@ -28,26 +28,57 @@ struct BufferReadDispatchParams { uint32_t unpadded_dst_offset = 0; uint32_t pages_per_txn = 0; uint32_t address = 0; + uint32_t total_pages_to_read = 0; + uint32_t total_pages_read = 0; + uint32_t num_banks = 0; virtual ~BufferReadDispatchParams() = default; - void update_params_to_be_within_bounds(const Buffer& buffer) { - const uint32_t num_banks = this->device->allocator()->get_num_banks(buffer.buffer_type()); - const uint32_t num_pages_per_bank = this->src_page_index / num_banks; + virtual void update_params_to_be_within_bounds(const Buffer& buffer) { + const uint32_t num_pages_per_bank = this->src_page_index / this->num_banks; this->address += num_pages_per_bank * this->padded_page_size; - this->src_page_index = this->src_page_index % num_banks; + this->src_page_index = this->src_page_index % this->num_banks; + } + + virtual void calculate_num_pages_for_read_transaction() { this->pages_per_txn = this->total_pages_to_read; } + + virtual void update_params_after_read_transaction() { + this->total_pages_to_read -= this->pages_per_txn; + this->total_pages_read += this->pages_per_txn; + this->src_page_index += this->pages_per_txn; } }; struct PartialPageSpec { uint32_t unpadded_partial_page_size = 0; - uint32_t padded_partial_page_size = 0; + // uint32_t padded_partial_page_size = 0; uint32_t last_partial_page_additional_padding = 0; uint32_t num_partial_pages_per_full_page = 0; }; struct BufferReadLargePageDispatchParams : BufferReadDispatchParams { PartialPageSpec partial_page_spec; + + void update_params_to_be_within_bounds(const Buffer& buffer) override { + const uint32_t num_pages_per_bank = this->src_page_index / this->num_banks; + this->address += num_pages_per_bank * (this->partial_page_spec.num_partial_pages_per_full_page * + this->partial_page_spec.unpadded_partial_page_size); + this->src_page_index = this->src_page_index % this->num_banks; + } + + // void calculate_num_pages_for_read_transaction() override { + // this->pages_per_txn = + // std::min(this->total_pages_to_read, this->num_banks - (this->src_page_index % this->num_banks)); + // } + + void update_params_after_read_transaction() override { + this->total_pages_to_read -= this->pages_per_txn; + this->total_pages_read += this->pages_per_txn; + this->address += ((this->src_page_index + this->pages_per_txn) / this->num_banks) * + (this->partial_page_spec.num_partial_pages_per_full_page * + this->partial_page_spec.unpadded_partial_page_size); + this->src_page_index = (this->src_page_index + this->pages_per_txn) % this->num_banks; + } }; struct ShardedBufferReadDispatchParams : BufferReadDispatchParams { @@ -55,7 +86,7 @@ struct ShardedBufferReadDispatchParams : BufferReadDispatchParams { uint32_t initial_pages_skipped = 0; uint32_t starting_src_host_page_index = 0; std::shared_ptr buffer_page_mapping = nullptr; - uint32_t total_pages_to_read = 0; + // uint32_t total_pages_to_read = 0; uint32_t total_pages_read = 0; uint32_t max_pages_per_shard = 0; CoreCoord core; diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp index 9b8c3b0ccdb..29ce64a4520 100644 --- a/tt_metal/tt_metal.cpp +++ b/tt_metal/tt_metal.cpp @@ -32,6 +32,7 @@ #include #include #include +#include "tt_align.hpp" #include "tt_metal/impl/dispatch/dispatch_query_manager.hpp" #include "tt_metal/include/tt_metal/program.hpp" #include "tracy/Tracy.hpp" @@ -461,90 +462,97 @@ void WriteToDeviceSharded(Buffer& buffer, tt::stl::Span host_buff } } -uint32_t AddPaddingToPartialPages( - const buffer_dispatch::PartialPageSpec& partial_page_spec, - const uint8_t* partial_pages_data, - uint32_t full_unpadded_page_data_size_bytes, - uint8_t* page_with_padding_data) { - uint32_t total_num_bytes_added = 0; - uint32_t num_partial_pages_processed = 0; - uint32_t i = 0; - while (i < full_unpadded_page_data_size_bytes) { - uint32_t num_bytes_data_to_add = 0; - uint32_t num_bytes_padding_to_add = 0; - if (num_partial_pages_processed == partial_page_spec.num_partial_pages_per_full_page - 1) { - num_bytes_data_to_add = - partial_page_spec.unpadded_partial_page_size - partial_page_spec.last_partial_page_additional_padding; - num_bytes_padding_to_add = - partial_page_spec.last_partial_page_additional_padding + - (partial_page_spec.padded_partial_page_size - partial_page_spec.unpadded_partial_page_size); - } else { - num_bytes_data_to_add = partial_page_spec.unpadded_partial_page_size; - num_bytes_padding_to_add = - partial_page_spec.padded_partial_page_size - partial_page_spec.unpadded_partial_page_size; - } - - std::memcpy(page_with_padding_data + total_num_bytes_added, partial_pages_data + i, num_bytes_data_to_add); - total_num_bytes_added += num_bytes_data_to_add; - - std::memset(page_with_padding_data + total_num_bytes_added, 0, num_bytes_padding_to_add); - total_num_bytes_added += num_bytes_padding_to_add; - - i += num_bytes_data_to_add + num_bytes_padding_to_add; - num_partial_pages_processed += 1; - } - return total_num_bytes_added; -} - -void WriteToDeviceInterleavedContiguousLargePage(const Buffer& buffer, tt::stl::Span host_buffer) { - const buffer_dispatch::PartialPageSpec& partial_page_spec = buffer_dispatch::calculate_partial_page_spec(buffer); - const uint32_t full_padded_page_size = - partial_page_spec.padded_partial_page_size * partial_page_spec.num_partial_pages_per_full_page; - // const uint32_t full_unpadded_page_size = - // partial_page_spec.unpadded_partial_page_size * partial_page_spec.num_partial_pages_per_full_page; - const uint32_t num_full_pages = buffer.num_pages(); - - IDevice* device = buffer.device(); - const uint32_t num_banks = device->allocator()->get_num_banks(buffer.buffer_type()); - - uint32_t num_round_robins = 0; - uint32_t bank_index = 0; - uint32_t data_index = 0; - std::vector page; - page.resize(full_padded_page_size / sizeof(uint32_t)); - for (uint32_t page_index = 0; page_index < num_full_pages; page_index++) { - auto absolute_address = buffer.page_address(bank_index, page_index); - // Get address offset of buffer in bank. Required when writing to DRAM. - auto bank_local_address = buffer.bank_local_page_address(bank_index, page_index); - // std::memcpy(page.data(), host_buffer.data() + data_index, full_padded_page_size); - const DeviceAddr full_page_address_offset = - (num_round_robins > 0) ? (full_padded_page_size - buffer.aligned_page_size()) * num_round_robins : 0; - const uint32_t full_page_data_with_padding_size_bytes = AddPaddingToPartialPages( - partial_page_spec, - host_buffer.data() + data_index, - buffer.page_size(), - reinterpret_cast(page.data())); - TT_ASSERT(full_page_data_with_padding_size_bytes == full_padded_page_size); - switch (buffer.buffer_type()) { - case BufferType::DRAM: - WriteToDeviceDRAMChannel(device, bank_index, bank_local_address + full_page_address_offset, page); - break; - case BufferType::L1: - case BufferType::L1_SMALL: { - CoreCoord logical_core = buffer.logical_core_from_bank_id(bank_index); - WriteToDeviceL1( - device, logical_core, absolute_address + full_page_address_offset, page, CoreType::WORKER); - } break; - default: TT_THROW("Unsupported buffer type to write to device!"); - } - - if (bank_index + 1 == num_banks) { - num_round_robins += 1; - } - bank_index = (bank_index + 1) % num_banks; - data_index += buffer.page_size(); - } -} +// uint32_t AddPaddingToPartialPages( +// const buffer_dispatch::PartialPageSpec& partial_page_spec, +// const uint8_t* partial_pages_data, +// uint32_t full_unpadded_page_data_size_bytes, +// uint8_t* page_with_padding_data) { +// uint32_t total_num_bytes_added = 0; +// uint32_t total_num_bytes_data_added = 0; +// uint32_t num_partial_pages_processed = 0; +// // uint32_t i = 0; +// while (total_num_bytes_data_added < full_unpadded_page_data_size_bytes) { +// uint32_t num_bytes_data_to_add = 0; +// uint32_t num_bytes_padding_to_add = 0; +// if (num_partial_pages_processed == partial_page_spec.num_partial_pages_per_full_page - 1) { +// num_bytes_data_to_add = +// partial_page_spec.unpadded_partial_page_size - +// partial_page_spec.last_partial_page_additional_padding; +// num_bytes_padding_to_add = +// partial_page_spec.last_partial_page_additional_padding + +// (partial_page_spec.padded_partial_page_size - partial_page_spec.unpadded_partial_page_size); +// } else { +// num_bytes_data_to_add = partial_page_spec.unpadded_partial_page_size; +// num_bytes_padding_to_add = +// partial_page_spec.padded_partial_page_size - partial_page_spec.unpadded_partial_page_size; +// } + +// std::memcpy( +// page_with_padding_data + total_num_bytes_added, +// partial_pages_data + total_num_bytes_data_added, +// num_bytes_data_to_add); +// total_num_bytes_added += num_bytes_data_to_add; +// total_num_bytes_data_added += num_bytes_data_to_add; + +// std::memset(page_with_padding_data + total_num_bytes_added, 0, num_bytes_padding_to_add); +// total_num_bytes_added += num_bytes_padding_to_add; + +// // i += num_bytes_data_to_add + num_bytes_padding_to_add; +// num_partial_pages_processed += 1; +// } +// return total_num_bytes_added; +// } + +// void WriteToDeviceInterleavedContiguousLargePage(const Buffer& buffer, tt::stl::Span host_buffer) { +// const buffer_dispatch::PartialPageSpec& partial_page_spec = buffer_dispatch::calculate_partial_page_spec(buffer); +// const uint32_t full_padded_page_size = +// partial_page_spec.padded_partial_page_size * partial_page_spec.num_partial_pages_per_full_page; +// // const uint32_t full_unpadded_page_size = +// // partial_page_spec.unpadded_partial_page_size * partial_page_spec.num_partial_pages_per_full_page; +// const uint32_t num_full_pages = buffer.num_pages(); + +// IDevice* device = buffer.device(); +// const uint32_t num_banks = device->allocator()->get_num_banks(buffer.buffer_type()); + +// uint32_t num_round_robins = 0; +// uint32_t bank_index = 0; +// uint32_t data_index = 0; +// std::vector page; +// page.resize(full_padded_page_size / sizeof(uint32_t)); +// for (uint32_t page_index = 0; page_index < num_full_pages; page_index++) { +// auto absolute_address = buffer.page_address(bank_index, page_index); +// // Get address offset of buffer in bank. Required when writing to DRAM. +// auto bank_local_address = buffer.bank_local_page_address(bank_index, page_index); +// // std::memcpy(page.data(), host_buffer.data() + data_index, full_padded_page_size); +// const DeviceAddr full_page_address_offset = +// (num_round_robins > 0) ? (full_padded_page_size - buffer.aligned_page_size()) * num_round_robins : 0; +// const uint32_t full_page_data_with_padding_size_bytes = AddPaddingToPartialPages( +// partial_page_spec, +// host_buffer.data() + data_index, +// buffer.page_size(), +// reinterpret_cast(page.data())); +// TT_ASSERT(full_page_data_with_padding_size_bytes == full_padded_page_size); +// switch (buffer.buffer_type()) { +// case BufferType::DRAM: +// WriteToDeviceDRAMChannel(device, bank_index, bank_local_address + full_page_address_offset, page); +// break; +// case BufferType::L1: +// case BufferType::L1_SMALL: { +// CoreCoord logical_core = buffer.logical_core_from_bank_id(bank_index); +// //page.resize(buffer.page_size() / 4); +// WriteToDeviceL1( +// device, logical_core, absolute_address + full_page_address_offset, page, CoreType::WORKER); +// } break; +// default: TT_THROW("Unsupported buffer type to write to device!"); +// } + +// if (bank_index + 1 == num_banks) { +// num_round_robins += 1; +// } +// bank_index = (bank_index + 1) % num_banks; +// data_index += buffer.page_size(); +// } +// } void WriteToDeviceInterleavedContiguous(const Buffer& buffer, tt::stl::Span host_buffer) { uint32_t host_buffer_size_bytes = host_buffer.size(); @@ -554,24 +562,31 @@ void WriteToDeviceInterleavedContiguous(const Buffer& buffer, tt::stl::Spanallocator()->get_num_banks(buffer.buffer_type()); + uint32_t num_round_robins = 0; uint32_t bank_index = 0; int data_index = 0; std::vector page; page.resize(page_size / sizeof(uint32_t)); for (int page_index = 0; page_index < num_pages; page_index++) { - auto absolute_address = buffer.page_address(bank_index, page_index); + const DeviceAddr full_page_address_offset = + (num_round_robins > 0) ? (full_padded_page_size - buffer.aligned_page_size()) * num_round_robins : 0; + auto absolute_address = buffer.page_address(bank_index, page_index) + full_page_address_offset; // Get address offset of buffer in bank. Required when writing to DRAM. - auto bank_local_address = buffer.bank_local_page_address(bank_index, page_index); + auto bank_local_address = buffer.bank_local_page_address(bank_index, page_index) + full_page_address_offset; std::memcpy(page.data(), host_buffer.data() + data_index, page_size); switch (buffer.buffer_type()) { case BufferType::DRAM: @@ -585,6 +600,10 @@ void WriteToDeviceInterleavedContiguous(const Buffer& buffer, tt::stl::Span host_buffer) { } } -uint32_t RemovePaddingFromPartialPages( - const buffer_dispatch::PartialPageSpec& partial_page_spec, - const uint8_t* full_page_data, - uint32_t full_page_data_size_bytes, - uint8_t* page_without_padding_data) { - uint32_t total_num_bytes_copied = 0; - uint32_t num_partial_pages_processed = 0; - uint32_t i = 0; - while (i < full_page_data_size_bytes) { - uint32_t num_bytes_to_copy = 0; - if (num_partial_pages_processed == partial_page_spec.num_partial_pages_per_full_page - 1) { - num_bytes_to_copy = - partial_page_spec.unpadded_partial_page_size - partial_page_spec.last_partial_page_additional_padding; - } else { - num_bytes_to_copy = partial_page_spec.unpadded_partial_page_size; - } - std::memcpy(page_without_padding_data + total_num_bytes_copied, full_page_data + i, num_bytes_to_copy); - i += partial_page_spec.padded_partial_page_size; - num_partial_pages_processed += 1; - total_num_bytes_copied += num_bytes_to_copy; - } - return total_num_bytes_copied; -} - -void ReadFromDeviceInterleavedContiguousLargePage(const Buffer& buffer, uint8_t* host_buffer) { - const buffer_dispatch::PartialPageSpec& partial_page_spec = buffer_dispatch::calculate_partial_page_spec(buffer); - const uint32_t full_padded_page_size = - partial_page_spec.padded_partial_page_size * partial_page_spec.num_partial_pages_per_full_page; - const uint32_t num_full_pages = buffer.num_pages(); - - IDevice* device = buffer.device(); - const uint32_t num_banks = device->allocator()->get_num_banks(buffer.buffer_type()); - - uint32_t num_round_robins = 0; - uint32_t host_idx = 0; - uint32_t bank_index = 0; - std::vector page; - page.resize(full_padded_page_size / sizeof(uint32_t)); - for (int page_index = 0; page_index < num_full_pages; page_index++) { - const DeviceAddr full_page_address_offset = - (num_round_robins > 0) ? (full_padded_page_size - buffer.aligned_page_size()) * num_round_robins : 0; - page.clear(); - switch (buffer.buffer_type()) { - case BufferType::DRAM: - case BufferType::TRACE: { - const DeviceAddr page_address = - buffer.bank_local_page_address(bank_index, page_index) + full_page_address_offset; - ReadFromDeviceDRAMChannel(device, bank_index, page_address, full_padded_page_size, page); - } break; - case BufferType::L1: - case BufferType::L1_SMALL: { - const DeviceAddr page_address = buffer.page_address(bank_index, page_index) + full_page_address_offset; - auto core_coordinates = - device->worker_core_from_logical_core(buffer.logical_core_from_bank_id(bank_index)); - tt::Cluster::instance().read_core( - page.data(), full_padded_page_size, tt_cxy_pair(device->id(), core_coordinates), page_address); - } break; - default: TT_THROW("Unsupported buffer type to read from device!"); - } - - uint8_t* full_page_data_without_padding = new uint8_t[full_padded_page_size]; - const uint32_t full_page_data_without_padding_size_bytes = RemovePaddingFromPartialPages( - partial_page_spec, - reinterpret_cast(page.data()), - full_padded_page_size, - full_page_data_without_padding); - TT_ASSERT(full_page_data_without_padding_size_bytes == buffer.page_size()); - - // Copy page into host buffer - std::memcpy(host_buffer + host_idx, full_page_data_without_padding, full_page_data_without_padding_size_bytes); - delete[] full_page_data_without_padding; - - host_idx += full_page_data_without_padding_size_bytes; - if (bank_index + 1 == num_banks) { - num_round_robins += 1; - } - bank_index = (bank_index + 1) % num_banks; - } -} +// uint32_t RemovePaddingFromPartialPages( +// const buffer_dispatch::PartialPageSpec& partial_page_spec, +// const uint8_t* full_page_data, +// uint32_t full_page_data_size_bytes, +// uint8_t* page_without_padding_data) { +// uint32_t total_num_bytes_copied = 0; +// uint32_t num_partial_pages_processed = 0; +// uint32_t i = 0; +// while (i < full_page_data_size_bytes) { +// uint32_t num_bytes_to_copy = 0; +// if (num_partial_pages_processed == partial_page_spec.num_partial_pages_per_full_page - 1) { +// num_bytes_to_copy = +// partial_page_spec.unpadded_partial_page_size - +// partial_page_spec.last_partial_page_additional_padding; +// } else { +// num_bytes_to_copy = partial_page_spec.unpadded_partial_page_size; +// } +// std::memcpy(page_without_padding_data + total_num_bytes_copied, full_page_data + i, num_bytes_to_copy); +// i += partial_page_spec.padded_partial_page_size; +// num_partial_pages_processed += 1; +// total_num_bytes_copied += num_bytes_to_copy; +// } +// return total_num_bytes_copied; +// } + +// void ReadFromDeviceInterleavedContiguousLargePage(const Buffer& buffer, uint8_t* host_buffer) { +// const buffer_dispatch::PartialPageSpec& partial_page_spec = buffer_dispatch::calculate_partial_page_spec(buffer); +// const uint32_t full_padded_page_size = +// partial_page_spec.padded_partial_page_size * partial_page_spec.num_partial_pages_per_full_page; +// const uint32_t num_full_pages = buffer.num_pages(); + +// IDevice* device = buffer.device(); +// const uint32_t num_banks = device->allocator()->get_num_banks(buffer.buffer_type()); + +// uint32_t num_round_robins = 0; +// uint32_t host_idx = 0; +// uint32_t bank_index = 0; +// std::vector page; +// page.resize(full_padded_page_size / sizeof(uint32_t)); +// for (int page_index = 0; page_index < num_full_pages; page_index++) { +// const DeviceAddr full_page_address_offset = +// (num_round_robins > 0) ? (full_padded_page_size - buffer.aligned_page_size()) * num_round_robins : 0; +// page.clear(); +// switch (buffer.buffer_type()) { +// case BufferType::DRAM: +// case BufferType::TRACE: { +// const DeviceAddr page_address = +// buffer.bank_local_page_address(bank_index, page_index) + full_page_address_offset; +// ReadFromDeviceDRAMChannel(device, bank_index, page_address, full_padded_page_size, page); +// } break; +// case BufferType::L1: +// case BufferType::L1_SMALL: { +// const DeviceAddr page_address = buffer.page_address(bank_index, page_index) + +// full_page_address_offset; auto core_coordinates = +// device->worker_core_from_logical_core(buffer.logical_core_from_bank_id(bank_index)); +// auto l1_size = tt::tt_metal::hal.get_dev_size(tt::tt_metal::HalProgrammableCoreType::TENSIX, +// tt::tt_metal::HalL1MemAddrType::BASE); auto l1_base = +// tt::tt_metal::hal.get_dev_addr(tt::tt_metal::HalProgrammableCoreType::TENSIX, +// tt::tt_metal::HalL1MemAddrType::BASE); tt::Cluster::instance().read_core( +// page.data(), full_padded_page_size, tt_cxy_pair(device->id(), core_coordinates), page_address); +// } break; +// default: TT_THROW("Unsupported buffer type to read from device!"); +// } + +// uint8_t* full_page_data_without_padding = new uint8_t[full_padded_page_size]; +// const uint32_t full_page_data_without_padding_size_bytes = RemovePaddingFromPartialPages( +// partial_page_spec, +// reinterpret_cast(page.data()), +// full_padded_page_size, +// full_page_data_without_padding); +// TT_ASSERT(full_page_data_without_padding_size_bytes == buffer.page_size()); + +// // Copy page into host buffer +// std::memcpy(host_buffer + host_idx, full_page_data_without_padding, +// full_page_data_without_padding_size_bytes); delete[] full_page_data_without_padding; + +// host_idx += full_page_data_without_padding_size_bytes; +// if (bank_index + 1 == num_banks) { +// num_round_robins += 1; +// } +// bank_index = (bank_index + 1) % num_banks; +// } +// } void ReadFromDeviceInterleavedContiguous(const Buffer& buffer, uint8_t* host_buffer) { - if (buffer_dispatch::are_pages_large(buffer)) { - ReadFromDeviceInterleavedContiguousLargePage(buffer, host_buffer); - return; - } + // if (buffer_dispatch::are_pages_large(buffer)) { + // ReadFromDeviceInterleavedContiguousLargePage(buffer, host_buffer); + // return; + // } uint32_t page_size = buffer.page_size(); uint32_t num_pages = buffer.num_pages(); + const buffer_dispatch::PartialPageSpec& partial_page_spec = buffer_dispatch::calculate_partial_page_spec(buffer); + const uint32_t full_padded_page_size = + partial_page_spec.unpadded_partial_page_size * partial_page_spec.num_partial_pages_per_full_page; + auto device = buffer.device(); auto num_banks = device->allocator()->get_num_banks(buffer.buffer_type()); + uint32_t num_round_robins = 0; size_t host_idx = 0; uint32_t bank_index = 0; std::vector page; page.resize(page_size / sizeof(uint32_t)); for (int page_index = 0; page_index < num_pages; page_index++) { - auto absolute_address = buffer.page_address(bank_index, page_index); + const DeviceAddr full_page_address_offset = + (num_round_robins > 0) ? (full_padded_page_size - buffer.aligned_page_size()) * num_round_robins : 0; + auto absolute_address = buffer.page_address(bank_index, page_index) + full_page_address_offset; // Get address offset of buffer in bank. Required when reading from DRAM. - auto bank_local_address = buffer.bank_local_page_address(bank_index, page_index); + auto bank_local_address = buffer.bank_local_page_address(bank_index, page_index) + full_page_address_offset; page.clear(); switch (buffer.buffer_type()) { case BufferType::DRAM: @@ -736,6 +766,10 @@ void ReadFromDeviceInterleavedContiguous(const Buffer& buffer, uint8_t* host_buf std::memcpy(host_buffer + host_idx, page.data(), page_size); host_idx += page_size; + if (bank_index + 1 == num_banks) { + num_round_robins += 1; + } + bank_index = (bank_index + 1) % num_banks; } } From 840e87b176b4b67c7bbb6f2e95af32b7c88c9e56 Mon Sep 17 00:00:00 2001 From: Samarth Agarwal Date: Wed, 19 Feb 2025 20:50:23 +0000 Subject: [PATCH 14/24] Saving work --- ...queueWriteBuffer_and_EnqueueReadBuffer.cpp | 4 +- .../api/tt-metalium/dispatch_settings.hpp | 8 +- tt_metal/impl/buffers/dispatch.cpp | 76 +++--- tt_metal/impl/buffers/dispatch.hpp | 8 +- tt_metal/tt_metal.cpp | 244 +++--------------- 5 files changed, 76 insertions(+), 264 deletions(-) diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp index c08da28327b..dc6b6df44c0 100644 --- a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp @@ -259,8 +259,8 @@ void test_EnqueueWriteBuffer_and_EnqueueReadBuffer(IDevice* device, CommandQueue for (uint32_t i = 0; i < result.size(); i++) { if (i != result[i]) { std::cout << "i: " << std::to_string(i) << " result[i]: " << std::to_string(result[i]) << std::endl; - // std::cout << "Fail" << std::endl; - // break; + std::cout << "Fail" << std::endl; + break; } } diff --git a/tt_metal/api/tt-metalium/dispatch_settings.hpp b/tt_metal/api/tt-metalium/dispatch_settings.hpp index 95eb63aff81..731d9a3e595 100644 --- a/tt_metal/api/tt-metalium/dispatch_settings.hpp +++ b/tt_metal/api/tt-metalium/dispatch_settings.hpp @@ -135,10 +135,10 @@ class DispatchSettings { static constexpr uint32_t EVENT_PADDED_SIZE = 16; // When page size of buffer to write/read exceeds the max prefetch command size, the PCIe-aligned page size is - // broken down into equal sized partial pages. UNPADDED_PARTIAL_PAGE_SIZE denotes the unpadded partial page size to - // use. The size of the padded partial page is the smallest value >= UNPADDED_PARTIAL_PAGE_SIZE that is - // PCIE-aligned. - static constexpr uint32_t UNPADDED_PARTIAL_PAGE_SIZE = 3044; + // broken down into equal sized partial pages. BASE_PARTIAL_PAGE_SIZE is incremented until the partial page size + // is PCIE-aligned. If the resulting partial page size doesn't evenly divide the full page size, the last partial + // page size is padded appropriately. + static constexpr uint32_t BASE_PARTIAL_PAGE_SIZE = 4096; static_assert( DISPATCH_MESSAGE_ENTRIES <= diff --git a/tt_metal/impl/buffers/dispatch.cpp b/tt_metal/impl/buffers/dispatch.cpp index c46e10ae991..9ad8b90a342 100644 --- a/tt_metal/impl/buffers/dispatch.cpp +++ b/tt_metal/impl/buffers/dispatch.cpp @@ -7,8 +7,6 @@ #include "assert.hpp" #include "math.hpp" #include "dispatch.hpp" -#include -#include #include #include @@ -261,19 +259,25 @@ ShardedBufferWriteDispatchParams initialize_sharded_buf_dispatch_params( return dispatch_params; } -PartialPageSpec calculate_partial_page_spec(const Buffer& buffer) { - PartialPageSpec partial_page_spec; - partial_page_spec.unpadded_partial_page_size = DispatchSettings::UNPADDED_PARTIAL_PAGE_SIZE; - // while (buffer.aligned_page_size() % partial_page.unpadded_partial_page_size != 0) { - // partial_page.unpadded_partial_page_size += 1; - // } - // partial_page_spec.padded_partial_page_size = partial_page_spec.unpadded_partial_page_size; +uint32_t calculate_partial_page_size(const Buffer& buffer) { + uint32_t partial_page_size = DispatchSettings::BASE_PARTIAL_PAGE_SIZE; const uint32_t pcie_alignment = hal.get_alignment(HalMemType::HOST); - const uint32_t l1_alignment = hal.get_alignment(HalMemType::L1); - while (partial_page_spec.unpadded_partial_page_size % pcie_alignment != 0 || - partial_page_spec.unpadded_partial_page_size % l1_alignment != 0) { - partial_page_spec.unpadded_partial_page_size += 1; + uint32_t mem_alignment = 0; + if (buffer.is_dram()) { + mem_alignment = hal.get_alignment(HalMemType::DRAM); + } else { + TT_ASSERT(buffer.is_l1()); + mem_alignment = hal.get_alignment(HalMemType::L1); + } + while (partial_page_size % pcie_alignment != 0 || partial_page_size % mem_alignment != 0) { + partial_page_size += 1; } + return partial_page_size; +} + +PartialPageSpec calculate_partial_page_spec(const Buffer& buffer) { + PartialPageSpec partial_page_spec; + partial_page_spec.unpadded_partial_page_size = calculate_partial_page_size(buffer); partial_page_spec.num_partial_pages_per_full_page = tt::div_up(buffer.aligned_page_size(), partial_page_spec.unpadded_partial_page_size); // partial_page.last_partial_page_additional_padding = buffer.aligned_page_size() - buffer.page_size(); @@ -974,7 +978,7 @@ std::shared_ptr generate_sharded_buffer_r dispatch_params.padded_page_size, dst, dispatch_params.unpadded_dst_offset, - dispatch_params.total_pages_read, + dispatch_params.pages_per_txn, initial_src_page_index, dispatch_params.starting_src_host_page_index, dispatch_params.buffer_page_mapping); @@ -994,10 +998,7 @@ std::shared_ptr generate_interleaved_buff dst, dispatch_params->unpadded_dst_offset, dispatch_params->total_pages_read, - dispatch_params->src_page_index, - 0, - nullptr, - nullptr); + dispatch_params->src_page_index); } void copy_completion_queue_data_into_user_space( @@ -1007,13 +1008,14 @@ void copy_completion_queue_data_into_user_space( uint32_t cq_id, SystemMemoryManager& sysmem_manager, volatile bool& exit_condition) { - const auto& [buffer_layout, page_size, padded_page_size, buffer_page_mapping, dst, dst_offset, num_pages_read, cur_dev_page_id, starting_host_page_id, partial_page_spec] = + const auto& [buffer_layout, page_size, padded_page_size, buffer_page_mapping, dst, dst_offset, num_pages_read, cur_dev_page_id, starting_host_page_id] = read_buffer_descriptor; const uint32_t padded_num_bytes = (num_pages_read * padded_page_size) + sizeof(CQDispatchCmd); uint32_t contig_dst_offset = dst_offset; uint32_t remaining_bytes_to_read = padded_num_bytes; uint32_t total_num_bytes_read = 0; uint32_t dev_page_id = cur_dev_page_id; + PartialPageSpec* partial_page_spec = nullptr; // track the amount of bytes read in the last non-aligned page uint32_t remaining_bytes_of_nonaligned_page = 0; @@ -1023,7 +1025,7 @@ void copy_completion_queue_data_into_user_space( uint32_t pad_size_bytes = padded_page_size - page_size; bool trigger = false; - std::cout << "buffer page size: " << page_size << std::endl; + // std::cout << "buffer page size: " << page_size << std::endl; while (remaining_bytes_to_read != 0) { uint32_t completion_queue_write_ptr_and_toggle = @@ -1138,30 +1140,30 @@ void copy_completion_queue_data_into_user_space( // There is more data after padding if (rem_bytes_in_cq >= pad_size_bytes) { src_offset_increment += pad_size_bytes; - if (trigger) { - std::cout << "rem_bytes_in_cq >= pad_size_bytes " << rem_bytes_in_cq << " " - << pad_size_bytes << " " << src_offset_increment << std::endl; - } + // if (trigger) { + // std::cout << "rem_bytes_in_cq >= pad_size_bytes " << rem_bytes_in_cq << " " + // << pad_size_bytes << " " << src_offset_increment << std::endl; + // } // Only pad data left in queue } else { offset_in_completion_q_data = pad_size_bytes - rem_bytes_in_cq; - if (trigger) { - std::cout << "rem_bytes_in_cq < pad_size_bytes " << rem_bytes_in_cq << " " - << pad_size_bytes << " " << src_offset_increment << " " - << offset_in_completion_q_data << std::endl; - } + // if (trigger) { + // std::cout << "rem_bytes_in_cq < pad_size_bytes " << rem_bytes_in_cq << " " + // << pad_size_bytes << " " << src_offset_increment << " " + // << offset_in_completion_q_data << std::endl; + // } } } - if (trigger) { - std::cout << num_bytes_remaining << " " << num_bytes_to_copy << " " - << remaining_bytes_of_nonaligned_page << " " << src_offset_increment << " " - << offset_in_completion_q_data << std::endl; - } + // if (trigger) { + // std::cout << num_bytes_remaining << " " << num_bytes_to_copy << " " + // << remaining_bytes_of_nonaligned_page << " " << src_offset_increment << " " + // << offset_in_completion_q_data << std::endl; + // } trigger = false; } else if (src_offset_bytes + padded_page_size_to_read >= bytes_xfered) { // Case 2: Last page of data that was popped off the completion queue // Don't need to compute src_offset_increment since this is end of loop - std::cout << "Total num bytes read: " << total_num_bytes_read << std::endl; + // std::cout << "Total num bytes read: " << total_num_bytes_read << std::endl; uint32_t num_bytes_remaining = bytes_xfered - src_offset_bytes; num_bytes_to_copy = std::min(num_bytes_remaining, page_size_to_read); remaining_bytes_of_nonaligned_page = page_size_to_read - num_bytes_to_copy; @@ -1193,8 +1195,8 @@ void copy_completion_queue_data_into_user_space( mmio_device_id, channel); - std::cout << "num bytes to copy " << num_bytes_to_copy << std::endl; - std::cout << "src offset increment" << src_offset_increment << std::endl; + // std::cout << "num bytes to copy " << num_bytes_to_copy << std::endl; + // std::cout << "src offset increment" << src_offset_increment << std::endl; total_num_bytes_read += num_bytes_to_copy; // if (total_num_bytes_read == page_size) { diff --git a/tt_metal/impl/buffers/dispatch.hpp b/tt_metal/impl/buffers/dispatch.hpp index 980e8b7bde3..ad1257d4b40 100644 --- a/tt_metal/impl/buffers/dispatch.hpp +++ b/tt_metal/impl/buffers/dispatch.hpp @@ -8,7 +8,6 @@ #include #include #include "buffer.hpp" -#include "device.hpp" #include "tt_metal/impl/event/dispatch.hpp" namespace tt::tt_metal { @@ -163,7 +162,6 @@ struct ReadBufferDescriptor { uint32_t num_pages_read; uint32_t cur_dev_page_id; uint32_t starting_host_page_id; - buffer_dispatch::PartialPageSpec* partial_page; ReadBufferDescriptor( TensorMemoryLayout buffer_layout, @@ -174,8 +172,7 @@ struct ReadBufferDescriptor { uint32_t num_pages_read, uint32_t cur_dev_page_id, uint32_t starting_host_page_id = 0, - const std::shared_ptr& buffer_page_mapping = nullptr, - buffer_dispatch::PartialPageSpec* partial_page = nullptr) : + const std::shared_ptr& buffer_page_mapping = nullptr) : buffer_layout(buffer_layout), page_size(page_size), padded_page_size(padded_page_size), @@ -184,8 +181,7 @@ struct ReadBufferDescriptor { dst_offset(dst_offset), num_pages_read(num_pages_read), cur_dev_page_id(cur_dev_page_id), - starting_host_page_id(starting_host_page_id), - partial_page(partial_page) {} + starting_host_page_id(starting_host_page_id) {} }; } // namespace tt::tt_metal diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp index 29ce64a4520..c1c164967f4 100644 --- a/tt_metal/tt_metal.cpp +++ b/tt_metal/tt_metal.cpp @@ -2,7 +2,6 @@ // // SPDX-License-Identifier: Apache-2.0 -#include #include #include @@ -32,7 +31,6 @@ #include #include #include -#include "tt_align.hpp" #include "tt_metal/impl/dispatch/dispatch_query_manager.hpp" #include "tt_metal/include/tt_metal/program.hpp" #include "tracy/Tracy.hpp" @@ -41,8 +39,6 @@ #include "lightmetal/host_api_capture_helpers.hpp" #include "llrt.hpp" -#include "umd/device/tt_core_coordinates.h" - namespace tt { namespace tt_metal { @@ -462,97 +458,27 @@ void WriteToDeviceSharded(Buffer& buffer, tt::stl::Span host_buff } } -// uint32_t AddPaddingToPartialPages( -// const buffer_dispatch::PartialPageSpec& partial_page_spec, -// const uint8_t* partial_pages_data, -// uint32_t full_unpadded_page_data_size_bytes, -// uint8_t* page_with_padding_data) { -// uint32_t total_num_bytes_added = 0; -// uint32_t total_num_bytes_data_added = 0; -// uint32_t num_partial_pages_processed = 0; -// // uint32_t i = 0; -// while (total_num_bytes_data_added < full_unpadded_page_data_size_bytes) { -// uint32_t num_bytes_data_to_add = 0; -// uint32_t num_bytes_padding_to_add = 0; -// if (num_partial_pages_processed == partial_page_spec.num_partial_pages_per_full_page - 1) { -// num_bytes_data_to_add = -// partial_page_spec.unpadded_partial_page_size - -// partial_page_spec.last_partial_page_additional_padding; -// num_bytes_padding_to_add = -// partial_page_spec.last_partial_page_additional_padding + -// (partial_page_spec.padded_partial_page_size - partial_page_spec.unpadded_partial_page_size); -// } else { -// num_bytes_data_to_add = partial_page_spec.unpadded_partial_page_size; -// num_bytes_padding_to_add = -// partial_page_spec.padded_partial_page_size - partial_page_spec.unpadded_partial_page_size; -// } - -// std::memcpy( -// page_with_padding_data + total_num_bytes_added, -// partial_pages_data + total_num_bytes_data_added, -// num_bytes_data_to_add); -// total_num_bytes_added += num_bytes_data_to_add; -// total_num_bytes_data_added += num_bytes_data_to_add; - -// std::memset(page_with_padding_data + total_num_bytes_added, 0, num_bytes_padding_to_add); -// total_num_bytes_added += num_bytes_padding_to_add; - -// // i += num_bytes_data_to_add + num_bytes_padding_to_add; -// num_partial_pages_processed += 1; -// } -// return total_num_bytes_added; -// } - -// void WriteToDeviceInterleavedContiguousLargePage(const Buffer& buffer, tt::stl::Span host_buffer) { -// const buffer_dispatch::PartialPageSpec& partial_page_spec = buffer_dispatch::calculate_partial_page_spec(buffer); -// const uint32_t full_padded_page_size = -// partial_page_spec.padded_partial_page_size * partial_page_spec.num_partial_pages_per_full_page; -// // const uint32_t full_unpadded_page_size = -// // partial_page_spec.unpadded_partial_page_size * partial_page_spec.num_partial_pages_per_full_page; -// const uint32_t num_full_pages = buffer.num_pages(); - -// IDevice* device = buffer.device(); -// const uint32_t num_banks = device->allocator()->get_num_banks(buffer.buffer_type()); - -// uint32_t num_round_robins = 0; -// uint32_t bank_index = 0; -// uint32_t data_index = 0; -// std::vector page; -// page.resize(full_padded_page_size / sizeof(uint32_t)); -// for (uint32_t page_index = 0; page_index < num_full_pages; page_index++) { -// auto absolute_address = buffer.page_address(bank_index, page_index); -// // Get address offset of buffer in bank. Required when writing to DRAM. -// auto bank_local_address = buffer.bank_local_page_address(bank_index, page_index); -// // std::memcpy(page.data(), host_buffer.data() + data_index, full_padded_page_size); -// const DeviceAddr full_page_address_offset = -// (num_round_robins > 0) ? (full_padded_page_size - buffer.aligned_page_size()) * num_round_robins : 0; -// const uint32_t full_page_data_with_padding_size_bytes = AddPaddingToPartialPages( -// partial_page_spec, -// host_buffer.data() + data_index, -// buffer.page_size(), -// reinterpret_cast(page.data())); -// TT_ASSERT(full_page_data_with_padding_size_bytes == full_padded_page_size); -// switch (buffer.buffer_type()) { -// case BufferType::DRAM: -// WriteToDeviceDRAMChannel(device, bank_index, bank_local_address + full_page_address_offset, page); -// break; -// case BufferType::L1: -// case BufferType::L1_SMALL: { -// CoreCoord logical_core = buffer.logical_core_from_bank_id(bank_index); -// //page.resize(buffer.page_size() / 4); -// WriteToDeviceL1( -// device, logical_core, absolute_address + full_page_address_offset, page, CoreType::WORKER); -// } break; -// default: TT_THROW("Unsupported buffer type to write to device!"); -// } - -// if (bank_index + 1 == num_banks) { -// num_round_robins += 1; -// } -// bank_index = (bank_index + 1) % num_banks; -// data_index += buffer.page_size(); -// } -// } +DeviceAddr CalculateAddressDeviceInterleavedContiguous( + const Buffer& buffer, uint32_t bank_index, uint32_t page_index, uint32_t num_round_robins) { + DeviceAddr addr = 0; + if (buffer.is_dram()) { + addr = buffer.bank_local_page_address(bank_index, page_index); + } else { + TT_ASSERT(buffer.is_l1()); + addr = buffer.page_address(bank_index, page_index); + } + + if (buffer_dispatch::are_pages_large(buffer)) { + const buffer_dispatch::PartialPageSpec& partial_page_spec = + buffer_dispatch::calculate_partial_page_spec(buffer); + const uint32_t full_padded_page_size = + partial_page_spec.unpadded_partial_page_size * partial_page_spec.num_partial_pages_per_full_page; + const DeviceAddr full_page_address_offset = + (num_round_robins > 0) ? (full_padded_page_size - buffer.aligned_page_size()) * num_round_robins : 0; + addr += full_page_address_offset; + } + return addr; +} void WriteToDeviceInterleavedContiguous(const Buffer& buffer, tt::stl::Span host_buffer) { uint32_t host_buffer_size_bytes = host_buffer.size(); @@ -562,18 +488,9 @@ void WriteToDeviceInterleavedContiguous(const Buffer& buffer, tt::stl::Spanallocator()->get_num_banks(buffer.buffer_type()); uint32_t num_round_robins = 0; @@ -582,20 +499,15 @@ void WriteToDeviceInterleavedContiguous(const Buffer& buffer, tt::stl::Span page; page.resize(page_size / sizeof(uint32_t)); for (int page_index = 0; page_index < num_pages; page_index++) { - const DeviceAddr full_page_address_offset = - (num_round_robins > 0) ? (full_padded_page_size - buffer.aligned_page_size()) * num_round_robins : 0; - auto absolute_address = buffer.page_address(bank_index, page_index) + full_page_address_offset; - // Get address offset of buffer in bank. Required when writing to DRAM. - auto bank_local_address = buffer.bank_local_page_address(bank_index, page_index) + full_page_address_offset; + const DeviceAddr address = + CalculateAddressDeviceInterleavedContiguous(buffer, bank_index, page_index, num_round_robins); std::memcpy(page.data(), host_buffer.data() + data_index, page_size); switch (buffer.buffer_type()) { - case BufferType::DRAM: - WriteToDeviceDRAMChannel(device, bank_index, bank_local_address, page); - break; + case BufferType::DRAM: WriteToDeviceDRAMChannel(device, bank_index, address, page); break; case BufferType::L1: case BufferType::L1_SMALL: { CoreCoord logical_core = buffer.logical_core_from_bank_id(bank_index); - WriteToDeviceL1(device, logical_core, absolute_address, page, CoreType::WORKER); + WriteToDeviceL1(device, logical_core, address, page, CoreType::WORKER); } break; default: TT_THROW("Unsupported buffer type to write to device!"); } @@ -635,103 +547,10 @@ void WriteToBuffer(Buffer& buffer, tt::stl::Span host_buffer) { } } -// uint32_t RemovePaddingFromPartialPages( -// const buffer_dispatch::PartialPageSpec& partial_page_spec, -// const uint8_t* full_page_data, -// uint32_t full_page_data_size_bytes, -// uint8_t* page_without_padding_data) { -// uint32_t total_num_bytes_copied = 0; -// uint32_t num_partial_pages_processed = 0; -// uint32_t i = 0; -// while (i < full_page_data_size_bytes) { -// uint32_t num_bytes_to_copy = 0; -// if (num_partial_pages_processed == partial_page_spec.num_partial_pages_per_full_page - 1) { -// num_bytes_to_copy = -// partial_page_spec.unpadded_partial_page_size - -// partial_page_spec.last_partial_page_additional_padding; -// } else { -// num_bytes_to_copy = partial_page_spec.unpadded_partial_page_size; -// } -// std::memcpy(page_without_padding_data + total_num_bytes_copied, full_page_data + i, num_bytes_to_copy); -// i += partial_page_spec.padded_partial_page_size; -// num_partial_pages_processed += 1; -// total_num_bytes_copied += num_bytes_to_copy; -// } -// return total_num_bytes_copied; -// } - -// void ReadFromDeviceInterleavedContiguousLargePage(const Buffer& buffer, uint8_t* host_buffer) { -// const buffer_dispatch::PartialPageSpec& partial_page_spec = buffer_dispatch::calculate_partial_page_spec(buffer); -// const uint32_t full_padded_page_size = -// partial_page_spec.padded_partial_page_size * partial_page_spec.num_partial_pages_per_full_page; -// const uint32_t num_full_pages = buffer.num_pages(); - -// IDevice* device = buffer.device(); -// const uint32_t num_banks = device->allocator()->get_num_banks(buffer.buffer_type()); - -// uint32_t num_round_robins = 0; -// uint32_t host_idx = 0; -// uint32_t bank_index = 0; -// std::vector page; -// page.resize(full_padded_page_size / sizeof(uint32_t)); -// for (int page_index = 0; page_index < num_full_pages; page_index++) { -// const DeviceAddr full_page_address_offset = -// (num_round_robins > 0) ? (full_padded_page_size - buffer.aligned_page_size()) * num_round_robins : 0; -// page.clear(); -// switch (buffer.buffer_type()) { -// case BufferType::DRAM: -// case BufferType::TRACE: { -// const DeviceAddr page_address = -// buffer.bank_local_page_address(bank_index, page_index) + full_page_address_offset; -// ReadFromDeviceDRAMChannel(device, bank_index, page_address, full_padded_page_size, page); -// } break; -// case BufferType::L1: -// case BufferType::L1_SMALL: { -// const DeviceAddr page_address = buffer.page_address(bank_index, page_index) + -// full_page_address_offset; auto core_coordinates = -// device->worker_core_from_logical_core(buffer.logical_core_from_bank_id(bank_index)); -// auto l1_size = tt::tt_metal::hal.get_dev_size(tt::tt_metal::HalProgrammableCoreType::TENSIX, -// tt::tt_metal::HalL1MemAddrType::BASE); auto l1_base = -// tt::tt_metal::hal.get_dev_addr(tt::tt_metal::HalProgrammableCoreType::TENSIX, -// tt::tt_metal::HalL1MemAddrType::BASE); tt::Cluster::instance().read_core( -// page.data(), full_padded_page_size, tt_cxy_pair(device->id(), core_coordinates), page_address); -// } break; -// default: TT_THROW("Unsupported buffer type to read from device!"); -// } - -// uint8_t* full_page_data_without_padding = new uint8_t[full_padded_page_size]; -// const uint32_t full_page_data_without_padding_size_bytes = RemovePaddingFromPartialPages( -// partial_page_spec, -// reinterpret_cast(page.data()), -// full_padded_page_size, -// full_page_data_without_padding); -// TT_ASSERT(full_page_data_without_padding_size_bytes == buffer.page_size()); - -// // Copy page into host buffer -// std::memcpy(host_buffer + host_idx, full_page_data_without_padding, -// full_page_data_without_padding_size_bytes); delete[] full_page_data_without_padding; - -// host_idx += full_page_data_without_padding_size_bytes; -// if (bank_index + 1 == num_banks) { -// num_round_robins += 1; -// } -// bank_index = (bank_index + 1) % num_banks; -// } -// } - void ReadFromDeviceInterleavedContiguous(const Buffer& buffer, uint8_t* host_buffer) { - // if (buffer_dispatch::are_pages_large(buffer)) { - // ReadFromDeviceInterleavedContiguousLargePage(buffer, host_buffer); - // return; - // } - uint32_t page_size = buffer.page_size(); uint32_t num_pages = buffer.num_pages(); - const buffer_dispatch::PartialPageSpec& partial_page_spec = buffer_dispatch::calculate_partial_page_spec(buffer); - const uint32_t full_padded_page_size = - partial_page_spec.unpadded_partial_page_size * partial_page_spec.num_partial_pages_per_full_page; - auto device = buffer.device(); auto num_banks = device->allocator()->get_num_banks(buffer.buffer_type()); @@ -741,23 +560,18 @@ void ReadFromDeviceInterleavedContiguous(const Buffer& buffer, uint8_t* host_buf std::vector page; page.resize(page_size / sizeof(uint32_t)); for (int page_index = 0; page_index < num_pages; page_index++) { - const DeviceAddr full_page_address_offset = - (num_round_robins > 0) ? (full_padded_page_size - buffer.aligned_page_size()) * num_round_robins : 0; - auto absolute_address = buffer.page_address(bank_index, page_index) + full_page_address_offset; - // Get address offset of buffer in bank. Required when reading from DRAM. - auto bank_local_address = buffer.bank_local_page_address(bank_index, page_index) + full_page_address_offset; + const DeviceAddr address = + CalculateAddressDeviceInterleavedContiguous(buffer, bank_index, page_index, num_round_robins); page.clear(); switch (buffer.buffer_type()) { case BufferType::DRAM: - case BufferType::TRACE: - ReadFromDeviceDRAMChannel(device, bank_index, bank_local_address, page_size, page); - break; + case BufferType::TRACE: ReadFromDeviceDRAMChannel(device, bank_index, address, page_size, page); break; case BufferType::L1: case BufferType::L1_SMALL: { auto core_coordinates = device->worker_core_from_logical_core(buffer.logical_core_from_bank_id(bank_index)); tt::Cluster::instance().read_core( - page.data(), page_size, tt_cxy_pair(device->id(), core_coordinates), absolute_address); + page.data(), page_size, tt_cxy_pair(device->id(), core_coordinates), address); } break; default: TT_THROW("Unsupported buffer type to read from device!"); } From 5dfd47e4533e5fd03aeb30c177a4c697eb3bed90 Mon Sep 17 00:00:00 2001 From: Samarth Agarwal Date: Wed, 19 Feb 2025 21:38:51 +0000 Subject: [PATCH 15/24] Cleanup --- tt_metal/impl/buffers/dispatch.cpp | 147 ++++------------------------- tt_metal/impl/buffers/dispatch.hpp | 18 ++-- tt_metal/tt_metal.cpp | 7 +- 3 files changed, 26 insertions(+), 146 deletions(-) diff --git a/tt_metal/impl/buffers/dispatch.cpp b/tt_metal/impl/buffers/dispatch.cpp index 9ad8b90a342..b00e0a4d961 100644 --- a/tt_metal/impl/buffers/dispatch.cpp +++ b/tt_metal/impl/buffers/dispatch.cpp @@ -5,7 +5,6 @@ #include #include #include "assert.hpp" -#include "math.hpp" #include "dispatch.hpp" #include #include @@ -98,7 +97,7 @@ struct InterleavedBufferWriteDispatchParams : BufferWriteDispatchParams { virtual uint32_t num_partial_pages_per_full_page() const { return 1; } - virtual uint32_t get_additional_padding_for_last_partial_page() const { return 0; } + virtual uint32_t get_padding_for_last_partial_page() const { return 0; } }; struct InterleavedBufferWriteLargePageDispatchParams : InterleavedBufferWriteDispatchParams { @@ -113,12 +112,12 @@ struct InterleavedBufferWriteLargePageDispatchParams : InterleavedBufferWriteDis tt::stl::Span expected_num_workers_completed) : InterleavedBufferWriteDispatchParams( buffer, dst_page_index, total_pages_to_write, cq_id, expected_num_workers_completed) { - this->page_size_to_write = partial_page_spec.unpadded_partial_page_size; - this->data_size_per_page_size_to_write = partial_page_spec.unpadded_partial_page_size; + this->page_size_to_write = partial_page_spec.partial_page_size; + this->data_size_per_page_size_to_write = partial_page_spec.partial_page_size; this->full_pages_to_write = num_full_pages; this->full_page_size = full_page_size; this->num_partial_pages_in_single_full_page = partial_page_spec.num_partial_pages_per_full_page; - this->last_partial_page_additional_padding = partial_page_spec.last_partial_page_additional_padding; + this->last_partial_page_padding = partial_page_spec.last_partial_page_padding; } void calculate_num_pages_for_write_transaction(uint32_t num_pages_available_in_cq) override { @@ -156,13 +155,11 @@ struct InterleavedBufferWriteLargePageDispatchParams : InterleavedBufferWriteDis uint32_t num_partial_pages_per_full_page() const override { return this->num_partial_pages_in_single_full_page; } - uint32_t get_additional_padding_for_last_partial_page() const override { - return this->last_partial_page_additional_padding; - } + uint32_t get_padding_for_last_partial_page() const override { return this->last_partial_page_padding; } private: uint32_t num_partial_pages_in_single_full_page = 0; - uint32_t last_partial_page_additional_padding = 0; + uint32_t last_partial_page_padding = 0; uint32_t full_page_size = 0; uint32_t full_pages_written = 0; uint32_t full_pages_to_write = 0; @@ -277,13 +274,11 @@ uint32_t calculate_partial_page_size(const Buffer& buffer) { PartialPageSpec calculate_partial_page_spec(const Buffer& buffer) { PartialPageSpec partial_page_spec; - partial_page_spec.unpadded_partial_page_size = calculate_partial_page_size(buffer); + partial_page_spec.partial_page_size = calculate_partial_page_size(buffer); partial_page_spec.num_partial_pages_per_full_page = - tt::div_up(buffer.aligned_page_size(), partial_page_spec.unpadded_partial_page_size); - // partial_page.last_partial_page_additional_padding = buffer.aligned_page_size() - buffer.page_size(); - partial_page_spec.last_partial_page_additional_padding = - (partial_page_spec.num_partial_pages_per_full_page * partial_page_spec.unpadded_partial_page_size) - - buffer.page_size(); + tt::div_up(buffer.aligned_page_size(), partial_page_spec.partial_page_size); + partial_page_spec.last_partial_page_padding = + (partial_page_spec.num_partial_pages_per_full_page * partial_page_spec.partial_page_size) - buffer.page_size(); return partial_page_spec; } @@ -301,22 +296,8 @@ std::unique_ptr initialize_interleaved_buf const bool write_large_pages = buffer.aligned_page_size() > buf_dispatch_constants.max_data_sizeB; if (write_large_pages) { const PartialPageSpec partial_page_spec = calculate_partial_page_spec(buffer); - // uint32_t partial_page_size = DispatchSettings::BASE_PARTIAL_PAGE_SIZE; - // while (buffer.aligned_page_size() % partial_page_size != 0) { - // partial_page_size += 1; - // } - // const uint32_t data_size_per_partial_page = partial_page_size; - // const uint32_t num_partial_pages_per_full_page = buffer.aligned_page_size() / partial_page_size; - // const uint32_t pcie_alignment = hal.get_alignment(HalMemType::HOST); - // const uint32_t l1_alignment = hal.get_alignment(HalMemType::L1); - // while (partial_page_size % pcie_alignment != 0 || partial_page_size % l1_alignment != 0) { - // partial_page_size += 1; - // } - // const uint32_t page_size_to_write = partial_page.padded_partial_page_size; - // const uint32_t num_partial_pages_per_full_page = buffer.aligned_page_size() / - // partial_page_spec.unpadded_partial_page_size; const uint32_t full_page_size = - partial_page_spec.num_partial_pages_per_full_page * partial_page_spec.unpadded_partial_page_size; + partial_page_spec.num_partial_pages_per_full_page * partial_page_spec.partial_page_size; const uint32_t num_full_pages = total_pages_to_write; const uint32_t padded_buffer_size = total_pages_to_write * buffer.aligned_page_size(); total_pages_to_write = num_full_pages * partial_page_spec.num_partial_pages_per_full_page; @@ -377,7 +358,7 @@ void populate_interleaved_buffer_write_dispatch_cmds( num_partial_pages_written_curr_txn * buffer.page_size(); if (num_partial_pages_written_per_current_full_page == num_partial_pages_per_full_page - 1) { // last partial page being copied from unpadded src buffer - const uint32_t padding = dispatch_params.get_additional_padding_for_last_partial_page(); + const uint32_t padding = dispatch_params.get_padding_for_last_partial_page(); page_size_to_copy -= padding; } command_sequence.add_data( @@ -806,8 +787,7 @@ std::unique_ptr initialize_interleaved_buf_read_dispat const PartialPageSpec partial_page_spec = calculate_partial_page_spec(buffer); large_page_dispatch_params->partial_page_spec = partial_page_spec; dispatch_params->padded_page_size = - partial_page_spec.unpadded_partial_page_size * partial_page_spec.num_partial_pages_per_full_page; - // large_page_dispatch_params->update_params_to_be_within_bounds(buffer); + partial_page_spec.partial_page_size * partial_page_spec.num_partial_pages_per_full_page; } else { dispatch_params->padded_page_size = buffer.aligned_page_size(); } @@ -1013,9 +993,7 @@ void copy_completion_queue_data_into_user_space( const uint32_t padded_num_bytes = (num_pages_read * padded_page_size) + sizeof(CQDispatchCmd); uint32_t contig_dst_offset = dst_offset; uint32_t remaining_bytes_to_read = padded_num_bytes; - uint32_t total_num_bytes_read = 0; uint32_t dev_page_id = cur_dev_page_id; - PartialPageSpec* partial_page_spec = nullptr; // track the amount of bytes read in the last non-aligned page uint32_t remaining_bytes_of_nonaligned_page = 0; @@ -1025,8 +1003,6 @@ void copy_completion_queue_data_into_user_space( uint32_t pad_size_bytes = padded_page_size - page_size; bool trigger = false; - // std::cout << "buffer page size: " << page_size << std::endl; - while (remaining_bytes_to_read != 0) { uint32_t completion_queue_write_ptr_and_toggle = sysmem_manager.completion_queue_wait_front(cq_id, exit_condition); @@ -1058,9 +1034,6 @@ void copy_completion_queue_data_into_user_space( if (buffer_page_mapping == nullptr) { void* contiguous_dst = (void*)(uint64_t(dst) + contig_dst_offset); if (page_size == padded_page_size) { - // if (!partial_page_spec) { - - // } uint32_t data_bytes_xfered = bytes_xfered - offset_in_completion_q_data; tt::Cluster::instance().read_sysmem( contiguous_dst, @@ -1076,56 +1049,7 @@ void copy_completion_queue_data_into_user_space( uint32_t dst_offset_bytes = 0; while (src_offset_bytes < bytes_xfered) { - uint32_t page_size_to_read = - partial_page_spec ? partial_page_spec->unpadded_partial_page_size : page_size; - const uint32_t num_bytes_read_curr_full_page = total_num_bytes_read % page_size; - if (partial_page_spec && - // total_num_bytes_read % partial_page_spec->unpadded_partial_page_size == 0 && - num_bytes_read_curr_full_page / partial_page_spec->unpadded_partial_page_size == - partial_page_spec->num_partial_pages_per_full_page - 1) { - page_size_to_read -= partial_page_spec->last_partial_page_additional_padding; - } - - const uint32_t padded_page_size_to_read = - partial_page_spec ? partial_page_spec->unpadded_partial_page_size : padded_page_size; - - pad_size_bytes = partial_page_spec ? padded_page_size_to_read - page_size_to_read : pad_size_bytes; - // if (partial_page_spec && - // // total_num_bytes_read % partial_page_spec->unpadded_partial_page_size == 0 && - // total_num_bytes_read / partial_page_spec->unpadded_partial_page_size == - // partial_page_spec->num_partial_pages_per_full_page - 1) { - // pad_size_bytes += partial_page_spec->last_partial_page_additional_padding; - // } - - // if (partial_page_spec) { - // const uint32_t num_bytes_read_curr_full_page = total_num_bytes_read % page_size; - // if (num_bytes_read_curr_full_page + page_size_to_read > page_size) { - // // if (trigger) { - // // } - // const uint32_t extra_padding_bytes = - // partial_page_spec->last_partial_page_additional_padding; std::cout << "Output " << - // std::to_string(trigger) << " " << total_num_bytes_read << " " << page_size_to_read << " " - // << remaining_bytes_of_nonaligned_page << " " << extra_padding_bytes << std::endl; - // page_size_to_read -= extra_padding_bytes; - // // page_size_to_read -= std::min( - // // total_num_bytes_read + page_size_to_read - page_size, - // // page_size_to_read - extra_padding_bytes); - // if (remaining_bytes_of_nonaligned_page > 0) { - // // page_size_to_read - num_bytes_read_curr_partial_page => if <= 0, no more data to - // read, otherwise take min of this and remaining_bytes_of_nonaligned_page - // remaining_bytes_of_nonaligned_page = std::min(page_size_to_read, - // remaining_bytes_of_nonaligned_page); const uint32_t num_bytes_read_curr_partial_page - // = num_bytes_read_curr_full_page % partial_page_spec->unpadded_partial_page_size; - // pad_size_bytes += (partial_page_spec->unpadded_partial_page_size - - // remaining_bytes_of_nonaligned_page - num_bytes_read_curr_partial_page); - // } - // // else { - - // // } - // } - // } - - uint32_t src_offset_increment = padded_page_size_to_read; + uint32_t src_offset_increment = padded_page_size; uint32_t num_bytes_to_copy = 0; if (remaining_bytes_of_nonaligned_page > 0) { @@ -1140,54 +1064,28 @@ void copy_completion_queue_data_into_user_space( // There is more data after padding if (rem_bytes_in_cq >= pad_size_bytes) { src_offset_increment += pad_size_bytes; - // if (trigger) { - // std::cout << "rem_bytes_in_cq >= pad_size_bytes " << rem_bytes_in_cq << " " - // << pad_size_bytes << " " << src_offset_increment << std::endl; - // } // Only pad data left in queue } else { offset_in_completion_q_data = pad_size_bytes - rem_bytes_in_cq; - // if (trigger) { - // std::cout << "rem_bytes_in_cq < pad_size_bytes " << rem_bytes_in_cq << " " - // << pad_size_bytes << " " << src_offset_increment << " " - // << offset_in_completion_q_data << std::endl; - // } } } - // if (trigger) { - // std::cout << num_bytes_remaining << " " << num_bytes_to_copy << " " - // << remaining_bytes_of_nonaligned_page << " " << src_offset_increment << " " - // << offset_in_completion_q_data << std::endl; - // } trigger = false; - } else if (src_offset_bytes + padded_page_size_to_read >= bytes_xfered) { + } else if (src_offset_bytes + padded_page_size >= bytes_xfered) { // Case 2: Last page of data that was popped off the completion queue // Don't need to compute src_offset_increment since this is end of loop - // std::cout << "Total num bytes read: " << total_num_bytes_read << std::endl; uint32_t num_bytes_remaining = bytes_xfered - src_offset_bytes; - num_bytes_to_copy = std::min(num_bytes_remaining, page_size_to_read); - remaining_bytes_of_nonaligned_page = page_size_to_read - num_bytes_to_copy; + num_bytes_to_copy = std::min(num_bytes_remaining, page_size); + remaining_bytes_of_nonaligned_page = page_size - num_bytes_to_copy; trigger = true; // We've copied needed data, start of next read is offset due to remaining pad bytes if (remaining_bytes_of_nonaligned_page == 0) { - offset_in_completion_q_data = padded_page_size_to_read - num_bytes_remaining; + offset_in_completion_q_data = padded_page_size - num_bytes_remaining; } } else { - num_bytes_to_copy = page_size_to_read; + num_bytes_to_copy = page_size; trigger = false; } - // if (partial_page_spec && (total_num_bytes_read % page_size) + num_bytes_to_copy == - // (padded_page_size / padded_page_size_to_read) * page_size_to_read) { - // if (partial_page_spec && partial_page_spec->num_partial_pages_per_full_page - 1 == - // ((total_num_bytes_read % page_size) + num_bytes_to_copy) / - // partial_page_spec->unpadded_partial_page_size) { - // if (partial_page_spec && (total_num_bytes_read % page_size) + num_bytes_to_copy > page_size) { - // // uint32_t extra_bytes = (total_num_bytes_read % page_size) + num_bytes_to_copy - page_size; - // uint32_t extra_bytes = partial_page_spec->last_partial_page_additional_padding; - // num_bytes_to_copy -= extra_bytes; - // } - tt::Cluster::instance().read_sysmem( (char*)(uint64_t(contiguous_dst) + dst_offset_bytes), num_bytes_to_copy, @@ -1195,13 +1093,6 @@ void copy_completion_queue_data_into_user_space( mmio_device_id, channel); - // std::cout << "num bytes to copy " << num_bytes_to_copy << std::endl; - // std::cout << "src offset increment" << src_offset_increment << std::endl; - - total_num_bytes_read += num_bytes_to_copy; - // if (total_num_bytes_read == page_size) { - // total_num_bytes_read = 0; - // } src_offset_bytes += src_offset_increment; dst_offset_bytes += num_bytes_to_copy; contig_dst_offset += num_bytes_to_copy; diff --git a/tt_metal/impl/buffers/dispatch.hpp b/tt_metal/impl/buffers/dispatch.hpp index ad1257d4b40..2b74a437838 100644 --- a/tt_metal/impl/buffers/dispatch.hpp +++ b/tt_metal/impl/buffers/dispatch.hpp @@ -49,9 +49,8 @@ struct BufferReadDispatchParams { }; struct PartialPageSpec { - uint32_t unpadded_partial_page_size = 0; - // uint32_t padded_partial_page_size = 0; - uint32_t last_partial_page_additional_padding = 0; + uint32_t partial_page_size = 0; + uint32_t last_partial_page_padding = 0; uint32_t num_partial_pages_per_full_page = 0; }; @@ -61,21 +60,16 @@ struct BufferReadLargePageDispatchParams : BufferReadDispatchParams { void update_params_to_be_within_bounds(const Buffer& buffer) override { const uint32_t num_pages_per_bank = this->src_page_index / this->num_banks; this->address += num_pages_per_bank * (this->partial_page_spec.num_partial_pages_per_full_page * - this->partial_page_spec.unpadded_partial_page_size); + this->partial_page_spec.partial_page_size); this->src_page_index = this->src_page_index % this->num_banks; } - // void calculate_num_pages_for_read_transaction() override { - // this->pages_per_txn = - // std::min(this->total_pages_to_read, this->num_banks - (this->src_page_index % this->num_banks)); - // } - void update_params_after_read_transaction() override { this->total_pages_to_read -= this->pages_per_txn; this->total_pages_read += this->pages_per_txn; - this->address += ((this->src_page_index + this->pages_per_txn) / this->num_banks) * - (this->partial_page_spec.num_partial_pages_per_full_page * - this->partial_page_spec.unpadded_partial_page_size); + this->address += + ((this->src_page_index + this->pages_per_txn) / this->num_banks) * + (this->partial_page_spec.num_partial_pages_per_full_page * this->partial_page_spec.partial_page_size); this->src_page_index = (this->src_page_index + this->pages_per_txn) % this->num_banks; } }; diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp index c1c164967f4..0831892c1ac 100644 --- a/tt_metal/tt_metal.cpp +++ b/tt_metal/tt_metal.cpp @@ -6,16 +6,11 @@ #include #include -#include -#include -#include #include #include #include -#include "assert.hpp" #include "buffers/dispatch.hpp" -#include "device.hpp" #include "dprint_server.hpp" #include #include @@ -472,7 +467,7 @@ DeviceAddr CalculateAddressDeviceInterleavedContiguous( const buffer_dispatch::PartialPageSpec& partial_page_spec = buffer_dispatch::calculate_partial_page_spec(buffer); const uint32_t full_padded_page_size = - partial_page_spec.unpadded_partial_page_size * partial_page_spec.num_partial_pages_per_full_page; + partial_page_spec.partial_page_size * partial_page_spec.num_partial_pages_per_full_page; const DeviceAddr full_page_address_offset = (num_round_robins > 0) ? (full_padded_page_size - buffer.aligned_page_size()) * num_round_robins : 0; addr += full_page_address_offset; From 011cbb7ec1ed70c2f1885c6606564c12de7db8db Mon Sep 17 00:00:00 2001 From: Samarth Agarwal Date: Wed, 19 Feb 2025 21:46:04 +0000 Subject: [PATCH 16/24] More cleanup --- ...queueWriteBuffer_and_EnqueueReadBuffer.cpp | 12 ---- tt_metal/impl/buffers/dispatch.cpp | 4 -- tt_metal/impl/buffers/dispatch.hpp | 70 +++++++++---------- 3 files changed, 34 insertions(+), 52 deletions(-) diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp index dc6b6df44c0..2466c600343 100644 --- a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp @@ -252,18 +252,6 @@ void test_EnqueueWriteBuffer_and_EnqueueReadBuffer(IDevice* device, CommandQueue detail::ReadFromBuffer(*bufa, result); } - std::cout << "write: " << cq_write << " read: " << cq_read << std::endl; - if (result.size() != src.size()) { - std::cout << "Unequal size" << std::endl; - } - for (uint32_t i = 0; i < result.size(); i++) { - if (i != result[i]) { - std::cout << "i: " << std::to_string(i) << " result[i]: " << std::to_string(result[i]) << std::endl; - std::cout << "Fail" << std::endl; - break; - } - } - EXPECT_EQ(src, result); } } diff --git a/tt_metal/impl/buffers/dispatch.cpp b/tt_metal/impl/buffers/dispatch.cpp index b00e0a4d961..ba61f1770fd 100644 --- a/tt_metal/impl/buffers/dispatch.cpp +++ b/tt_metal/impl/buffers/dispatch.cpp @@ -1001,7 +1001,6 @@ void copy_completion_queue_data_into_user_space( uint32_t offset_in_completion_q_data = sizeof(CQDispatchCmd); uint32_t pad_size_bytes = padded_page_size - page_size; - bool trigger = false; while (remaining_bytes_to_read != 0) { uint32_t completion_queue_write_ptr_and_toggle = @@ -1069,21 +1068,18 @@ void copy_completion_queue_data_into_user_space( offset_in_completion_q_data = pad_size_bytes - rem_bytes_in_cq; } } - trigger = false; } else if (src_offset_bytes + padded_page_size >= bytes_xfered) { // Case 2: Last page of data that was popped off the completion queue // Don't need to compute src_offset_increment since this is end of loop uint32_t num_bytes_remaining = bytes_xfered - src_offset_bytes; num_bytes_to_copy = std::min(num_bytes_remaining, page_size); remaining_bytes_of_nonaligned_page = page_size - num_bytes_to_copy; - trigger = true; // We've copied needed data, start of next read is offset due to remaining pad bytes if (remaining_bytes_of_nonaligned_page == 0) { offset_in_completion_q_data = padded_page_size - num_bytes_remaining; } } else { num_bytes_to_copy = page_size; - trigger = false; } tt::Cluster::instance().read_sysmem( diff --git a/tt_metal/impl/buffers/dispatch.hpp b/tt_metal/impl/buffers/dispatch.hpp index 2b74a437838..47e5bc2c2c9 100644 --- a/tt_metal/impl/buffers/dispatch.hpp +++ b/tt_metal/impl/buffers/dispatch.hpp @@ -11,8 +11,40 @@ #include "tt_metal/impl/event/dispatch.hpp" namespace tt::tt_metal { -struct ReadBufferDescriptor; -struct ReadEventDescriptor; + +// Used so the host knows how to properly copy data into user space from the completion queue (in hugepages) +struct ReadBufferDescriptor { + TensorMemoryLayout buffer_layout; + uint32_t page_size; + uint32_t padded_page_size; + std::shared_ptr buffer_page_mapping; + void* dst; + uint32_t dst_offset; + uint32_t num_pages_read; + uint32_t cur_dev_page_id; + uint32_t starting_host_page_id; + + ReadBufferDescriptor( + TensorMemoryLayout buffer_layout, + uint32_t page_size, + uint32_t padded_page_size, + void* dst, + uint32_t dst_offset, + uint32_t num_pages_read, + uint32_t cur_dev_page_id, + uint32_t starting_host_page_id = 0, + const std::shared_ptr& buffer_page_mapping = nullptr) : + buffer_layout(buffer_layout), + page_size(page_size), + padded_page_size(padded_page_size), + buffer_page_mapping(buffer_page_mapping), + dst(dst), + dst_offset(dst_offset), + num_pages_read(num_pages_read), + cur_dev_page_id(cur_dev_page_id), + starting_host_page_id(starting_host_page_id) {} +}; + using CompletionReaderVariant = std::variant; // Contains helper functions to interface with buffers on device @@ -79,7 +111,6 @@ struct ShardedBufferReadDispatchParams : BufferReadDispatchParams { uint32_t initial_pages_skipped = 0; uint32_t starting_src_host_page_index = 0; std::shared_ptr buffer_page_mapping = nullptr; - // uint32_t total_pages_to_read = 0; uint32_t total_pages_read = 0; uint32_t max_pages_per_shard = 0; CoreCoord core; @@ -145,37 +176,4 @@ bool are_pages_large(const Buffer& buffer); PartialPageSpec calculate_partial_page_spec(const Buffer& buffer); } // namespace buffer_dispatch -// Used so the host knows how to properly copy data into user space from the completion queue (in hugepages) -struct ReadBufferDescriptor { - TensorMemoryLayout buffer_layout; - uint32_t page_size; - uint32_t padded_page_size; - std::shared_ptr buffer_page_mapping; - void* dst; - uint32_t dst_offset; - uint32_t num_pages_read; - uint32_t cur_dev_page_id; - uint32_t starting_host_page_id; - - ReadBufferDescriptor( - TensorMemoryLayout buffer_layout, - uint32_t page_size, - uint32_t padded_page_size, - void* dst, - uint32_t dst_offset, - uint32_t num_pages_read, - uint32_t cur_dev_page_id, - uint32_t starting_host_page_id = 0, - const std::shared_ptr& buffer_page_mapping = nullptr) : - buffer_layout(buffer_layout), - page_size(page_size), - padded_page_size(padded_page_size), - buffer_page_mapping(buffer_page_mapping), - dst(dst), - dst_offset(dst_offset), - num_pages_read(num_pages_read), - cur_dev_page_id(cur_dev_page_id), - starting_host_page_id(starting_host_page_id) {} -}; - } // namespace tt::tt_metal From ce66863c3b33582a240cf8f59c27e85e3403e4bb Mon Sep 17 00:00:00 2001 From: Samarth Agarwal Date: Wed, 19 Feb 2025 21:58:20 +0000 Subject: [PATCH 17/24] Cleanup --- tt_metal/impl/buffers/dispatch.cpp | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/tt_metal/impl/buffers/dispatch.cpp b/tt_metal/impl/buffers/dispatch.cpp index ba61f1770fd..dd2fb5dd891 100644 --- a/tt_metal/impl/buffers/dispatch.cpp +++ b/tt_metal/impl/buffers/dispatch.cpp @@ -42,7 +42,7 @@ struct BufferWriteDispatchParams { // Parameters specific to interleaved buffers struct InterleavedBufferWriteDispatchParams : BufferWriteDispatchParams { uint32_t num_banks = 0; - uint32_t data_size_per_page_size_to_write = 0; + uint32_t data_size_to_copy = 0; const Buffer& buffer; InterleavedBufferWriteDispatchParams( @@ -56,7 +56,7 @@ struct InterleavedBufferWriteDispatchParams : BufferWriteDispatchParams { this->address = buffer.address(); this->dst_page_index = dst_page_index; this->page_size_to_write = buffer.aligned_page_size(); - this->data_size_per_page_size_to_write = buffer.page_size(); + this->data_size_to_copy = buffer.page_size(); this->total_pages_to_write = total_pages_to_write; this->device = buffer.device(); this->cq_id = cq_id; @@ -113,7 +113,7 @@ struct InterleavedBufferWriteLargePageDispatchParams : InterleavedBufferWriteDis InterleavedBufferWriteDispatchParams( buffer, dst_page_index, total_pages_to_write, cq_id, expected_num_workers_completed) { this->page_size_to_write = partial_page_spec.partial_page_size; - this->data_size_per_page_size_to_write = partial_page_spec.partial_page_size; + this->data_size_to_copy = partial_page_spec.partial_page_size; this->full_pages_to_write = num_full_pages; this->full_page_size = full_page_size; this->num_partial_pages_in_single_full_page = partial_page_spec.num_partial_pages_per_full_page; @@ -352,7 +352,7 @@ void populate_interleaved_buffer_write_dispatch_cmds( uint32_t num_partial_pages_written_curr_txn = 0; for (uint32_t sysmem_address_offset = 0; sysmem_address_offset < data_size_bytes; sysmem_address_offset += dispatch_params.page_size_to_write) { - uint32_t page_size_to_copy = dispatch_params.data_size_per_page_size_to_write; + uint32_t page_size_to_copy = dispatch_params.data_size_to_copy; uint32_t src_address_offset = num_full_pages_written * buffer.page_size() + num_partial_pages_written_per_current_full_page * page_size_to_copy + num_partial_pages_written_curr_txn * buffer.page_size(); @@ -373,9 +373,9 @@ void populate_interleaved_buffer_write_dispatch_cmds( sysmem_address_offset += dispatch_params.page_size_to_write) { command_sequence.add_data( (char*)src + src_address_offset, - dispatch_params.data_size_per_page_size_to_write, + dispatch_params.data_size_to_copy, dispatch_params.page_size_to_write); - src_address_offset += dispatch_params.data_size_per_page_size_to_write; + src_address_offset += dispatch_params.data_size_to_copy; } } else { command_sequence.add_data((char*)src + src_address_offset, data_size_bytes, data_size_bytes); @@ -966,10 +966,6 @@ std::shared_ptr generate_sharded_buffer_r std::shared_ptr generate_interleaved_buffer_read_descriptor( void* dst, BufferReadDispatchParams* dispatch_params, Buffer& buffer) { - BufferReadLargePageDispatchParams* large_page_dispatch_params = - dynamic_cast(dispatch_params); - PartialPageSpec* partial_page_spec = - large_page_dispatch_params ? &(large_page_dispatch_params->partial_page_spec) : nullptr; return std::make_shared( std::in_place_type, buffer.buffer_layout(), From 1e7336094c81387d9a497fe3a95fa0f4ef594a78 Mon Sep 17 00:00:00 2001 From: Samarth Agarwal Date: Tue, 25 Feb 2025 03:23:35 +0000 Subject: [PATCH 18/24] Fixing bug --- ...t_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp | 15 +++++++++++++++ tt_metal/impl/buffers/dispatch.cpp | 1 + 2 files changed, 16 insertions(+) diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp index 2466c600343..3d5710967e4 100644 --- a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp @@ -1047,6 +1047,21 @@ TEST_F(MultiCommandQueueSingleDeviceBufferFixture, TestIssueMultipleReadWriteCom local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config)); } +TEST_F(CommandQueueMultiDeviceBufferFixture, TestMultipleUnalignedPagesLargerThanMaxPrefetchCommandSize) { + for (IDevice* device : devices_) { + tt::log_info("Running On Device {}", device->id()); + CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id()); + const uint32_t max_prefetch_command_size = DispatchMemMap::get(dispatch_core_type).max_prefetch_command_size(); + TestBufferConfig config = { + .num_pages = 50, .page_size = max_prefetch_command_size + 4, .buftype = BufferType::DRAM}; + + CommandQueue& a = device->command_queue(0); + vector> cqs = {a}; + EXPECT_TRUE( + local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config)); + } +} + } // end namespace dram_tests namespace l1_tests { diff --git a/tt_metal/impl/buffers/dispatch.cpp b/tt_metal/impl/buffers/dispatch.cpp index dd2fb5dd891..8e22dea2019 100644 --- a/tt_metal/impl/buffers/dispatch.cpp +++ b/tt_metal/impl/buffers/dispatch.cpp @@ -1061,6 +1061,7 @@ void copy_completion_queue_data_into_user_space( src_offset_increment += pad_size_bytes; // Only pad data left in queue } else { + src_offset_increment += rem_bytes_in_cq; offset_in_completion_q_data = pad_size_bytes - rem_bytes_in_cq; } } From fc9dad4726cac5793d9e1d6c16567c1e9fdb28d8 Mon Sep 17 00:00:00 2001 From: Samarth Agarwal Date: Tue, 25 Feb 2025 15:02:22 +0000 Subject: [PATCH 19/24] Replacing loop with align function --- tt_metal/impl/buffers/dispatch.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tt_metal/impl/buffers/dispatch.cpp b/tt_metal/impl/buffers/dispatch.cpp index 8e22dea2019..f190d5e4b93 100644 --- a/tt_metal/impl/buffers/dispatch.cpp +++ b/tt_metal/impl/buffers/dispatch.cpp @@ -266,9 +266,7 @@ uint32_t calculate_partial_page_size(const Buffer& buffer) { TT_ASSERT(buffer.is_l1()); mem_alignment = hal.get_alignment(HalMemType::L1); } - while (partial_page_size % pcie_alignment != 0 || partial_page_size % mem_alignment != 0) { - partial_page_size += 1; - } + partial_page_size = tt::align(partial_page_size, std::lcm(pcie_alignment, mem_alignment)); return partial_page_size; } From 467b71884115ea7dc8a13d0355a177e3c9f97829 Mon Sep 17 00:00:00 2001 From: Samarth Agarwal Date: Fri, 28 Feb 2025 15:47:13 +0000 Subject: [PATCH 20/24] Making changes --- tt_metal/api/tt-metalium/buffer.hpp | 1 + tt_metal/api/tt-metalium/hal.hpp | 9 ++ tt_metal/distributed/mesh_command_queue.cpp | 12 +- tt_metal/impl/buffers/buffer.cpp | 11 ++ tt_metal/impl/buffers/dispatch.cpp | 116 ++++++++++-------- tt_metal/impl/buffers/dispatch.hpp | 9 +- .../impl/dispatch/hardware_command_queue.cpp | 12 +- tt_metal/llrt/blackhole/bh_hal.cpp | 8 ++ tt_metal/llrt/grayskull/gs_hal.cpp | 7 ++ tt_metal/llrt/wormhole/wh_hal.cpp | 8 ++ tt_metal/tt_metal.cpp | 2 +- 11 files changed, 134 insertions(+), 61 deletions(-) diff --git a/tt_metal/api/tt-metalium/buffer.hpp b/tt_metal/api/tt-metalium/buffer.hpp index 7dccc0dfef6..c676e30ecd2 100644 --- a/tt_metal/api/tt-metalium/buffer.hpp +++ b/tt_metal/api/tt-metalium/buffer.hpp @@ -209,6 +209,7 @@ class Buffer final { uint32_t num_dev_pages() const; BufferType buffer_type() const { return buffer_type_; } + HalMemType memory_type() const; CoreType core_type() const; bool is_l1() const; diff --git a/tt_metal/api/tt-metalium/hal.hpp b/tt_metal/api/tt-metalium/hal.hpp index 42a33d44ab5..d20f0501e2e 100644 --- a/tt_metal/api/tt-metalium/hal.hpp +++ b/tt_metal/api/tt-metalium/hal.hpp @@ -150,6 +150,7 @@ class Hal { std::vector dram_bases_; std::vector dram_sizes_; std::vector mem_alignments_; + std::vector mem_alignments_with_pcie_; uint32_t num_nocs_; uint32_t noc_addr_node_id_bits_; uint32_t noc_coord_reg_offset_; @@ -251,6 +252,8 @@ class Hal { uint32_t get_dev_size(HalDramMemAddrType addr_type) const; uint32_t get_alignment(HalMemType memory_type) const; + // Returns an alignment that is aligned with PCIE and the given memory type + uint32_t get_common_alignment_with_pcie(HalMemType memory_type) const; bool get_supports_cbs(uint32_t programmable_core_type_index) const; @@ -350,6 +353,12 @@ inline uint32_t Hal::get_alignment(HalMemType memory_type) const { return this->mem_alignments_[index]; } +inline uint32_t Hal::get_common_alignment_with_pcie(HalMemType memory_type) const { + uint32_t index = utils::underlying_type(memory_type); + TT_ASSERT(index < this->mem_alignments_.size()); + return this->mem_alignments_with_pcie_[index]; +} + inline bool Hal::get_supports_cbs(uint32_t programmable_core_type_index) const { return this->core_info_[programmable_core_type_index].supports_cbs_; } diff --git a/tt_metal/distributed/mesh_command_queue.cpp b/tt_metal/distributed/mesh_command_queue.cpp index 8435cf446d6..75fc04a6f7f 100644 --- a/tt_metal/distributed/mesh_command_queue.cpp +++ b/tt_metal/distributed/mesh_command_queue.cpp @@ -266,15 +266,21 @@ void MeshCommandQueue::read_shard_from_device( } } } else { - auto dispatch_params = buffer_dispatch::initialize_interleaved_buf_read_dispatch_params( - *shard_view, id_, expected_num_workers_completed_, region); + buffer_dispatch::BufferReadDispatchParamsVariant dispatch_params_variant = + buffer_dispatch::initialize_interleaved_buf_read_dispatch_params( + *shard_view, id_, expected_num_workers_completed_, region); + + buffer_dispatch::BufferReadDispatchParams* dispatch_params = std::visit( + [](auto& val) { return static_cast(&val); }, + dispatch_params_variant); + buffer_dispatch::copy_interleaved_buffer_to_completion_queue( *dispatch_params, *shard_view, sub_device_ids, this->dispatch_core_type()); if (dispatch_params->pages_per_txn > 0) { num_txns_per_device[device]++; auto& read_descriptor_queue = this->get_read_descriptor_queue(device); read_descriptor_queue.push( - buffer_dispatch::generate_interleaved_buffer_read_descriptor(dst, dispatch_params.get(), *shard_view)); + buffer_dispatch::generate_interleaved_buffer_read_descriptor(dst, dispatch_params, *shard_view)); } } } diff --git a/tt_metal/impl/buffers/buffer.cpp b/tt_metal/impl/buffers/buffer.cpp index d51bdcffea0..dc73f02a190 100644 --- a/tt_metal/impl/buffers/buffer.cpp +++ b/tt_metal/impl/buffers/buffer.cpp @@ -17,6 +17,7 @@ #include #include #include +#include "hal.hpp" #include "umd/device/tt_soc_descriptor.h" #include "fmt/base.h" #include @@ -489,6 +490,16 @@ uint32_t Buffer::num_dev_pages() const { return this->shard_spec().num_pages() * this->num_cores().value(); } +HalMemType Buffer::memory_type() const { + if (this->is_dram()) { + return HalMemType::DRAM; + } else if (this->is_l1()) { + return HalMemType::L1; + } else { + TT_THROW("Unknown HAL memory type for {} buffer type", this->buffer_type()); + } +} + CoreType Buffer::core_type() const { switch (this->buffer_type_) { case BufferType::DRAM: diff --git a/tt_metal/impl/buffers/dispatch.cpp b/tt_metal/impl/buffers/dispatch.cpp index f190d5e4b93..cc07c1a749c 100644 --- a/tt_metal/impl/buffers/dispatch.cpp +++ b/tt_metal/impl/buffers/dispatch.cpp @@ -9,6 +9,7 @@ #include #include +#include "hal.hpp" #include "tt_cluster.hpp" namespace tt::tt_metal { @@ -40,18 +41,16 @@ struct BufferWriteDispatchParams { }; // Parameters specific to interleaved buffers -struct InterleavedBufferWriteDispatchParams : BufferWriteDispatchParams { - uint32_t num_banks = 0; +class InterleavedBufferWriteDispatchParams : public BufferWriteDispatchParams { +public: uint32_t data_size_to_copy = 0; - const Buffer& buffer; InterleavedBufferWriteDispatchParams( const Buffer& buffer, uint32_t dst_page_index, uint32_t total_pages_to_write, uint32_t cq_id, - tt::stl::Span expected_num_workers_completed) : - buffer(buffer) { + tt::stl::Span expected_num_workers_completed) { this->num_banks = buffer.device()->allocator()->get_num_banks(buffer.buffer_type()); this->address = buffer.address(); this->dst_page_index = dst_page_index; @@ -62,6 +61,8 @@ struct InterleavedBufferWriteDispatchParams : BufferWriteDispatchParams { this->cq_id = cq_id; this->expected_num_workers_completed = expected_num_workers_completed; } + + InterleavedBufferWriteDispatchParams() = default; virtual ~InterleavedBufferWriteDispatchParams() = default; void calculate_issue_wait() { @@ -98,9 +99,13 @@ struct InterleavedBufferWriteDispatchParams : BufferWriteDispatchParams { virtual uint32_t num_partial_pages_per_full_page() const { return 1; } virtual uint32_t get_padding_for_last_partial_page() const { return 0; } + +protected: + uint32_t num_banks = 0; }; -struct InterleavedBufferWriteLargePageDispatchParams : InterleavedBufferWriteDispatchParams { +class InterleavedBufferWriteLargePageDispatchParams : public InterleavedBufferWriteDispatchParams { +public: InterleavedBufferWriteLargePageDispatchParams( const Buffer& buffer, uint32_t dst_page_index, @@ -112,6 +117,7 @@ struct InterleavedBufferWriteLargePageDispatchParams : InterleavedBufferWriteDis tt::stl::Span expected_num_workers_completed) : InterleavedBufferWriteDispatchParams( buffer, dst_page_index, total_pages_to_write, cq_id, expected_num_workers_completed) { + this->buffer_address = buffer.address(); this->page_size_to_write = partial_page_spec.partial_page_size; this->data_size_to_copy = partial_page_spec.partial_page_size; this->full_pages_to_write = num_full_pages; @@ -120,6 +126,8 @@ struct InterleavedBufferWriteLargePageDispatchParams : InterleavedBufferWriteDis this->last_partial_page_padding = partial_page_spec.last_partial_page_padding; } + InterleavedBufferWriteLargePageDispatchParams() = default; + void calculate_num_pages_for_write_transaction(uint32_t num_pages_available_in_cq) override { TT_ASSERT(this->num_banks > this->dst_page_index); this->pages_per_txn = @@ -158,14 +166,15 @@ struct InterleavedBufferWriteLargePageDispatchParams : InterleavedBufferWriteDis uint32_t get_padding_for_last_partial_page() const override { return this->last_partial_page_padding; } private: - uint32_t num_partial_pages_in_single_full_page = 0; - uint32_t last_partial_page_padding = 0; - uint32_t full_page_size = 0; + uint32_t buffer_address; + uint32_t num_partial_pages_in_single_full_page; + uint32_t last_partial_page_padding; + uint32_t full_page_size; uint32_t full_pages_written = 0; uint32_t full_pages_to_write = 0; bool were_full_pages_written_in_last_write_transaction() const { - const uint32_t page_size = this->address - this->buffer.address(); + const uint32_t page_size = this->address - this->buffer_address; return page_size > 0 && page_size % this->full_page_size == 0; } @@ -209,7 +218,7 @@ uint32_t calculate_max_data_size(const CoreType& dispatch_core_type) { (hal.get_alignment(HalMemType::HOST) * 2); // * 2 to account for issue } -bool are_pages_large(const Buffer& buffer) { +bool are_pages_larger_than_max_prefetch_cmd_size(const Buffer& buffer) { const CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(buffer.device()->id()); const uint32_t max_data_size = calculate_max_data_size(dispatch_core_type); return buffer.aligned_page_size() > max_data_size; @@ -258,15 +267,8 @@ ShardedBufferWriteDispatchParams initialize_sharded_buf_dispatch_params( uint32_t calculate_partial_page_size(const Buffer& buffer) { uint32_t partial_page_size = DispatchSettings::BASE_PARTIAL_PAGE_SIZE; - const uint32_t pcie_alignment = hal.get_alignment(HalMemType::HOST); - uint32_t mem_alignment = 0; - if (buffer.is_dram()) { - mem_alignment = hal.get_alignment(HalMemType::DRAM); - } else { - TT_ASSERT(buffer.is_l1()); - mem_alignment = hal.get_alignment(HalMemType::L1); - } - partial_page_size = tt::align(partial_page_size, std::lcm(pcie_alignment, mem_alignment)); + const HalMemType buffer_mem_type = buffer.memory_type(); + partial_page_size = tt::align(partial_page_size, hal.get_common_alignment_with_pcie(buffer_mem_type)); return partial_page_size; } @@ -280,26 +282,28 @@ PartialPageSpec calculate_partial_page_spec(const Buffer& buffer) { return partial_page_spec; } -std::unique_ptr initialize_interleaved_buf_dispatch_params( +using InterleavedBufferWriteDispatchParamsVariant = + std::variant; + +InterleavedBufferWriteDispatchParamsVariant initialize_interleaved_buf_dispatch_params( const Buffer& buffer, const BufferDispatchConstants& buf_dispatch_constants, uint32_t cq_id, tt::stl::Span expected_num_workers_completed, const BufferRegion& region) { - std::unique_ptr dispatch_params; + InterleavedBufferWriteDispatchParamsVariant dispatch_params; uint32_t total_pages_to_write = region.size / buffer.page_size(); const uint32_t dst_page_index = region.offset / buffer.page_size(); - const bool write_large_pages = buffer.aligned_page_size() > buf_dispatch_constants.max_data_sizeB; - if (write_large_pages) { + if (are_pages_larger_than_max_prefetch_cmd_size(buffer)) { const PartialPageSpec partial_page_spec = calculate_partial_page_spec(buffer); const uint32_t full_page_size = partial_page_spec.num_partial_pages_per_full_page * partial_page_spec.partial_page_size; const uint32_t num_full_pages = total_pages_to_write; const uint32_t padded_buffer_size = total_pages_to_write * buffer.aligned_page_size(); total_pages_to_write = num_full_pages * partial_page_spec.num_partial_pages_per_full_page; - dispatch_params = std::make_unique( + dispatch_params.emplace( buffer, dst_page_index, partial_page_spec, @@ -309,7 +313,7 @@ std::unique_ptr initialize_interleaved_buf cq_id, expected_num_workers_completed); } else { - dispatch_params = std::make_unique( + dispatch_params.emplace( buffer, dst_page_index, total_pages_to_write, cq_id, expected_num_workers_completed); } @@ -717,9 +721,14 @@ void write_to_device_buffer( dispatch_core_type); } } else { - std::unique_ptr dispatch_params = + InterleavedBufferWriteDispatchParamsVariant dispatch_params_variant = initialize_interleaved_buf_dispatch_params( buffer, buf_dispatch_constants, cq_id, expected_num_workers_completed, region); + + InterleavedBufferWriteDispatchParams* dispatch_params = std::visit( + [](auto& val) { return static_cast(&val); }, + dispatch_params_variant); + write_interleaved_buffer_to_device( src, *dispatch_params, buffer, buf_dispatch_constants, sub_device_ids, dispatch_core_type); } @@ -755,41 +764,46 @@ ShardedBufferReadDispatchParams initialize_sharded_buf_read_dispatch_params( return dispatch_params; } -std::unique_ptr initialize_interleaved_buf_read_dispatch_params( +BufferReadDispatchParamsVariant initialize_interleaved_buf_read_dispatch_params( Buffer& buffer, uint32_t cq_id, tt::stl::Span expected_num_workers_completed, const BufferRegion& region) { validate_buffer_region_conditions(buffer, region); - std::unique_ptr dispatch_params; - const bool read_large_pages = are_pages_large(buffer); - if (read_large_pages) { - dispatch_params = std::make_unique(); - } else { - dispatch_params = std::make_unique(); - } - - dispatch_params->total_pages_to_read = region.size / buffer.page_size(); - dispatch_params->src_page_index = region.offset / buffer.page_size(); - dispatch_params->cq_id = cq_id; - dispatch_params->device = buffer.device(); - dispatch_params->address = buffer.address(); - dispatch_params->unpadded_dst_offset = 0; - dispatch_params->expected_num_workers_completed = expected_num_workers_completed; - dispatch_params->num_banks = buffer.device()->allocator()->get_num_banks(buffer.buffer_type()); + BufferReadDispatchParamsVariant dispatch_params; + const bool read_large_pages = are_pages_larger_than_max_prefetch_cmd_size(buffer); if (read_large_pages) { - BufferReadLargePageDispatchParams* large_page_dispatch_params = - dynamic_cast(dispatch_params.get()); - const PartialPageSpec partial_page_spec = calculate_partial_page_spec(buffer); - large_page_dispatch_params->partial_page_spec = partial_page_spec; - dispatch_params->padded_page_size = - partial_page_spec.partial_page_size * partial_page_spec.num_partial_pages_per_full_page; + dispatch_params = BufferReadLargePageDispatchParams{}; } else { - dispatch_params->padded_page_size = buffer.aligned_page_size(); + dispatch_params = BufferReadDispatchParams{}; } + IDevice* device = buffer.device(); + + std::visit( + [&](auto& params) { + params.total_pages_to_read = region.size / buffer.page_size(); + params.src_page_index = region.offset / buffer.page_size(); + params.cq_id = cq_id; + params.device = device; + params.address = buffer.address(); + params.unpadded_dst_offset = 0; + params.expected_num_workers_completed = expected_num_workers_completed; + params.num_banks = device->allocator()->get_num_banks(buffer.buffer_type()); + + if constexpr (std::is_same_v, BufferReadLargePageDispatchParams>) { + const PartialPageSpec partial_page_spec = calculate_partial_page_spec(buffer); + params.partial_page_spec = partial_page_spec; + params.padded_page_size = + partial_page_spec.partial_page_size * partial_page_spec.num_partial_pages_per_full_page; + } else { + params.padded_page_size = buffer.aligned_page_size(); + } + }, + dispatch_params); + return dispatch_params; } diff --git a/tt_metal/impl/buffers/dispatch.hpp b/tt_metal/impl/buffers/dispatch.hpp index 47e5bc2c2c9..dda1a96558b 100644 --- a/tt_metal/impl/buffers/dispatch.hpp +++ b/tt_metal/impl/buffers/dispatch.hpp @@ -86,7 +86,8 @@ struct PartialPageSpec { uint32_t num_partial_pages_per_full_page = 0; }; -struct BufferReadLargePageDispatchParams : BufferReadDispatchParams { +class BufferReadLargePageDispatchParams : public BufferReadDispatchParams { +public: PartialPageSpec partial_page_spec; void update_params_to_be_within_bounds(const Buffer& buffer) override { @@ -106,6 +107,8 @@ struct BufferReadLargePageDispatchParams : BufferReadDispatchParams { } }; +using BufferReadDispatchParamsVariant = std::variant; + struct ShardedBufferReadDispatchParams : BufferReadDispatchParams { bool width_split = false; uint32_t initial_pages_skipped = 0; @@ -131,7 +134,7 @@ ShardedBufferReadDispatchParams initialize_sharded_buf_read_dispatch_params( tt::stl::Span expected_num_workers_completed, const BufferRegion& region); -std::unique_ptr initialize_interleaved_buf_read_dispatch_params( +BufferReadDispatchParamsVariant initialize_interleaved_buf_read_dispatch_params( Buffer& buffer, uint32_t cq_id, tt::stl::Span expected_num_workers_completed, @@ -171,7 +174,7 @@ std::shared_ptr<::tt::tt_metal::CompletionReaderVariant> generate_sharded_buffer std::shared_ptr<::tt::tt_metal::CompletionReaderVariant> generate_interleaved_buffer_read_descriptor( void* dst, BufferReadDispatchParams* dispatch_params, Buffer& buffer); -bool are_pages_large(const Buffer& buffer); +bool are_pages_larger_than_max_prefetch_cmd_size(const Buffer& buffer); PartialPageSpec calculate_partial_page_spec(const Buffer& buffer); } // namespace buffer_dispatch diff --git a/tt_metal/impl/dispatch/hardware_command_queue.cpp b/tt_metal/impl/dispatch/hardware_command_queue.cpp index d44404e8926..cf07fd4a65b 100644 --- a/tt_metal/impl/dispatch/hardware_command_queue.cpp +++ b/tt_metal/impl/dispatch/hardware_command_queue.cpp @@ -207,8 +207,14 @@ void HWCommandQueue::enqueue_read_buffer( } else { // Forward data from device to the completion queue. // Then have the completion queue reader thread copy this data to user space. - auto dispatch_params = buffer_dispatch::initialize_interleaved_buf_read_dispatch_params( - buffer_obj, this->id_, this->expected_num_workers_completed, region); + buffer_dispatch::BufferReadDispatchParamsVariant dispatch_params_variant = + buffer_dispatch::initialize_interleaved_buf_read_dispatch_params( + buffer_obj, this->id_, this->expected_num_workers_completed, region); + + buffer_dispatch::BufferReadDispatchParams* dispatch_params = std::visit( + [](auto& val) { return static_cast(&val); }, + dispatch_params_variant); + buffer_dispatch::copy_interleaved_buffer_to_completion_queue( *dispatch_params, buffer_obj, @@ -216,7 +222,7 @@ void HWCommandQueue::enqueue_read_buffer( dispatch_core_manager::instance().get_dispatch_core_type(device_->id())); if (dispatch_params->pages_per_txn > 0) { this->issued_completion_q_reads.push( - buffer_dispatch::generate_interleaved_buffer_read_descriptor(dst, dispatch_params.get(), buffer_obj)); + buffer_dispatch::generate_interleaved_buffer_read_descriptor(dst, dispatch_params, buffer_obj)); this->increment_num_entries_in_completion_q(); } } diff --git a/tt_metal/llrt/blackhole/bh_hal.cpp b/tt_metal/llrt/blackhole/bh_hal.cpp index b19d3683e07..6f4f74895da 100644 --- a/tt_metal/llrt/blackhole/bh_hal.cpp +++ b/tt_metal/llrt/blackhole/bh_hal.cpp @@ -4,6 +4,7 @@ #include #include +#include #include "core_config.h" // ProgrammableCoreType #include "dev_mem_map.h" @@ -56,6 +57,13 @@ void Hal::initialize_bh() { this->mem_alignments_[static_cast(HalMemType::DRAM)] = DRAM_ALIGNMENT; this->mem_alignments_[static_cast(HalMemType::HOST)] = PCIE_ALIGNMENT; + this->mem_alignments_with_pcie_.resize(static_cast(HalMemType::COUNT)); + this->mem_alignments_with_pcie_[static_cast(HalMemType::L1)] = std::lcm(L1_ALIGNMENT, PCIE_ALIGNMENT); + this->mem_alignments_with_pcie_[static_cast(HalMemType::DRAM)] = + std::lcm(DRAM_ALIGNMENT, PCIE_ALIGNMENT); + this->mem_alignments_with_pcie_[static_cast(HalMemType::HOST)] = + std::lcm(PCIE_ALIGNMENT, PCIE_ALIGNMENT); + this->relocate_func_ = [](uint64_t addr, uint64_t local_init_addr) { if ((addr & MEM_LOCAL_BASE) == MEM_LOCAL_BASE) { // Move addresses in the local memory range to l1 (copied by kernel) diff --git a/tt_metal/llrt/grayskull/gs_hal.cpp b/tt_metal/llrt/grayskull/gs_hal.cpp index ea6cbd8a31d..2041c10f661 100644 --- a/tt_metal/llrt/grayskull/gs_hal.cpp +++ b/tt_metal/llrt/grayskull/gs_hal.cpp @@ -156,6 +156,13 @@ void Hal::initialize_gs() { this->mem_alignments_[static_cast(HalMemType::DRAM)] = DRAM_ALIGNMENT; this->mem_alignments_[static_cast(HalMemType::HOST)] = PCIE_ALIGNMENT; + this->mem_alignments_with_pcie_.resize(static_cast(HalMemType::COUNT)); + this->mem_alignments_with_pcie_[static_cast(HalMemType::L1)] = std::lcm(L1_ALIGNMENT, PCIE_ALIGNMENT); + this->mem_alignments_with_pcie_[static_cast(HalMemType::DRAM)] = + std::lcm(DRAM_ALIGNMENT, PCIE_ALIGNMENT); + this->mem_alignments_with_pcie_[static_cast(HalMemType::HOST)] = + std::lcm(PCIE_ALIGNMENT, PCIE_ALIGNMENT); + this->relocate_func_ = [](uint64_t addr, uint64_t local_init_addr) { if ((addr & MEM_LOCAL_BASE) == MEM_LOCAL_BASE) { // Move addresses in the local memory range to l1 (copied by kernel) diff --git a/tt_metal/llrt/wormhole/wh_hal.cpp b/tt_metal/llrt/wormhole/wh_hal.cpp index af3de9d9e34..df6fc1a6bbc 100644 --- a/tt_metal/llrt/wormhole/wh_hal.cpp +++ b/tt_metal/llrt/wormhole/wh_hal.cpp @@ -4,6 +4,7 @@ #include #include +#include #include "core_config.h" // ProgrammableCoreType #include "dev_mem_map.h" // MEM_LOCAL_BASE @@ -57,6 +58,13 @@ void Hal::initialize_wh() { this->mem_alignments_[static_cast(HalMemType::DRAM)] = DRAM_ALIGNMENT; this->mem_alignments_[static_cast(HalMemType::HOST)] = PCIE_ALIGNMENT; + this->mem_alignments_with_pcie_.resize(static_cast(HalMemType::COUNT)); + this->mem_alignments_with_pcie_[static_cast(HalMemType::L1)] = std::lcm(L1_ALIGNMENT, PCIE_ALIGNMENT); + this->mem_alignments_with_pcie_[static_cast(HalMemType::DRAM)] = + std::lcm(DRAM_ALIGNMENT, PCIE_ALIGNMENT); + this->mem_alignments_with_pcie_[static_cast(HalMemType::HOST)] = + std::lcm(PCIE_ALIGNMENT, PCIE_ALIGNMENT); + this->relocate_func_ = [](uint64_t addr, uint64_t local_init_addr) { if ((addr & MEM_LOCAL_BASE) == MEM_LOCAL_BASE) { // Move addresses in the local memory range to l1 (copied by kernel) diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp index 0831892c1ac..51984ffa874 100644 --- a/tt_metal/tt_metal.cpp +++ b/tt_metal/tt_metal.cpp @@ -463,7 +463,7 @@ DeviceAddr CalculateAddressDeviceInterleavedContiguous( addr = buffer.page_address(bank_index, page_index); } - if (buffer_dispatch::are_pages_large(buffer)) { + if (buffer_dispatch::are_pages_larger_than_max_prefetch_cmd_size(buffer)) { const buffer_dispatch::PartialPageSpec& partial_page_spec = buffer_dispatch::calculate_partial_page_spec(buffer); const uint32_t full_padded_page_size = From 073c42244291a7e697588aedc2e4a8e10e561f43 Mon Sep 17 00:00:00 2001 From: Samarth Agarwal Date: Wed, 5 Mar 2025 05:18:54 +0000 Subject: [PATCH 21/24] Saving work --- ...queueWriteBuffer_and_EnqueueReadBuffer.cpp | 4 + tt_metal/api/tt-metalium/hal.hpp | 2 +- tt_metal/impl/buffers/dispatch.cpp | 153 +++++++++++++----- tt_metal/impl/buffers/dispatch.hpp | 32 ++-- tt_metal/tt_metal.cpp | 14 +- 5 files changed, 142 insertions(+), 63 deletions(-) diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp index 3d5710967e4..96ea52c59f8 100644 --- a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp @@ -9,6 +9,7 @@ #include "buffer_constants.hpp" #include "command_queue_fixture.hpp" #include "core_coord.hpp" +#include "hal.hpp" #include "math.hpp" #include "shape2d.hpp" #include "multi_command_queue_fixture.hpp" @@ -213,6 +214,9 @@ void test_EnqueueWriteBuffer_and_EnqueueReadBuffer(IDevice* device, CommandQueue mmio_device_id, channel); + std::cout << "HAL L1 Size: " << HAL_MEM_L1_SIZE << std::endl; + std::cout << "HAL L1 Base: " << HAL_MEM_L1_BASE << std::endl; + for (const bool cq_write : {true, false}) { for (const bool cq_read : {true, false}) { if constexpr (cq_dispatch_only) { diff --git a/tt_metal/api/tt-metalium/hal.hpp b/tt_metal/api/tt-metalium/hal.hpp index d20f0501e2e..13614245428 100644 --- a/tt_metal/api/tt-metalium/hal.hpp +++ b/tt_metal/api/tt-metalium/hal.hpp @@ -355,7 +355,7 @@ inline uint32_t Hal::get_alignment(HalMemType memory_type) const { inline uint32_t Hal::get_common_alignment_with_pcie(HalMemType memory_type) const { uint32_t index = utils::underlying_type(memory_type); - TT_ASSERT(index < this->mem_alignments_.size()); + TT_ASSERT(index < this->mem_alignments_with_pcie_.size()); return this->mem_alignments_with_pcie_[index]; } diff --git a/tt_metal/impl/buffers/dispatch.cpp b/tt_metal/impl/buffers/dispatch.cpp index cc07c1a749c..6ba64f36d2d 100644 --- a/tt_metal/impl/buffers/dispatch.cpp +++ b/tt_metal/impl/buffers/dispatch.cpp @@ -8,8 +8,10 @@ #include "dispatch.hpp" #include #include +#include #include "hal.hpp" +#include "logger.hpp" #include "tt_cluster.hpp" namespace tt::tt_metal { @@ -62,7 +64,6 @@ class InterleavedBufferWriteDispatchParams : public BufferWriteDispatchParams { this->expected_num_workers_completed = expected_num_workers_completed; } - InterleavedBufferWriteDispatchParams() = default; virtual ~InterleavedBufferWriteDispatchParams() = default; void calculate_issue_wait() { @@ -98,7 +99,9 @@ class InterleavedBufferWriteDispatchParams : public BufferWriteDispatchParams { virtual uint32_t num_partial_pages_per_full_page() const { return 1; } - virtual uint32_t get_padding_for_last_partial_page() const { return 0; } + virtual uint32_t partial_page_size() const { return this->page_size_to_write; } + + virtual uint32_t get_additional_padding_for_last_partial_page() const { return 0; } protected: uint32_t num_banks = 0; @@ -111,23 +114,23 @@ class InterleavedBufferWriteLargePageDispatchParams : public InterleavedBufferWr uint32_t dst_page_index, const PartialPageSpec& partial_page_spec, uint32_t total_pages_to_write, - uint32_t full_page_size, uint32_t num_full_pages, uint32_t cq_id, tt::stl::Span expected_num_workers_completed) : InterleavedBufferWriteDispatchParams( - buffer, dst_page_index, total_pages_to_write, cq_id, expected_num_workers_completed) { - this->buffer_address = buffer.address(); + buffer, dst_page_index, total_pages_to_write, cq_id, expected_num_workers_completed), + buffer(buffer) { + this->size_of_partial_page = partial_page_spec.partial_page_size; this->page_size_to_write = partial_page_spec.partial_page_size; this->data_size_to_copy = partial_page_spec.partial_page_size; this->full_pages_to_write = num_full_pages; - this->full_page_size = full_page_size; + // this->full_page_size = partial_page_spec.num_partial_pages_per_full_page * + // partial_page_spec.partial_page_size; this->num_partial_pages_in_single_full_page = partial_page_spec.num_partial_pages_per_full_page; - this->last_partial_page_padding = partial_page_spec.last_partial_page_padding; + this->last_partial_page_additional_padding = partial_page_spec.last_partial_page_additional_padding; + this->curr_full_pages_start_address = buffer.address(); } - InterleavedBufferWriteLargePageDispatchParams() = default; - void calculate_num_pages_for_write_transaction(uint32_t num_pages_available_in_cq) override { TT_ASSERT(this->num_banks > this->dst_page_index); this->pages_per_txn = @@ -138,7 +141,8 @@ class InterleavedBufferWriteLargePageDispatchParams : public InterleavedBufferWr void update_params_to_be_within_bounds() override { const uint32_t num_pages_written_per_bank = this->dst_page_index / this->num_banks; - this->address += num_pages_written_per_bank * this->full_page_size; + this->address += num_pages_written_per_bank * this->buffer.aligned_page_size(); + this->curr_full_pages_start_address = this->address; this->dst_page_index %= this->num_banks; } @@ -150,10 +154,24 @@ class InterleavedBufferWriteLargePageDispatchParams : public InterleavedBufferWr this->full_pages_to_write -= this->pages_per_txn; this->full_pages_written += this->pages_per_txn; if (!this->will_next_full_page_be_round_robined()) { - this->address -= this->full_page_size; + // this->address -= this->full_page_size; + this->address = this->curr_full_pages_start_address; + } else { + // this->address += this->page_size_to_write; } + TT_ASSERT((this->address - this->curr_full_pages_start_address) % this->buffer.aligned_page_size() == 0); + this->curr_full_pages_start_address = this->address; this->dst_page_index += this->pages_per_txn; this->dst_page_index %= this->num_banks; + this->page_size_to_write = this->size_of_partial_page; + this->data_size_to_copy = this->size_of_partial_page; + } else if (this->will_full_pages_be_written_in_next_write_transaction()) { + // this->address += this->page_size_to_write; + this->page_size_to_write = + this->buffer.aligned_page_size() - (this->address - this->curr_full_pages_start_address); + this->data_size_to_copy = this->buffer.page_size() - (this->address - this->curr_full_pages_start_address); + } else { + // this->address += this->page_size_to_write; } } @@ -163,25 +181,43 @@ class InterleavedBufferWriteLargePageDispatchParams : public InterleavedBufferWr uint32_t num_partial_pages_per_full_page() const override { return this->num_partial_pages_in_single_full_page; } - uint32_t get_padding_for_last_partial_page() const override { return this->last_partial_page_padding; } + uint32_t partial_page_size() const override { return this->size_of_partial_page; } + + uint32_t get_additional_padding_for_last_partial_page() const override { + return this->last_partial_page_additional_padding; + } private: - uint32_t buffer_address; - uint32_t num_partial_pages_in_single_full_page; - uint32_t last_partial_page_padding; - uint32_t full_page_size; + const Buffer& buffer; + uint32_t curr_full_pages_start_address = 0; + uint32_t size_of_partial_page = 0; + uint32_t num_partial_pages_in_single_full_page = 0; + uint32_t last_partial_page_additional_padding = 0; + // uint32_t full_page_size = 0; uint32_t full_pages_written = 0; uint32_t full_pages_to_write = 0; bool were_full_pages_written_in_last_write_transaction() const { - const uint32_t page_size = this->address - this->buffer_address; - return page_size > 0 && page_size % this->full_page_size == 0; + const int32_t page_size = this->address - this->curr_full_pages_start_address; + return page_size == this->buffer.aligned_page_size(); + } + + bool will_full_pages_be_written_in_next_write_transaction() const { + const int32_t page_size = this->address + this->page_size_to_write - this->curr_full_pages_start_address; + return page_size >= this->buffer.aligned_page_size(); } bool will_next_full_page_be_round_robined() const { const uint32_t dst_page_index_next_txn = this->dst_page_index + this->pages_per_txn; return dst_page_index_next_txn != (dst_page_index_next_txn % this->num_banks); } + + // void update_address_after_write_transaction() { + // if (this->address - this->curr_full_pages_start_address + this->page_size_to_write > + // this->buffer.aligned_page_size()) { + + // } + // } }; // Parameters specific to sharded buffers @@ -277,13 +313,14 @@ PartialPageSpec calculate_partial_page_spec(const Buffer& buffer) { partial_page_spec.partial_page_size = calculate_partial_page_size(buffer); partial_page_spec.num_partial_pages_per_full_page = tt::div_up(buffer.aligned_page_size(), partial_page_spec.partial_page_size); - partial_page_spec.last_partial_page_padding = - (partial_page_spec.num_partial_pages_per_full_page * partial_page_spec.partial_page_size) - buffer.page_size(); + partial_page_spec.last_partial_page_additional_padding = + (partial_page_spec.num_partial_pages_per_full_page * partial_page_spec.partial_page_size) - + buffer.aligned_page_size(); return partial_page_spec; } using InterleavedBufferWriteDispatchParamsVariant = - std::variant; + std::variant; InterleavedBufferWriteDispatchParamsVariant initialize_interleaved_buf_dispatch_params( const Buffer& buffer, @@ -297,22 +334,34 @@ InterleavedBufferWriteDispatchParamsVariant initialize_interleaved_buf_dispatch_ const uint32_t dst_page_index = region.offset / buffer.page_size(); if (are_pages_larger_than_max_prefetch_cmd_size(buffer)) { + tt::log_info( + LogDispatch, + "Initializing large page write params - buffer id: {}. buffer page size: {}, buffer num pages: {}, buffer " + "addr: {}", + buffer.unique_id(), + buffer.page_size(), + buffer.num_pages(), + buffer.address()); const PartialPageSpec partial_page_spec = calculate_partial_page_spec(buffer); - const uint32_t full_page_size = - partial_page_spec.num_partial_pages_per_full_page * partial_page_spec.partial_page_size; const uint32_t num_full_pages = total_pages_to_write; - const uint32_t padded_buffer_size = total_pages_to_write * buffer.aligned_page_size(); + // const uint32_t padded_buffer_size = total_pages_to_write * buffer.aligned_page_size(); total_pages_to_write = num_full_pages * partial_page_spec.num_partial_pages_per_full_page; dispatch_params.emplace( buffer, dst_page_index, partial_page_spec, total_pages_to_write, - full_page_size, num_full_pages, cq_id, expected_num_workers_completed); } else { + tt::log_info( + LogDispatch, + "Initializing write params - buffer id: {}, buffer page size: {}, buffer num pages {}, buffer addr: {}", + buffer.unique_id(), + buffer.page_size(), + buffer.num_pages(), + buffer.address()); dispatch_params.emplace( buffer, dst_page_index, total_pages_to_write, cq_id, expected_num_workers_completed); } @@ -354,17 +403,19 @@ void populate_interleaved_buffer_write_dispatch_cmds( uint32_t num_partial_pages_written_curr_txn = 0; for (uint32_t sysmem_address_offset = 0; sysmem_address_offset < data_size_bytes; sysmem_address_offset += dispatch_params.page_size_to_write) { - uint32_t page_size_to_copy = dispatch_params.data_size_to_copy; - uint32_t src_address_offset = num_full_pages_written * buffer.page_size() + - num_partial_pages_written_per_current_full_page * page_size_to_copy + - num_partial_pages_written_curr_txn * buffer.page_size(); - if (num_partial_pages_written_per_current_full_page == num_partial_pages_per_full_page - 1) { - // last partial page being copied from unpadded src buffer - const uint32_t padding = dispatch_params.get_padding_for_last_partial_page(); - page_size_to_copy -= padding; - } + // uint32_t page_size_to_copy = dispatch_params.data_size_to_copy; + uint32_t src_address_offset = + num_full_pages_written * buffer.page_size() + + num_partial_pages_written_per_current_full_page * dispatch_params.partial_page_size() + + num_partial_pages_written_curr_txn * buffer.page_size(); + // if (num_partial_pages_written_per_current_full_page == num_partial_pages_per_full_page - 1) { + // // last partial page being copied from unpadded src buffer + // const uint32_t padding = (buffer.aligned_page_size() - buffer.page_size()) + + // dispatch_params.get_additional_padding_for_last_partial_page(); + // page_size_to_copy -= padding; + // } command_sequence.add_data( - (char*)src + src_address_offset, page_size_to_copy, dispatch_params.page_size_to_write); + (char*)src + src_address_offset, dispatch_params.data_size_to_copy, dispatch_params.page_size_to_write); num_partial_pages_written_curr_txn += 1; } } else { @@ -726,7 +777,12 @@ void write_to_device_buffer( buffer, buf_dispatch_constants, cq_id, expected_num_workers_completed, region); InterleavedBufferWriteDispatchParams* dispatch_params = std::visit( - [](auto& val) { return static_cast(&val); }, + [](auto& val) -> InterleavedBufferWriteDispatchParams* { + if constexpr (!std::is_same_v, std::monostate>) { + return static_cast(&val); + } + return nullptr; + }, dispatch_params_variant); write_interleaved_buffer_to_device( @@ -772,11 +828,27 @@ BufferReadDispatchParamsVariant initialize_interleaved_buf_read_dispatch_params( validate_buffer_region_conditions(buffer, region); BufferReadDispatchParamsVariant dispatch_params; - + // This is the issue const bool read_large_pages = are_pages_larger_than_max_prefetch_cmd_size(buffer); if (read_large_pages) { + tt::log_info( + LogDispatch, + "Initializing large page read params - buffer id: {}, buffer page size: {}, buffer aligned page size: {}, " + "buffer num pages: {}, buffer addr: {}", + buffer.unique_id(), + buffer.page_size(), + buffer.aligned_page_size(), + buffer.num_pages(), + buffer.address()); dispatch_params = BufferReadLargePageDispatchParams{}; } else { + tt::log_info( + LogDispatch, + "Initializing read params - buffer id: {}, buffer page size: {}, buffer num pages: {}, buffer addr: {}", + buffer.unique_id(), + buffer.page_size(), + buffer.num_pages(), + buffer.address()); dispatch_params = BufferReadDispatchParams{}; } @@ -796,8 +868,8 @@ BufferReadDispatchParamsVariant initialize_interleaved_buf_read_dispatch_params( if constexpr (std::is_same_v, BufferReadLargePageDispatchParams>) { const PartialPageSpec partial_page_spec = calculate_partial_page_spec(buffer); params.partial_page_spec = partial_page_spec; - params.padded_page_size = - partial_page_spec.partial_page_size * partial_page_spec.num_partial_pages_per_full_page; + params.padded_page_size = buffer.aligned_page_size(); + // partial_page_spec.partial_page_size * partial_page_spec.num_partial_pages_per_full_page; } else { params.padded_page_size = buffer.aligned_page_size(); } @@ -950,6 +1022,7 @@ void copy_interleaved_buffer_to_completion_queue( if (dispatch_params.src_page_index > CQ_PREFETCH_RELAY_PAGED_START_PAGE_MASK) { dispatch_params.update_params_to_be_within_bounds(buffer); } + tt::log_info(tt::LogDispatch, "copy_interleaved_buffer_to_completion_queue"); dispatch_params.calculate_num_pages_for_read_transaction(); issue_read_buffer_dispatch_command_sequence(buffer, dispatch_params, sub_device_ids, dispatch_core_type); dispatch_params.update_params_after_read_transaction(); @@ -1003,6 +1076,8 @@ void copy_completion_queue_data_into_user_space( uint32_t remaining_bytes_to_read = padded_num_bytes; uint32_t dev_page_id = cur_dev_page_id; + tt::log_info(tt::LogDispatch, "copy_completion_queue_data_into_user_space"); + // track the amount of bytes read in the last non-aligned page uint32_t remaining_bytes_of_nonaligned_page = 0; std::optional host_page_id = std::nullopt; diff --git a/tt_metal/impl/buffers/dispatch.hpp b/tt_metal/impl/buffers/dispatch.hpp index dda1a96558b..822ea496bae 100644 --- a/tt_metal/impl/buffers/dispatch.hpp +++ b/tt_metal/impl/buffers/dispatch.hpp @@ -82,7 +82,7 @@ struct BufferReadDispatchParams { struct PartialPageSpec { uint32_t partial_page_size = 0; - uint32_t last_partial_page_padding = 0; + uint32_t last_partial_page_additional_padding = 0; uint32_t num_partial_pages_per_full_page = 0; }; @@ -90,21 +90,21 @@ class BufferReadLargePageDispatchParams : public BufferReadDispatchParams { public: PartialPageSpec partial_page_spec; - void update_params_to_be_within_bounds(const Buffer& buffer) override { - const uint32_t num_pages_per_bank = this->src_page_index / this->num_banks; - this->address += num_pages_per_bank * (this->partial_page_spec.num_partial_pages_per_full_page * - this->partial_page_spec.partial_page_size); - this->src_page_index = this->src_page_index % this->num_banks; - } - - void update_params_after_read_transaction() override { - this->total_pages_to_read -= this->pages_per_txn; - this->total_pages_read += this->pages_per_txn; - this->address += - ((this->src_page_index + this->pages_per_txn) / this->num_banks) * - (this->partial_page_spec.num_partial_pages_per_full_page * this->partial_page_spec.partial_page_size); - this->src_page_index = (this->src_page_index + this->pages_per_txn) % this->num_banks; - } + // void update_params_to_be_within_bounds(const Buffer& buffer) override { + // const uint32_t num_pages_per_bank = this->src_page_index / this->num_banks; + // this->address += num_pages_per_bank * (this->partial_page_spec.num_partial_pages_per_full_page * + // this->partial_page_spec.partial_page_size); + // this->src_page_index = this->src_page_index % this->num_banks; + // } + + // void update_params_after_read_transaction() override { + // this->total_pages_to_read -= this->pages_per_txn; + // this->total_pages_read += this->pages_per_txn; + // this->address += + // ((this->src_page_index + this->pages_per_txn) / this->num_banks) * + // (this->partial_page_spec.num_partial_pages_per_full_page * this->partial_page_spec.partial_page_size); + // this->src_page_index = (this->src_page_index + this->pages_per_txn) % this->num_banks; + // } }; using BufferReadDispatchParamsVariant = std::variant; diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp index 51984ffa874..77a22e83448 100644 --- a/tt_metal/tt_metal.cpp +++ b/tt_metal/tt_metal.cpp @@ -464,13 +464,13 @@ DeviceAddr CalculateAddressDeviceInterleavedContiguous( } if (buffer_dispatch::are_pages_larger_than_max_prefetch_cmd_size(buffer)) { - const buffer_dispatch::PartialPageSpec& partial_page_spec = - buffer_dispatch::calculate_partial_page_spec(buffer); - const uint32_t full_padded_page_size = - partial_page_spec.partial_page_size * partial_page_spec.num_partial_pages_per_full_page; - const DeviceAddr full_page_address_offset = - (num_round_robins > 0) ? (full_padded_page_size - buffer.aligned_page_size()) * num_round_robins : 0; - addr += full_page_address_offset; + // const buffer_dispatch::PartialPageSpec& partial_page_spec = + // buffer_dispatch::calculate_partial_page_spec(buffer); + // const uint32_t full_padded_page_size = + // partial_page_spec.partial_page_size * partial_page_spec.num_partial_pages_per_full_page; + // const DeviceAddr full_page_address_offset = + // (num_round_robins > 0) ? (full_padded_page_size - buffer.aligned_page_size()) * num_round_robins : 0; + // addr += (buffer.aligned_page_size() * num_round_robins); } return addr; } From 2e4736eefc0f469bb17578b5043062e4aa74d94d Mon Sep 17 00:00:00 2001 From: Samarth Agarwal Date: Wed, 5 Mar 2025 15:08:58 +0000 Subject: [PATCH 22/24] Cleanup --- .../test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp index 96ea52c59f8..3d5710967e4 100644 --- a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp @@ -9,7 +9,6 @@ #include "buffer_constants.hpp" #include "command_queue_fixture.hpp" #include "core_coord.hpp" -#include "hal.hpp" #include "math.hpp" #include "shape2d.hpp" #include "multi_command_queue_fixture.hpp" @@ -214,9 +213,6 @@ void test_EnqueueWriteBuffer_and_EnqueueReadBuffer(IDevice* device, CommandQueue mmio_device_id, channel); - std::cout << "HAL L1 Size: " << HAL_MEM_L1_SIZE << std::endl; - std::cout << "HAL L1 Base: " << HAL_MEM_L1_BASE << std::endl; - for (const bool cq_write : {true, false}) { for (const bool cq_read : {true, false}) { if constexpr (cq_dispatch_only) { From a0f352fc5436cd37a587260b0a5d73c156196969 Mon Sep 17 00:00:00 2001 From: Samarth Agarwal Date: Wed, 5 Mar 2025 18:37:22 +0000 Subject: [PATCH 23/24] Saving work --- tt_metal/api/tt-metalium/dispatch_settings.hpp | 9 ++++++--- tt_metal/impl/buffers/dispatch.cpp | 9 ++++++++- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/tt_metal/api/tt-metalium/dispatch_settings.hpp b/tt_metal/api/tt-metalium/dispatch_settings.hpp index 731d9a3e595..9da28d05825 100644 --- a/tt_metal/api/tt-metalium/dispatch_settings.hpp +++ b/tt_metal/api/tt-metalium/dispatch_settings.hpp @@ -135,10 +135,13 @@ class DispatchSettings { static constexpr uint32_t EVENT_PADDED_SIZE = 16; // When page size of buffer to write/read exceeds the max prefetch command size, the PCIe-aligned page size is - // broken down into equal sized partial pages. BASE_PARTIAL_PAGE_SIZE is incremented until the partial page size + // broken down into equal sized partial pages. The base partial page size is incremented until it // is PCIE-aligned. If the resulting partial page size doesn't evenly divide the full page size, the last partial - // page size is padded appropriately. - static constexpr uint32_t BASE_PARTIAL_PAGE_SIZE = 4096; + // page size is padded appropriately. The base partial page size is different for tensix dispatch and eth dispatch + // because the max prefetch command size is different depending on the dispatch core type. + static constexpr uint32_t BASE_PARTIAL_PAGE_SIZE_TENSIX_DISPATCH = 4096; + static constexpr uint32_t BASE_PARTIAL_PAGE_SIZE_ETH_DISPATCH = BASE_PARTIAL_PAGE_SIZE_TENSIX_DISPATCH / 4; + static_assert(BASE_PARTIAL_PAGE_SIZE_TENSIX_DISPATCH % 4 == 0); static_assert( DISPATCH_MESSAGE_ENTRIES <= diff --git a/tt_metal/impl/buffers/dispatch.cpp b/tt_metal/impl/buffers/dispatch.cpp index 6ba64f36d2d..d81f7018b9f 100644 --- a/tt_metal/impl/buffers/dispatch.cpp +++ b/tt_metal/impl/buffers/dispatch.cpp @@ -302,7 +302,14 @@ ShardedBufferWriteDispatchParams initialize_sharded_buf_dispatch_params( } uint32_t calculate_partial_page_size(const Buffer& buffer) { - uint32_t partial_page_size = DispatchSettings::BASE_PARTIAL_PAGE_SIZE; + uint32_t partial_page_size = 0; + const CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(buffer.device()->id()); + if (dispatch_core_type == CoreType::WORKER) { + partial_page_size = DispatchSettings::BASE_PARTIAL_PAGE_SIZE_TENSIX_DISPATCH; + } else { + TT_ASSERT(dispatch_core_type == CoreType::ETH); + partial_page_size = DispatchSettings::BASE_PARTIAL_PAGE_SIZE_ETH_DISPATCH; + } const HalMemType buffer_mem_type = buffer.memory_type(); partial_page_size = tt::align(partial_page_size, hal.get_common_alignment_with_pcie(buffer_mem_type)); return partial_page_size; From b14186bcfc90d8a8d88e4c193eedf51d397858ba Mon Sep 17 00:00:00 2001 From: Samarth Agarwal Date: Thu, 6 Mar 2025 17:05:34 +0000 Subject: [PATCH 24/24] Cleanup --- tt_metal/impl/buffers/dispatch.cpp | 82 ++---------------------------- tt_metal/impl/buffers/dispatch.hpp | 20 +------- tt_metal/tt_metal.cpp | 29 ++--------- 3 files changed, 8 insertions(+), 123 deletions(-) diff --git a/tt_metal/impl/buffers/dispatch.cpp b/tt_metal/impl/buffers/dispatch.cpp index d81f7018b9f..4834ee9c623 100644 --- a/tt_metal/impl/buffers/dispatch.cpp +++ b/tt_metal/impl/buffers/dispatch.cpp @@ -8,10 +8,7 @@ #include "dispatch.hpp" #include #include -#include -#include "hal.hpp" -#include "logger.hpp" #include "tt_cluster.hpp" namespace tt::tt_metal { @@ -101,8 +98,6 @@ class InterleavedBufferWriteDispatchParams : public BufferWriteDispatchParams { virtual uint32_t partial_page_size() const { return this->page_size_to_write; } - virtual uint32_t get_additional_padding_for_last_partial_page() const { return 0; } - protected: uint32_t num_banks = 0; }; @@ -124,10 +119,7 @@ class InterleavedBufferWriteLargePageDispatchParams : public InterleavedBufferWr this->page_size_to_write = partial_page_spec.partial_page_size; this->data_size_to_copy = partial_page_spec.partial_page_size; this->full_pages_to_write = num_full_pages; - // this->full_page_size = partial_page_spec.num_partial_pages_per_full_page * - // partial_page_spec.partial_page_size; this->num_partial_pages_in_single_full_page = partial_page_spec.num_partial_pages_per_full_page; - this->last_partial_page_additional_padding = partial_page_spec.last_partial_page_additional_padding; this->curr_full_pages_start_address = buffer.address(); } @@ -154,10 +146,7 @@ class InterleavedBufferWriteLargePageDispatchParams : public InterleavedBufferWr this->full_pages_to_write -= this->pages_per_txn; this->full_pages_written += this->pages_per_txn; if (!this->will_next_full_page_be_round_robined()) { - // this->address -= this->full_page_size; this->address = this->curr_full_pages_start_address; - } else { - // this->address += this->page_size_to_write; } TT_ASSERT((this->address - this->curr_full_pages_start_address) % this->buffer.aligned_page_size() == 0); this->curr_full_pages_start_address = this->address; @@ -166,12 +155,9 @@ class InterleavedBufferWriteLargePageDispatchParams : public InterleavedBufferWr this->page_size_to_write = this->size_of_partial_page; this->data_size_to_copy = this->size_of_partial_page; } else if (this->will_full_pages_be_written_in_next_write_transaction()) { - // this->address += this->page_size_to_write; this->page_size_to_write = this->buffer.aligned_page_size() - (this->address - this->curr_full_pages_start_address); this->data_size_to_copy = this->buffer.page_size() - (this->address - this->curr_full_pages_start_address); - } else { - // this->address += this->page_size_to_write; } } @@ -183,17 +169,11 @@ class InterleavedBufferWriteLargePageDispatchParams : public InterleavedBufferWr uint32_t partial_page_size() const override { return this->size_of_partial_page; } - uint32_t get_additional_padding_for_last_partial_page() const override { - return this->last_partial_page_additional_padding; - } - private: const Buffer& buffer; uint32_t curr_full_pages_start_address = 0; uint32_t size_of_partial_page = 0; uint32_t num_partial_pages_in_single_full_page = 0; - uint32_t last_partial_page_additional_padding = 0; - // uint32_t full_page_size = 0; uint32_t full_pages_written = 0; uint32_t full_pages_to_write = 0; @@ -211,13 +191,6 @@ class InterleavedBufferWriteLargePageDispatchParams : public InterleavedBufferWr const uint32_t dst_page_index_next_txn = this->dst_page_index + this->pages_per_txn; return dst_page_index_next_txn != (dst_page_index_next_txn % this->num_banks); } - - // void update_address_after_write_transaction() { - // if (this->address - this->curr_full_pages_start_address + this->page_size_to_write > - // this->buffer.aligned_page_size()) { - - // } - // } }; // Parameters specific to sharded buffers @@ -320,9 +293,6 @@ PartialPageSpec calculate_partial_page_spec(const Buffer& buffer) { partial_page_spec.partial_page_size = calculate_partial_page_size(buffer); partial_page_spec.num_partial_pages_per_full_page = tt::div_up(buffer.aligned_page_size(), partial_page_spec.partial_page_size); - partial_page_spec.last_partial_page_additional_padding = - (partial_page_spec.num_partial_pages_per_full_page * partial_page_spec.partial_page_size) - - buffer.aligned_page_size(); return partial_page_spec; } @@ -341,17 +311,8 @@ InterleavedBufferWriteDispatchParamsVariant initialize_interleaved_buf_dispatch_ const uint32_t dst_page_index = region.offset / buffer.page_size(); if (are_pages_larger_than_max_prefetch_cmd_size(buffer)) { - tt::log_info( - LogDispatch, - "Initializing large page write params - buffer id: {}. buffer page size: {}, buffer num pages: {}, buffer " - "addr: {}", - buffer.unique_id(), - buffer.page_size(), - buffer.num_pages(), - buffer.address()); const PartialPageSpec partial_page_spec = calculate_partial_page_spec(buffer); const uint32_t num_full_pages = total_pages_to_write; - // const uint32_t padded_buffer_size = total_pages_to_write * buffer.aligned_page_size(); total_pages_to_write = num_full_pages * partial_page_spec.num_partial_pages_per_full_page; dispatch_params.emplace( buffer, @@ -362,13 +323,6 @@ InterleavedBufferWriteDispatchParamsVariant initialize_interleaved_buf_dispatch_ cq_id, expected_num_workers_completed); } else { - tt::log_info( - LogDispatch, - "Initializing write params - buffer id: {}, buffer page size: {}, buffer num pages {}, buffer addr: {}", - buffer.unique_id(), - buffer.page_size(), - buffer.num_pages(), - buffer.address()); dispatch_params.emplace( buffer, dst_page_index, total_pages_to_write, cq_id, expected_num_workers_completed); } @@ -410,17 +364,10 @@ void populate_interleaved_buffer_write_dispatch_cmds( uint32_t num_partial_pages_written_curr_txn = 0; for (uint32_t sysmem_address_offset = 0; sysmem_address_offset < data_size_bytes; sysmem_address_offset += dispatch_params.page_size_to_write) { - // uint32_t page_size_to_copy = dispatch_params.data_size_to_copy; - uint32_t src_address_offset = + const uint32_t src_address_offset = num_full_pages_written * buffer.page_size() + num_partial_pages_written_per_current_full_page * dispatch_params.partial_page_size() + num_partial_pages_written_curr_txn * buffer.page_size(); - // if (num_partial_pages_written_per_current_full_page == num_partial_pages_per_full_page - 1) { - // // last partial page being copied from unpadded src buffer - // const uint32_t padding = (buffer.aligned_page_size() - buffer.page_size()) + - // dispatch_params.get_additional_padding_for_last_partial_page(); - // page_size_to_copy -= padding; - // } command_sequence.add_data( (char*)src + src_address_offset, dispatch_params.data_size_to_copy, dispatch_params.page_size_to_write); num_partial_pages_written_curr_txn += 1; @@ -835,27 +782,11 @@ BufferReadDispatchParamsVariant initialize_interleaved_buf_read_dispatch_params( validate_buffer_region_conditions(buffer, region); BufferReadDispatchParamsVariant dispatch_params; - // This is the issue + const bool read_large_pages = are_pages_larger_than_max_prefetch_cmd_size(buffer); if (read_large_pages) { - tt::log_info( - LogDispatch, - "Initializing large page read params - buffer id: {}, buffer page size: {}, buffer aligned page size: {}, " - "buffer num pages: {}, buffer addr: {}", - buffer.unique_id(), - buffer.page_size(), - buffer.aligned_page_size(), - buffer.num_pages(), - buffer.address()); dispatch_params = BufferReadLargePageDispatchParams{}; } else { - tt::log_info( - LogDispatch, - "Initializing read params - buffer id: {}, buffer page size: {}, buffer num pages: {}, buffer addr: {}", - buffer.unique_id(), - buffer.page_size(), - buffer.num_pages(), - buffer.address()); dispatch_params = BufferReadDispatchParams{}; } @@ -871,14 +802,11 @@ BufferReadDispatchParamsVariant initialize_interleaved_buf_read_dispatch_params( params.unpadded_dst_offset = 0; params.expected_num_workers_completed = expected_num_workers_completed; params.num_banks = device->allocator()->get_num_banks(buffer.buffer_type()); + params.padded_page_size = buffer.aligned_page_size(); if constexpr (std::is_same_v, BufferReadLargePageDispatchParams>) { const PartialPageSpec partial_page_spec = calculate_partial_page_spec(buffer); params.partial_page_spec = partial_page_spec; - params.padded_page_size = buffer.aligned_page_size(); - // partial_page_spec.partial_page_size * partial_page_spec.num_partial_pages_per_full_page; - } else { - params.padded_page_size = buffer.aligned_page_size(); } }, dispatch_params); @@ -1029,7 +957,7 @@ void copy_interleaved_buffer_to_completion_queue( if (dispatch_params.src_page_index > CQ_PREFETCH_RELAY_PAGED_START_PAGE_MASK) { dispatch_params.update_params_to_be_within_bounds(buffer); } - tt::log_info(tt::LogDispatch, "copy_interleaved_buffer_to_completion_queue"); + dispatch_params.calculate_num_pages_for_read_transaction(); issue_read_buffer_dispatch_command_sequence(buffer, dispatch_params, sub_device_ids, dispatch_core_type); dispatch_params.update_params_after_read_transaction(); @@ -1083,8 +1011,6 @@ void copy_completion_queue_data_into_user_space( uint32_t remaining_bytes_to_read = padded_num_bytes; uint32_t dev_page_id = cur_dev_page_id; - tt::log_info(tt::LogDispatch, "copy_completion_queue_data_into_user_space"); - // track the amount of bytes read in the last non-aligned page uint32_t remaining_bytes_of_nonaligned_page = 0; std::optional host_page_id = std::nullopt; diff --git a/tt_metal/impl/buffers/dispatch.hpp b/tt_metal/impl/buffers/dispatch.hpp index 822ea496bae..92f50314ff4 100644 --- a/tt_metal/impl/buffers/dispatch.hpp +++ b/tt_metal/impl/buffers/dispatch.hpp @@ -82,29 +82,11 @@ struct BufferReadDispatchParams { struct PartialPageSpec { uint32_t partial_page_size = 0; - uint32_t last_partial_page_additional_padding = 0; uint32_t num_partial_pages_per_full_page = 0; }; -class BufferReadLargePageDispatchParams : public BufferReadDispatchParams { -public: +struct BufferReadLargePageDispatchParams : BufferReadDispatchParams { PartialPageSpec partial_page_spec; - - // void update_params_to_be_within_bounds(const Buffer& buffer) override { - // const uint32_t num_pages_per_bank = this->src_page_index / this->num_banks; - // this->address += num_pages_per_bank * (this->partial_page_spec.num_partial_pages_per_full_page * - // this->partial_page_spec.partial_page_size); - // this->src_page_index = this->src_page_index % this->num_banks; - // } - - // void update_params_after_read_transaction() override { - // this->total_pages_to_read -= this->pages_per_txn; - // this->total_pages_read += this->pages_per_txn; - // this->address += - // ((this->src_page_index + this->pages_per_txn) / this->num_banks) * - // (this->partial_page_spec.num_partial_pages_per_full_page * this->partial_page_spec.partial_page_size); - // this->src_page_index = (this->src_page_index + this->pages_per_txn) % this->num_banks; - // } }; using BufferReadDispatchParamsVariant = std::variant; diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp index 77a22e83448..0074c938ffc 100644 --- a/tt_metal/tt_metal.cpp +++ b/tt_metal/tt_metal.cpp @@ -10,7 +10,6 @@ #include #include #include -#include "buffers/dispatch.hpp" #include "dprint_server.hpp" #include #include @@ -453,8 +452,7 @@ void WriteToDeviceSharded(Buffer& buffer, tt::stl::Span host_buff } } -DeviceAddr CalculateAddressDeviceInterleavedContiguous( - const Buffer& buffer, uint32_t bank_index, uint32_t page_index, uint32_t num_round_robins) { +DeviceAddr CalculateAddressDeviceInterleavedContiguous(const Buffer& buffer, uint32_t bank_index, uint32_t page_index) { DeviceAddr addr = 0; if (buffer.is_dram()) { addr = buffer.bank_local_page_address(bank_index, page_index); @@ -463,15 +461,6 @@ DeviceAddr CalculateAddressDeviceInterleavedContiguous( addr = buffer.page_address(bank_index, page_index); } - if (buffer_dispatch::are_pages_larger_than_max_prefetch_cmd_size(buffer)) { - // const buffer_dispatch::PartialPageSpec& partial_page_spec = - // buffer_dispatch::calculate_partial_page_spec(buffer); - // const uint32_t full_padded_page_size = - // partial_page_spec.partial_page_size * partial_page_spec.num_partial_pages_per_full_page; - // const DeviceAddr full_page_address_offset = - // (num_round_robins > 0) ? (full_padded_page_size - buffer.aligned_page_size()) * num_round_robins : 0; - // addr += (buffer.aligned_page_size() * num_round_robins); - } return addr; } @@ -488,14 +477,12 @@ void WriteToDeviceInterleavedContiguous(const Buffer& buffer, tt::stl::Spanallocator()->get_num_banks(buffer.buffer_type()); - uint32_t num_round_robins = 0; uint32_t bank_index = 0; int data_index = 0; std::vector page; page.resize(page_size / sizeof(uint32_t)); for (int page_index = 0; page_index < num_pages; page_index++) { - const DeviceAddr address = - CalculateAddressDeviceInterleavedContiguous(buffer, bank_index, page_index, num_round_robins); + const DeviceAddr address = CalculateAddressDeviceInterleavedContiguous(buffer, bank_index, page_index); std::memcpy(page.data(), host_buffer.data() + data_index, page_size); switch (buffer.buffer_type()) { case BufferType::DRAM: WriteToDeviceDRAMChannel(device, bank_index, address, page); break; @@ -507,10 +494,6 @@ void WriteToDeviceInterleavedContiguous(const Buffer& buffer, tt::stl::Spanallocator()->get_num_banks(buffer.buffer_type()); - uint32_t num_round_robins = 0; size_t host_idx = 0; uint32_t bank_index = 0; std::vector page; page.resize(page_size / sizeof(uint32_t)); for (int page_index = 0; page_index < num_pages; page_index++) { - const DeviceAddr address = - CalculateAddressDeviceInterleavedContiguous(buffer, bank_index, page_index, num_round_robins); + const DeviceAddr address = CalculateAddressDeviceInterleavedContiguous(buffer, bank_index, page_index); page.clear(); switch (buffer.buffer_type()) { case BufferType::DRAM: @@ -575,10 +556,6 @@ void ReadFromDeviceInterleavedContiguous(const Buffer& buffer, uint8_t* host_buf std::memcpy(host_buffer + host_idx, page.data(), page_size); host_idx += page_size; - if (bank_index + 1 == num_banks) { - num_round_robins += 1; - } - bank_index = (bank_index + 1) % num_banks; } }