diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 9dcb53b0..9284c221 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -12,6 +12,7 @@ add_library(arcae SHARED arcae/complex_type.cc arcae/configuration.cc arcae/descriptor.cc + arcae/base_column_map.cc arcae/service_locator.cc arcae/safe_table_proxy.cc arcae/table_factory.cc diff --git a/cpp/arcae/base_column_map.cc b/cpp/arcae/base_column_map.cc new file mode 100644 index 00000000..5fe22f5b --- /dev/null +++ b/cpp/arcae/base_column_map.cc @@ -0,0 +1,9 @@ +#include "arcae/base_column_map.h" + +namespace arcae { + +std::ptrdiff_t SelectDim(std::size_t dim, std::size_t sdims, std::size_t ndims) { + return std::ptrdiff_t(dim) + std::ptrdiff_t(sdims) - std::ptrdiff_t(ndims); +} + +} // namespace arcae \ No newline at end of file diff --git a/cpp/arcae/map_iterator.h b/cpp/arcae/base_column_map.h similarity index 75% rename from cpp/arcae/map_iterator.h rename to cpp/arcae/base_column_map.h index 71e061e0..98830768 100644 --- a/cpp/arcae/map_iterator.h +++ b/cpp/arcae/base_column_map.h @@ -6,6 +6,8 @@ #include #include +#include + #include #include #include @@ -15,6 +17,15 @@ namespace arcae { enum MapOrder {C_ORDER=0, F_ORDER}; +// Return a selection dimension given +// +// 1. FORTRAN ordered dim +// 2. Number of selection dimensions +// 3. Number of column dimensions +// +// A return of < 0 indicates a non-existent selection +std::ptrdiff_t SelectDim(std::size_t dim, std::size_t sdims, std::size_t ndims); + using RowIds = std::vector; using ColumnSelection = std::vector; @@ -552,6 +563,178 @@ std::size_t BaseColumnMap::nElements() const { } + +// Create a Column Map from a selection of row id's in different dimensions +template +ColumnMaps MapFactory(const SP & shape_prov, const ColumnSelection & selection) { + ColumnMaps column_maps; + auto ndim = shape_prov.nDim(); + column_maps.reserve(ndim); + + for(auto dim=std::size_t{0}; dim < ndim; ++dim) { + // Dimension needs to be adjusted for + // 1. We may not have selections matching all dimensions + // 2. Selections are FORTRAN ordered + auto sdim = SelectDim(dim, selection.size(), ndim); + + if(sdim < 0 || selection.size() == 0 || selection[sdim].size() == 0) { + column_maps.emplace_back(ColumnMap{}); + continue; + } + + const auto & dim_ids = selection[sdim]; + ColumnMap column_map; + column_map.reserve(dim_ids.size()); + + for(auto [disk_it, mem] = std::tuple{std::begin(dim_ids), casacore::rownr_t{0}}; + disk_it != std::end(dim_ids); ++mem, ++disk_it) { + column_map.push_back({*disk_it, mem}); + } + + std::sort(std::begin(column_map), std::end(column_map), + [](const auto & lhs, const auto & rhs) { + return lhs.disk < rhs.disk; }); + + column_maps.emplace_back(std::move(column_map)); + } + + return column_maps; +} + +// Make ranges for fixed shape data +// In this case, each row has the same shape +// so we can make ranges that span multiple rows +template +arrow::Result +FixedRangeFactory(const SP & shape_prov, const ColumnMaps & maps) { + //assert(shape_prov.IsDataFixed()); + auto ndim = shape_prov.nDim(); + ColumnRanges column_ranges; + column_ranges.reserve(ndim); + + for(std::size_t dim=0; dim < ndim; ++dim) { + // If no mapping exists for this dimension, create a range + // from the column shape + if(dim >= maps.size() || maps[dim].size() == 0) { + ARROW_ASSIGN_OR_RAISE(auto dim_size, shape_prov.DimSize(dim)); + column_ranges.emplace_back(ColumnRange{Range{0, dim_size, Range::FREE}}); + continue; + } + + // A mapping exists for this dimension, create ranges + // from contiguous segments + const auto & column_map = maps[dim]; + auto column_range = ColumnRange{}; + auto current = Range{0, 1, Range::MAP}; + + for(auto [i, prev, next] = std::tuple{ + casacore::rownr_t{1}, + std::begin(column_map), + std::next(std::begin(column_map))}; + next != std::end(column_map); ++i, ++prev, ++next) { + + if(next->disk - prev->disk == 1) { + current.end += 1; + } else { + column_range.push_back(current); + current = Range{i, i + 1, Range::MAP}; + } + } + + column_range.emplace_back(std::move(current)); + column_ranges.emplace_back(std::move(column_range)); + } + + assert(ndim == column_ranges.size()); + return column_ranges; +} + +// Make ranges for variably shaped data +// In this case, each row may have a different shape +// so we create a separate range for each row and VARYING +// ranges for other dimensions whose size cannot be determined. +template +arrow::Result +VariableRangeFactory(const SP & shape_prov, const ColumnMaps & maps) { + //assert(shape_prov.IsDataVarying()); + auto ndim = shape_prov.nDim(); + auto row_dim = ndim - 1; + ColumnRanges column_ranges; + column_ranges.reserve(ndim); + + + // Handle non-row dimensions first + for(std::size_t dim=0; dim < row_dim; ++dim) { + // If no mapping exists for this dimension + // create a single VARYING range + if(dim >= maps.size() || maps[dim].size() == 0) { + column_ranges.emplace_back(ColumnRange{Range{0, 0, Range::VARYING}}); + continue; + } + + // A mapping exists for this dimension, create ranges + // from contiguous segments + const auto & column_map = maps[dim]; + auto column_range = ColumnRange{}; + auto current = Range{0, 1, Range::MAP}; + + for(auto [i, prev, next] = std::tuple{ + casacore::rownr_t{1}, + std::begin(column_map), + std::next(std::begin(column_map))}; + next != std::end(column_map); ++i, ++prev, ++next) { + + if(next->disk - prev->disk == 1) { + current.end += 1; + } else { + column_range.push_back(current); + current = Range{i, i + 1, Range::MAP}; + } + } + + column_range.emplace_back(std::move(current)); + column_ranges.emplace_back(std::move(column_range)); + } + + // Lastly, the row dimension + auto row_range = ColumnRange{}; + + // Split the row dimension into ranges of exactly one row + if(maps.size() == 0 || maps[row_dim].size() == 0) { + // No maps provided, derive from shape + ARROW_ASSIGN_OR_RAISE(auto dim_size, shape_prov.DimSize(row_dim)); + row_range.reserve(dim_size); + for(std::size_t r=0; r < dim_size; ++r) { + row_range.emplace_back(Range{r, r + 1, Range::FREE}); + } + } else { + // Derive from mapping + const auto & row_maps = maps[row_dim]; + row_range.reserve(row_maps.size()); + for(std::size_t r=0; r < row_maps.size(); ++r) { + row_range.emplace_back(Range{r, r + 1, Range::MAP}); + } + } + + column_ranges.emplace_back(std::move(row_range)); + + + assert(ndim == column_ranges.size()); + return column_ranges; +} + +// Make ranges for each dimension +template +arrow::Result +RangeFactory(const SP & shape_prov, const ColumnMaps & maps) { + if(shape_prov.IsDataFixed()) { + return FixedRangeFactory(shape_prov, maps); + } + + return VariableRangeFactory(shape_prov, maps); +} + + } // namespace arcea #endif // ARCAE_MAP_ITERATOR_H \ No newline at end of file diff --git a/cpp/arcae/column_read_map.cc b/cpp/arcae/column_read_map.cc index b03dc5da..f9b27989 100644 --- a/cpp/arcae/column_read_map.cc +++ b/cpp/arcae/column_read_map.cc @@ -27,18 +27,6 @@ namespace arcae { namespace { -// Return a selection dimension given -// -// 1. FORTRAN ordered dim -// 2. Number of selection dimensions -// 3. Number of column dimensions -// -// A return of < 0 indicates a non-existent selection -std::ptrdiff_t SelectDim(std::size_t dim, std::size_t sdims, std::size_t ndims) { - return std::ptrdiff_t(dim) + std::ptrdiff_t(sdims) - std::ptrdiff_t(ndims); -} - - // Clip supplied shape based on the column selection arrow::Result ClipShape( const casacore::IPosition & shape, @@ -69,174 +57,6 @@ arrow::Result ClipShape( return clipped; } - - -// Create a Column Map from a selection of row id's in different dimensions -ColumnMaps MapFactory(const ShapeProvider & shape_prov, const ColumnSelection & selection) { - ColumnMaps column_maps; - auto ndim = shape_prov.nDim(); - column_maps.reserve(ndim); - - for(auto dim=std::size_t{0}; dim < ndim; ++dim) { - // Dimension needs to be adjusted for - // 1. We may not have selections matching all dimensions - // 2. Selections are FORTRAN ordered - auto sdim = SelectDim(dim, selection.size(), ndim); - - if(sdim < 0 || selection.size() == 0 || selection[sdim].size() == 0) { - column_maps.emplace_back(ColumnMap{}); - continue; - } - - const auto & dim_ids = selection[sdim]; - ColumnMap column_map; - column_map.reserve(dim_ids.size()); - - for(auto [disk_it, mem] = std::tuple{std::begin(dim_ids), casacore::rownr_t{0}}; - disk_it != std::end(dim_ids); ++mem, ++disk_it) { - column_map.push_back({*disk_it, mem}); - } - - std::sort(std::begin(column_map), std::end(column_map), - [](const auto & lhs, const auto & rhs) { - return lhs.disk < rhs.disk; }); - - column_maps.emplace_back(std::move(column_map)); - } - - return column_maps; -} - -// Make ranges for fixed shape columns -// In this case, each row has the same shape -// so we can make ranges that span multiple rows -arrow::Result -FixedRangeFactory(const ShapeProvider & shape_prov, const ColumnMaps & maps) { - assert(shape_prov.IsActuallyFixed()); - auto ndim = shape_prov.nDim(); - ColumnRanges column_ranges; - column_ranges.reserve(ndim); - - for(std::size_t dim=0; dim < ndim; ++dim) { - // If no mapping exists for this dimension, create a range - // from the column shape - if(dim >= maps.size() || maps[dim].size() == 0) { - ARROW_ASSIGN_OR_RAISE(auto dim_size, shape_prov.DimSize(dim)); - column_ranges.emplace_back(ColumnRange{Range{0, dim_size, Range::FREE}}); - continue; - } - - // A mapping exists for this dimension, create ranges - // from contiguous segments - const auto & column_map = maps[dim]; - auto column_range = ColumnRange{}; - auto current = Range{0, 1, Range::MAP}; - - for(auto [i, prev, next] = std::tuple{ - casacore::rownr_t{1}, - std::begin(column_map), - std::next(std::begin(column_map))}; - next != std::end(column_map); ++i, ++prev, ++next) { - - if(next->disk - prev->disk == 1) { - current.end += 1; - } else { - column_range.push_back(current); - current = Range{i, i + 1, Range::MAP}; - } - } - - column_range.emplace_back(std::move(current)); - column_ranges.emplace_back(std::move(column_range)); - } - - assert(ndim == column_ranges.size()); - return column_ranges; -} - -// Make ranges for variably shaped columns -// In this case, each row may have a different shape -// so we create a separate range for each row and VARYING -// ranges for other dimensions whose size cannot be determined. -arrow::Result -VariableRangeFactory(const ShapeProvider & shape_prov, const ColumnMaps & maps) { - assert(!shape_prov.IsActuallyFixed()); - auto ndim = shape_prov.nDim(); - auto row_dim = ndim - 1; - ColumnRanges column_ranges; - column_ranges.reserve(ndim); - - - // Handle non-row dimensions first - for(std::size_t dim=0; dim < row_dim; ++dim) { - // If no mapping exists for this dimension - // create a single VARYING range - if(dim >= maps.size() || maps[dim].size() == 0) { - column_ranges.emplace_back(ColumnRange{Range{0, 0, Range::VARYING}}); - continue; - } - - // A mapping exists for this dimension, create ranges - // from contiguous segments - const auto & column_map = maps[dim]; - auto column_range = ColumnRange{}; - auto current = Range{0, 1, Range::MAP}; - - for(auto [i, prev, next] = std::tuple{ - casacore::rownr_t{1}, - std::begin(column_map), - std::next(std::begin(column_map))}; - next != std::end(column_map); ++i, ++prev, ++next) { - - if(next->disk - prev->disk == 1) { - current.end += 1; - } else { - column_range.push_back(current); - current = Range{i, i + 1, Range::MAP}; - } - } - - column_range.emplace_back(std::move(current)); - column_ranges.emplace_back(std::move(column_range)); - } - - // Lastly, the row dimension - auto row_range = ColumnRange{}; - - // Split the row dimension into ranges of exactly one row - if(maps.size() == 0 || maps[row_dim].size() == 0) { - // No maps provided, derive from shape - ARROW_ASSIGN_OR_RAISE(auto dim_size, shape_prov.DimSize(row_dim)); - row_range.reserve(dim_size); - for(std::size_t r=0; r < dim_size; ++r) { - row_range.emplace_back(Range{r, r + 1, Range::FREE}); - } - } else { - // Derive from mapping - const auto & row_maps = maps[row_dim]; - row_range.reserve(row_maps.size()); - for(std::size_t r=0; r < row_maps.size(); ++r) { - row_range.emplace_back(Range{r, r + 1, Range::MAP}); - } - } - - column_ranges.emplace_back(std::move(row_range)); - - - assert(ndim == column_ranges.size()); - return column_ranges; -} - -// Make ranges for each dimension -arrow::Result -RangeFactory(const ShapeProvider & shape_prov, const ColumnMaps & maps) { - if(shape_prov.IsActuallyFixed()) { - return FixedRangeFactory(shape_prov, maps); - } - - return VariableRangeFactory(shape_prov, maps); -} - // Derive an output shape from the selection ranges // This may not be possible for variably shaped columns std::optional @@ -267,6 +87,17 @@ MaybeMakeOutputShape(const ColumnRanges & ranges) { return shape; } +// Make ranges for each dimension +arrow::Result +RangeFactory(const ShapeProvider & shape_prov, const ColumnMaps & maps) { + if(shape_prov.IsActuallyFixed()) { + return FixedRangeFactory(shape_prov, maps); + } + + return VariableRangeFactory(shape_prov, maps); +} + + } // namespace arrow::Result GetColumnRowShape( diff --git a/cpp/arcae/column_read_map.h b/cpp/arcae/column_read_map.h index 59f6937c..b94b188d 100644 --- a/cpp/arcae/column_read_map.h +++ b/cpp/arcae/column_read_map.h @@ -1,7 +1,6 @@ #ifndef ARCAE_COLUMN_READ_MAP_H #define ARCAE_COLUMN_READ_MAP_H -#include #include #include #include @@ -16,7 +15,7 @@ #include #include -#include "arcae/map_iterator.h" +#include "arcae/base_column_map.h" namespace arcae { diff --git a/cpp/arcae/column_write_map.cc b/cpp/arcae/column_write_map.cc index 067e7a7f..627cfae1 100644 --- a/cpp/arcae/column_write_map.cc +++ b/cpp/arcae/column_write_map.cc @@ -6,7 +6,6 @@ #include #include #include -#include #include #include #include @@ -28,24 +27,12 @@ #include #include -#include "arcae/map_iterator.h" +#include "arcae/base_column_map.h" namespace arcae { namespace { -// Return a selection dimension given -// -// 1. FORTRAN ordered dim -// 2. Number of selection dimensions -// 3. Number of column dimensions -// -// A return of < 0 indicates a non-existent selection -std::ptrdiff_t SelectDim(std::size_t dim, std::size_t sdims, std::size_t ndims) { - return std::ptrdiff_t(dim) + std::ptrdiff_t(sdims) - std::ptrdiff_t(ndims); -} - - // Reconcile Data Shape with selection indices to decide // the shape of the column row. // Selection indices might refer to indices that are greater @@ -71,164 +58,6 @@ ReconcileDataAndSelectionShape(const ColumnSelection & selection, } } - - -// Create a Column Map from a selection of row id's in different dimensions -ColumnMaps MapFactory(const ArrowShapeProvider & shape_prov, const ColumnSelection & selection) { - ColumnMaps column_maps; - auto ndim = shape_prov.nDim(); - column_maps.reserve(ndim); - - for(auto dim=std::size_t{0}; dim < ndim; ++dim) { - // Dimension needs to be adjusted for - // 1. We may not have selections matching all dimensions - // 2. Selections are FORTRAN ordered - auto sdim = SelectDim(dim, selection.size(), ndim); - - if(sdim < 0 || selection.size() == 0 || selection[sdim].size() == 0) { - column_maps.emplace_back(ColumnMap{}); - continue; - } - - const auto & dim_ids = selection[sdim]; - ColumnMap column_map; - column_map.reserve(dim_ids.size()); - - for(auto [disk_it, mem] = std::tuple{std::begin(dim_ids), casacore::rownr_t{0}}; - disk_it != std::end(dim_ids); ++mem, ++disk_it) { - column_map.push_back({*disk_it, mem}); - } - - std::sort(std::begin(column_map), std::end(column_map), - [](const auto & lhs, const auto & rhs) { - return lhs.disk < rhs.disk; }); - - column_maps.emplace_back(std::move(column_map)); - } - - return column_maps; -} - -// Make ranges for fixed shape data -// In this case, each row has the same shape -// so we can make ranges that span multiple rows -arrow::Result -FixedRangeFactory(const ArrowShapeProvider & shape_prov, const ColumnMaps & maps) { - assert(shape_prov.IsDataFixed()); - auto ndim = shape_prov.nDim(); - ColumnRanges column_ranges; - column_ranges.reserve(ndim); - - for(std::size_t dim=0; dim < ndim; ++dim) { - // If no mapping exists for this dimension, create a range - // from the column shape - if(dim >= maps.size() || maps[dim].size() == 0) { - ARROW_ASSIGN_OR_RAISE(auto dim_size, shape_prov.DimSize(dim)); - column_ranges.emplace_back(ColumnRange{Range{0, dim_size, Range::FREE}}); - continue; - } - - // A mapping exists for this dimension, create ranges - // from contiguous segments - const auto & column_map = maps[dim]; - auto column_range = ColumnRange{}; - auto current = Range{0, 1, Range::MAP}; - - for(auto [i, prev, next] = std::tuple{ - casacore::rownr_t{1}, - std::begin(column_map), - std::next(std::begin(column_map))}; - next != std::end(column_map); ++i, ++prev, ++next) { - - if(next->disk - prev->disk == 1) { - current.end += 1; - } else { - column_range.push_back(current); - current = Range{i, i + 1, Range::MAP}; - } - } - - column_range.emplace_back(std::move(current)); - column_ranges.emplace_back(std::move(column_range)); - } - - assert(ndim == column_ranges.size()); - return column_ranges; -} - -// Make ranges for variably shaped data -// In this case, each row may have a different shape -// so we create a separate range for each row and VARYING -// ranges for other dimensions whose size cannot be determined. -arrow::Result -VariableRangeFactory(const ArrowShapeProvider & shape_prov, const ColumnMaps & maps) { - assert(shape_prov.IsDataVarying()); - auto ndim = shape_prov.nDim(); - auto row_dim = ndim - 1; - ColumnRanges column_ranges; - column_ranges.reserve(ndim); - - - // Handle non-row dimensions first - for(std::size_t dim=0; dim < row_dim; ++dim) { - // If no mapping exists for this dimension - // create a single VARYING range - if(dim >= maps.size() || maps[dim].size() == 0) { - column_ranges.emplace_back(ColumnRange{Range{0, 0, Range::VARYING}}); - continue; - } - - // A mapping exists for this dimension, create ranges - // from contiguous segments - const auto & column_map = maps[dim]; - auto column_range = ColumnRange{}; - auto current = Range{0, 1, Range::MAP}; - - for(auto [i, prev, next] = std::tuple{ - casacore::rownr_t{1}, - std::begin(column_map), - std::next(std::begin(column_map))}; - next != std::end(column_map); ++i, ++prev, ++next) { - - if(next->disk - prev->disk == 1) { - current.end += 1; - } else { - column_range.push_back(current); - current = Range{i, i + 1, Range::MAP}; - } - } - - column_range.emplace_back(std::move(current)); - column_ranges.emplace_back(std::move(column_range)); - } - - // Lastly, the row dimension - auto row_range = ColumnRange{}; - - // Split the row dimension into ranges of exactly one row - if(maps.size() == 0 || maps[row_dim].size() == 0) { - // No maps provided, derive from shape - ARROW_ASSIGN_OR_RAISE(auto dim_size, shape_prov.DimSize(row_dim)); - row_range.reserve(dim_size); - for(std::size_t r=0; r < dim_size; ++r) { - row_range.emplace_back(Range{r, r + 1, Range::FREE}); - } - } else { - // Derive from mapping - const auto & row_maps = maps[row_dim]; - row_range.reserve(row_maps.size()); - for(std::size_t r=0; r < row_maps.size(); ++r) { - row_range.emplace_back(Range{r, r + 1, Range::MAP}); - } - } - - column_ranges.emplace_back(std::move(row_range)); - - - assert(ndim == column_ranges.size()); - return column_ranges; -} - // Make ranges for each dimension arrow::Result RangeFactory(const ArrowShapeProvider & shape_prov, const ColumnMaps & maps) { @@ -239,6 +68,7 @@ RangeFactory(const ArrowShapeProvider & shape_prov, const ColumnMaps & maps) { return VariableRangeFactory(shape_prov, maps); } + struct DataProperties { std::optional shape; std::size_t ndim; diff --git a/cpp/arcae/column_write_map.h b/cpp/arcae/column_write_map.h index e2038dc5..d358664a 100644 --- a/cpp/arcae/column_write_map.h +++ b/cpp/arcae/column_write_map.h @@ -15,7 +15,7 @@ #include #include -#include "arcae/map_iterator.h" +#include "arcae/base_column_map.h" #include "arrow/array/array_base.h" #include "arrow/type.h"