Skip to content

Commit

Permalink
fix(parquet): Reading UUID columns (#173)
Browse files Browse the repository at this point in the history
Split from #171 to be a more focused PR.

Currently we will properly write arrow data with the canonical UUID
extension type as a parquet UUID column via `pqarrow`. This PR enables
us to read back that data using the `extensions.UUID` data type
correctly even when we don't have a stored schema.

Added a test to the `ArrowExtensionTypeRoundTrip` to ensure proper round
trip without a stored schema.

---------

Co-authored-by: Sutou Kouhei <kou@cozmixng.org>
  • Loading branch information
zeroshade and kou authored Oct 26, 2024
1 parent 19bd313 commit fe4bd93
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 9 deletions.
4 changes: 2 additions & 2 deletions arrow/extensions/extensions.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ import (
)

var canonicalExtensionTypes = []arrow.ExtensionType{
&Bool8Type{},
&UUIDType{},
NewBool8Type(),
NewUUIDType(),
&OpaqueType{},
&JSONType{},
}
Expand Down
1 change: 1 addition & 0 deletions parquet/pqarrow/encode_arrow_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2057,6 +2057,7 @@ func (ps *ParquetIOTestSuite) TestArrowExtensionTypeRoundTrip() {
defer tbl.Release()

ps.roundTripTable(mem, tbl, true)
ps.roundTripTable(mem, tbl, false)
}

func (ps *ParquetIOTestSuite) TestArrowUnknownExtensionTypeRoundTrip() {
Expand Down
21 changes: 14 additions & 7 deletions parquet/pqarrow/schema.go
Original file line number Diff line number Diff line change
Expand Up @@ -514,8 +514,14 @@ func arrowFromFLBA(logical schema.LogicalType, length int) (arrow.DataType, erro
switch logtype := logical.(type) {
case schema.DecimalLogicalType:
return arrowDecimal(logtype), nil
case schema.NoLogicalType, schema.IntervalLogicalType, schema.UUIDLogicalType:
case schema.NoLogicalType, schema.IntervalLogicalType:
return &arrow.FixedSizeBinaryType{ByteWidth: int(length)}, nil
case schema.UUIDLogicalType:
uuidType := arrow.GetExtensionType("arrow.uuid")
if uuidType == nil {
return &arrow.FixedSizeBinaryType{ByteWidth: int(length)}, nil
}
return uuidType, nil
case schema.Float16LogicalType:
return &arrow.Float16Type{}, nil
default:
Expand Down Expand Up @@ -984,13 +990,14 @@ func applyOriginalStorageMetadata(origin arrow.Field, inferred *SchemaField) (mo
return
}

if !arrow.TypeEqual(extType.StorageType(), inferred.Field.Type) {
return modified, fmt.Errorf("%w: mismatch storage type '%s' for extension type '%s'",
arrow.ErrInvalid, inferred.Field.Type, extType)
}
if modified && !arrow.TypeEqual(extType, inferred.Field.Type) {
if !arrow.TypeEqual(extType.StorageType(), inferred.Field.Type) {
return modified, fmt.Errorf("%w: mismatch storage type '%s' for extension type '%s'",
arrow.ErrInvalid, inferred.Field.Type, extType)
}

inferred.Field.Type = extType
modified = true
inferred.Field.Type = extType
}
case arrow.SPARSE_UNION, arrow.DENSE_UNION:
err = xerrors.New("unimplemented type")
case arrow.STRUCT:
Expand Down

0 comments on commit fe4bd93

Please sign in to comment.