Skip to content

Commit

Permalink
apacheGH-35718: [Go][Parquet] Fix for null-only encoding panic (apach…
Browse files Browse the repository at this point in the history
…e#39497)

### Rationale for this change

closes: apache#35718 

### What changes are included in this PR?

Fix painc writing with DeltaBinaryPacked or DeltaByteArray when column only has nulls

### Are these changes tested?

Yes

- add a test writing nulls to columns with DeltaBinaryPacked / DeltaByteArray / DeltaLengthByteArray encodings

### Are there any user-facing changes?

No

* Closes: apache#35718

Lead-authored-by: yufanmo <yufan.mo@transwarp.io>
Co-authored-by: Matt Topol <zotthewizard@gmail.com>
Signed-off-by: Matt Topol <zotthewizard@gmail.com>
  • Loading branch information
2 people authored and clayburn committed Jan 23, 2024
1 parent 641f8cb commit 327f4f1
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 1 deletion.
10 changes: 9 additions & 1 deletion go/parquet/internal/encoding/delta_byte_array.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,15 @@ type DeltaByteArrayEncoder struct {
}

func (enc *DeltaByteArrayEncoder) EstimatedDataEncodedSize() int64 {
return enc.prefixEncoder.EstimatedDataEncodedSize() + enc.suffixEncoder.EstimatedDataEncodedSize()
prefixEstimatedSize := int64(0)
if enc.prefixEncoder != nil {
prefixEstimatedSize = enc.prefixEncoder.EstimatedDataEncodedSize()
}
suffixEstimatedSize := int64(0)
if enc.suffixEncoder != nil {
suffixEstimatedSize = enc.suffixEncoder.EstimatedDataEncodedSize()
}
return prefixEstimatedSize + suffixEstimatedSize
}

func (enc *DeltaByteArrayEncoder) initEncoders() {
Expand Down
58 changes: 58 additions & 0 deletions go/parquet/pqarrow/encode_arrow_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -473,6 +473,64 @@ func TestWriteEmptyLists(t *testing.T) {
require.NoError(t, err)
}

func TestWriteAllNullsWithDeltaEncoding(t *testing.T) {
sc := arrow.NewSchema([]arrow.Field{
{Name: "f1", Type: arrow.PrimitiveTypes.Int64, Nullable: true},
{Name: "f2", Type: arrow.ListOf(arrow.FixedWidthTypes.Date32)},
{Name: "f3", Type: arrow.BinaryTypes.String, Nullable: true},
{Name: "f4", Type: arrow.ListOf(arrow.BinaryTypes.String)},
{Name: "f5", Type: arrow.BinaryTypes.LargeString, Nullable: true},
{Name: "f6", Type: arrow.ListOf(arrow.BinaryTypes.LargeString)},
{Name: "f7", Type: arrow.PrimitiveTypes.Float64, Nullable: true},
{Name: "f8", Type: arrow.ListOf(arrow.FixedWidthTypes.Date64)},
{Name: "f9", Type: arrow.BinaryTypes.String, Nullable: true},
{Name: "f10", Type: arrow.ListOf(arrow.BinaryTypes.LargeString)},
{Name: "f11", Type: arrow.FixedWidthTypes.Boolean, Nullable: true},
{Name: "f12", Type: arrow.ListOf(arrow.FixedWidthTypes.Boolean)},
{Name: "f13", Type: arrow.PrimitiveTypes.Int32, Nullable: true},
{Name: "f14", Type: arrow.ListOf(arrow.PrimitiveTypes.Float32)},
}, nil)
bldr := array.NewRecordBuilder(memory.DefaultAllocator, sc)
defer bldr.Release()
for _, b := range bldr.Fields() {
b.AppendNull()
}

rec := bldr.NewRecord()
defer rec.Release()

props := parquet.NewWriterProperties(
parquet.WithVersion(parquet.V1_0),
parquet.WithDictionaryDefault(false),
parquet.WithDictionaryFor("f9", true),
parquet.WithDictionaryFor("f10", true),
parquet.WithDictionaryFor("f13", true),
parquet.WithDictionaryFor("f14", true),
parquet.WithEncodingFor("f1", parquet.Encodings.DeltaBinaryPacked),
parquet.WithEncodingFor("f2", parquet.Encodings.DeltaBinaryPacked),
parquet.WithEncodingFor("f3", parquet.Encodings.DeltaByteArray),
parquet.WithEncodingFor("f4", parquet.Encodings.DeltaByteArray),
parquet.WithEncodingFor("f5", parquet.Encodings.DeltaLengthByteArray),
parquet.WithEncodingFor("f6", parquet.Encodings.DeltaLengthByteArray),
parquet.WithEncodingFor("f7", parquet.Encodings.Plain),
parquet.WithEncodingFor("f8", parquet.Encodings.Plain),
parquet.WithEncodingFor("f9", parquet.Encodings.Plain),
parquet.WithEncodingFor("f10", parquet.Encodings.Plain),
parquet.WithEncodingFor("f11", parquet.Encodings.RLE),
parquet.WithEncodingFor("f12", parquet.Encodings.RLE),
parquet.WithEncodingFor("f13", parquet.Encodings.RLE),
parquet.WithEncodingFor("f14", parquet.Encodings.RLE),
)
arrprops := pqarrow.DefaultWriterProps()
var buf bytes.Buffer
fw, err := pqarrow.NewFileWriter(sc, &buf, props, arrprops)
require.NoError(t, err)
err = fw.Write(rec)
require.NoError(t, err)
err = fw.Close()
require.NoError(t, err)
}

func TestArrowReadWriteTableChunkedCols(t *testing.T) {
chunkSizes := []int{2, 4, 10, 2}
const totalLen = int64(18)
Expand Down

0 comments on commit 327f4f1

Please sign in to comment.