Skip to content

Commit

Permalink
fix(parquet/metadata): fix default unsigned int statistics (#210)
Browse files Browse the repository at this point in the history
### Rationale for this change
fixes #209 

### What changes are included in this PR?
Fixing `UpdateSpaced` statistics methods to properly check unsigned
values to ensure proper handling of default min/max values.

### Are these changes tested?
Yes, unit test was added to confirm.

### Are there any user-facing changes?
None, other than fixing the reported bug.
  • Loading branch information
zeroshade authored Dec 10, 2024
1 parent 90f5515 commit ebda0f3
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 39 deletions.
30 changes: 30 additions & 0 deletions parquet/metadata/statistics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
"github.com/apache/arrow-go/v18/parquet/metadata"
"github.com/apache/arrow-go/v18/parquet/schema"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

// NOTE(zeroshade): tests will be added and updated after merging the "file" package
Expand Down Expand Up @@ -260,3 +261,32 @@ func TestBooleanStatisticsEncoding(t *testing.T) {
assert.Equal(t, []byte{1}, maxEnc)
assert.Equal(t, []byte{0}, minEnc)
}

func TestUnsignedDefaultStats(t *testing.T) {
// testing issue github.com/apache/arrow-go/issues/209
// stats for unsigned columns should not have invalid min values
{
n, err := schema.NewPrimitiveNodeLogical("uint16", parquet.Repetitions.Optional,
schema.NewIntLogicalType(16, false), parquet.Types.Int32, 4, -1)
require.NoError(t, err)
descr := schema.NewColumn(n, 1, 0)
s := metadata.NewStatistics(descr, nil).(*metadata.Int32Statistics)
s.UpdateSpaced([]int32{0}, []byte{1}, 0, 0)

minv, maxv := s.Min(), s.Max()
assert.Zero(t, minv)
assert.Zero(t, maxv)
}
{
n, err := schema.NewPrimitiveNodeLogical("uint64", parquet.Repetitions.Optional,
schema.NewIntLogicalType(64, false), parquet.Types.Int64, 4, -1)
require.NoError(t, err)
descr := schema.NewColumn(n, 1, 0)
s := metadata.NewStatistics(descr, nil).(*metadata.Int64Statistics)
s.UpdateSpaced([]int64{0}, []byte{1}, 0, 0)

minv, maxv := s.Min(), s.Max()
assert.Zero(t, minv)
assert.Zero(t, maxv)
}
}
66 changes: 40 additions & 26 deletions parquet/metadata/statistics_types.gen.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

33 changes: 20 additions & 13 deletions parquet/metadata/statistics_types.gen.go.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -248,13 +248,26 @@ func (s *{{.Name}}Statistics) getMinMaxSpaced(values []{{.name}}, validBits []by
max = s.defaultMax()

{{- if or (eq .name "int32") (eq .name "int64")}}
var fn func([]{{.name}}) ({{.name}}, {{.name}})
var fn func([]{{.name}})
if s.order == schema.SortSIGNED {
fn = shared_utils.GetMinMax{{.Name}}
fn = func(v []{{.name}}) {
localMin, localMax := shared_utils.GetMinMax{{.Name}}(v)
if min > localMin {
min = localMin
}
if max < localMax {
max = localMax
}
}
} else {
fn = func(v []{{.name}}) ({{.name}}, {{.name}}) {
umin, umax := shared_utils.GetMinMaxU{{.name}}(arrow.U{{.name}}Traits.CastFromBytes(arrow.{{.Name}}Traits.CastToBytes(values)))
return {{.name}}(umin), {{.name}}(umax)
fn = func(v []{{.name}}) {
umin, umax := shared_utils.GetMinMaxU{{.name}}(arrow.U{{.name}}Traits.CastFromBytes(arrow.{{.Name}}Traits.CastToBytes(v)))
if u{{.name}}(min) > umin {
min = {{.name}}(umin)
}
if u{{.name}}(max) < umax {
max = {{.name}}(umax)
}
}
}
{{- end}}
Expand All @@ -271,13 +284,7 @@ func (s *{{.Name}}Statistics) getMinMaxSpaced(values []{{.name}}, validBits []by
break
}
{{- if or (eq .name "int32") (eq .name "int64")}}
localMin, localMax := fn(values[int(run.Pos):int(run.Pos+run.Length)])
if min > localMin {
min = localMin
}
if max < localMax {
max = localMax
}
fn(values[int(run.Pos):int(run.Pos+run.Length)])
{{- else}}
for _, v := range values[int(run.Pos):int(run.Pos+run.Length)] {
{{- if or (eq .name "float32") (eq .name "float64") (eq .Name "Float16") }}
Expand Down Expand Up @@ -385,7 +392,7 @@ func (s *{{.Name}}Statistics) UpdateFromArrow(values arrow.Array, updateCounts b
)

for i := 0; i < arr.Len(); i++ {
nextOffset := arr.ValueOffset64(i + 1)
nextOffset := curOffset + int64(arr.ValueLen(i))
v := data[curOffset:nextOffset]
curOffset = nextOffset

Expand Down

0 comments on commit ebda0f3

Please sign in to comment.