Skip to content

Commit

Permalink
feat: (GH-56) Add arrayApproxEqualString to handle null characters in…
Browse files Browse the repository at this point in the history
… strings

Signed-off-by: Saurabh Kumar Singh <singh1203.ss@gmail.com>
  • Loading branch information
singh1203 committed Feb 20, 2025
1 parent 6e2e50b commit 1d07d32
Show file tree
Hide file tree
Showing 2 changed files with 155 additions and 3 deletions.
47 changes: 44 additions & 3 deletions arrow/array/compare.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package array
import (
"fmt"
"math"
"strings"

"github.com/apache/arrow-go/v18/arrow"
"github.com/apache/arrow-go/v18/arrow/float16"
Expand Down Expand Up @@ -487,19 +488,19 @@ func arrayApproxEqual(left, right arrow.Array, opt equalOption) bool {
return arrayEqualBinary(l, r)
case *String:
r := right.(*String)
return arrayEqualString(l, r)
return arrayApproxEqualString(l, r)
case *LargeBinary:
r := right.(*LargeBinary)
return arrayEqualLargeBinary(l, r)
case *LargeString:
r := right.(*LargeString)
return arrayEqualLargeString(l, r)
return arrayApproxEqualLargeString(l, r)
case *BinaryView:
r := right.(*BinaryView)
return arrayEqualBinaryView(l, r)
case *StringView:
r := right.(*StringView)
return arrayEqualStringView(l, r)
return arrayApproxEqualStringView(l, r)
case *Int8:
r := right.(*Int8)
return arrayEqualInt8(l, r)
Expand Down Expand Up @@ -644,6 +645,46 @@ func validityBitmapEqual(left, right arrow.Array) bool {
return true
}

func arrayApproxEqualString(left, right *String) bool {
for i := 0; i < left.Len(); i++ {
if left.IsNull(i) {
continue
}
if stripNulls(left.Value(i)) != stripNulls(right.Value(i)) {
return false
}
}
return true
}

func arrayApproxEqualLargeString(left, right *LargeString) bool {
for i := 0; i < left.Len(); i++ {
if left.IsNull(i) {
continue
}
if stripNulls(left.Value(i)) != stripNulls(right.Value(i)) {
return false
}
}
return true
}

func arrayApproxEqualStringView(left, right *StringView) bool {
for i := 0; i < left.Len(); i++ {
if left.IsNull(i) {
continue
}
if stripNulls(left.Value(i)) != stripNulls(right.Value(i)) {
return false
}
}
return true
}

func stripNulls(s string) string {
return strings.TrimRight(s, "\x00")
}

func arrayApproxEqualFloat16(left, right *Float16, opt equalOption) bool {
for i := 0; i < left.Len(); i++ {
if left.IsNull(i) {
Expand Down
111 changes: 111 additions & 0 deletions arrow/array/compare_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,94 @@ func TestArrayApproxEqual(t *testing.T) {
}
}

func TestArrayApproxEqualStrings(t *testing.T) {
for _, tc := range []struct {
name string
a1 interface{}
a2 interface{}
want bool
}{
{
name: "string",
a1: []string{"a", "b", "c", "d", "e", "f"},
a2: []string{"a", "b", "c", "d", "e", "f"},
want: true,
},
{
name: "string",
a1: []string{"a", "b\x00"},
a2: []string{"a", "b"},
want: true,
},
{
name: "string",
a1: []string{"a", "b\x00"},
a2: []string{"a\x00", "b"},
want: true,
},
{
name: "equal large strings",
a1: []string{"a", "b", "c", "d", "e", "f"},
a2: []string{"a", "b", "c", "d", "e", "f"},
want: true,
},
{
name: "equal large strings with nulls",
a1: []string{"a", "b\x00"},
a2: []string{"a", "b"},
want: true,
},
{
name: "equal large strings with nulls in both",
a1: []string{"Apache", "Arrow\x00"},
a2: []string{"Apache\x00", "Arrow"},
want: true,
},
{
name: "equal string views",
a1: []string{"a", "b", "c", "d", "e", "f"},
a2: []string{"a", "b", "c", "d", "e", "f"},
want: true,
},
{
name: "equal string views with nulls",
a1: []string{"Apache", "Arrow\x00"},
a2: []string{"Apache", "Arrow"},
want: true,
},
{
name: "equal string views with nulls in both",
a1: []string{"Apache", "Arrow\x00"},
a2: []string{"Apache\x00", "Arrow"},
want: true,
},
} {
t.Run(tc.name, func(t *testing.T) {
mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
defer mem.AssertSize(t, 0)

var a1, a2 arrow.Array
switch tc.name {
case "equal large strings", "equal large strings with nulls", "equal large strings with nulls in both":
a1 = arrayOfLargeString(mem, tc.a1.([]string), nil)
a2 = arrayOfLargeString(mem, tc.a2.([]string), nil)
case "equal string views", "equal string views with nulls", "equal string views with nulls in both":
a1 = arrayOfStringView(mem, tc.a1.([]string), nil)
a2 = arrayOfStringView(mem, tc.a2.([]string), nil)
default:
a1 = arrayOf(mem, tc.a1, nil)
a2 = arrayOf(mem, tc.a2, nil)
}
defer a1.Release()
defer a2.Release()

if got, want := array.ApproxEqual(a1, a2), tc.want; got != want {
t.Fatalf("invalid comparison: got=%v, want=%v\na1: %v\na2: %v\n", got, want, a1, a2)
}
})
}
}

func TestArrayApproxEqualFloats(t *testing.T) {
f16sFrom := func(vs []float64) []float16.Num {
o := make([]float16.Num, len(vs))
Expand Down Expand Up @@ -445,11 +533,34 @@ func arrayOf(mem memory.Allocator, a interface{}, valids []bool) arrow.Array {
bldr.AppendValues(a, valids)
return bldr.NewFloat64Array()

case []string:
bldr := array.NewStringBuilder(mem)
defer bldr.Release()

bldr.AppendValues(a, valids)
return bldr.NewStringArray()

default:
panic(fmt.Errorf("arrdata: invalid data slice type %T", a))
}
}

func arrayOfLargeString(mem memory.Allocator, a []string, valids []bool) arrow.Array {
bldr := array.NewLargeStringBuilder(mem)
defer bldr.Release()

bldr.AppendValues(a, valids)
return bldr.NewLargeStringArray()
}

func arrayOfStringView(mem memory.Allocator, a []string, valids []bool) arrow.Array {
bldr := array.NewStringViewBuilder(mem)
defer bldr.Release()

bldr.AppendValues(a, valids)
return bldr.NewStringViewArray()
}

func TestArrayEqualBaseArray(t *testing.T) {
mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
defer mem.AssertSize(t, 0)
Expand Down

0 comments on commit 1d07d32

Please sign in to comment.