From 3d5393ea67076cc1662a3c9a2a03591c214ab4ba Mon Sep 17 00:00:00 2001 From: Matt Sherman Date: Fri, 17 Apr 2020 16:32:08 -0400 Subject: [PATCH] Implement Unicode normalization filters Not sure how to test --- filters/norm/filter.go | 32 ++++++++++++++++++++++++++++++++ go.mod | 1 + 2 files changed, 33 insertions(+) create mode 100644 filters/norm/filter.go diff --git a/filters/norm/filter.go b/filters/norm/filter.go new file mode 100644 index 0000000..4e562ee --- /dev/null +++ b/filters/norm/filter.go @@ -0,0 +1,32 @@ +package norm + +import ( + "github.com/clipperhouse/jargon" + "github.com/clipperhouse/jargon/filters/mapper" + "golang.org/x/text/unicode/norm" +) + +// NFC normalizes tokens into Unicode Normalization Form C +var NFC = newFilter(norm.NFC) + +// NFD normalizes tokens into Unicode Normalization Form D +var NFD = newFilter(norm.NFD) + +// NFKC normalizes tokens into Unicode Normalization Form KC +var NFKC = newFilter(norm.NFKC) + +// NFKD normalizes tokens into Unicode Normalization Form KD +var NFKD = newFilter(norm.NFKD) + +func newFilter(form norm.Form) jargon.Filter { + f := func(token *jargon.Token) *jargon.Token { + if form.IsNormalString(token.String()) { + return token + } + + s := form.String(token.String()) + return jargon.NewToken(s, true) + } + + return mapper.NewFilter(f) +} diff --git a/go.mod b/go.mod index a6ea2b3..7ab6271 100644 --- a/go.mod +++ b/go.mod @@ -7,6 +7,7 @@ require ( github.com/kljensen/snowball v0.6.0 github.com/spf13/afero v1.2.2 golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e + golang.org/x/text v0.3.2 ) go 1.14