-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconvenience.go
150 lines (139 loc) · 3.95 KB
/
convenience.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
// Convenience functions that use DoubleMetaphone.
// Created 2022-12-16 by Ron Charlton and placed in the public domain.
//
// $Id: convenience.go,v 1.23 2024-06-16 15:47:55-04 ron Exp $
package metaphone
import (
"bytes"
"compress/gzip"
"errors"
"fmt"
"io"
"os"
"strings"
)
// MetaphMap defines a MetaphMap for a wordlist and maximum metaph/metaph2
// length from DoubleMetaphone.
type MetaphMap struct {
mapper map[string][]string
// maximum length of metaph and metaph2 in DoubleMetaphone.
maxlen int
}
// NewMetaphMap returns a MetaphMap made from wordlist and a maximum
// length for the DoubleMetaphone return values.
// The MetaphMap can be used with MatchWord to find all words in the
// MetaphMap that sound like a given word or misspelling.
// Argument maxLen is 4 in the original Double Metaphone algorithm.
// Case is ignored in the words in wordlist, as are non-alphabetic
// characters.
func NewMetaphMap(wordlist []string, maxLen int) *MetaphMap {
MMap := make(map[string][]string)
for _, word := range wordlist {
m, m2 := DoubleMetaphone(word, maxLen)
if len(m) > 0 {
MMap[m] = append(MMap[m], word)
}
if len(m2) > 0 {
MMap[m2] = append(MMap[m2], word)
}
}
return &MetaphMap{
mapper: MMap,
maxlen: maxLen,
}
}
// NewMetaphMapFromFile returns a MetaphMap made from a file containing a
// word list, and using a maximum length for the DoubleMetaphone return values.
// The file can be a gzipped file with its name ending with ".gz".
// The MetaphMap can be used with MatchWord to find all words in the
// MetaphMap that sound like a given word or misspelling.
// Argument maxLen is 4 in the original Double Metaphone algorithm.
// Letter case and non-alphabetic characters in the file are ignored.
func NewMetaphMapFromFile(fileName string, maxLen int) (
metaph *MetaphMap, err error) {
var b []byte
var r io.Reader
var fp *os.File
var gr *gzip.Reader
if fp, err = os.Open(fileName); err != nil {
err = fmt.Errorf("trying to open file %s: %v", fileName, err)
return
}
defer func() {
err = errors.Join(err, fp.Close())
}()
r = fp
if strings.HasSuffix(fileName, ".gz") {
if gr, err = gzip.NewReader(fp); err != nil {
err = fmt.Errorf(
"trying to make a gzip reader for file %s: %v", fileName, err)
return
}
defer func() {
err = errors.Join(err, gr.Close())
}()
r = gr
}
if b, err = io.ReadAll(r); err != nil {
err = fmt.Errorf("trying to read file %s: %v", fileName, err)
return
}
lines := strings.Split(string(noCRs(b)), "\n")
return NewMetaphMap(lines, maxLen), err
}
// Len returns the number of sound-alike entries in metaph.
func (metaph *MetaphMap) Len() int {
return len(metaph.mapper)
}
// MatchWord returns all words in metaph that sound like word.
// Case and non-alphabetic characters in word are ignored. Typical use:
//
// import "fmt"
// import "metaphone"
// // ...
// // File wordlistFileName should contain a comprehesive word
// // list, one word per line. Errors are ignored here.
// metaphMap, _ := metaphone.NewMetaphMapFromFile(wordlistFileName, 4)
// matches := metaphMap.MatchWord("knewmoanya")
// for _, word = range matches {
// fmt.Println(word)
// }
func (metaph *MetaphMap) MatchWord(word string) (output []string) {
m, m2 := DoubleMetaphone(word, metaph.maxlen)
if len(m) > 0 {
output = metaph.mapper[m]
}
if len(m2) > 0 {
output = append(output, metaph.mapper[m2]...)
}
output = removeDups(output)
return
}
// removeDups removes duplicate strings in s.
func removeDups(s []string) (out []string) {
m := make(map[string]struct{})
for _, w := range s {
m[w] = struct{}{}
}
for o := range m {
out = append(out, o)
}
return
}
// noCRs removes CRs. Assumes UTF-8, ANSI, iso-8859-n or ASCII encoding.
func noCRs(b []byte) []byte {
// bytes.Map would use an additional len(b) buffer and be slower.
from := bytes.IndexByte(b, '\r')
if from < 0 {
return b
}
to := from
for from < len(b) {
if b[from] != '\r' {
b[to] = b[from]
to++
}
from++
}
return b[:to]
}