-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathtokenise.go
94 lines (77 loc) · 1.69 KB
/
tokenise.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
package formulation
import (
"github.com/dan-locke/clean-html"
"github.com/hscells/go-unidecode"
"strings"
"unicode"
)
type tokeniseOutput [][]byte
type currType int
const (
char currType = iota
num
space
other
)
func tokenise(text string) (tokeniseOutput, error) {
var curr currType
var tokens [][]byte
var currWordLen int
txt := unidecode.Unidecode(strings.ToLower(text))
portions, err := clean_html.TextPos([]byte(txt))
if err != nil {
return nil, err
}
for i := range portions.Positions {
for j, t := range txt[portions.Positions[i][0]:portions.Positions[i][1]] {
prev := curr
if unicode.IsSpace(t) {
curr = space
} else if unicode.IsNumber(t) {
curr = num
} else if unicode.IsLetter(t) {
curr = char
} else {
curr = other
}
// Remove this if not doing lower ...
if curr == char {
t = unicode.ToLower(t)
}
var change bool
if prev != curr {
change = true
} else if curr == other {
continue
}
if change {
start := portions.Positions[i][0] + j - currWordLen
if start < 0 {
start = 0
}
if currWordLen != 0 {
if curr == other || curr == num {
currWordLen = 0
continue
}
tokens = append(tokens, []byte(txt[start:portions.Positions[i][0]+j]))
currWordLen = 0
}
}
if portions.Positions[i][0]+j+1 == len(txt) {
if curr != space {
if curr == other || curr == num {
currWordLen = 0
continue
}
tokens = append(tokens, []byte(txt[portions.Positions[i][0]+j-currWordLen:portions.Positions[i][0]+j+1]))
currWordLen = 0
}
}
if curr != space && curr != other && curr != num {
currWordLen++
}
}
}
return tokens, nil
}