-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfeature.go
113 lines (97 loc) · 2.65 KB
/
feature.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
// Copyright 2015, Yahoo Inc. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package distance
import (
"bytes"
"io"
"github.com/mfonda/simhash"
"golang.org/x/net/html"
)
// Fingerprint generates the fingerprint of an HTML from the io.Reader r and a shingle factor.
// Shingle refers to the level of shuffling.
// E.g. with shingle factor =2, input "a", "b", "c" will be converted to "a b", "b c"
func Fingerprint(r io.Reader, shingle int) uint64 {
if shingle < 1 {
shingle = 1
}
// collect the features via this cf channel.
cf := make(chan string, 1000)
cs := make(chan uint64, 1000)
v := simhash.Vector{}
// Tokenize and then Generate Features. .
go func() {
defer close(cf)
z := html.NewTokenizer(r)
// TODO - export the max token count as an function argument.
count := 0
for tt := z.Next(); count < 5000 && tt != html.ErrorToken; tt = z.Next() {
t := z.Token()
count++
genFeatures(&t, cf)
}
}()
// Collect the features.
go func() {
defer close(cs)
a := make([][]byte, shingle)
for f := <-cf; f != ""; f = <-cf {
// shingle: generate the k-gram token as a single feature.
a = append(a[1:], []byte(f))
// fmt.Printf("%#v\n", a)
// fmt.Printf("%s\n", bytes.Join(a, []byte(" ")))
cs <- simhash.NewFeature(bytes.Join(a, []byte(" "))).Sum()
// cs <- simhash.NewFeature([]byte(f)).Sum()
}
}()
// from the checksum (of feature), append to vector.
for s := <-cs; s != 0; s = <-cs {
for i := uint8(0); i < 64; i++ {
bit := ((s >> i) & 1)
if bit == 1 {
v[i]++
} else {
v[i]--
}
}
}
return simhash.Fingerprint(v)
}
func genFeatures(t *html.Token, cf chan<- string) {
s := ""
switch t.Type {
case html.StartTagToken:
s = "A:" + t.DataAtom.String()
case html.EndTagToken:
s = "B:" + t.DataAtom.String()
case html.SelfClosingTagToken:
s = "C:" + t.DataAtom.String()
case html.DoctypeToken:
s = "D:" + t.DataAtom.String()
case html.CommentToken:
s = "E:" + t.DataAtom.String()
case html.TextToken:
s = "F:" + t.DataAtom.String()
case html.ErrorToken:
s = "Z:" + t.DataAtom.String()
}
// fmt.Println(s)
cf <- s
for _, attr := range t.Attr {
switch attr.Key {
case "class":
s = "G:" + t.DataAtom.String() + ":" + attr.Key + ":" + attr.Val
// case "id":
// s = "G:" + t.DataAtom.String() + ":" + attr.Key + ":" + attr.Val
case "name":
s = "G:" + t.DataAtom.String() + ":" + attr.Key + ":" + attr.Val
case "rel":
s = "G:" + t.DataAtom.String() + ":" + attr.Key + ":" + attr.Val
default:
s = "G:" + t.DataAtom.String() + ":" + attr.Key
}
// fmt.Println(s)
cf <- s
}
// fmt.Println(s)
}