-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconll.go
303 lines (253 loc) · 7.85 KB
/
conll.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
// Copyright 2015, 2016 The conllx Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package conllx
import (
"bytes"
"fmt"
"strconv"
"strings"
)
type fields uint32
const (
formBit fields = 1 << iota
lemmaBit
coarsePosTagBit
posTagBit
featuresBit
headBit
headRelBit
pHeadBit
pHeadRelBit
)
// Features from the CONLL-X features field.
type Features struct {
featuresString string
featuresMap map[string]string
}
// Construct a new features field from a features string.
func newFeatures(featuresString string) *Features {
return &Features{
featuresString: featuresString,
featuresMap: nil,
}
}
// FeaturesString returns the token features as a string. This will give
// feature in exactly the same format as the original CONLL-X data.
func (f *Features) FeaturesString() string {
return f.featuresString
}
// FeaturesMap returns the token features as a key-value mapping. Features
// that do not follow the expected format are skipped.
//
// The feature map is lazily initialized on its first call. No
// feature field parsing is done if this method is not called.
func (f *Features) FeaturesMap() map[string]string {
if f.featuresMap == nil {
f.featuresMap = make(map[string]string)
for _, av := range strings.Split(f.featuresString, "|") {
if sepIdx := strings.IndexByte(av, ':'); sepIdx != -1 {
f.featuresMap[av[:sepIdx]] = av[sepIdx+1:]
}
}
}
return f.featuresMap
}
var _ fmt.Stringer = Token{}
// Token stores a token with the CONLL-X annotation layers.
type Token struct {
available fields
form string
lemma string
coarsePosTag string
posTag string
features *Features
head uint
headRel string
pHead uint
pHeadRel string
}
// NewToken creates a new Token with all layers set to absent.
//
// Note that although the Sentence type used by readers and writers
// is a slice of Token as a value type, this constructor returns a
// pointer. This is intentional: the token constructor returns a
// pointer to permit token construction via the builder pattern.
func NewToken() *Token {
return &Token{}
}
// Form returns the form (the actual token), the second tuple element is
// false when there is no form stored in this token.
func (t *Token) Form() (string, bool) {
return t.form, t.available&formBit != 0
}
// Lemma returns the lemma of the token, the second tuple element is false
// when there is no lemma stored in this token.
func (t *Token) Lemma() (string, bool) {
return t.lemma, t.available&lemmaBit != 0
}
// CoarsePosTag returns the coarse-grained POS tag of the token, the
// second tuple element is false when there is no coarse-grained tag
// stored in this token.
func (t *Token) CoarsePosTag() (string, bool) {
return t.coarsePosTag, t.available&coarsePosTagBit != 0
}
// PosTag returns the fine-grained POS tag of the token, the second
// tuple element is false when there is no fine-grained tag stored in
// this token.
func (t *Token) PosTag() (string, bool) {
return t.posTag, t.available&posTagBit != 0
}
// Features returns the features field, the second tuple element is false
// when there are no features stored in this token.
func (t *Token) Features() (*Features, bool) {
return t.features, t.available&featuresBit != 0
}
// Head returns the head of the token, the second tuple element is false
// when there is no head stored in this token.
func (t *Token) Head() (uint, bool) {
return t.head, t.available&headBit != 0
}
// HeadRel returns the relation of the token to its head, the second
// tuple element is false when there is no head relation stored in this
// token.
func (t *Token) HeadRel() (string, bool) {
return t.headRel, t.available&headRelBit != 0
}
// PHead returns the projective head of the token, the second tuple
// element is false when there is no head stored in this token.
func (t *Token) PHead() (uint, bool) {
return t.pHead, t.available&pHeadBit != 0
}
// PHeadRel returns the relation of the token to its projective head, the
// second tuple element is false when there is no head relation stored in
// this token.
func (t *Token) PHeadRel() (string, bool) {
return t.pHeadRel, t.available&pHeadRelBit != 0
}
// SetFeatures sets the features for this token. The token itself is
// returned to allow method chaining.
func (t *Token) SetFeatures(features map[string]string) *Token {
f := new(Features)
f.featuresMap = features
fVals := make([]string, 0, len(features))
for k, v := range features {
fVals = append(fVals, fmt.Sprintf("%s:%s", k, v))
}
f.featuresString = strings.Join(fVals, "|")
t.features = f
t.available |= featuresBit
return t
}
// SetForm sets the form for this token. The token itself is returned to
// allow method chaining.
func (t *Token) SetForm(form string) *Token {
t.form = form
t.available |= formBit
return t
}
// SetLemma sets the lemma for this token. The token itself is returned to
// allow method chaining.
func (t *Token) SetLemma(lemma string) *Token {
t.lemma = lemma
t.available |= lemmaBit
return t
}
// SetCoarsePosTag sets the coarse-grained POS tag for this token. The
// token itself is returned to allow method chaining.
func (t *Token) SetCoarsePosTag(coarsePosTag string) *Token {
t.coarsePosTag = coarsePosTag
t.available |= coarsePosTagBit
return t
}
// SetPosTag sets the fine-grained POS tag for this token. The token
// itself is returned to allow method chaining.
func (t *Token) SetPosTag(posTag string) *Token {
t.posTag = posTag
t.available |= posTagBit
return t
}
// SetHead sets the head of this token. The token itself is returned to
// allow method chaining.
func (t *Token) SetHead(head uint) *Token {
t.head = head
t.available |= headBit
return t
}
// SetHeadRel sets the relation to the head of this token. The token
// itself is returned to allow method chaining.
func (t *Token) SetHeadRel(rel string) *Token {
t.headRel = rel
t.available |= headRelBit
return t
}
// SetPHead sets the projective head of this token. The token itself is
// returned to allow method chaining.
func (t *Token) SetPHead(head uint) *Token {
t.pHead = head
t.available |= pHeadBit
return t
}
// SetPHeadRel sets the relation to the projective head of this token.
// The token itself is returned to allow method chaining.
func (t *Token) SetPHeadRel(rel string) *Token {
t.pHeadRel = rel
t.available |= pHeadRelBit
return t
}
func (t Token) String() string {
var buffer bytes.Buffer
buffer.WriteString(stringForField(t.Form))
buffer.WriteRune('\t')
buffer.WriteString(stringForField(t.Lemma))
buffer.WriteRune('\t')
buffer.WriteString(stringForField(t.CoarsePosTag))
buffer.WriteRune('\t')
buffer.WriteString(stringForField(t.PosTag))
buffer.WriteRune('\t')
buffer.WriteString(stringForFeatures(t.Features))
buffer.WriteRune('\t')
buffer.WriteString(stringForUintField(t.Head))
buffer.WriteRune('\t')
buffer.WriteString(stringForField(t.HeadRel))
buffer.WriteRune('\t')
buffer.WriteString(stringForUintField(t.PHead))
buffer.WriteRune('\t')
buffer.WriteString(stringForField(t.PHeadRel))
return buffer.String()
}
func stringForField(f func() (string, bool)) string {
if v, ok := f(); ok {
return v
}
return "_"
}
func stringForFeatures(f func() (*Features, bool)) string {
if v, ok := f(); ok {
return v.FeaturesString()
}
return "_"
}
func stringForUintField(f func() (uint, bool)) string {
if v, ok := f(); ok {
return strconv.FormatUint(uint64(v), 10)
}
return "_"
}
var _ fmt.Stringer = Sentence{}
// A Sentence is a slice of Tokens.
type Sentence []Token
func (s Sentence) String() string {
var buf bytes.Buffer
for idx, token := range s {
// Write the token identifier.
buf.WriteString(strconv.FormatInt(int64(idx+1), 10))
buf.WriteRune('\t')
buf.WriteString(token.String())
// Append a newline, unless we are at the last token.
if idx != len(s)-1 {
buf.WriteRune('\n')
}
}
return buf.String()
}