Skip to content
This repository was archived by the owner on Dec 15, 2022. It is now read-only.

Commit 5f11ffd

Browse files
committed
In hunspell, handle apostrophes, ignore words w/ non-english letters
1 parent 4262eb3 commit 5f11ffd

File tree

2 files changed

+58
-19
lines changed

2 files changed

+58
-19
lines changed

spec/spellchecker-spec.coffee

+18
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,24 @@ describe "SpellChecker", ->
3535
{start: 20, end: 25},
3636
]
3737

38+
it "does not treat non-english letters as word boundaries", ->
39+
SpellChecker.add("cliché")
40+
expect(SpellChecker.checkSpelling("what cliché nonsense")).toEqual []
41+
42+
it "handles words with apostrophes", ->
43+
string = "doesn't isn't aint hasn't"
44+
expect(SpellChecker.checkSpelling(string)).toEqual [
45+
{start: string.indexOf("aint"), end: string.indexOf("aint") + 4}
46+
]
47+
48+
string = "you say you're 'certain', but are you really?"
49+
expect(SpellChecker.checkSpelling(string)).toEqual []
50+
51+
string = "you say you're 'sertan', but are you really?"
52+
expect(SpellChecker.checkSpelling(string)).toEqual [
53+
{start: string.indexOf("sertan"), end: string.indexOf("',")}
54+
]
55+
3856
it "handles invalid inputs", ->
3957
expect(SpellChecker.checkSpelling("")).toEqual []
4058
expect(-> SpellChecker.checkSpelling()).toThrow("Bad argument")

src/spellchecker_hunspell.cc

+40-19
Original file line numberDiff line numberDiff line change
@@ -63,28 +63,49 @@ std::vector<MisspelledRange> HunspellSpellchecker::CheckSpelling(const uint16_t
6363

6464
std::vector<char> utf8_buffer(256);
6565

66-
size_t word_start = 0;
67-
bool within_word = false;
68-
for (size_t i = 0; i < utf16_length; i++) {
66+
enum {
67+
unknown,
68+
in_separator,
69+
in_word,
70+
} state = in_separator;
71+
72+
for (size_t word_start = 0, i = 0; i < utf16_length; i++) {
6973
uint16_t c = utf16_text[i];
70-
bool is_word_character = iswalpha(c);
71-
if (within_word) {
72-
if (!is_word_character) {
73-
within_word = false;
74-
75-
bool converted = TranscodeUTF16ToUTF8(transcoder, (char *)utf8_buffer.data(), utf8_buffer.size(), utf16_text + word_start, i - word_start);
76-
if (converted) {
77-
if (hunspell->spell(utf8_buffer.data()) == 0) {
78-
MisspelledRange range;
79-
range.start = word_start;
80-
range.end = i;
81-
result.push_back(range);
74+
75+
switch (state) {
76+
case unknown:
77+
if (iswpunct(c) || iswspace(c)) {
78+
state = in_separator;
79+
}
80+
break;
81+
82+
case in_separator:
83+
if (iswalpha(c)) {
84+
word_start = i;
85+
state = in_word;
86+
} else if (!iswpunct(c) && !iswspace(c)) {
87+
state = unknown;
88+
}
89+
break;
90+
91+
case in_word:
92+
if (c == '\'' && iswalpha(utf16_text[i + 1])) {
93+
i++;
94+
} else if (c == 0 || iswpunct(c) || iswspace(c)) {
95+
state = in_separator;
96+
bool converted = TranscodeUTF16ToUTF8(transcoder, (char *)utf8_buffer.data(), utf8_buffer.size(), utf16_text + word_start, i - word_start);
97+
if (converted) {
98+
if (hunspell->spell(utf8_buffer.data()) == 0) {
99+
MisspelledRange range;
100+
range.start = word_start;
101+
range.end = i;
102+
result.push_back(range);
103+
}
82104
}
105+
} else if (!iswalpha(c)) {
106+
state = unknown;
83107
}
84-
}
85-
} else if (is_word_character) {
86-
word_start = i;
87-
within_word = true;
108+
break;
88109
}
89110
}
90111

0 commit comments

Comments
 (0)