Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bert tokenization korean decoding problems with lower case. #28

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 17 additions & 9 deletions model/tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@
import collections
import unicodedata
import six
import tensorflow.compat.v1 as tf

import re
import tensorflow as tf


def convert_to_unicode(text):
Expand Down Expand Up @@ -73,7 +73,7 @@ def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
index = 0
with tf.io.gfile.GFile(vocab_file, "r") as reader:
with tf.gfile.GFile(vocab_file, "r") as reader:
while True:
token = convert_to_unicode(reader.readline())
if not token:
Expand Down Expand Up @@ -138,7 +138,6 @@ class BasicTokenizer(object):

def __init__(self, do_lower_case=True):
"""Constructs a BasicTokenizer.

Args:
do_lower_case: Whether to lower case the input.
"""
Expand All @@ -162,14 +161,27 @@ def tokenize(self, text):
for token in orig_tokens:
if self.do_lower_case:
token = token.lower()
token = self._run_strip_accents(token)
token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token))

output_tokens = whitespace_tokenize(" ".join(split_tokens))
return output_tokens

def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""

# Skip using _run_strip_accents() for Korean substrings, since normalizing
# Korean characters with NFD and joining them back results in a seemingly
# same but different text, which causes a bug.
to_char = chr if six.PY3 else unichr
korean = "%s-%s%s-%s" % (to_char(0xac00), to_char(0xd7a3),
to_char(0x3131), to_char(0x3163))
if re.search("[%s]+" % korean, text):
return "".join(
substr if re.search("^[%s]+$" % korean, substr)
else self._run_strip_accents(substr)
for substr in re.findall("[%s]+|[^%s]+" % (korean, korean), text))

text = unicodedata.normalize("NFD", text)
output = []
for char in text:
Expand Down Expand Up @@ -258,18 +270,14 @@ def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):

def tokenize(self, text):
"""Tokenizes a piece of text into its word pieces.

This uses a greedy longest-match-first algorithm to perform tokenization
using the given vocabulary.

For example:
input = "unaffable"
output = ["un", "##aff", "##able"]

Args:
text: A single token or whitespace separated tokens. This should have
already been passed through `BasicTokenizer.

Returns:
A list of wordpiece tokens.
"""
Expand Down