From 64a72d1d9a57bfd8e8a46af492c0c09f8d9123ad Mon Sep 17 00:00:00 2001 From: DeokJin <33983084+qute012@users.noreply.github.com> Date: Sat, 28 Mar 2020 19:19:57 +0900 Subject: [PATCH 1/2] Update tokenization.py Fix korean decoding with lower case. --- model/tokenization.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/model/tokenization.py b/model/tokenization.py index b345256..943dcab 100644 --- a/model/tokenization.py +++ b/model/tokenization.py @@ -22,8 +22,8 @@ import collections import unicodedata import six -import tensorflow.compat.v1 as tf - +import re +import tensorflow as tf def convert_to_unicode(text): @@ -73,7 +73,7 @@ def load_vocab(vocab_file): """Loads a vocabulary file into a dictionary.""" vocab = collections.OrderedDict() index = 0 - with tf.io.gfile.GFile(vocab_file, "r") as reader: + with tf.gfile.GFile(vocab_file, "r") as reader: while True: token = convert_to_unicode(reader.readline()) if not token: @@ -138,7 +138,6 @@ class BasicTokenizer(object): def __init__(self, do_lower_case=True): """Constructs a BasicTokenizer. - Args: do_lower_case: Whether to lower case the input. """ @@ -170,6 +169,19 @@ def tokenize(self, text): def _run_strip_accents(self, text): """Strips accents from a piece of text.""" + + # Skip using _run_strip_accents() for Korean substrings, since normalizing + # Korean characters with NFD and joining them back results in a seemingly + # same but different text, which causes a bug. + to_char = chr if six.PY3 else unichr + korean = "%s-%s%s-%s" % (to_char(0xac00), to_char(0xd7a3), + to_char(0x3131), to_char(0x3163)) + if re.search("[%s]+" % korean, text): + return "".join( + substr if re.search("^[%s]+$" % korean, substr) + else self._run_strip_accents(substr) + for substr in re.findall("[%s]+|[^%s]+" % (korean, korean), text)) + text = unicodedata.normalize("NFD", text) output = [] for char in text: @@ -258,18 +270,14 @@ def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200): def tokenize(self, text): """Tokenizes a piece of text into its word pieces. - This uses a greedy longest-match-first algorithm to perform tokenization using the given vocabulary. - For example: input = "unaffable" output = ["un", "##aff", "##able"] - Args: text: A single token or whitespace separated tokens. This should have already been passed through `BasicTokenizer. - Returns: A list of wordpiece tokens. """ From a6c731d88340a5e7d2d87789cdcab3087972841b Mon Sep 17 00:00:00 2001 From: DeokJin <33983084+qute012@users.noreply.github.com> Date: Sat, 28 Mar 2020 19:25:20 +0900 Subject: [PATCH 2/2] Update tokenization.py 164 line out of 'if' --- model/tokenization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/model/tokenization.py b/model/tokenization.py index 943dcab..8849d3c 100644 --- a/model/tokenization.py +++ b/model/tokenization.py @@ -161,7 +161,7 @@ def tokenize(self, text): for token in orig_tokens: if self.do_lower_case: token = token.lower() - token = self._run_strip_accents(token) + token = self._run_strip_accents(token) split_tokens.extend(self._run_split_on_punc(token)) output_tokens = whitespace_tokenize(" ".join(split_tokens))