-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtokenizer.py
32 lines (28 loc) · 1.16 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# -*- coding: utf-8 -*-
"""
@author: rishabbh-sahu
"""
import bert
import tensorflow_hub as hub
def create_bert_tokenizer(model_path):
'''
Languate en, bert tokenizer to split words in their appropriate sub-tokens and
use the these as a part of embedding layer
Bert variant tensorflow-hub model path : param model_path
Bert tokenizer:return
'''
BertTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1",trainable=False)
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocabulary_file, to_lower_case)
print('vocabulary_file:',type(vocabulary_file),'\nto_lower_case:',type(to_lower_case))
print('tokenizer.vocab:',len(tokenizer.vocab))
return tokenizer
def tokenize_text(text,tokenizer):
'''
Text to tokenize : param text
tokenizer used for word splitting : param tokenizer
Stream of sub-tokens after tokenization:return
'''
return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))