-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtokenizer.py
116 lines (87 loc) · 4.78 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#Rana Hani and Sulaeman Ahmed Text-Processing Group 3
import json
import re
import time
import timeit
import typing
class Tokenizer:
def tokenize(self, document_text: str) -> typing.List[str]:
pass
class NaiveTokenizer(Tokenizer):
def tokenize(self, document_text: str) -> typing.List[str]:
start_time = timeit.default_timer()
pattern = document_text.replace('.', ' . ').replace(',', ' , ').lower().split()
tokens = re.findall(pattern, document_text.lower())
end_time = timeit.default_timer()
print(f"NaiveTokenizer execution time: {end_time - start_time} seconds")
return tokens
class RegexTokenizer(Tokenizer):
def tokenize(self, text):
start_time = time.time()
tokens = re.findall(
r"\b" # Match a word boundary
r"(?!https?://)" # Negative lookahead to exclude URLs starting with "http://" or "https://"
r"(?<!\.)" # Negative lookbehind to exclude URLs starting with a dot
r"\b[\w.]+" # Match one or more word characters or dots. This matches words and numbers that may include decimal points.
r"(?:'\w+)?" # Optionally match an apostrophe followed by one or more word characters
r"\b(?!\.\w)" # Negative lookahead to exclude URLs ending with a dot and a word character
r"|" #or
r"[^\s\w]+" # Match one or more non-space and non-word characters. This matches punctuation marks like commas and exclamation marks.
r"|" #or
r"(?:(?<!://)" # Negative lookbehind to exclude URLs starting with "http://" or "https://"
r"(?:https?://|www\.)" # Match "http://" or "https://" or "www."
r"[^\"'\s]+" # Match one or more non-space and non-quote characters after "http://" or "https://" or "www."
r")", # End non-capturing group for URLs
#r"\b(?!https?://)(?<!\.)\b[\w.]+(?:'\w+)?\b(?!\.\w)|[^\s\w]+|(?:(?<!://)(?:https?://|www\.)[^\"'\s]+)",
text.lower())
end_time = time.time()
print(f"RegexTokenizer execution time: {end_time - start_time} seconds")
return tokens
import unittest
class TokenizerTestCase(unittest.TestCase):
def setUp(self):
self.naive_tokenizer = NaiveTokenizer()
self.regex_tokenizer = RegexTokenizer()
#def test_naive_tokenizer(self):
# Test basic tokenization
#self.assertEqual(self.naive_tokenizer.tokenize("Hello, world!"), ['hello', ',', 'world', '!'])
# Test handling of apostrophes
#self.assertEqual(self.naive_tokenizer.tokenize("The quick brown fox jumped over the lazy dog's back."), ['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', "dog's", 'back', '.'])
# Test handling where apostrophe should be cut off
#self.assertEqual(self.regex_tokenizer.tokenize("The company has Denis' car."),['the', 'company', 'has', 'denis', "'", 'car', '.'])
# Test handling of numbers
#self.assertEqual(self.naive_tokenizer.tokenize("My favorite number is 3.14."), ['my', 'favorite', 'number', 'is', '3.14', '.'])
# Test handling of URLs
#self.assertEqual(self.naive_tokenizer.tokenize("Check out my website: https://www.example.com"), ['check', 'out', 'my', 'website', ':', 'https://www.example.com'])
def test_regex_tokenizer(self):
# Test basic tokenization
self.assertEqual(self.regex_tokenizer.tokenize("Hello, world!"), ['hello', ',', 'world', '!'])
# Test handling of apostrophes
self.assertEqual(self.regex_tokenizer.tokenize("The quick brown fox jumped over the lazy dog's back."),
['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', "dog's", 'back', '.'])
#Test handling where apostrophe should be cut off
self.assertEqual(self.regex_tokenizer.tokenize("The company has Denis' car."),
['the', 'company', 'has', 'denis', "'", 'car', '.'])
# Test handling of numbers
self.assertEqual(self.regex_tokenizer.tokenize("My favorite number is 3.14."),
['my', 'favorite', 'number', 'is', '3.14', '.'])
# Test handling of URLs
self.assertEqual(self.regex_tokenizer.tokenize("Check out my website: https://www.example.com"),
['check', 'out', 'my', 'website', ':', 'https://www.example.com'])
if __name__ == '__main__':
unittest.main()
#Tokenizing wiki_small.json
import json
import documents
import tokenizer
from tokenizer import RegexTokenizer
#Load the documents from the JSON file**
with open('wiki_small.json') as f:
data = json.load(f)
#Create a RegexTokenizer object**
tokenizer = tokenizer.RegexTokenizer()
#Tokenize each document's text**
for document in data:
text = document['init_text']
tokens = tokenizer.tokenize(text)
print(tokens)