Skip to content

Commit cc94bfd

Browse files
committed
Merge thaidd/develop to master
2 parents 889b23f + 91f17f8 commit cc94bfd

15 files changed

+201
-191
lines changed

sctokenizer/assets/c_keywords.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
keyword_set = {
1+
c_keyword_set = {
22
'auto',
33
'break',
44
'case',

sctokenizer/assets/c_operators.py

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
c_operator_set = {
2+
'++', '--', '.', '->', '~', '!', '+', '-', \
3+
'&', '_Alignof', 'sizeof', '?:', ',', '*', \
4+
'/', '%', '<<', '>>', '<', '>', '<=', '>=', \
5+
'==', '!=', '^', '|', '&&', '||', '=', '*=', \
6+
'/=', '%=', '+=', '-=', '>>=', '<<=', '&=', \
7+
'^=', '|=', '?', ':'
8+
} # Becareful with ?: operator

sctokenizer/assets/cpp_keywords.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
keyword_set = {
1+
cpp_keyword_set = {
22
'alignas',
33
'alignof',
44
'and',

sctokenizer/assets/cpp_operators.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,5 @@
44
'*', '/', '%', '<<', '>>', '<', '>', '<=', '>=', \
55
'==', '!=', '^', '|', '&&', '||', '=', '*=', '/=', \
66
'%=', '+=', '-=', '>>=', '<<=', '&=', '^=', '|=', ',', \
7-
'?:', 'and', 'or','xor', 'not'
7+
'?:', 'and', 'or','xor', 'not', '?', ':'
88
} # Becareful with ?: operator

sctokenizer/assets/java_keywords.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
keyword_set = {
1+
java_keyword_set = {
22
'abstract',
33
'assert',
44
'boolean',

sctokenizer/assets/java_operators.py

+5-6
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
1-
cpp_operator_set = {
2-
'::', '++', '--', '.', '->', '~', '!', '+', '-', \
3-
'&', '*', 'new', 'delete', 'sizeof', '.*', '->*', \
4-
'*', '/', '%', '<<', '>>', '<', '>', '<=', '>=', \
1+
java_operator_set = {
2+
'::', '++', '--', '~', '!', '+', '-', 'instanceof', \
3+
'&', '*', '/', '%', '<<', '>>', '<', '>', '<=', '>=', \
54
'==', '!=', '^', '|', '&&', '||', '=', '*=', '/=', \
6-
'%=', '+=', '-=', '>>=', '<<=', '&=', '^=', '|=', ',', \
7-
'?:'
5+
'%=', '+=', '-=', '>>=', '<<=', '&=', '^=', '|=', \
6+
'?:', '>>>', '>>>=', '?', ':'
87
} # Becareful with ?: operator

sctokenizer/assets/python_keywords.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
keyword_set = {
1+
python_keyword_set = {
22
'and',
33
'as',
44
'assert',
+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
python_operator_set = {
2+
'+', '-', '*', '/', '%', '**', '//', '=', \
3+
'+=', '-=', '*=', '/=', '%=', '//=', '**=', \
4+
'&=', '|=', '^=', '>>=', '<<=', '==', '!=', \
5+
'>', '<', '>=', '<=', 'and', 'or', 'not', 'is', \
6+
'in', '&', '|', '^', '~', '<<', '>>'
7+
}

sctokenizer/c_tokenizer.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
from __future__ import absolute_import
22

33
from sctokenizer.cpp_tokenizer import CppTokenizer
4-
from sctokenizer.assets.c_keywords import keyword_set
5-
# from sctokenizer.aset.c_operator_set import c_operator_set
4+
from sctokenizer.assets.c_keywords import c_keyword_set
5+
from sctokenizer.assets.c_operators import c_operator_set
66
from sctokenizer.token import TokenType, Token
77

88
class CTokenizer(CppTokenizer):
99
def __init__(self):
1010
super().__init__()
11-
self.keyword_set = keyword_set
11+
self.keyword_set = c_keyword_set
12+
self.operator_set = c_operator_set
13+

sctokenizer/cpp_tokenizer.py

+29-14
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
from __future__ import absolute_import
22

33
from sctokenizer.tokenizer import Tokenizer, TokenizerState
4-
from sctokenizer.assets.cpp_keywords import keyword_set
4+
from sctokenizer.assets.cpp_keywords import cpp_keyword_set
55
from sctokenizer.assets.cpp_operators import cpp_operator_set
66
from sctokenizer.token import TokenType, Token
77

88
class CppTokenizer(Tokenizer):
99
def __init__(self):
1010
super().__init__()
11-
self.keyword_set = keyword_set
11+
self.keyword_set = cpp_keyword_set
1212
self.operator_set = cpp_operator_set
1313

1414
def tokenize(self, source_str):
@@ -50,6 +50,7 @@ def tokenize(self, source_str):
5050
if first_no_space_in_word == '':
5151
first_no_space_in_word = cur
5252
self.colnumber = i
53+
5354
if state == TokenizerState.IN_COMMENT:
5455
# Check end of block comment
5556
if cur == '*':
@@ -59,10 +60,12 @@ def tokenize(self, source_str):
5960
i += 1
6061
state = TokenizerState.REGULAR
6162
continue
63+
6264
elif state == TokenizerState.IN_LINECOMMENT:
6365
# Check end of line comment
6466
if cur == '\n':
6567
state = TokenizerState.REGULAR
68+
6669
elif state == TokenizerState.IN_MACRO:
6770
# Get first char after # in marco
6871
if cur == ' ' or cur == '\t':
@@ -88,6 +91,7 @@ def tokenize(self, source_str):
8891
first_no_space_in_macro = ''
8992
if self.is_alpha(cur):
9093
pending += cur
94+
9195
elif state == TokenizerState.IN_INCLUDE:
9296
if cur == '<' or cur == '"':
9397
state = TokenizerState.IN_INCLUDE_HEADER
@@ -98,6 +102,7 @@ def tokenize(self, source_str):
98102
self.add_pending(tokens, cur, TokenType.SPECIAL_SYMBOL, len_lines, t)
99103
elif cur != ' ' and cur != '\t':
100104
pending += cur
105+
101106
elif state == TokenizerState.IN_INCLUDE_HEADER:
102107
if cur == '>' or cur == '"':
103108
state = TokenizerState.REGULAR
@@ -108,6 +113,7 @@ def tokenize(self, source_str):
108113
self.add_pending(tokens, cur, TokenType.SPECIAL_SYMBOL, len_lines, t)
109114
elif cur != ' ' and cur != '\t':
110115
pending += cur
116+
111117
elif state == TokenizerState.IN_STRING:
112118
# Check end of string
113119
if cur == '"' and prev != '\\':
@@ -119,6 +125,7 @@ def tokenize(self, source_str):
119125
self.add_pending(tokens, cur, TokenType.SPECIAL_SYMBOL, len_lines, t)
120126
else:
121127
pending += cur
128+
122129
elif state == TokenizerState.IN_CHAR:
123130
# Check end of char
124131
if cur == "'" and prev != '\\':
@@ -130,26 +137,36 @@ def tokenize(self, source_str):
130137
self.add_pending(tokens, cur, TokenType.SPECIAL_SYMBOL, len_lines, t)
131138
else:
132139
pending += cur
140+
133141
elif state == TokenizerState.IN_NUMBER:
134142
if (cur >= '0' and cur <= '9') or \
135143
cur == '.' or cur == 'E' or cur == 'e':
136144
pending += cur
137145
i += 1
138-
# self.colnumber += 1
139146
continue
140147
if (cur == '-' or cur == '+') and \
141148
(prev == 'E' or prev == 'e'):
142149
pending += cur
143150
i += 1
144-
# self.colnumber += 1
145151
continue
146152
self.add_pending(tokens, pending, TokenType.CONSTANT, len_lines, t)
147-
pending = ''
148-
first_no_space_in_word = ''
153+
first_no_space_in_word = cur
154+
pending = cur
149155
self.colnumber = i
150-
self.add_pending(tokens, cur, TokenType.SPECIAL_SYMBOL, len_lines, t)
151156
state = TokenizerState.REGULAR
157+
152158
elif state == TokenizerState.REGULAR:
159+
if pending in self.operator_set:
160+
if (pending + cur) in self.operator_set:
161+
pending += cur
162+
i += 1
163+
continue
164+
else:
165+
self.add_pending(tokens, pending, TokenType.OPERATOR, len_lines, t)
166+
pending = ''
167+
first_no_space_in_word = cur
168+
self.colnumber = i
169+
153170
if cur == '/':
154171
if next == '*': # Begin block comments
155172
state = TokenizerState.IN_COMMENT
@@ -194,27 +211,25 @@ def tokenize(self, source_str):
194211
if first_no_space_in_word == cur:
195212
state = TokenizerState.IN_NUMBER
196213
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
214+
self.colnumber = i
197215
# first_no_space_in_word = ''
198216
pending = cur
199217
else:
200218
pending += cur
201219
elif self.is_alpha(cur):
202220
pending += cur
203221
elif cur in self.operator_set: # cur = + - * / , ...
204-
if pending in self.operator_set and (pending+cur) in self.operator_set:
205-
pending += cur
206-
else:
207-
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
208-
pending = cur
209-
first_no_space_in_word = ''
222+
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
223+
pending = cur
224+
first_no_space_in_word = cur
225+
self.colnumber = i
210226
else: # cur = ;, ', space
211227
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
212228
pending = ''
213229
first_no_space_in_word = ''
214230
if cur > ' ':
215231
self.colnumber = i
216232
self.add_pending(tokens, cur, TokenType.SPECIAL_SYMBOL, len_lines, t)
217-
pending = ''
218233
i += 1
219234
# is Cpp always ends with } ?
220235
if len(cur) > 1 or self.is_alpha(cur):

0 commit comments

Comments
 (0)