Skip to content

Commit 196e5f2

Browse files
committed
Fix some bugs
1 parent 1e91263 commit 196e5f2

File tree

5 files changed

+173
-44
lines changed

5 files changed

+173
-44
lines changed

sctokenizer/cpp_tokenizer.py

+45-12
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from sctokenizer.assets.cpp_keywords import cpp_keyword_set
55
from sctokenizer.assets.cpp_operators import cpp_operator_set
66
from sctokenizer.token import TokenType, Token
7+
# from icecream import ic
78

89
class CppTokenizer(Tokenizer):
910
def __init__(self):
@@ -147,8 +148,10 @@ def tokenize(self, source_str):
147148
pending += cur
148149

149150
elif state == TokenizerState.IN_NUMBER:
150-
if (cur >= '0' and cur <= '9') or \
151-
cur == '.' or cur == 'E' or cur == 'e':
151+
if (cur >= '0' and cur <= '9') or cur == '.' \
152+
or (cur >= 'A' and cur <= 'F') \
153+
or (cur >= 'a' and cur <= 'f') \
154+
or cur == 'X' or cur == 'x':
152155
pending += cur
153156
i += 1
154157
continue
@@ -174,11 +177,20 @@ def tokenize(self, source_str):
174177
pending = ''
175178
first_no_space_in_word = cur
176179
self.colnumber = i
180+
181+
if len(pending) == 1 and not self.is_identifier(pending):
182+
self.add_pending(tokens, pending, TokenType.SPECIAL_SYMBOL, len_lines, t)
183+
pending = ''
184+
first_no_space_in_word = cur
185+
self.colnumber = i
177186

178187
if cur == '/':
179188
if next == '*': # Begin block comments
180189
state = TokenizerState.IN_COMMENT
181-
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
190+
if self.is_identifier(pending):
191+
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
192+
else:
193+
self.add_pending(tokens, pending, TokenType.SPECIAL_SYMBOL, len_lines, t)
182194
pending = ''
183195
first_no_space_in_word = ''
184196
self.colnumber = i
@@ -187,7 +199,10 @@ def tokenize(self, source_str):
187199
continue
188200
if next == '/': # Begin line comment
189201
state = TokenizerState.IN_LINECOMMENT
190-
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
202+
if self.is_identifier(pending):
203+
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
204+
else:
205+
self.add_pending(tokens, pending, TokenType.SPECIAL_SYMBOL, len_lines, t)
191206
pending = ''
192207
first_no_space_in_word = ''
193208
self.colnumber = i
@@ -196,29 +211,41 @@ def tokenize(self, source_str):
196211
continue
197212
elif cur == '"':
198213
state = TokenizerState.IN_STRING
199-
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
214+
if self.is_identifier(pending):
215+
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
216+
else:
217+
self.add_pending(tokens, pending, TokenType.SPECIAL_SYMBOL, len_lines, t)
200218
pending = ''
201219
first_no_space_in_word = ''
202220
self.colnumber = i
203221
self.add_pending(tokens, '"', TokenType.SPECIAL_SYMBOL, len_lines, t)
204222
elif cur == "'":
205223
state = TokenizerState.IN_CHAR
206-
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
224+
if self.is_identifier(pending):
225+
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
226+
else:
227+
self.add_pending(tokens, pending, TokenType.SPECIAL_SYMBOL, len_lines, t)
207228
pending = ''
208229
first_no_space_in_word = ''
209230
self.colnumber = i
210231
self.add_pending(tokens, "'", TokenType.SPECIAL_SYMBOL, len_lines, t)
211232
elif cur == '#' and first_no_space == cur:
212233
state = TokenizerState.IN_MACRO
213-
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
234+
if self.is_identifier(pending):
235+
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
236+
else:
237+
self.add_pending(tokens, pending, TokenType.SPECIAL_SYMBOL, len_lines, t)
214238
pending = ''
215239
first_no_space_in_word = ''
216240
self.colnumber = i
217241
self.add_pending(tokens, '#', TokenType.SPECIAL_SYMBOL, len_lines, t)
218242
elif cur >= '0' and cur <= '9':
219243
if first_no_space_in_word == cur:
220244
state = TokenizerState.IN_NUMBER
221-
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
245+
if self.is_identifier(pending):
246+
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
247+
else:
248+
self.add_pending(tokens, pending, TokenType.SPECIAL_SYMBOL, len_lines, t)
222249
self.colnumber = i
223250
# first_no_space_in_word = ''
224251
pending = cur
@@ -227,21 +254,27 @@ def tokenize(self, source_str):
227254
elif self.is_alpha(cur):
228255
pending += cur
229256
elif cur in self.operator_set: # cur = + - * / , ...
230-
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
257+
if self.is_identifier(pending):
258+
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
259+
else:
260+
self.add_pending(tokens, pending, TokenType.SPECIAL_SYMBOL, len_lines, t)
231261
pending = cur
232262
first_no_space_in_word = cur
233263
self.colnumber = i
234264
else: # cur = ;, ', space
235-
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
265+
if self.is_identifier(pending):
266+
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
267+
else:
268+
self.add_pending(tokens, pending, TokenType.SPECIAL_SYMBOL, len_lines, t)
236269
pending = ''
237270
first_no_space_in_word = ''
238271
if cur > ' ':
239272
self.colnumber = i
240273
self.add_pending(tokens, cur, TokenType.SPECIAL_SYMBOL, len_lines, t)
241274
i += 1
242275
# is Cpp always ends with } ?
243-
if len(cur) > 1 or self.is_alpha(cur):
244-
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
276+
if self.is_identifier(pending):
277+
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
245278
else:
246279
self.add_pending(tokens, pending, TokenType.SPECIAL_SYMBOL, len_lines, t)
247280
return tokens

sctokenizer/java_tokenizer.py

+40-11
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,10 @@ def tokenize(self, source_str):
8686
else:
8787
pending += cur
8888
elif state == TokenizerState.IN_NUMBER:
89-
if (cur >= '0' and cur <= '9') or \
90-
cur == '.' or cur == 'E' or cur == 'e':
89+
if (cur >= '0' and cur <= '9') or cur == '.' \
90+
or (cur >= 'A' and cur <= 'F') \
91+
or (cur >= 'a' and cur <= 'f') \
92+
or cur == 'X' or cur == 'x':
9193
pending += cur
9294
i += 1
9395
# self.colnumber += 1
@@ -115,10 +117,19 @@ def tokenize(self, source_str):
115117
first_no_space_in_word = cur
116118
self.colnumber = i
117119

120+
if len(pending) == 1 and not self.is_identifier(pending, ['_', '$']):
121+
self.add_pending(tokens, pending, TokenType.SPECIAL_SYMBOL, len_lines, t)
122+
pending = ''
123+
first_no_space_in_word = cur
124+
self.colnumber = i
125+
118126
if cur == '/':
119127
if next == '*': # Begin block comments
120128
state = TokenizerState.IN_COMMENT
121-
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
129+
if self.is_identifier(pending, ['_', '$']):
130+
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
131+
else:
132+
self.add_pending(tokens, pending, TokenType.SPECIAL_SYMBOL, len_lines, t)
122133
pending = ''
123134
first_no_space_in_word = ''
124135
self.colnumber = i
@@ -127,7 +138,10 @@ def tokenize(self, source_str):
127138
continue
128139
if next == '/': # Begin line comment
129140
state = TokenizerState.IN_LINECOMMENT
130-
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
141+
if self.is_identifier(pending, ['_', '$']):
142+
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
143+
else:
144+
self.add_pending(tokens, pending, TokenType.SPECIAL_SYMBOL, len_lines, t)
131145
pending = ''
132146
first_no_space_in_word = ''
133147
self.colnumber = i
@@ -136,44 +150,59 @@ def tokenize(self, source_str):
136150
continue
137151
elif cur == '"':
138152
state = TokenizerState.IN_STRING
139-
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
153+
if self.is_identifier(pending, ['_', '$']):
154+
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
155+
else:
156+
self.add_pending(tokens, pending, TokenType.SPECIAL_SYMBOL, len_lines, t)
140157
pending = ''
141158
first_no_space_in_word = ''
142159
self.colnumber = i
143160
self.add_pending(tokens, '"', TokenType.SPECIAL_SYMBOL, len_lines, t)
144161
elif cur == "'":
145162
state = TokenizerState.IN_CHAR
146-
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
163+
if self.is_identifier(pending, ['_', '$']):
164+
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
165+
else:
166+
self.add_pending(tokens, pending, TokenType.SPECIAL_SYMBOL, len_lines, t)
147167
pending = ''
148168
first_no_space_in_word = ''
149169
self.colnumber = i
150170
self.add_pending(tokens, "'", TokenType.SPECIAL_SYMBOL, len_lines, t)
151171
elif cur >= '0' and cur <= '9':
152172
if first_no_space_in_word == cur:
153173
state = TokenizerState.IN_NUMBER
154-
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
174+
if self.is_identifier(pending, ['_', '$']):
175+
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
176+
else:
177+
self.add_pending(tokens, pending, TokenType.SPECIAL_SYMBOL, len_lines, t)
155178
# first_no_space_in_word = ''
156179
pending = cur
157180
else:
158181
pending += cur
159182
elif self.is_alpha(cur):
160183
pending += cur
161184
elif cur in self.operator_set: # cur = + - * / , ...
162-
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
185+
if self.is_identifier(pending, ['_', '$']):
186+
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
187+
else:
188+
self.add_pending(tokens, pending, TokenType.SPECIAL_SYMBOL, len_lines, t)
163189
pending = cur
164190
first_no_space_in_word = cur
165191
self.colnumber = i
166192
else: # cur = ;, ', space
167-
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
193+
if self.is_identifier(pending, ['_', '$']):
194+
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
195+
else:
196+
self.add_pending(tokens, pending, TokenType.SPECIAL_SYMBOL, len_lines, t)
168197
pending = ''
169198
first_no_space_in_word = ''
170199
if cur > ' ':
171200
self.colnumber = i
172201
self.add_pending(tokens, cur, TokenType.SPECIAL_SYMBOL, len_lines, t)
173202
i += 1
174203
# is Java always ends with } ?
175-
if len(cur) > 1 or self.is_alpha(cur):
176-
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
204+
if self.is_identifier(pending, ['_', '$']):
205+
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
177206
else:
178207
self.add_pending(tokens, pending, TokenType.SPECIAL_SYMBOL, len_lines, t)
179208
return tokens

0 commit comments

Comments
 (0)