Skip to content

Commit a58f154

Browse files
authored
Merge pull request #5 from BK-SCOSS/thaidd/dev
Fix problems with #if, #ifdef and #ifndef in C/CPP
2 parents d4a714a + 196e5f2 commit a58f154

6 files changed

+182
-51
lines changed

sctokenizer/cpp_tokenizer.py

+52-16
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from sctokenizer.assets.cpp_keywords import cpp_keyword_set
55
from sctokenizer.assets.cpp_operators import cpp_operator_set
66
from sctokenizer.token import TokenType, Token
7+
# from icecream import ic
78

89
class CppTokenizer(Tokenizer):
910
def __init__(self):
@@ -21,6 +22,7 @@ def tokenize(self, source_str):
2122
first_no_space = ''
2223
last_no_space = ''
2324
first_no_space_in_macro = ''
25+
second_no_space_in_macro = ''
2426
first_no_space_in_word = ''
2527
cur = ''
2628
prev = ''
@@ -76,22 +78,26 @@ def tokenize(self, source_str):
7678
continue
7779
if cur != ' ' and cur != '\t' and first_no_space_in_macro == '':
7880
first_no_space_in_macro = cur
81+
second_no_space_in_macro = next
7982
# Check end of marco
8083
if cur == '\n' and last_no_space != '\\':
8184
state = TokenizerState.REGULAR
8285
first_no_space_in_macro = ''
86+
second_no_space_in_macro = ''
8387

8488
# Can handle:
8589
# include <bits/stdc++.h>
8690
# define circleArea(r) (3.1415*(r)*(r))
8791
# define PI 3.1415
8892
# handle #include vs #define, undef, pragma
89-
if first_no_space_in_macro == 'i':
93+
if first_no_space_in_macro == 'i' and second_no_space_in_macro == 'n':
9094
state = TokenizerState.IN_INCLUDE
9195
first_no_space_in_macro = ''
96+
second_no_space_in_macro = ''
9297
else:
9398
state = TokenizerState.REGULAR
9499
first_no_space_in_macro = ''
100+
second_no_space_in_macro = ''
95101
if self.is_alpha(cur):
96102
pending += cur
97103

@@ -142,8 +148,10 @@ def tokenize(self, source_str):
142148
pending += cur
143149

144150
elif state == TokenizerState.IN_NUMBER:
145-
if (cur >= '0' and cur <= '9') or \
146-
cur == '.' or cur == 'E' or cur == 'e':
151+
if (cur >= '0' and cur <= '9') or cur == '.' \
152+
or (cur >= 'A' and cur <= 'F') \
153+
or (cur >= 'a' and cur <= 'f') \
154+
or cur == 'X' or cur == 'x':
147155
pending += cur
148156
i += 1
149157
continue
@@ -169,11 +177,20 @@ def tokenize(self, source_str):
169177
pending = ''
170178
first_no_space_in_word = cur
171179
self.colnumber = i
180+
181+
if len(pending) == 1 and not self.is_identifier(pending):
182+
self.add_pending(tokens, pending, TokenType.SPECIAL_SYMBOL, len_lines, t)
183+
pending = ''
184+
first_no_space_in_word = cur
185+
self.colnumber = i
172186

173187
if cur == '/':
174188
if next == '*': # Begin block comments
175189
state = TokenizerState.IN_COMMENT
176-
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
190+
if self.is_identifier(pending):
191+
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
192+
else:
193+
self.add_pending(tokens, pending, TokenType.SPECIAL_SYMBOL, len_lines, t)
177194
pending = ''
178195
first_no_space_in_word = ''
179196
self.colnumber = i
@@ -182,7 +199,10 @@ def tokenize(self, source_str):
182199
continue
183200
if next == '/': # Begin line comment
184201
state = TokenizerState.IN_LINECOMMENT
185-
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
202+
if self.is_identifier(pending):
203+
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
204+
else:
205+
self.add_pending(tokens, pending, TokenType.SPECIAL_SYMBOL, len_lines, t)
186206
pending = ''
187207
first_no_space_in_word = ''
188208
self.colnumber = i
@@ -191,29 +211,41 @@ def tokenize(self, source_str):
191211
continue
192212
elif cur == '"':
193213
state = TokenizerState.IN_STRING
194-
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
214+
if self.is_identifier(pending):
215+
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
216+
else:
217+
self.add_pending(tokens, pending, TokenType.SPECIAL_SYMBOL, len_lines, t)
195218
pending = ''
196219
first_no_space_in_word = ''
197220
self.colnumber = i
198221
self.add_pending(tokens, '"', TokenType.SPECIAL_SYMBOL, len_lines, t)
199222
elif cur == "'":
200223
state = TokenizerState.IN_CHAR
201-
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
224+
if self.is_identifier(pending):
225+
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
226+
else:
227+
self.add_pending(tokens, pending, TokenType.SPECIAL_SYMBOL, len_lines, t)
202228
pending = ''
203229
first_no_space_in_word = ''
204230
self.colnumber = i
205231
self.add_pending(tokens, "'", TokenType.SPECIAL_SYMBOL, len_lines, t)
206232
elif cur == '#' and first_no_space == cur:
207233
state = TokenizerState.IN_MACRO
208-
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
234+
if self.is_identifier(pending):
235+
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
236+
else:
237+
self.add_pending(tokens, pending, TokenType.SPECIAL_SYMBOL, len_lines, t)
209238
pending = ''
210239
first_no_space_in_word = ''
211240
self.colnumber = i
212241
self.add_pending(tokens, '#', TokenType.SPECIAL_SYMBOL, len_lines, t)
213242
elif cur >= '0' and cur <= '9':
214243
if first_no_space_in_word == cur:
215244
state = TokenizerState.IN_NUMBER
216-
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
245+
if self.is_identifier(pending):
246+
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
247+
else:
248+
self.add_pending(tokens, pending, TokenType.SPECIAL_SYMBOL, len_lines, t)
217249
self.colnumber = i
218250
# first_no_space_in_word = ''
219251
pending = cur
@@ -222,23 +254,27 @@ def tokenize(self, source_str):
222254
elif self.is_alpha(cur):
223255
pending += cur
224256
elif cur in self.operator_set: # cur = + - * / , ...
225-
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
257+
if self.is_identifier(pending):
258+
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
259+
else:
260+
self.add_pending(tokens, pending, TokenType.SPECIAL_SYMBOL, len_lines, t)
226261
pending = cur
227262
first_no_space_in_word = cur
228263
self.colnumber = i
229264
else: # cur = ;, ', space
230-
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
265+
if self.is_identifier(pending):
266+
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
267+
else:
268+
self.add_pending(tokens, pending, TokenType.SPECIAL_SYMBOL, len_lines, t)
231269
pending = ''
232270
first_no_space_in_word = ''
233271
if cur > ' ':
234272
self.colnumber = i
235273
self.add_pending(tokens, cur, TokenType.SPECIAL_SYMBOL, len_lines, t)
236274
i += 1
237275
# is Cpp always ends with } ?
238-
if len(cur) > 1 or self.is_alpha(cur):
239-
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
276+
if self.is_identifier(pending):
277+
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
240278
else:
241279
self.add_pending(tokens, pending, TokenType.SPECIAL_SYMBOL, len_lines, t)
242-
return tokens
243-
244-
280+
return tokens

sctokenizer/java_tokenizer.py

+40-11
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,10 @@ def tokenize(self, source_str):
8686
else:
8787
pending += cur
8888
elif state == TokenizerState.IN_NUMBER:
89-
if (cur >= '0' and cur <= '9') or \
90-
cur == '.' or cur == 'E' or cur == 'e':
89+
if (cur >= '0' and cur <= '9') or cur == '.' \
90+
or (cur >= 'A' and cur <= 'F') \
91+
or (cur >= 'a' and cur <= 'f') \
92+
or cur == 'X' or cur == 'x':
9193
pending += cur
9294
i += 1
9395
# self.colnumber += 1
@@ -115,10 +117,19 @@ def tokenize(self, source_str):
115117
first_no_space_in_word = cur
116118
self.colnumber = i
117119

120+
if len(pending) == 1 and not self.is_identifier(pending, ['_', '$']):
121+
self.add_pending(tokens, pending, TokenType.SPECIAL_SYMBOL, len_lines, t)
122+
pending = ''
123+
first_no_space_in_word = cur
124+
self.colnumber = i
125+
118126
if cur == '/':
119127
if next == '*': # Begin block comments
120128
state = TokenizerState.IN_COMMENT
121-
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
129+
if self.is_identifier(pending, ['_', '$']):
130+
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
131+
else:
132+
self.add_pending(tokens, pending, TokenType.SPECIAL_SYMBOL, len_lines, t)
122133
pending = ''
123134
first_no_space_in_word = ''
124135
self.colnumber = i
@@ -127,7 +138,10 @@ def tokenize(self, source_str):
127138
continue
128139
if next == '/': # Begin line comment
129140
state = TokenizerState.IN_LINECOMMENT
130-
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
141+
if self.is_identifier(pending, ['_', '$']):
142+
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
143+
else:
144+
self.add_pending(tokens, pending, TokenType.SPECIAL_SYMBOL, len_lines, t)
131145
pending = ''
132146
first_no_space_in_word = ''
133147
self.colnumber = i
@@ -136,44 +150,59 @@ def tokenize(self, source_str):
136150
continue
137151
elif cur == '"':
138152
state = TokenizerState.IN_STRING
139-
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
153+
if self.is_identifier(pending, ['_', '$']):
154+
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
155+
else:
156+
self.add_pending(tokens, pending, TokenType.SPECIAL_SYMBOL, len_lines, t)
140157
pending = ''
141158
first_no_space_in_word = ''
142159
self.colnumber = i
143160
self.add_pending(tokens, '"', TokenType.SPECIAL_SYMBOL, len_lines, t)
144161
elif cur == "'":
145162
state = TokenizerState.IN_CHAR
146-
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
163+
if self.is_identifier(pending, ['_', '$']):
164+
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
165+
else:
166+
self.add_pending(tokens, pending, TokenType.SPECIAL_SYMBOL, len_lines, t)
147167
pending = ''
148168
first_no_space_in_word = ''
149169
self.colnumber = i
150170
self.add_pending(tokens, "'", TokenType.SPECIAL_SYMBOL, len_lines, t)
151171
elif cur >= '0' and cur <= '9':
152172
if first_no_space_in_word == cur:
153173
state = TokenizerState.IN_NUMBER
154-
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
174+
if self.is_identifier(pending, ['_', '$']):
175+
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
176+
else:
177+
self.add_pending(tokens, pending, TokenType.SPECIAL_SYMBOL, len_lines, t)
155178
# first_no_space_in_word = ''
156179
pending = cur
157180
else:
158181
pending += cur
159182
elif self.is_alpha(cur):
160183
pending += cur
161184
elif cur in self.operator_set: # cur = + - * / , ...
162-
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
185+
if self.is_identifier(pending, ['_', '$']):
186+
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
187+
else:
188+
self.add_pending(tokens, pending, TokenType.SPECIAL_SYMBOL, len_lines, t)
163189
pending = cur
164190
first_no_space_in_word = cur
165191
self.colnumber = i
166192
else: # cur = ;, ', space
167-
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
193+
if self.is_identifier(pending, ['_', '$']):
194+
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
195+
else:
196+
self.add_pending(tokens, pending, TokenType.SPECIAL_SYMBOL, len_lines, t)
168197
pending = ''
169198
first_no_space_in_word = ''
170199
if cur > ' ':
171200
self.colnumber = i
172201
self.add_pending(tokens, cur, TokenType.SPECIAL_SYMBOL, len_lines, t)
173202
i += 1
174203
# is Java always ends with } ?
175-
if len(cur) > 1 or self.is_alpha(cur):
176-
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
204+
if self.is_identifier(pending, ['_', '$']):
205+
self.add_pending(tokens, pending, TokenType.IDENTIFIER, len_lines, t)
177206
else:
178207
self.add_pending(tokens, pending, TokenType.SPECIAL_SYMBOL, len_lines, t)
179208
return tokens

0 commit comments

Comments
 (0)