4
4
from sctokenizer .assets .cpp_keywords import cpp_keyword_set
5
5
from sctokenizer .assets .cpp_operators import cpp_operator_set
6
6
from sctokenizer .token import TokenType , Token
7
+ # from icecream import ic
7
8
8
9
class CppTokenizer (Tokenizer ):
9
10
def __init__ (self ):
@@ -147,8 +148,10 @@ def tokenize(self, source_str):
147
148
pending += cur
148
149
149
150
elif state == TokenizerState .IN_NUMBER :
150
- if (cur >= '0' and cur <= '9' ) or \
151
- cur == '.' or cur == 'E' or cur == 'e' :
151
+ if (cur >= '0' and cur <= '9' ) or cur == '.' \
152
+ or (cur >= 'A' and cur <= 'F' ) \
153
+ or (cur >= 'a' and cur <= 'f' ) \
154
+ or cur == 'X' or cur == 'x' :
152
155
pending += cur
153
156
i += 1
154
157
continue
@@ -174,11 +177,20 @@ def tokenize(self, source_str):
174
177
pending = ''
175
178
first_no_space_in_word = cur
176
179
self .colnumber = i
180
+
181
+ if len (pending ) == 1 and not self .is_identifier (pending ):
182
+ self .add_pending (tokens , pending , TokenType .SPECIAL_SYMBOL , len_lines , t )
183
+ pending = ''
184
+ first_no_space_in_word = cur
185
+ self .colnumber = i
177
186
178
187
if cur == '/' :
179
188
if next == '*' : # Begin block comments
180
189
state = TokenizerState .IN_COMMENT
181
- self .add_pending (tokens , pending , TokenType .IDENTIFIER , len_lines , t )
190
+ if self .is_identifier (pending ):
191
+ self .add_pending (tokens , pending , TokenType .IDENTIFIER , len_lines , t )
192
+ else :
193
+ self .add_pending (tokens , pending , TokenType .SPECIAL_SYMBOL , len_lines , t )
182
194
pending = ''
183
195
first_no_space_in_word = ''
184
196
self .colnumber = i
@@ -187,7 +199,10 @@ def tokenize(self, source_str):
187
199
continue
188
200
if next == '/' : # Begin line comment
189
201
state = TokenizerState .IN_LINECOMMENT
190
- self .add_pending (tokens , pending , TokenType .IDENTIFIER , len_lines , t )
202
+ if self .is_identifier (pending ):
203
+ self .add_pending (tokens , pending , TokenType .IDENTIFIER , len_lines , t )
204
+ else :
205
+ self .add_pending (tokens , pending , TokenType .SPECIAL_SYMBOL , len_lines , t )
191
206
pending = ''
192
207
first_no_space_in_word = ''
193
208
self .colnumber = i
@@ -196,29 +211,41 @@ def tokenize(self, source_str):
196
211
continue
197
212
elif cur == '"' :
198
213
state = TokenizerState .IN_STRING
199
- self .add_pending (tokens , pending , TokenType .IDENTIFIER , len_lines , t )
214
+ if self .is_identifier (pending ):
215
+ self .add_pending (tokens , pending , TokenType .IDENTIFIER , len_lines , t )
216
+ else :
217
+ self .add_pending (tokens , pending , TokenType .SPECIAL_SYMBOL , len_lines , t )
200
218
pending = ''
201
219
first_no_space_in_word = ''
202
220
self .colnumber = i
203
221
self .add_pending (tokens , '"' , TokenType .SPECIAL_SYMBOL , len_lines , t )
204
222
elif cur == "'" :
205
223
state = TokenizerState .IN_CHAR
206
- self .add_pending (tokens , pending , TokenType .IDENTIFIER , len_lines , t )
224
+ if self .is_identifier (pending ):
225
+ self .add_pending (tokens , pending , TokenType .IDENTIFIER , len_lines , t )
226
+ else :
227
+ self .add_pending (tokens , pending , TokenType .SPECIAL_SYMBOL , len_lines , t )
207
228
pending = ''
208
229
first_no_space_in_word = ''
209
230
self .colnumber = i
210
231
self .add_pending (tokens , "'" , TokenType .SPECIAL_SYMBOL , len_lines , t )
211
232
elif cur == '#' and first_no_space == cur :
212
233
state = TokenizerState .IN_MACRO
213
- self .add_pending (tokens , pending , TokenType .IDENTIFIER , len_lines , t )
234
+ if self .is_identifier (pending ):
235
+ self .add_pending (tokens , pending , TokenType .IDENTIFIER , len_lines , t )
236
+ else :
237
+ self .add_pending (tokens , pending , TokenType .SPECIAL_SYMBOL , len_lines , t )
214
238
pending = ''
215
239
first_no_space_in_word = ''
216
240
self .colnumber = i
217
241
self .add_pending (tokens , '#' , TokenType .SPECIAL_SYMBOL , len_lines , t )
218
242
elif cur >= '0' and cur <= '9' :
219
243
if first_no_space_in_word == cur :
220
244
state = TokenizerState .IN_NUMBER
221
- self .add_pending (tokens , pending , TokenType .IDENTIFIER , len_lines , t )
245
+ if self .is_identifier (pending ):
246
+ self .add_pending (tokens , pending , TokenType .IDENTIFIER , len_lines , t )
247
+ else :
248
+ self .add_pending (tokens , pending , TokenType .SPECIAL_SYMBOL , len_lines , t )
222
249
self .colnumber = i
223
250
# first_no_space_in_word = ''
224
251
pending = cur
@@ -227,21 +254,27 @@ def tokenize(self, source_str):
227
254
elif self .is_alpha (cur ):
228
255
pending += cur
229
256
elif cur in self .operator_set : # cur = + - * / , ...
230
- self .add_pending (tokens , pending , TokenType .IDENTIFIER , len_lines , t )
257
+ if self .is_identifier (pending ):
258
+ self .add_pending (tokens , pending , TokenType .IDENTIFIER , len_lines , t )
259
+ else :
260
+ self .add_pending (tokens , pending , TokenType .SPECIAL_SYMBOL , len_lines , t )
231
261
pending = cur
232
262
first_no_space_in_word = cur
233
263
self .colnumber = i
234
264
else : # cur = ;, ', space
235
- self .add_pending (tokens , pending , TokenType .IDENTIFIER , len_lines , t )
265
+ if self .is_identifier (pending ):
266
+ self .add_pending (tokens , pending , TokenType .IDENTIFIER , len_lines , t )
267
+ else :
268
+ self .add_pending (tokens , pending , TokenType .SPECIAL_SYMBOL , len_lines , t )
236
269
pending = ''
237
270
first_no_space_in_word = ''
238
271
if cur > ' ' :
239
272
self .colnumber = i
240
273
self .add_pending (tokens , cur , TokenType .SPECIAL_SYMBOL , len_lines , t )
241
274
i += 1
242
275
# is Cpp always ends with } ?
243
- if len ( cur ) > 1 or self .is_alpha ( cur ):
244
- self .add_pending (tokens , pending , TokenType .IDENTIFIER , len_lines , t )
276
+ if self .is_identifier ( pending ):
277
+ self .add_pending (tokens , pending , TokenType .IDENTIFIER , len_lines , t )
245
278
else :
246
279
self .add_pending (tokens , pending , TokenType .SPECIAL_SYMBOL , len_lines , t )
247
280
return tokens
0 commit comments