4
4
from sctokenizer .assets .cpp_keywords import cpp_keyword_set
5
5
from sctokenizer .assets .cpp_operators import cpp_operator_set
6
6
from sctokenizer .token import TokenType , Token
7
+ # from icecream import ic
7
8
8
9
class CppTokenizer (Tokenizer ):
9
10
def __init__ (self ):
@@ -21,6 +22,7 @@ def tokenize(self, source_str):
21
22
first_no_space = ''
22
23
last_no_space = ''
23
24
first_no_space_in_macro = ''
25
+ second_no_space_in_macro = ''
24
26
first_no_space_in_word = ''
25
27
cur = ''
26
28
prev = ''
@@ -76,22 +78,26 @@ def tokenize(self, source_str):
76
78
continue
77
79
if cur != ' ' and cur != '\t ' and first_no_space_in_macro == '' :
78
80
first_no_space_in_macro = cur
81
+ second_no_space_in_macro = next
79
82
# Check end of marco
80
83
if cur == '\n ' and last_no_space != '\\ ' :
81
84
state = TokenizerState .REGULAR
82
85
first_no_space_in_macro = ''
86
+ second_no_space_in_macro = ''
83
87
84
88
# Can handle:
85
89
# include <bits/stdc++.h>
86
90
# define circleArea(r) (3.1415*(r)*(r))
87
91
# define PI 3.1415
88
92
# handle #include vs #define, undef, pragma
89
- if first_no_space_in_macro == 'i' :
93
+ if first_no_space_in_macro == 'i' and second_no_space_in_macro == 'n' :
90
94
state = TokenizerState .IN_INCLUDE
91
95
first_no_space_in_macro = ''
96
+ second_no_space_in_macro = ''
92
97
else :
93
98
state = TokenizerState .REGULAR
94
99
first_no_space_in_macro = ''
100
+ second_no_space_in_macro = ''
95
101
if self .is_alpha (cur ):
96
102
pending += cur
97
103
@@ -142,8 +148,10 @@ def tokenize(self, source_str):
142
148
pending += cur
143
149
144
150
elif state == TokenizerState .IN_NUMBER :
145
- if (cur >= '0' and cur <= '9' ) or \
146
- cur == '.' or cur == 'E' or cur == 'e' :
151
+ if (cur >= '0' and cur <= '9' ) or cur == '.' \
152
+ or (cur >= 'A' and cur <= 'F' ) \
153
+ or (cur >= 'a' and cur <= 'f' ) \
154
+ or cur == 'X' or cur == 'x' :
147
155
pending += cur
148
156
i += 1
149
157
continue
@@ -169,11 +177,20 @@ def tokenize(self, source_str):
169
177
pending = ''
170
178
first_no_space_in_word = cur
171
179
self .colnumber = i
180
+
181
+ if len (pending ) == 1 and not self .is_identifier (pending ):
182
+ self .add_pending (tokens , pending , TokenType .SPECIAL_SYMBOL , len_lines , t )
183
+ pending = ''
184
+ first_no_space_in_word = cur
185
+ self .colnumber = i
172
186
173
187
if cur == '/' :
174
188
if next == '*' : # Begin block comments
175
189
state = TokenizerState .IN_COMMENT
176
- self .add_pending (tokens , pending , TokenType .IDENTIFIER , len_lines , t )
190
+ if self .is_identifier (pending ):
191
+ self .add_pending (tokens , pending , TokenType .IDENTIFIER , len_lines , t )
192
+ else :
193
+ self .add_pending (tokens , pending , TokenType .SPECIAL_SYMBOL , len_lines , t )
177
194
pending = ''
178
195
first_no_space_in_word = ''
179
196
self .colnumber = i
@@ -182,7 +199,10 @@ def tokenize(self, source_str):
182
199
continue
183
200
if next == '/' : # Begin line comment
184
201
state = TokenizerState .IN_LINECOMMENT
185
- self .add_pending (tokens , pending , TokenType .IDENTIFIER , len_lines , t )
202
+ if self .is_identifier (pending ):
203
+ self .add_pending (tokens , pending , TokenType .IDENTIFIER , len_lines , t )
204
+ else :
205
+ self .add_pending (tokens , pending , TokenType .SPECIAL_SYMBOL , len_lines , t )
186
206
pending = ''
187
207
first_no_space_in_word = ''
188
208
self .colnumber = i
@@ -191,29 +211,41 @@ def tokenize(self, source_str):
191
211
continue
192
212
elif cur == '"' :
193
213
state = TokenizerState .IN_STRING
194
- self .add_pending (tokens , pending , TokenType .IDENTIFIER , len_lines , t )
214
+ if self .is_identifier (pending ):
215
+ self .add_pending (tokens , pending , TokenType .IDENTIFIER , len_lines , t )
216
+ else :
217
+ self .add_pending (tokens , pending , TokenType .SPECIAL_SYMBOL , len_lines , t )
195
218
pending = ''
196
219
first_no_space_in_word = ''
197
220
self .colnumber = i
198
221
self .add_pending (tokens , '"' , TokenType .SPECIAL_SYMBOL , len_lines , t )
199
222
elif cur == "'" :
200
223
state = TokenizerState .IN_CHAR
201
- self .add_pending (tokens , pending , TokenType .IDENTIFIER , len_lines , t )
224
+ if self .is_identifier (pending ):
225
+ self .add_pending (tokens , pending , TokenType .IDENTIFIER , len_lines , t )
226
+ else :
227
+ self .add_pending (tokens , pending , TokenType .SPECIAL_SYMBOL , len_lines , t )
202
228
pending = ''
203
229
first_no_space_in_word = ''
204
230
self .colnumber = i
205
231
self .add_pending (tokens , "'" , TokenType .SPECIAL_SYMBOL , len_lines , t )
206
232
elif cur == '#' and first_no_space == cur :
207
233
state = TokenizerState .IN_MACRO
208
- self .add_pending (tokens , pending , TokenType .IDENTIFIER , len_lines , t )
234
+ if self .is_identifier (pending ):
235
+ self .add_pending (tokens , pending , TokenType .IDENTIFIER , len_lines , t )
236
+ else :
237
+ self .add_pending (tokens , pending , TokenType .SPECIAL_SYMBOL , len_lines , t )
209
238
pending = ''
210
239
first_no_space_in_word = ''
211
240
self .colnumber = i
212
241
self .add_pending (tokens , '#' , TokenType .SPECIAL_SYMBOL , len_lines , t )
213
242
elif cur >= '0' and cur <= '9' :
214
243
if first_no_space_in_word == cur :
215
244
state = TokenizerState .IN_NUMBER
216
- self .add_pending (tokens , pending , TokenType .IDENTIFIER , len_lines , t )
245
+ if self .is_identifier (pending ):
246
+ self .add_pending (tokens , pending , TokenType .IDENTIFIER , len_lines , t )
247
+ else :
248
+ self .add_pending (tokens , pending , TokenType .SPECIAL_SYMBOL , len_lines , t )
217
249
self .colnumber = i
218
250
# first_no_space_in_word = ''
219
251
pending = cur
@@ -222,23 +254,27 @@ def tokenize(self, source_str):
222
254
elif self .is_alpha (cur ):
223
255
pending += cur
224
256
elif cur in self .operator_set : # cur = + - * / , ...
225
- self .add_pending (tokens , pending , TokenType .IDENTIFIER , len_lines , t )
257
+ if self .is_identifier (pending ):
258
+ self .add_pending (tokens , pending , TokenType .IDENTIFIER , len_lines , t )
259
+ else :
260
+ self .add_pending (tokens , pending , TokenType .SPECIAL_SYMBOL , len_lines , t )
226
261
pending = cur
227
262
first_no_space_in_word = cur
228
263
self .colnumber = i
229
264
else : # cur = ;, ', space
230
- self .add_pending (tokens , pending , TokenType .IDENTIFIER , len_lines , t )
265
+ if self .is_identifier (pending ):
266
+ self .add_pending (tokens , pending , TokenType .IDENTIFIER , len_lines , t )
267
+ else :
268
+ self .add_pending (tokens , pending , TokenType .SPECIAL_SYMBOL , len_lines , t )
231
269
pending = ''
232
270
first_no_space_in_word = ''
233
271
if cur > ' ' :
234
272
self .colnumber = i
235
273
self .add_pending (tokens , cur , TokenType .SPECIAL_SYMBOL , len_lines , t )
236
274
i += 1
237
275
# is Cpp always ends with } ?
238
- if len ( cur ) > 1 or self .is_alpha ( cur ):
239
- self .add_pending (tokens , pending , TokenType .IDENTIFIER , len_lines , t )
276
+ if self .is_identifier ( pending ):
277
+ self .add_pending (tokens , pending , TokenType .IDENTIFIER , len_lines , t )
240
278
else :
241
279
self .add_pending (tokens , pending , TokenType .SPECIAL_SYMBOL , len_lines , t )
242
- return tokens
243
-
244
-
280
+ return tokens
0 commit comments