1
1
from __future__ import absolute_import
2
2
3
3
from sctokenizer .tokenizer import Tokenizer , TokenizerState
4
- from sctokenizer .assets .cpp_keywords import keyword_set
4
+ from sctokenizer .assets .cpp_keywords import cpp_keyword_set
5
5
from sctokenizer .assets .cpp_operators import cpp_operator_set
6
6
from sctokenizer .token import TokenType , Token
7
7
8
8
class CppTokenizer (Tokenizer ):
9
9
def __init__ (self ):
10
10
super ().__init__ ()
11
- self .keyword_set = keyword_set
11
+ self .keyword_set = cpp_keyword_set
12
12
self .operator_set = cpp_operator_set
13
13
14
14
def tokenize (self , source_str ):
@@ -50,6 +50,7 @@ def tokenize(self, source_str):
50
50
if first_no_space_in_word == '' :
51
51
first_no_space_in_word = cur
52
52
self .colnumber = i
53
+
53
54
if state == TokenizerState .IN_COMMENT :
54
55
# Check end of block comment
55
56
if cur == '*' :
@@ -59,10 +60,12 @@ def tokenize(self, source_str):
59
60
i += 1
60
61
state = TokenizerState .REGULAR
61
62
continue
63
+
62
64
elif state == TokenizerState .IN_LINECOMMENT :
63
65
# Check end of line comment
64
66
if cur == '\n ' :
65
67
state = TokenizerState .REGULAR
68
+
66
69
elif state == TokenizerState .IN_MACRO :
67
70
# Get first char after # in marco
68
71
if cur == ' ' or cur == '\t ' :
@@ -88,6 +91,7 @@ def tokenize(self, source_str):
88
91
first_no_space_in_macro = ''
89
92
if self .is_alpha (cur ):
90
93
pending += cur
94
+
91
95
elif state == TokenizerState .IN_INCLUDE :
92
96
if cur == '<' or cur == '"' :
93
97
state = TokenizerState .IN_INCLUDE_HEADER
@@ -98,6 +102,7 @@ def tokenize(self, source_str):
98
102
self .add_pending (tokens , cur , TokenType .SPECIAL_SYMBOL , len_lines , t )
99
103
elif cur != ' ' and cur != '\t ' :
100
104
pending += cur
105
+
101
106
elif state == TokenizerState .IN_INCLUDE_HEADER :
102
107
if cur == '>' or cur == '"' :
103
108
state = TokenizerState .REGULAR
@@ -108,6 +113,7 @@ def tokenize(self, source_str):
108
113
self .add_pending (tokens , cur , TokenType .SPECIAL_SYMBOL , len_lines , t )
109
114
elif cur != ' ' and cur != '\t ' :
110
115
pending += cur
116
+
111
117
elif state == TokenizerState .IN_STRING :
112
118
# Check end of string
113
119
if cur == '"' and prev != '\\ ' :
@@ -119,6 +125,7 @@ def tokenize(self, source_str):
119
125
self .add_pending (tokens , cur , TokenType .SPECIAL_SYMBOL , len_lines , t )
120
126
else :
121
127
pending += cur
128
+
122
129
elif state == TokenizerState .IN_CHAR :
123
130
# Check end of char
124
131
if cur == "'" and prev != '\\ ' :
@@ -130,26 +137,36 @@ def tokenize(self, source_str):
130
137
self .add_pending (tokens , cur , TokenType .SPECIAL_SYMBOL , len_lines , t )
131
138
else :
132
139
pending += cur
140
+
133
141
elif state == TokenizerState .IN_NUMBER :
134
142
if (cur >= '0' and cur <= '9' ) or \
135
143
cur == '.' or cur == 'E' or cur == 'e' :
136
144
pending += cur
137
145
i += 1
138
- # self.colnumber += 1
139
146
continue
140
147
if (cur == '-' or cur == '+' ) and \
141
148
(prev == 'E' or prev == 'e' ):
142
149
pending += cur
143
150
i += 1
144
- # self.colnumber += 1
145
151
continue
146
152
self .add_pending (tokens , pending , TokenType .CONSTANT , len_lines , t )
147
- pending = ''
148
- first_no_space_in_word = ''
153
+ first_no_space_in_word = cur
154
+ pending = cur
149
155
self .colnumber = i
150
- self .add_pending (tokens , cur , TokenType .SPECIAL_SYMBOL , len_lines , t )
151
156
state = TokenizerState .REGULAR
157
+
152
158
elif state == TokenizerState .REGULAR :
159
+ if pending in self .operator_set :
160
+ if (pending + cur ) in self .operator_set :
161
+ pending += cur
162
+ i += 1
163
+ continue
164
+ else :
165
+ self .add_pending (tokens , pending , TokenType .OPERATOR , len_lines , t )
166
+ pending = ''
167
+ first_no_space_in_word = cur
168
+ self .colnumber = i
169
+
153
170
if cur == '/' :
154
171
if next == '*' : # Begin block comments
155
172
state = TokenizerState .IN_COMMENT
@@ -194,27 +211,25 @@ def tokenize(self, source_str):
194
211
if first_no_space_in_word == cur :
195
212
state = TokenizerState .IN_NUMBER
196
213
self .add_pending (tokens , pending , TokenType .IDENTIFIER , len_lines , t )
214
+ self .colnumber = i
197
215
# first_no_space_in_word = ''
198
216
pending = cur
199
217
else :
200
218
pending += cur
201
219
elif self .is_alpha (cur ):
202
220
pending += cur
203
221
elif cur in self .operator_set : # cur = + - * / , ...
204
- if pending in self .operator_set and (pending + cur ) in self .operator_set :
205
- pending += cur
206
- else :
207
- self .add_pending (tokens , pending , TokenType .IDENTIFIER , len_lines , t )
208
- pending = cur
209
- first_no_space_in_word = ''
222
+ self .add_pending (tokens , pending , TokenType .IDENTIFIER , len_lines , t )
223
+ pending = cur
224
+ first_no_space_in_word = cur
225
+ self .colnumber = i
210
226
else : # cur = ;, ', space
211
227
self .add_pending (tokens , pending , TokenType .IDENTIFIER , len_lines , t )
212
228
pending = ''
213
229
first_no_space_in_word = ''
214
230
if cur > ' ' :
215
231
self .colnumber = i
216
232
self .add_pending (tokens , cur , TokenType .SPECIAL_SYMBOL , len_lines , t )
217
- pending = ''
218
233
i += 1
219
234
# is Cpp always ends with } ?
220
235
if len (cur ) > 1 or self .is_alpha (cur ):
0 commit comments