Skip to content

Commit

Permalink
Split scanner off into own module.
Browse files Browse the repository at this point in the history
  • Loading branch information
cpressey committed Feb 7, 2022
1 parent acd1c3c commit ded3519
Show file tree
Hide file tree
Showing 2 changed files with 142 additions and 116 deletions.
137 changes: 21 additions & 116 deletions src/castile/parser.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,12 @@
import re

from castile.ast import AST


class CastileSyntaxError(ValueError):
pass
from castile.scanner import Scanner, CastileSyntaxError


class Parser(object):
"""Parse a Castile program into an AST.
The parser includes the scanner as part of it. (Delegating to an external
scanner is rather verbose ("self.scanner.expect(...)"; inheriting from a
Scanner class, even if it's just a mixin, seems rather weird.)
The parser mainly just constructs the AST. It does few other analyses
or transformations itself. However, there are a few:
Expand All @@ -24,121 +17,33 @@ class Parser(object):
"""
def __init__(self, text):
self.text = text
self.token = None
self.type = None
self.pos = 0
self.scan()
# for parser...
self.scanner = Scanner(text)
self.locals = None

# ### SCANNER ### #
### Delegate to scanner

def near_text(self, length=10):
return self.text[self.pos:self.pos + length]
def consume(self, *args, **kwargs):
return self.scanner.consume(*args, **kwargs)

def scan_pattern(self, pattern, type, token_group=1, rest_group=2):
pattern = r'(' + pattern + r')'
regexp = re.compile(pattern, flags=re.DOTALL)
match = regexp.match(self.text, pos=self.pos)
if not match:
return False
else:
self.type = type
self.token = match.group(token_group)
self.pos += len(match.group(0))
return True

def scan(self):
self.scan_pattern(r'[ \t\n\r]*', 'whitespace')
while self.scan_pattern(r'\/\*.*?\*\/[ \t\n\r]*', 'comment'):
self.scan_pattern(r'[ \t\n\r]*', 'whitespace')
if self.pos >= len(self.text):
self.token = None
self.type = 'EOF'
return
if self.scan_pattern(r'->', 'arrow'):
return
if self.scan_pattern(r'>=|>|<=|<|==|!=', 'relational operator'):
return
if self.scan_pattern(r'\+|\-', 'additive operator'):
return
if self.scan_pattern(r'\*|\/|\|', 'multiplicative operator'):
return
if self.scan_pattern(r'\.|\;|\,|\(|\)|\{|\}|\=', 'punctuation'):
return
if self.scan_pattern(r'string|integer|boolean|function|void|union',
'type name'):
return
if self.scan_pattern(r'and|or', 'boolean operator'):
return
if self.scan_pattern(r'(if|else|while|make|struct|'
r'typecase|is|as|return|break|'
r'true|false|null)(?!\w)',
'keyword', token_group=2, rest_group=3):
return
if self.scan_pattern(r'\d+', 'integer literal'):
return
if self.scan_pattern(r'\"(.*?)\"', 'string literal',
token_group=2, rest_group=3):
return
if self.scan_pattern(r'[a-zA-Z_][a-zA-Z0-9_]*', 'identifier'):
return
if self.scan_pattern(r'.', 'unknown character'):
return
else:
raise ValueError("this should never happen, "
"self.text=(%s)" % self.text)
def consume_type(self, *args, **kwargs):
return self.scanner.consume_type(*args, **kwargs)

def expect(self, token):
if self.token == token:
self.scan()
else:
raise CastileSyntaxError(
"Expected '%s', but found '%s' (near '%s')" % (
token, self.token, self.near_text()
)
)

def expect_type(self, type):
self.check_type(type)
token = self.token
self.scan()
return token

def on(self, token):
return self.token == token

def on_any(self, tokens):
return self.token in tokens

def on_type(self, type):
return self.type == type

def check_type(self, type):
if not self.type == type:
raise CastileSyntaxError(
"Expected %s, but found %s ('%s') (near '%s')" % (
type, self.type, self.token, self.near_text()
)
)

def consume(self, token):
if self.token == token:
self.scan()
return True
else:
return False
def expect(self, *args, **kwargs):
return self.scanner.expect(*args, **kwargs)

def consume_type(self, type):
if self.on_type(type):
token = self.token
self.scan()
return token
else:
return None
def expect_type(self, *args, **kwargs):
return self.scanner.expect_type(*args, **kwargs)

def on(self, *args, **kwargs):
return self.scanner.on(*args, **kwargs)

def on_any(self, *args, **kwargs):
return self.scanner.on_any(*args, **kwargs)

def on_type(self, *args, **kwargs):
return self.scanner.on_type(*args, **kwargs)

# ### PARSER ### #
### Parser proper

def program(self):
defns = []
Expand Down
121 changes: 121 additions & 0 deletions src/castile/scanner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
import re

from castile.ast import AST


class CastileSyntaxError(ValueError):
pass


class Scanner(object):

def __init__(self, text):
self.text = text
self.token = None
self.type = None
self.pos = 0
self.scan()

def near_text(self, length=10):
return self.text[self.pos:self.pos + length]

def scan_pattern(self, pattern, type, token_group=1, rest_group=2):
pattern = r'(' + pattern + r')'
regexp = re.compile(pattern, flags=re.DOTALL)
match = regexp.match(self.text, pos=self.pos)
if not match:
return False
else:
self.type = type
self.token = match.group(token_group)
self.pos += len(match.group(0))
return True

def scan(self):
self.scan_pattern(r'[ \t\n\r]*', 'whitespace')
while self.scan_pattern(r'\/\*.*?\*\/[ \t\n\r]*', 'comment'):
self.scan_pattern(r'[ \t\n\r]*', 'whitespace')
if self.pos >= len(self.text):
self.token = None
self.type = 'EOF'
return
if self.scan_pattern(r'->', 'arrow'):
return
if self.scan_pattern(r'>=|>|<=|<|==|!=', 'relational operator'):
return
if self.scan_pattern(r'\+|\-', 'additive operator'):
return
if self.scan_pattern(r'\*|\/|\|', 'multiplicative operator'):
return
if self.scan_pattern(r'\.|\;|\,|\(|\)|\{|\}|\=', 'punctuation'):
return
if self.scan_pattern(r'string|integer|boolean|function|void|union',
'type name'):
return
if self.scan_pattern(r'and|or', 'boolean operator'):
return
if self.scan_pattern(r'(if|else|while|make|struct|'
r'typecase|is|as|return|break|'
r'true|false|null)(?!\w)',
'keyword', token_group=2, rest_group=3):
return
if self.scan_pattern(r'\d+', 'integer literal'):
return
if self.scan_pattern(r'\"(.*?)\"', 'string literal',
token_group=2, rest_group=3):
return
if self.scan_pattern(r'[a-zA-Z_][a-zA-Z0-9_]*', 'identifier'):
return
if self.scan_pattern(r'.', 'unknown character'):
return
else:
raise ValueError("this should never happen, "
"self.text=(%s)" % self.text)

def expect(self, token):
if self.token == token:
self.scan()
else:
raise CastileSyntaxError(
"Expected '%s', but found '%s' (near '%s')" % (
token, self.token, self.near_text()
)
)

def expect_type(self, type):
self.check_type(type)
token = self.token
self.scan()
return token

def on(self, token):
return self.token == token

def on_any(self, tokens):
return self.token in tokens

def on_type(self, type):
return self.type == type

def check_type(self, type):
if not self.type == type:
raise CastileSyntaxError(
"Expected %s, but found %s ('%s') (near '%s')" % (
type, self.type, self.token, self.near_text()
)
)

def consume(self, token):
if self.token == token:
self.scan()
return True
else:
return False

def consume_type(self, type):
if self.on_type(type):
token = self.token
self.scan()
return token
else:
return None

0 comments on commit ded3519

Please sign in to comment.