From 329b92c76c8d3da5fbb382f10ad5c854abd8631e Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Sat, 20 Jul 2024 21:27:47 +0200 Subject: [PATCH] sql: clean up parser module --- src/sql/parser/ast.rs | 105 +++++++++++----------- src/sql/parser/lexer.rs | 31 +++---- src/sql/parser/mod.rs | 2 + src/sql/parser/parser.rs | 186 +++++++++++++++++++-------------------- 4 files changed, 159 insertions(+), 165 deletions(-) diff --git a/src/sql/parser/ast.rs b/src/sql/parser/ast.rs index 6a9aa05bd..46185b6c9 100644 --- a/src/sql/parser/ast.rs +++ b/src/sql/parser/ast.rs @@ -2,42 +2,41 @@ use crate::sql::types::DataType; use std::collections::BTreeMap; -/// The statement AST is the root node of the AST tree, which describes the -/// syntactic structure of a SQL query. It is passed to the planner, which -/// validates its contents and converts it into an execution plan. +/// The statement is the root node of the Abstract Syntax Tree, and describes +/// the syntactic structure of a SQL query. It is built from a raw SQL string by +/// the parser, and passed on to the planner which validates it and builds an +/// execution plan from it. #[derive(Debug)] pub enum Statement { - Begin { - read_only: bool, - as_of: Option, - }, + /// Begin a new transaction. + Begin { read_only: bool, as_of: Option }, + /// Commit a transaction. Commit, + /// Roll back a transaction. Rollback, + /// Explain a statement. Explain(Box), - CreateTable { - name: String, - columns: Vec, - }, - DropTable { - name: String, - if_exists: bool, - }, - Delete { - table: String, - r#where: Option, - }, + /// Create a new table. + CreateTable { name: String, columns: Vec }, + /// Drop a table. + DropTable { name: String, if_exists: bool }, + /// Delete matching rows. + Delete { table: String, r#where: Option }, + /// Insert new rows into a table. Insert { table: String, - columns: Option>, - values: Vec>, + columns: Option>, // columns given in values, using default for rest + values: Vec>, // rows to insert }, + /// Update rows in a table. Update { table: String, - set: BTreeMap>, // None for DEFAULT value + set: BTreeMap>, // column → value, None for default value r#where: Option, }, + /// Select matching rows. Select { - select: Vec<(Expression, Option)>, + select: Vec<(Expression, Option)>, // optional column aliases from: Vec, r#where: Option, group_by: Vec, @@ -48,10 +47,12 @@ pub enum Statement { }, } -/// A FROM item: a table or join. +/// A FROM item. #[derive(Debug)] pub enum From { + /// A table. Table { name: String, alias: Option }, + /// A join of two or more tables (may be nested). Join { left: Box, right: Box, r#type: JoinType, predicate: Option }, } @@ -78,8 +79,8 @@ pub enum JoinType { } impl JoinType { - // If true, the join is an outer join -- rows with no join match are emitted - // with a NULL match. + // If true, the join is an outer join, where rows with no join matches are + // emitted with a NULL match. pub fn is_outer(&self) -> bool { match self { Self::Left | Self::Right => true, @@ -88,7 +89,7 @@ impl JoinType { } } -/// Sort orders. +/// ORDER BY direction. #[derive(Debug)] pub enum Order { Ascending, @@ -110,7 +111,7 @@ pub enum Expression { Operator(Operator), } -/// Expression literals. +/// Expression literal values. #[derive(Clone, Debug)] pub enum Literal { Null, @@ -120,19 +121,19 @@ pub enum Literal { String(String), } -/// To allow Expressions and Literals in e.g. hashmap lookups, implement simple +/// To allow using Expressions and Literals in e.g. hashmaps, implement simple /// equality and hash for all types, including Null and f64::NAN. This is not /// used for expression evaluation (handled by sql::types::Expression), where -/// these values should not be considered equal, only in lookups. +/// these values should not be considered equal to themselves, only in lookups. impl std::cmp::PartialEq for Literal { fn eq(&self, other: &Self) -> bool { match (self, other) { (Self::Boolean(l), Self::Boolean(r)) => l == r, (Self::Integer(l), Self::Integer(r)) => l == r, - // Consider e.g. NaN equal to NaN for comparison purposes. + // Implies NaN == NaN but -NaN != NaN. Similarly with +/-0.0. (Self::Float(l), Self::Float(r)) => l.to_bits() == r.to_bits(), (Self::String(l), Self::String(r)) => l == r, - _ => core::mem::discriminant(self) == core::mem::discriminant(other), + (l, r) => core::mem::discriminant(l) == core::mem::discriminant(r), } } } @@ -159,29 +160,29 @@ impl std::hash::Hash for Literal { /// around this, but we keep it simple. #[derive(Clone, Debug, Eq, Hash, PartialEq)] pub enum Operator { - And(Box, Box), - Not(Box), - Or(Box, Box), + And(Box, Box), // a AND b + Not(Box), // NOT a + Or(Box, Box), // a OR b - Equal(Box, Box), - GreaterThan(Box, Box), - GreaterThanOrEqual(Box, Box), - Is(Box, Literal), // NULL or f64 NAN - LessThan(Box, Box), - LessThanOrEqual(Box, Box), - NotEqual(Box, Box), + Equal(Box, Box), // a = b + GreaterThan(Box, Box), // a > b + GreaterThanOrEqual(Box, Box), // a != b + Is(Box, Literal), // IS NULL or IS NAN + LessThan(Box, Box), // a < b + LessThanOrEqual(Box, Box), // a <= b + NotEqual(Box, Box), // a != b - Add(Box, Box), - Divide(Box, Box), - Exponentiate(Box, Box), - Factorial(Box), - Identity(Box), - Multiply(Box, Box), - Negate(Box), - Remainder(Box, Box), - Subtract(Box, Box), + Add(Box, Box), // a + b + Divide(Box, Box), // a / b + Exponentiate(Box, Box), // a ^ b + Factorial(Box), // a! + Identity(Box), // +a + Multiply(Box, Box), // a * b + Negate(Box), // -a + Remainder(Box, Box), // a % b + Subtract(Box, Box), // a - b - Like(Box, Box), + Like(Box, Box), // a LIKE b } impl Expression { diff --git a/src/sql/parser/lexer.rs b/src/sql/parser/lexer.rs index f7e34e883..00b32392d 100644 --- a/src/sql/parser/lexer.rs +++ b/src/sql/parser/lexer.rs @@ -2,9 +2,9 @@ use crate::errinput; use crate::error::Result; /// The lexer (lexical analyzer) preprocesses raw SQL strings into a sequence of -/// lexical tokens (e.g. keyword, number, string, etc) that are passed onto the -/// SQL parser. In doing so, it strips away basic syntactic noise such as -/// whitespace, case, and quotes, and performs initial validation of symbols. +/// lexical tokens (e.g. keyword, number, string, etc), which are passed on to +/// the SQL parser. In doing so, it strips away basic syntactic noise such as +/// whitespace, case, and quotes, and performs initial symbol validation. pub struct Lexer<'a> { chars: std::iter::Peekable>, } @@ -16,8 +16,8 @@ pub struct Lexer<'a> { /// fine for our purposes here. #[derive(Clone, Debug, PartialEq)] pub enum Token { - /// A number, with digits, decimal points, and/or exponents. Leading signs - /// (e.g. -) are separate tokens. + /// A numeric string, with digits, decimal points, and/or exponents. Leading + /// signs (e.g. -) are separate tokens. Number(String), /// A Unicode string, with quotes stripped and escape sequences resolved. String(String), @@ -156,7 +156,7 @@ pub enum Keyword { } impl TryFrom<&str> for Keyword { - // The error just indicates this isn't a keyword, so use a cheap string. + // Use a cheap static string, since this just indicates it's not a keyword. type Error = &'static str; fn try_from(value: &str) -> std::result::Result { @@ -351,8 +351,9 @@ impl<'a> Lexer<'a> { /// Scans the next token, if any. fn scan(&mut self) -> Result> { - // Ignore whitespace. The first character tells us the token type. + // Ignore whitespace. self.skip_whitespace(); + // The first character tells us the token type. match self.chars.peek() { Some('\'') => self.scan_string(), Some('"') => self.scan_ident_quoted(), @@ -378,8 +379,7 @@ impl<'a> Lexer<'a> { } } - /// Scans the next quoted identifier, if any. Case is preserved, keywords - /// are ignored. + /// Scans the next quoted identifier, if any. Case is preserved. fn scan_ident_quoted(&mut self) -> Result> { if !self.next_is('"') { return Ok(None); @@ -405,8 +405,8 @@ impl<'a> Lexer<'a> { number.push(c) } // Scan the fractional part, if any. - if let Some(sep) = self.next_if(|c| c == '.') { - number.push(sep); + if self.next_is('.') { + number.push('.'); while let Some(dec) = self.next_if(|c| c.is_ascii_digit()) { number.push(dec) } @@ -424,7 +424,7 @@ impl<'a> Lexer<'a> { Some(Token::Number(number)) } - /// Scans the next string literal, if any. + /// Scans the next quoted string literal, if any. fn scan_string(&mut self) -> Result> { if !self.next_is('\'') { return Ok(None); @@ -485,9 +485,6 @@ impl<'a> Lexer<'a> { /// Returns true if the entire given string is a single valid identifier. pub fn is_ident(ident: &str) -> bool { let mut lexer = Lexer::new(ident); - let token = lexer.next(); - if lexer.next().is_some() { - return false; // multiple tokens, so not an identifier - } - matches!(token, Some(Ok(Token::Ident(_)))) + let Some(Ok(Token::Ident(_))) = lexer.next() else { return false }; + lexer.next().is_none() // if further tokens, it's not a lone identifier } diff --git a/src/sql/parser/mod.rs b/src/sql/parser/mod.rs index 79dc10862..288f15958 100644 --- a/src/sql/parser/mod.rs +++ b/src/sql/parser/mod.rs @@ -1,3 +1,5 @@ +//! Parses raw SQL strings into a structured Abstract Syntax Tree. + pub mod ast; mod lexer; mod parser; diff --git a/src/sql/parser/parser.rs b/src/sql/parser/parser.rs index 8fa358e58..28a169357 100644 --- a/src/sql/parser/parser.rs +++ b/src/sql/parser/parser.rs @@ -1,29 +1,28 @@ #![allow(clippy::module_inception)] -use super::ast; -use super::{Keyword, Lexer, Token}; +use super::{ast, Keyword, Lexer, Token}; use crate::errinput; use crate::error::Result; use crate::sql::types::DataType; /// The SQL parser takes tokens from the lexer and parses the SQL syntax into an -/// AST (Abstract Syntax Tree). This nested structure represents the semantic -/// components of a SQL query (e.g. the SELECT and FROM clauses, values, -/// arithmetic expressions, etc.), but only makes sure it is well-formed. It -/// does not know e.g. whether a given table or column exists, or which kind of -/// join to use -- that is the job of the planner. +/// Abstract Syntax Tree (AST). This nested structure represents the syntactic +/// structure of a SQL query (e.g. the SELECT and FROM clauses, values, +/// arithmetic expressions, etc.). However, it only ensures the syntax is +/// well-formed, and does not know whether e.g. a given table or column exists +/// or which kind of join to use -- that is the job of the planner. pub struct Parser<'a> { pub lexer: std::iter::Peekable>, } impl<'a> Parser<'a> { - /// Creates a new parser for the given SQL string. + /// Creates a new parser for the given raw SQL string. pub fn new(statement: &str) -> Parser { Parser { lexer: Lexer::new(statement).peekable() } } - /// Parses the input string into an AST statement. We expect to parse the - /// whole string as a single statement, ending with an optional semicolon. + /// Parses the input string into an AST statement. The whole string must be + /// parsed as a single statement, ending with an optional semicolon. pub fn parse(&mut self) -> Result { let statement = self.parse_statement()?; self.next_is(Token::Semicolon); @@ -33,7 +32,7 @@ impl<'a> Parser<'a> { Ok(statement) } - /// Fetches the next lexer token, or throws an error if none is found. + /// Fetches the next lexer token, or errors if none is found. fn next(&mut self) -> Result { self.lexer.next().transpose()?.ok_or_else(|| errinput!("unexpected end of input")) } @@ -54,20 +53,20 @@ impl<'a> Parser<'a> { /// Passes the next lexer token through the closure, consuming it if the /// closure returns Some. - fn next_if_map(&mut self, f: impl Fn(&Token) -> Option) -> Result> { - let out = self.peek()?.and_then(f); + fn next_if_map(&mut self, f: impl Fn(&Token) -> Option) -> Option { + let out = self.peek().unwrap_or(None).map(f)?; if out.is_some() { - self.next()?; + self.next().ok(); } - Ok(out) + out } /// Grabs the next keyword if there is one. fn next_if_keyword(&mut self) -> Option { - match self.next_if(|t| matches!(t, Token::Keyword(_))) { - Some(Token::Keyword(keyword)) => Some(keyword), - Some(_) | None => None, - } + self.next_if_map(|token| match token { + Token::Keyword(keyword) => Some(*keyword), + _ => None, + }) } /// Consumes the next lexer token if it is the given token, returning true. @@ -97,22 +96,24 @@ impl<'a> Parser<'a> { /// Parses a SQL statement. fn parse_statement(&mut self) -> Result { - match self.peek()? { - Some(Token::Keyword(Keyword::Begin)) => self.parse_begin(), - Some(Token::Keyword(Keyword::Commit)) => self.parse_commit(), - Some(Token::Keyword(Keyword::Rollback)) => self.parse_rollback(), - Some(Token::Keyword(Keyword::Explain)) => self.parse_explain(), + let Some(token) = self.peek()? else { + return errinput!("unexpected end of input"); + }; + match token { + Token::Keyword(Keyword::Begin) => self.parse_begin(), + Token::Keyword(Keyword::Commit) => self.parse_commit(), + Token::Keyword(Keyword::Rollback) => self.parse_rollback(), + Token::Keyword(Keyword::Explain) => self.parse_explain(), - Some(Token::Keyword(Keyword::Create)) => self.parse_create_table(), - Some(Token::Keyword(Keyword::Drop)) => self.parse_drop_table(), + Token::Keyword(Keyword::Create) => self.parse_create_table(), + Token::Keyword(Keyword::Drop) => self.parse_drop_table(), - Some(Token::Keyword(Keyword::Delete)) => self.parse_delete(), - Some(Token::Keyword(Keyword::Insert)) => self.parse_insert(), - Some(Token::Keyword(Keyword::Select)) => self.parse_select(), - Some(Token::Keyword(Keyword::Update)) => self.parse_update(), + Token::Keyword(Keyword::Delete) => self.parse_delete(), + Token::Keyword(Keyword::Insert) => self.parse_insert(), + Token::Keyword(Keyword::Select) => self.parse_select(), + Token::Keyword(Keyword::Update) => self.parse_update(), - Some(token) => errinput!("unexpected token {token}"), - None => errinput!("unexpected end of input"), + token => errinput!("unexpected token {token}"), } } @@ -296,17 +297,13 @@ impl<'a> Parser<'a> { self.expect(Keyword::Update.into())?; let table = self.next_ident()?; self.expect(Keyword::Set.into())?; - let mut set = std::collections::BTreeMap::new(); loop { let column = self.next_ident()?; self.expect(Token::Equal)?; - let expr = if self.next_is(Keyword::Default.into()) { - None - } else { - Some(self.parse_expression()?) - }; - + let expr = (!self.next_is(Keyword::Default.into())) + .then(|| self.parse_expression()) + .transpose()?; if set.contains_key(&column) { return errinput!("column {column} set multiple times"); } @@ -315,7 +312,6 @@ impl<'a> Parser<'a> { break; } } - Ok(ast::Statement::Update { table, set, r#where: self.parse_where_clause()? }) } @@ -400,25 +396,28 @@ impl<'a> Parser<'a> { // Parses a FROM JOIN type, if present. fn parse_from_join(&mut self) -> Result> { + if self.next_is(Keyword::Join.into()) { + return Ok(Some(ast::JoinType::Inner)); + } if self.next_is(Keyword::Cross.into()) { self.expect(Keyword::Join.into())?; - Ok(Some(ast::JoinType::Cross)) - } else if self.next_is(Keyword::Inner.into()) { + return Ok(Some(ast::JoinType::Cross)); + } + if self.next_is(Keyword::Inner.into()) { self.expect(Keyword::Join.into())?; - Ok(Some(ast::JoinType::Inner)) - } else if self.next_is(Keyword::Join.into()) { - Ok(Some(ast::JoinType::Inner)) - } else if self.next_is(Keyword::Left.into()) { + return Ok(Some(ast::JoinType::Inner)); + } + if self.next_is(Keyword::Left.into()) { self.skip(Keyword::Outer.into()); self.expect(Keyword::Join.into())?; - Ok(Some(ast::JoinType::Left)) - } else if self.next_is(Keyword::Right.into()) { + return Ok(Some(ast::JoinType::Left)); + } + if self.next_is(Keyword::Right.into()) { self.skip(Keyword::Outer.into()); self.expect(Keyword::Join.into())?; - Ok(Some(ast::JoinType::Right)) - } else { - Ok(None) + return Ok(Some(ast::JoinType::Right)); } + Ok(None) } /// Parses a WHERE clause, if present. @@ -462,13 +461,13 @@ impl<'a> Parser<'a> { self.expect(Keyword::By.into())?; loop { let expr = self.parse_expression()?; - let order = if self.next_is(Keyword::Asc.into()) { - ast::Order::Ascending - } else if self.next_is(Keyword::Desc.into()) { - ast::Order::Descending - } else { - ast::Order::Ascending - }; + let order = self + .next_if_map(|token| match token { + Token::Keyword(Keyword::Asc) => Some(ast::Order::Ascending), + Token::Keyword(Keyword::Desc) => Some(ast::Order::Descending), + _ => None, + }) + .unwrap_or(ast::Order::Ascending); order_by.push((expr, order)); if !self.next_is(Token::Comma) { break; @@ -479,6 +478,7 @@ impl<'a> Parser<'a> { /// Parses an expression consisting of at least one atom operated on by any /// number of operators, using the precedence climbing algorithm. + /// /// TODO: write a description of the algorithm. pub fn parse_expression(&mut self) -> Result { self.parse_expression_at(0) @@ -488,7 +488,7 @@ impl<'a> Parser<'a> { fn parse_expression_at(&mut self, min_precedence: Precedence) -> Result { // If there is a prefix operator, parse it and its right-hand operand. // Otherwise, parse the left-hand atom. - let mut lhs = if let Some(prefix) = self.parse_prefix_operator(min_precedence)? { + let mut lhs = if let Some(prefix) = self.parse_prefix_operator(min_precedence) { let at_precedence = prefix.precedence() + prefix.associativity(); prefix.build(self.parse_expression_at(at_precedence)?) } else { @@ -499,7 +499,7 @@ impl<'a> Parser<'a> { lhs = postfix.build(lhs) } // Apply any binary infix operators, parsing the right-hand operand. - while let Some(infix) = self.parse_infix_operator(min_precedence)? { + while let Some(infix) = self.parse_infix_operator(min_precedence) { let at_precedence = infix.precedence() + infix.associativity(); let rhs = self.parse_expression_at(at_precedence)?; lhs = infix.build(lhs, rhs); @@ -564,12 +564,9 @@ impl<'a> Parser<'a> { }) } - /// Parses a prefix operator, if there is one and it's precedence is at - /// least min_precedence. - fn parse_prefix_operator( - &mut self, - min_precedence: Precedence, - ) -> Result> { + /// Parses a prefix operator, if there is one and its precedence is at least + /// min_precedence. + fn parse_prefix_operator(&mut self, min_precedence: Precedence) -> Option { self.next_if_map(|token| { let operator = match token { Token::Keyword(Keyword::Not) => PrefixOperator::Not, @@ -581,12 +578,9 @@ impl<'a> Parser<'a> { }) } - /// Parses an infix operator, if there is one and it's precedence is at - /// least min_precedence. - fn parse_infix_operator( - &mut self, - min_precedence: Precedence, - ) -> Result> { + /// Parses an infix operator, if there is one and its precedence is at least + /// min_precedence. + fn parse_infix_operator(&mut self, min_precedence: Precedence) -> Option { self.next_if_map(|token| { let operator = match token { Token::Asterisk => InfixOperator::Multiply, @@ -611,7 +605,7 @@ impl<'a> Parser<'a> { }) } - /// Parses a postfix operator, if there is one and it's precedence is at + /// Parses a postfix operator, if there is one and its precedence is at /// least min_precedence. fn parse_postfix_operator( &mut self, @@ -638,13 +632,13 @@ impl<'a> Parser<'a> { return Ok(Some(operator)); } - self.next_if_map(|token| { + Ok(self.next_if_map(|token| { let operator = match token { Token::Exclamation => PostfixOperator::Factorial, _ => return None, }; Some(operator).filter(|op| op.precedence() >= min_precedence) - }) + })) } } @@ -656,9 +650,9 @@ const RIGHT_ASSOCIATIVE: Precedence = 0; /// Prefix operators. enum PrefixOperator { - Minus, - Not, - Plus, + Minus, // -a + Not, // NOT a + Plus, // +a } impl PrefixOperator { @@ -689,21 +683,21 @@ impl PrefixOperator { /// Infix operators. enum InfixOperator { - Add, - And, - Divide, - Equal, - Exponentiate, - GreaterThan, - GreaterThanOrEqual, - LessThan, - LessThanOrEqual, - Like, - Multiply, - NotEqual, - Or, - Remainder, - Subtract, + Add, // a + b + And, // a AND b + Divide, // a / b + Equal, // a = b + Exponentiate, // a ^ b + GreaterThan, // a > b + GreaterThanOrEqual, // a >= b + LessThan, // a < b + LessThanOrEqual, // a <= b + Like, // a LIKE b + Multiply, // a * b + NotEqual, // a != b + Or, // a OR b + Remainder, // a % b + Subtract, // a - b } impl InfixOperator { @@ -760,9 +754,9 @@ impl InfixOperator { /// Postfix operators. enum PostfixOperator { - Factorial, - Is(ast::Literal), - IsNot(ast::Literal), + Factorial, // a! + Is(ast::Literal), // a IS NULL | NAN + IsNot(ast::Literal), // a IS NOT NULL | NAN } impl PostfixOperator {