From 636840a23b5b1bed5dcab77caa34920b50a52f6f Mon Sep 17 00:00:00 2001 From: Stephen Hurwitz Date: Mon, 16 Oct 2023 17:56:26 -0700 Subject: [PATCH 1/4] Implements Scanner type for tokenizing nginx configs Implemented `crossplane.Scanner` that follows the example of other "scanner" types implemented in the Go stdlib. The existing `Lex` uses concurrency to make tokens available to the caller while managing "state". I think this design queue was taken from Rob Pike's 2011 talk on [Lexical Scanning in Go](https://go.dev/talks/2011/lex.slide). If you look at examples from the Go stdlib-- such as `bufio.Scanner` that `Lex` depends on-- you'd find that this isn't the strategy being employed and instead there is a struct that manages the state of the scanner and a method that used by the caller to advance the scanner to obtain tokens. After a bit of Internet archeology, I found [this](https://groups.google.com/g/golang-nuts/c/q--5t2cxv78/m/Vkr9bNuhP5sJ) post on `golang-nuts` from Rob Pike himself: > That talk was about a lexer, but the deeper purpose was to demonstrate > how concurrency can make programs nice even without obvious parallelism > in the problem. And like many such uses of concurrency, the code is > pretty but not necessarily fast. > > I think it's a fine approach to a lexer if you don't care about > performance. It is significantly slower than some other approaches but > is very easy to adapt. I used it in ivy, for example, but just so you > know, I'm probably going to replace the one in ivy with a more > traditional model to avoid some issues with the lexer accessing global > state. You don't care about that for your application, I'm sure. > So: It's pretty and nice to work on, but you'd probably not choose that > approach for a production compiler. An implementation of a "scanner" using the more "traditional" model-- much of the logic is the same or very close to `Lex`-- seems to support the above statement. ``` go test -benchmem -run=^$ -bench "^BenchmarkScan|BenchmarkLex$" github.com/nginxinc/nginx-go-crossplane -count=1 -v goos: darwin goarch: arm64 pkg: github.com/nginxinc/nginx-go-crossplane BenchmarkLex BenchmarkLex/simple BenchmarkLex/simple-10 70982 16581 ns/op 102857 B/op 37 allocs/op BenchmarkLex/with-comments BenchmarkLex/with-comments-10 64125 18366 ns/op 102921 B/op 43 allocs/op BenchmarkLex/messy BenchmarkLex/messy-10 28171 42697 ns/op 104208 B/op 166 allocs/op BenchmarkLex/quote-behavior BenchmarkLex/quote-behavior-10 83667 14154 ns/op 102768 B/op 24 allocs/op BenchmarkLex/quoted-right-brace BenchmarkLex/quoted-right-brace-10 48022 24799 ns/op 103369 B/op 52 allocs/op BenchmarkScan BenchmarkScan/simple BenchmarkScan/simple-10 179712 6660 ns/op 4544 B/op 34 allocs/op BenchmarkScan/with-comments BenchmarkScan/with-comments-10 133178 7628 ns/op 4608 B/op 40 allocs/op BenchmarkScan/messy BenchmarkScan/messy-10 49251 24106 ns/op 5896 B/op 163 allocs/op BenchmarkScan/quote-behavior BenchmarkScan/quote-behavior-10 240026 4854 ns/op 4456 B/op 21 allocs/op BenchmarkScan/quoted-right-brace BenchmarkScan/quoted-right-brace-10 87468 13534 ns/op 5056 B/op 49 allocs/op PASS ok github.com/nginxinc/nginx-go-crossplane 13.676s ``` This alternative to `Lex` is probably a micro-optimization for many use cases. As the size and number of NGINX configurations that need to be analyzed grows, optimization can be a good thing as well as an API that feels familiar to Go developers who might use this tool for their own purposes. Next steps: - Use `Scanner` to "parse" NGINX configurations. I think this should be done in place so that the existing API works as is, but we should also expose a way to allow the caller to provide the scanner. - Deprecate `Lex` in favor of `Scanner`. If we leave `Lex` in place then I don't think we would need a `v2` of the crossplane package (yet). --- lex_test.go | 57 +++++++++--- scanner.go | 230 ++++++++++++++++++++++++++++++++++++++++++++++++ scanner_test.go | 110 +++++++++++++++++++++++ util.go | 2 + 4 files changed, 386 insertions(+), 13 deletions(-) create mode 100644 scanner.go create mode 100644 scanner_test.go diff --git a/lex_test.go b/lex_test.go index cb3c5148..6bfc0bb0 100644 --- a/lex_test.go +++ b/lex_test.go @@ -446,22 +446,53 @@ func TestLex(t *testing.T) { } } -func TestLex_unhappy(t *testing.T) { - t.Parallel() +var lexToken NgxToken //nolint: gochecknoglobals // trying to avoid return value being optimzed away + +func BenchmarkLex(b *testing.B) { + var t NgxToken - testcases := map[string]string{ - "unbalanced open brance": `http {{}`, - "unbalanced closing brace": `http {}}`, - "multiple open braces": `http {{server {}}`, - "multiple closing braces after block end": `http {server {}}}`, - "multiple semicolons": `server { listen 80;; }`, - "semicolon afer closing brace": `server { listen 80; };`, - "open brace after semicolon": `server { listen 80; {}`, - "braces with no directive": `http{}{}`, - "missing final brace": `http{`, + for _, bm := range lexFixtures { + b.Run(bm.name, func(b *testing.B) { + path := getTestConfigPath(bm.name, "nginx.conf") + file, err := os.Open(path) + if err != nil { + b.Fatal(err) + } + defer file.Close() + b.ResetTimer() + + for i := 0; i < b.N; i++ { + if _, err := file.Seek(0, 0); err != nil { + b.Fatal(err) + } + + for tok := range Lex(file) { + t = tok + } + } + }) } - for name, c := range testcases { + lexToken = t +} + +//nolint:gochecknoglobals +var unhappyFixtures = map[string]string{ + "unbalanced open brance": `http {{}`, + "unbalanced closing brace": `http {}}`, + "multiple open braces": `http {{server {}}`, + "multiple closing braces after block end": `http {server {}}}`, + "multiple semicolons": `server { listen 80;; }`, + "semicolon afer closing brace": `server { listen 80; };`, + "open brace after semicolon": `server { listen 80; {}`, + "braces with no directive": `http{}{}`, + "missing final brace": `http{`, +} + +func TestLex_unhappy(t *testing.T) { + t.Parallel() + + for name, c := range unhappyFixtures { c := c t.Run(name, func(t *testing.T) { t.Parallel() diff --git a/scanner.go b/scanner.go new file mode 100644 index 00000000..cae32bf0 --- /dev/null +++ b/scanner.go @@ -0,0 +1,230 @@ +package crossplane + +import ( + "bufio" + "errors" + "fmt" + "io" + "strings" +) + +// Token is a lexical token of the NGINX configuration syntax. +type Token struct { + // Text is the string corresponding to the token. It could be a directive or symbol. The value is the actual token + // sequence in order to support defining directives in modules other than the core NGINX module set. + Text string + // Line is the source starting line number the token within a file. + Line int + // IsQuoted signifies if the token is wrapped by quotes (", '). Quotes are not usually necessary in an NGINX + // configuration and mostly serve to help make the config less ambiguous. + IsQuoted bool +} + +type scannerError struct { + msg string + line int +} + +func (e *scannerError) Error() string { return e.msg } +func (e *scannerError) Line() int { return e.line } + +func newScannerErrf(line int, format string, a ...any) *scannerError { + return &scannerError{line: line, msg: fmt.Sprintf(format, a...)} +} + +// LineNumber reports the line on which the error occurred by finding the first error in +// the errs chain that returns a line number. Otherwise, it returns 0, false. +// +// An error type should provide a Line() int method to return a line number. +func LineNumber(err error) (int, bool) { + var e interface{ Line() int } + if !errors.As(err, &e) { + return 0, false + } + + return e.Line(), true +} + +// Scanner provides an interface for tokenizing an NGINX configuration. Successive calls to the Scane method will step +// through the 'tokens; of an NGINX configuration. +// +// Scanning stops unrecoverably at EOF, the first I/O error, or an unexpected token. +// +// Use NewScanner to construct a Scanner. +type Scanner struct { + scanner *bufio.Scanner + lineno int + tokenStartLine int + tokenDepth int + repeateSpecialChar bool // only '}' can be repeated + prev string +} + +// NewScanner returns a new Scanner to read from r. +func NewScanner(r io.Reader) *Scanner { + s := &Scanner{ + scanner: bufio.NewScanner(r), + lineno: 1, + tokenStartLine: 1, + tokenDepth: 0, + repeateSpecialChar: false, + } + + s.scanner.Split(bufio.ScanRunes) + + return s +} + +// Scan reads the next token from source and returns it.. It returns io.EOF at the end of the source. Scanner errors are +// returned when encountered. +func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo + var tok strings.Builder + + lexState := skipSpace + newToken := false + readNext := true + esc := false + + var r, quote string + + for { + switch { + case s.prev != "": + r = s.prev + s.prev = "" + case readNext: + if !s.scanner.Scan() { + if tok.Len() > 0 { + return Token{Text: tok.String(), Line: s.tokenStartLine, IsQuoted: lexState == inQuote}, nil + } + + if s.tokenDepth > 0 { + return Token{}, &scannerError{line: s.tokenStartLine, msg: "unexpected end of file, expecting }"} + } + + return Token{}, io.EOF + } + + nextRune := s.scanner.Text() + r = nextRune + if isEOL(r) { + s.lineno++ + } + default: + readNext = true + } + + // skip CRs + if r == "\r" || r == "\\\r" { + continue + } + + if r == "\\" && !esc { + esc = true + continue + } + + if esc { + esc = false + r = "\\" + r + } + + switch lexState { + case skipSpace: + if !isSpace(r) { + lexState = inWord + newToken = true + readNext = false // re-eval + s.tokenStartLine = s.lineno + } + continue + + case inWord: + if newToken { + newToken = false + if r == "#" { + tok.WriteString(r) + lexState = inComment + s.tokenStartLine = s.lineno + continue + } + } + + if isSpace(r) { + return Token{Text: tok.String(), Line: s.tokenStartLine}, nil + } + + // parameter expansion syntax (ex: "${var[@]}") + if tok.Len() > 0 && strings.HasSuffix(tok.String(), "$") && r == "{" { + tok.WriteString(r) + lexState = inVar + s.repeateSpecialChar = false + continue + } + + // add entire quoted string to the token buffer + if r == `"` || r == "'" { + if tok.Len() > 0 { + // if a quote is inside a token, treat it like any other char + tok.WriteString(r) + } else { + quote = r + lexState = inQuote + s.tokenStartLine = s.lineno + } + s.repeateSpecialChar = false + continue + } + + // special characters treated as full tokens + if isSpecialChar(r) { + if tok.Len() > 0 { + s.prev = r + return Token{Text: tok.String(), Line: s.tokenStartLine}, nil + } + + // only } can be repeated + if s.repeateSpecialChar && r != "}" { + return Token{}, newScannerErrf(s.tokenStartLine, "unxpected %q", r) + } + + s.repeateSpecialChar = true + if r == "{" { + s.tokenDepth++ + } + + if r == "}" { + s.tokenDepth-- + if s.tokenDepth < 0 { + return Token{}, &scannerError{line: s.tokenStartLine, msg: `unexpected "}"`} + } + } + + tok.WriteString(r) + return Token{Text: tok.String(), Line: s.tokenStartLine}, nil + } + + s.repeateSpecialChar = false + tok.WriteString(r) + case inComment: + if isEOL(r) { + return Token{Text: tok.String(), Line: s.tokenStartLine}, nil + } + tok.WriteString(r) + case inVar: + tok.WriteString(r) + if r != "}" && !isSpace(r) { + continue + } + lexState = inWord + case inQuote: + if r == quote { + return Token{Text: tok.String(), Line: s.tokenStartLine}, nil + } + if r == "\\"+quote { + r = quote + } + tok.WriteString(r) + } + } +} diff --git a/scanner_test.go b/scanner_test.go new file mode 100644 index 00000000..30d06bbe --- /dev/null +++ b/scanner_test.go @@ -0,0 +1,110 @@ +package crossplane + +import ( + "io" + "os" + "strings" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestScanner(t *testing.T) { + t.Parallel() + + for _, f := range lexFixtures { + f := f + t.Run(f.name, func(t *testing.T) { + t.Parallel() + + path := getTestConfigPath(f.name, "nginx.conf") + file, err := os.Open(path) + if err != nil { + t.Fatal(err) + } + defer file.Close() + + s := NewScanner(file) + + i := 0 + for { + got, err := s.Scan() + if err == io.EOF { + if i < len(f.tokens)-1 { + t.Fatal("unexpected end of file") + } + return + } + + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + want := f.tokens[i] + require.Equal(t, want.value, got.Text) + require.Equal(t, want.line, got.Line) + i++ + } + }) + } +} + +func TestScanner_unhappy(t *testing.T) { + t.Parallel() + + for name, c := range unhappyFixtures { + c := c + t.Run(name, func(t *testing.T) { + t.Parallel() + + s := NewScanner(strings.NewReader(c)) + for { + _, err := s.Scan() + if err == io.EOF { + t.Fatal("reached end of string") + } + + if err != nil { + t.Logf("got error: %v", err) + return + } + } + }) + } +} + +var t Token //nolint: gochecknoglobals // trying to avoid return value being optimzed away + +func BenchmarkScan(b *testing.B) { + for _, bm := range lexFixtures { + b.Run(bm.name, func(b *testing.B) { + path := getTestConfigPath(bm.name, "nginx.conf") + file, err := os.Open(path) + if err != nil { + b.Fatal(err) + } + defer file.Close() + + b.ResetTimer() + + for i := 0; i < b.N; i++ { + if _, err := file.Seek(0, 0); err != nil { + b.Fatal(err) + } + + s := NewScanner(file) + + for { + tok, err := s.Scan() + if err == io.EOF { + break + } + if err != nil { + b.Fatal(err) + } + t = tok + } + } + }) + } +} diff --git a/util.go b/util.go index d2e84ade..186afeb5 100644 --- a/util.go +++ b/util.go @@ -35,6 +35,8 @@ func isEOL(s string) bool { return strings.HasSuffix(s, "\n") } +func isSpecialChar(s string) bool { return s == "{" || s == "}" || s == ";" } + func repr(s string) string { q := fmt.Sprintf("%q", s) for _, char := range s { From c07b1beddf5e53a407d0cd5e084e88b0b32bc782 Mon Sep 17 00:00:00 2001 From: Stephen Hurwitz Date: Wed, 31 Jan 2024 09:22:17 -0800 Subject: [PATCH 2/4] Adds Err() method to scanner and checks error in Scan() Stores the first error encountered by Scan() and checks it to make sure scanning stops unrecoverably. The Err() method can use used to fetch the last non-EOF error. --- scanner.go | 34 ++++++++++++++++++++++++++++------ scanner_test.go | 12 +++++++++++- 2 files changed, 39 insertions(+), 7 deletions(-) diff --git a/scanner.go b/scanner.go index cae32bf0..2a3a25b8 100644 --- a/scanner.go +++ b/scanner.go @@ -58,6 +58,7 @@ type Scanner struct { tokenDepth int repeateSpecialChar bool // only '}' can be repeated prev string + err error } // NewScanner returns a new Scanner to read from r. @@ -75,6 +76,20 @@ func NewScanner(r io.Reader) *Scanner { return s } +// Err returns the first non-EOF error that was encountered by the Scanner. +func (s *Scanner) Err() error { + if s.err == io.EOF { + return nil + } + return s.err +} + +func (s *Scanner) setErr(err error) { + if s.err == nil || s.err != io.EOF { + s.err = err + } +} + // Scan reads the next token from source and returns it.. It returns io.EOF at the end of the source. Scanner errors are // returned when encountered. func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo @@ -88,10 +103,13 @@ func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo var r, quote string for { + if s.err != nil { + return Token{}, s.err + } + switch { case s.prev != "": - r = s.prev - s.prev = "" + r, s.prev = s.prev, "" case readNext: if !s.scanner.Scan() { if tok.Len() > 0 { @@ -99,10 +117,12 @@ func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo } if s.tokenDepth > 0 { - return Token{}, &scannerError{line: s.tokenStartLine, msg: "unexpected end of file, expecting }"} + s.setErr(&scannerError{line: s.tokenStartLine, msg: "unexpected end of file, expecting }"}) + return Token{}, s.err } - return Token{}, io.EOF + s.setErr(io.EOF) + return Token{}, s.err } nextRune := s.scanner.Text() @@ -185,7 +205,8 @@ func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo // only } can be repeated if s.repeateSpecialChar && r != "}" { - return Token{}, newScannerErrf(s.tokenStartLine, "unxpected %q", r) + s.setErr(newScannerErrf(s.tokenStartLine, "unxpected %q", r)) + return Token{}, s.err } s.repeateSpecialChar = true @@ -196,7 +217,8 @@ func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo if r == "}" { s.tokenDepth-- if s.tokenDepth < 0 { - return Token{}, &scannerError{line: s.tokenStartLine, msg: `unexpected "}"`} + s.setErr(&scannerError{line: s.tokenStartLine, msg: `unexpected "}"`}) + return Token{}, s.err } } diff --git a/scanner_test.go b/scanner_test.go index 30d06bbe..0774a50a 100644 --- a/scanner_test.go +++ b/scanner_test.go @@ -1,6 +1,7 @@ package crossplane import ( + "errors" "io" "os" "strings" @@ -66,7 +67,16 @@ func TestScanner_unhappy(t *testing.T) { if err != nil { t.Logf("got error: %v", err) - return + + if gotErr := s.Err(); !errors.Is(gotErr, err) { + t.Fatalf("error do not match: have=%+v, want=%+v", gotErr, err) + } + + if _, gotErr := s.Scan(); !errors.Is(gotErr, err) { + t.Fatalf("error after scan does not match: have=%+v, want=%+v", gotErr, err) + } + + break } } }) From cc657b18c93b73f48cade75c8ce12c6926a057b7 Mon Sep 17 00:00:00 2001 From: Stephen Hurwitz Date: Thu, 4 Apr 2024 10:11:34 -0700 Subject: [PATCH 3/4] Fixes issue parsing comments in args with quote Fixed bug where the quoted token did not have `IsQuoted` set to `true`. I added an additional lex fixture which shows both the existing lexer and new scanner handle the case correctly. --- lex_test.go | 14 ++++++++++++++ scanner.go | 2 +- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/lex_test.go b/lex_test.go index 6bfc0bb0..520be852 100644 --- a/lex_test.go +++ b/lex_test.go @@ -415,6 +415,20 @@ var lexFixtures = []lexFixture{ {"}", 20}, {"}", 21}, }}, + {"comments-between-args", []tokenLine{ + {"http", 1}, + {"{", 1}, + {"#comment 1", 1}, + {"log_format", 2}, + {"#comment 2", 2}, + {"\\#arg\\ 1", 3}, + {"#comment 3", 3}, + {"#arg 2", 4}, + {"#comment 4", 4}, + {"#comment 5", 5}, + {";", 6}, + {"}", 7}, + }}, } func TestLex(t *testing.T) { diff --git a/scanner.go b/scanner.go index 2a3a25b8..683d1cb3 100644 --- a/scanner.go +++ b/scanner.go @@ -241,7 +241,7 @@ func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo lexState = inWord case inQuote: if r == quote { - return Token{Text: tok.String(), Line: s.tokenStartLine}, nil + return Token{Text: tok.String(), Line: s.tokenStartLine, IsQuoted: true}, nil } if r == "\\"+quote { r = quote From fe04b93474273b67c11e57b70f56efeef509f31c Mon Sep 17 00:00:00 2001 From: Stephen Hurwitz Date: Fri, 5 Jul 2024 14:52:46 -0700 Subject: [PATCH 4/4] Updates scanner to support Lua extension MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixed up the Scanner logic to mirror changes made to support Lua extension in Lex. Added a compat layer so that the existing Lua type can be used with `Scanner` vs trying to refactor the implementation to remove the channel. Doing so I think would result in further gains. Benchmarks: ``` ❯ go test -benchmem -run=^$ -bench "^(BenchmarkLex|BenchmarkLexWithLua|BenchmarkScanner|BenchmarkScannerWithLua)$" github.com/nginxinc/nginx-go-crossplane -count=1 goos: darwin goarch: arm64 pkg: github.com/nginxinc/nginx-go-crossplane BenchmarkLex/simple-10 57963 17756 ns/op 103049 B/op 39 allocs/op BenchmarkLex/with-comments-10 60025 20067 ns/op 103112 B/op 45 allocs/op BenchmarkLex/messy-10 26170 47822 ns/op 104400 B/op 168 allocs/op BenchmarkLex/quote-behavior-10 74510 17693 ns/op 102961 B/op 26 allocs/op BenchmarkLex/quoted-right-brace-10 43134 27752 ns/op 103560 B/op 54 allocs/op BenchmarkLex/comments-between-args-10 78271 14866 ns/op 102937 B/op 27 allocs/op BenchmarkLexWithLua/lua-basic-10 46273 26012 ns/op 105499 B/op 53 allocs/op BenchmarkLexWithLua/lua-block-simple-10 22514 54149 ns/op 108556 B/op 143 allocs/op BenchmarkLexWithLua/lua-block-larger-10 25983 46605 ns/op 108403 B/op 59 allocs/op BenchmarkLexWithLua/lua-block-tricky-10 33756 35067 ns/op 106684 B/op 66 allocs/op BenchmarkScanner/simple-10 163138 7084 ns/op 4648 B/op 36 allocs/op BenchmarkScanner/with-comments-10 144558 8100 ns/op 4712 B/op 42 allocs/op BenchmarkScanner/messy-10 47570 25026 ns/op 6000 B/op 165 allocs/op BenchmarkScanner/quote-behavior-10 222280 5083 ns/op 4560 B/op 23 allocs/op BenchmarkScanner/quoted-right-brace-10 82656 14281 ns/op 5160 B/op 51 allocs/op BenchmarkScanner/comments-between-args-10 225475 4872 ns/op 4536 B/op 24 allocs/op BenchmarkScannerWithLua/lua-basic-10 93081 12833 ns/op 7866 B/op 66 allocs/op BenchmarkScannerWithLua/lua-block-simple-10 31426 37989 ns/op 10924 B/op 156 allocs/op BenchmarkScannerWithLua/lua-block-larger-10 37148 30723 ns/op 10770 B/op 72 allocs/op BenchmarkScannerWithLua/lua-block-tricky-10 54890 22383 ns/op 9050 B/op 79 allocs/op PASS ok github.com/nginxinc/nginx-go-crossplane 29.969s ``` --- lex.go | 60 +++++++++++++++++++++-- lex_test.go | 57 ++++++++++++++-------- scanner.go | 125 ++++++++++++++++++++++++++++++++++++++++++------ scanner_test.go | 80 ++++++++++++++++++++----------- 4 files changed, 258 insertions(+), 64 deletions(-) diff --git a/lex.go b/lex.go index 4b69cbea..a39b8a08 100644 --- a/lex.go +++ b/lex.go @@ -65,6 +65,7 @@ type LexOptions struct { // RegisterLexer is an option that cna be used to add a lexer to tokenize external NGINX tokens. type RegisterLexer interface { applyLexOptions(options *LexOptions) + applyScannerOptions(options *scannerOptions) } type registerLexer struct { @@ -82,6 +83,16 @@ func (rl registerLexer) applyLexOptions(o *LexOptions) { } } +func (rl registerLexer) applyScannerOptions(o *scannerOptions) { + if o.extensions == nil { + o.extensions = make(map[string]ScannerExt) + } + + for _, s := range rl.stringTokens { + o.extensions[s] = &LexerScanner{lexer: rl.l} + } +} + // LexWithLexer registers a Lexer that implements tokenization of an NGINX configuration after one of the given // stringTokens is encountered by Lex. func LexWithLexer(l Lexer, stringTokens ...string) RegisterLexer { //nolint:ireturn @@ -106,12 +117,38 @@ func Lex(reader io.Reader) chan NgxToken { // SubScanner provides an interface for scanning alternative grammars within NGINX configuration data. type SubScanner struct { scanner *bufio.Scanner + parent *Scanner tokenLine int } // Scan advances the scanner to the next token which will be available though the Text method. It returns false // when the scan stops by reaching the end of input. func (e *SubScanner) Scan() bool { + if e.scanner != nil { + return e.lexScan() + } + + if e.parent.err != nil { + return false + } + + if !e.parent.scanner.Scan() { + if err := e.parent.scanner.Err(); err != nil { + e.parent.setErr(err) + } + return false + } + + // e.parent.prev = e.parent.scanner.Text() + // if isEOL(e.parent.prev) { + if t := e.parent.scanner.Text(); isEOL(t) { + e.parent.lineno++ + } + + return true +} + +func (e *SubScanner) lexScan() bool { if !e.scanner.Scan() { return false } @@ -122,13 +159,30 @@ func (e *SubScanner) Scan() bool { } // Err returns the fist non-EOF error encountered by the Scanner. -func (e *SubScanner) Err() error { return e.scanner.Err() } +func (e *SubScanner) Err() error { + if e.scanner != nil { + return e.scanner.Err() + } + return e.parent.Err() +} // Text returns the most recent token generated by a call to Scan. -func (e *SubScanner) Text() string { return e.scanner.Text() } +func (e *SubScanner) Text() string { + if e.scanner != nil { + return e.scanner.Text() + } + // return e.parent.prev + return e.parent.scanner.Text() +} // Line returns the line number of the most recent token generated by a call to Scan. -func (e *SubScanner) Line() int { return e.tokenLine } +func (e *SubScanner) Line() int { + if e.scanner != nil { + return e.tokenLine + } + + return e.parent.lineno +} //nolint:gocyclo,funlen,gocognit,maintidx func tokenize(reader io.Reader, tokenCh chan NgxToken, options LexOptions) { diff --git a/lex_test.go b/lex_test.go index 520be852..d7348089 100644 --- a/lex_test.go +++ b/lex_test.go @@ -460,34 +460,53 @@ func TestLex(t *testing.T) { } } -var lexToken NgxToken //nolint: gochecknoglobals // trying to avoid return value being optimzed away - -func BenchmarkLex(b *testing.B) { +func benchmarkLex(b *testing.B, path string, options LexOptions) { var t NgxToken + file, err := os.Open(path) + if err != nil { + b.Fatal(err) + } + defer file.Close() + b.ResetTimer() + + for i := 0; i < b.N; i++ { + if _, err := file.Seek(0, 0); err != nil { + b.Fatal(err) + } + + for tok := range LexWithOptions(file, options) { + t = tok + } + } + + _ = t +} + +func BenchmarkLex(b *testing.B) { for _, bm := range lexFixtures { + if strings.HasPrefix(bm.name, "lua") { + continue + } + b.Run(bm.name, func(b *testing.B) { path := getTestConfigPath(bm.name, "nginx.conf") - file, err := os.Open(path) - if err != nil { - b.Fatal(err) - } - defer file.Close() - b.ResetTimer() + benchmarkLex(b, path, LexOptions{}) + }) + } +} - for i := 0; i < b.N; i++ { - if _, err := file.Seek(0, 0); err != nil { - b.Fatal(err) - } +func BenchmarkLexWithLua(b *testing.B) { + for _, bm := range lexFixtures { + if !strings.HasPrefix(bm.name, "lua") { + continue + } - for tok := range Lex(file) { - t = tok - } - } + b.Run(bm.name, func(b *testing.B) { + path := getTestConfigPath(bm.name, "nginx.conf") + benchmarkLex(b, path, LexOptions{Lexers: []RegisterLexer{lua.RegisterLexer()}}) }) } - - lexToken = t } //nolint:gochecknoglobals diff --git a/scanner.go b/scanner.go index 683d1cb3..b688f078 100644 --- a/scanner.go +++ b/scanner.go @@ -8,6 +8,14 @@ import ( "strings" ) +type scannerOptions struct { + extensions map[string]ScannerExt +} + +type ScannerOption interface { + applyScannerOptions(options *scannerOptions) +} + // Token is a lexical token of the NGINX configuration syntax. type Token struct { // Text is the string corresponding to the token. It could be a directive or symbol. The value is the actual token @@ -20,6 +28,8 @@ type Token struct { IsQuoted bool } +func (t Token) String() string { return fmt.Sprintf("{%d, %s, %t}", t.Line, t.Text, t.IsQuoted) } + type scannerError struct { msg string line int @@ -52,23 +62,33 @@ func LineNumber(err error) (int, bool) { // // Use NewScanner to construct a Scanner. type Scanner struct { - scanner *bufio.Scanner - lineno int - tokenStartLine int - tokenDepth int - repeateSpecialChar bool // only '}' can be repeated - prev string - err error + scanner *bufio.Scanner + lineno int + tokenStartLine int + tokenDepth int + repeateSpecialChar bool // only '}' can be repeated + nextTokenIsDirective bool + prev string + err error + options *scannerOptions + ext Tokenizer } // NewScanner returns a new Scanner to read from r. -func NewScanner(r io.Reader) *Scanner { +func NewScanner(r io.Reader, options ...ScannerOption) *Scanner { + opts := &scannerOptions{} + for _, opt := range options { + opt.applyScannerOptions(opts) + } + s := &Scanner{ - scanner: bufio.NewScanner(r), - lineno: 1, - tokenStartLine: 1, - tokenDepth: 0, - repeateSpecialChar: false, + scanner: bufio.NewScanner(r), + lineno: 1, + tokenStartLine: 1, + tokenDepth: 0, + repeateSpecialChar: false, + nextTokenIsDirective: true, + options: opts, } s.scanner.Split(bufio.ScanRunes) @@ -92,7 +112,21 @@ func (s *Scanner) setErr(err error) { // Scan reads the next token from source and returns it.. It returns io.EOF at the end of the source. Scanner errors are // returned when encountered. -func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo +func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo, maintidx // sorry + if s.ext != nil { + t, err := s.ext.Next() + if err != nil { + if !errors.Is(err, ErrTokenizerDone) { + s.setErr(err) + return Token{}, s.err + } + + s.ext = nil + } else { + return t, nil + } + } + var tok strings.Builder lexState := skipSpace @@ -129,6 +163,7 @@ func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo r = nextRune if isEOL(r) { s.lineno++ + s.nextTokenIsDirective = true } default: readNext = true @@ -149,6 +184,16 @@ func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo r = "\\" + r } + if tok.Len() > 0 { + t := tok.String() + if s.nextTokenIsDirective { + if ext, ok := s.options.extensions[t]; ok { + s.ext = ext.Tokenizer(&SubScanner{parent: s, tokenLine: s.tokenStartLine}, t) + return Token{Text: t, Line: s.tokenStartLine}, nil + } + } + } + switch lexState { case skipSpace: if !isSpace(r) { @@ -166,11 +211,13 @@ func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo tok.WriteString(r) lexState = inComment s.tokenStartLine = s.lineno + s.nextTokenIsDirective = false continue } } if isSpace(r) { + s.nextTokenIsDirective = false return Token{Text: tok.String(), Line: s.tokenStartLine}, nil } @@ -179,6 +226,7 @@ func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo tok.WriteString(r) lexState = inVar s.repeateSpecialChar = false + s.nextTokenIsDirective = false continue } @@ -223,6 +271,7 @@ func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo } tok.WriteString(r) + s.nextTokenIsDirective = true return Token{Text: tok.String(), Line: s.tokenStartLine}, nil } @@ -250,3 +299,51 @@ func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo } } } + +// ScannerExt is the interface that describes an extension for the [Scanner]. Scanner extensions enable scanning of +// configurations that contain syntaxes that do not follow the usual grammar. +type ScannerExt interface { + Tokenizer(s *SubScanner, matchedToken string) Tokenizer +} + +// ErrTokenizerDone is returned by [Tokenizer] when tokenization is complete. +var ErrTokenizerDone = errors.New("done") + +// Tokenizer is the interface that wraps the Next method. +// +// Next returns the next token scanned from the NGINX configuration or an error if the configuration cannot be +// tokenized. Return the special error, [ErrTokenizerDone] when finished tokenizing. +type Tokenizer interface { + Next() (Token, error) +} + +// LexerScanner is a compatibility layer between Lexers and Scanner. +type LexerScanner struct { + lexer Lexer + scanner *SubScanner + matchedToken string + ch <-chan NgxToken +} + +func (s *LexerScanner) Tokenizer(scanner *SubScanner, matchedtoken string) Tokenizer { + s.scanner = scanner + s.matchedToken = matchedtoken + return s +} + +func (s *LexerScanner) Next() (Token, error) { + if s.ch == nil { + s.ch = s.lexer.Lex(s.scanner, s.matchedToken) + } + + ngxTok, ok := <-s.ch + if !ok { + return Token{}, ErrTokenizerDone + } + + if ngxTok.Error != nil { + return Token{}, newScannerErrf(ngxTok.Line, ngxTok.Error.Error()) + } + + return Token{Text: ngxTok.Value, Line: ngxTok.Line, IsQuoted: ngxTok.IsQuoted}, nil +} diff --git a/scanner_test.go b/scanner_test.go index 0774a50a..7f2f5b1e 100644 --- a/scanner_test.go +++ b/scanner_test.go @@ -15,6 +15,7 @@ func TestScanner(t *testing.T) { for _, f := range lexFixtures { f := f + t.Run(f.name, func(t *testing.T) { t.Parallel() @@ -25,7 +26,7 @@ func TestScanner(t *testing.T) { } defer file.Close() - s := NewScanner(file) + s := NewScanner(file, lua.RegisterLexer()) i := 0 for { @@ -42,8 +43,8 @@ func TestScanner(t *testing.T) { } want := f.tokens[i] - require.Equal(t, want.value, got.Text) - require.Equal(t, want.line, got.Line) + require.Equal(t, want.value, got.Text, "got=%s", got) + require.Equal(t, want.line, got.Line, "got=%s", got) i++ } }) @@ -58,7 +59,7 @@ func TestScanner_unhappy(t *testing.T) { t.Run(name, func(t *testing.T) { t.Parallel() - s := NewScanner(strings.NewReader(c)) + s := NewScanner(strings.NewReader(c), lua.RegisterLexer()) for { _, err := s.Scan() if err == io.EOF { @@ -83,38 +84,61 @@ func TestScanner_unhappy(t *testing.T) { } } -var t Token //nolint: gochecknoglobals // trying to avoid return value being optimzed away +func benchmarkScanner(b *testing.B, path string, options ...ScannerOption) { + var t Token -func BenchmarkScan(b *testing.B) { - for _, bm := range lexFixtures { - b.Run(bm.name, func(b *testing.B) { - path := getTestConfigPath(bm.name, "nginx.conf") - file, err := os.Open(path) + file, err := os.Open(path) + if err != nil { + b.Fatal(err) + } + defer file.Close() + + b.ResetTimer() + + for i := 0; i < b.N; i++ { + if _, err := file.Seek(0, 0); err != nil { + b.Fatal(err) + } + + s := NewScanner(file, options...) + + for { + tok, err := s.Scan() + if err == io.EOF { + break + } if err != nil { b.Fatal(err) } - defer file.Close() + t = tok + } + } - b.ResetTimer() + _ = t +} - for i := 0; i < b.N; i++ { - if _, err := file.Seek(0, 0); err != nil { - b.Fatal(err) - } +func BenchmarkScanner(b *testing.B) { + for _, bm := range lexFixtures { + if strings.HasPrefix(bm.name, "lua") { + continue + } - s := NewScanner(file) + b.Run(bm.name, func(b *testing.B) { + path := getTestConfigPath(bm.name, "nginx.conf") + benchmarkScanner(b, path) + }) + } +} - for { - tok, err := s.Scan() - if err == io.EOF { - break - } - if err != nil { - b.Fatal(err) - } - t = tok - } - } +func BenchmarkScannerWithLua(b *testing.B) { + for _, bm := range lexFixtures { + if !strings.HasPrefix(bm.name, "lua") { + continue + } + + b.Run(bm.name, func(b *testing.B) { + path := getTestConfigPath(bm.name, "nginx.conf") + benchmarkScanner(b, path, lua.RegisterLexer()) }) } }