lex.go (18154B)
1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 package parse
6
7 import (
8 "fmt"
9 "strings"
10 "unicode"
11 "unicode/utf8"
12 )
13
14 // item represents a token or text string returned from the scanner.
15 type item struct {
16 typ itemType // The type of this item.
17 pos Pos // The starting position, in bytes, of this item in the input string.
18 val string // The value of this item.
19 line int // The line number at the start of this item.
20 }
21
22 func (i item) String() string {
23 switch {
24 case i.typ == itemEOF:
25 return "EOF"
26 case i.typ == itemError:
27 return i.val
28 case i.typ > itemKeyword:
29 return fmt.Sprintf("<%s>", i.val)
30 case len(i.val) > 10:
31 return fmt.Sprintf("%.10q...", i.val)
32 }
33 return fmt.Sprintf("%q", i.val)
34 }
35
36 // itemType identifies the type of lex items.
37 type itemType int
38
39 const (
40 itemError itemType = iota // error occurred; value is text of error
41 itemBool // boolean constant
42 itemChar // printable ASCII character; grab bag for comma etc.
43 itemCharConstant // character constant
44 itemComment // comment text
45 itemComplex // complex constant (1+2i); imaginary is just a number
46 itemAssign // equals ('=') introducing an assignment
47 itemDeclare // colon-equals (':=') introducing a declaration
48 itemEOF
49 itemField // alphanumeric identifier starting with '.'
50 itemIdentifier // alphanumeric identifier not starting with '.'
51 itemLeftDelim // left action delimiter
52 itemLeftParen // '(' inside action
53 itemNumber // simple number, including imaginary
54 itemPipe // pipe symbol
55 itemRawString // raw quoted string (includes quotes)
56 itemRightDelim // right action delimiter
57 itemRightParen // ')' inside action
58 itemSpace // run of spaces separating arguments
59 itemString // quoted string (includes quotes)
60 itemText // plain text
61 itemVariable // variable starting with '$', such as '$' or '$1' or '$hello'
62 // Keywords appear after all the rest.
63 itemKeyword // used only to delimit the keywords
64 itemBlock // block keyword
65 itemBreak // break keyword
66 itemContinue // continue keyword
67 itemDot // the cursor, spelled '.'
68 itemDefine // define keyword
69 itemElse // else keyword
70 itemEnd // end keyword
71 itemIf // if keyword
72 itemNil // the untyped nil constant, easiest to treat as a keyword
73 itemRange // range keyword
74 itemTemplate // template keyword
75 itemWith // with keyword
76 )
77
78 var key = map[string]itemType{
79 ".": itemDot,
80 "block": itemBlock,
81 "break": itemBreak,
82 "continue": itemContinue,
83 "define": itemDefine,
84 "else": itemElse,
85 "end": itemEnd,
86 "if": itemIf,
87 "range": itemRange,
88 "nil": itemNil,
89 "template": itemTemplate,
90 "with": itemWith,
91 }
92
93 const eof = -1
94
95 // Trimming spaces.
96 // If the action begins "{{- " rather than "{{", then all space/tab/newlines
97 // preceding the action are trimmed; conversely if it ends " -}}" the
98 // leading spaces are trimmed. This is done entirely in the lexer; the
99 // parser never sees it happen. We require an ASCII space (' ', \t, \r, \n)
100 // to be present to avoid ambiguity with things like "{{-3}}". It reads
101 // better with the space present anyway. For simplicity, only ASCII
102 // does the job.
103 const (
104 spaceChars = " \t\r\n" // These are the space characters defined by Go itself.
105 trimMarker = '-' // Attached to left/right delimiter, trims trailing spaces from preceding/following text.
106 trimMarkerLen = Pos(1 + 1) // marker plus space before or after
107 )
108
109 // stateFn represents the state of the scanner as a function that returns the next state.
110 type stateFn func(*lexer) stateFn
111
112 // lexer holds the state of the scanner.
113 type lexer struct {
114 name string // the name of the input; used only for error reports
115 input string // the string being scanned
116 leftDelim string // start of action
117 rightDelim string // end of action
118 emitComment bool // emit itemComment tokens.
119 pos Pos // current position in the input
120 start Pos // start position of this item
121 width Pos // width of last rune read from input
122 items chan item // channel of scanned items
123 parenDepth int // nesting depth of ( ) exprs
124 line int // 1+number of newlines seen
125 startLine int // start line of this item
126 breakOK bool // break keyword allowed
127 continueOK bool // continue keyword allowed
128 }
129
130 // next returns the next rune in the input.
131 func (l *lexer) next() rune {
132 if int(l.pos) >= len(l.input) {
133 l.width = 0
134 return eof
135 }
136 r, w := utf8.DecodeRuneInString(l.input[l.pos:])
137 l.width = Pos(w)
138 l.pos += l.width
139 if r == '\n' {
140 l.line++
141 }
142 return r
143 }
144
145 // peek returns but does not consume the next rune in the input.
146 func (l *lexer) peek() rune {
147 r := l.next()
148 l.backup()
149 return r
150 }
151
152 // backup steps back one rune. Can only be called once per call of next.
153 func (l *lexer) backup() {
154 l.pos -= l.width
155 // Correct newline count.
156 if l.width == 1 && l.input[l.pos] == '\n' {
157 l.line--
158 }
159 }
160
161 // emit passes an item back to the client.
162 func (l *lexer) emit(t itemType) {
163 l.items <- item{t, l.start, l.input[l.start:l.pos], l.startLine}
164 l.start = l.pos
165 l.startLine = l.line
166 }
167
168 // ignore skips over the pending input before this point.
169 func (l *lexer) ignore() {
170 l.line += strings.Count(l.input[l.start:l.pos], "\n")
171 l.start = l.pos
172 l.startLine = l.line
173 }
174
175 // accept consumes the next rune if it's from the valid set.
176 func (l *lexer) accept(valid string) bool {
177 if strings.ContainsRune(valid, l.next()) {
178 return true
179 }
180 l.backup()
181 return false
182 }
183
184 // acceptRun consumes a run of runes from the valid set.
185 func (l *lexer) acceptRun(valid string) {
186 for strings.ContainsRune(valid, l.next()) {
187 }
188 l.backup()
189 }
190
191 // errorf returns an error token and terminates the scan by passing
192 // back a nil pointer that will be the next state, terminating l.nextItem.
193 func (l *lexer) errorf(format string, args ...any) stateFn {
194 l.items <- item{itemError, l.start, fmt.Sprintf(format, args...), l.startLine}
195 return nil
196 }
197
198 // nextItem returns the next item from the input.
199 // Called by the parser, not in the lexing goroutine.
200 func (l *lexer) nextItem() item {
201 return <-l.items
202 }
203
204 // drain drains the output so the lexing goroutine will exit.
205 // Called by the parser, not in the lexing goroutine.
206 func (l *lexer) drain() {
207 for range l.items {
208 }
209 }
210
211 // lex creates a new scanner for the input string.
212 func lex(name, input, left, right string, emitComment bool) *lexer {
213 if left == "" {
214 left = leftDelim
215 }
216 if right == "" {
217 right = rightDelim
218 }
219 l := &lexer{
220 name: name,
221 input: input,
222 leftDelim: left,
223 rightDelim: right,
224 emitComment: emitComment,
225 items: make(chan item),
226 line: 1,
227 startLine: 1,
228 }
229 go l.run()
230 return l
231 }
232
233 // run runs the state machine for the lexer.
234 func (l *lexer) run() {
235 for state := lexText; state != nil; {
236 state = state(l)
237 }
238 close(l.items)
239 }
240
241 // state functions
242
243 const (
244 leftDelim = "{{"
245 rightDelim = "}}"
246 leftComment = "/*"
247 rightComment = "*/"
248 )
249
250 // lexText scans until an opening action delimiter, "{{".
251 func lexText(l *lexer) stateFn {
252 l.width = 0
253 if x := strings.Index(l.input[l.pos:], l.leftDelim); x >= 0 {
254 ldn := Pos(len(l.leftDelim))
255 l.pos += Pos(x)
256 trimLength := Pos(0)
257 if hasLeftTrimMarker(l.input[l.pos+ldn:]) {
258 trimLength = rightTrimLength(l.input[l.start:l.pos])
259 }
260 l.pos -= trimLength
261 if l.pos > l.start {
262 l.line += strings.Count(l.input[l.start:l.pos], "\n")
263 l.emit(itemText)
264 }
265 l.pos += trimLength
266 l.ignore()
267 return lexLeftDelim
268 }
269 l.pos = Pos(len(l.input))
270 // Correctly reached EOF.
271 if l.pos > l.start {
272 l.line += strings.Count(l.input[l.start:l.pos], "\n")
273 l.emit(itemText)
274 }
275 l.emit(itemEOF)
276 return nil
277 }
278
279 // rightTrimLength returns the length of the spaces at the end of the string.
280 func rightTrimLength(s string) Pos {
281 return Pos(len(s) - len(strings.TrimRight(s, spaceChars)))
282 }
283
284 // atRightDelim reports whether the lexer is at a right delimiter, possibly preceded by a trim marker.
285 func (l *lexer) atRightDelim() (delim, trimSpaces bool) {
286 if hasRightTrimMarker(l.input[l.pos:]) && strings.HasPrefix(l.input[l.pos+trimMarkerLen:], l.rightDelim) { // With trim marker.
287 return true, true
288 }
289 if strings.HasPrefix(l.input[l.pos:], l.rightDelim) { // Without trim marker.
290 return true, false
291 }
292 return false, false
293 }
294
295 // leftTrimLength returns the length of the spaces at the beginning of the string.
296 func leftTrimLength(s string) Pos {
297 return Pos(len(s) - len(strings.TrimLeft(s, spaceChars)))
298 }
299
300 // lexLeftDelim scans the left delimiter, which is known to be present, possibly with a trim marker.
301 func lexLeftDelim(l *lexer) stateFn {
302 l.pos += Pos(len(l.leftDelim))
303 trimSpace := hasLeftTrimMarker(l.input[l.pos:])
304 afterMarker := Pos(0)
305 if trimSpace {
306 afterMarker = trimMarkerLen
307 }
308 if strings.HasPrefix(l.input[l.pos+afterMarker:], leftComment) {
309 l.pos += afterMarker
310 l.ignore()
311 return lexComment
312 }
313 l.emit(itemLeftDelim)
314 l.pos += afterMarker
315 l.ignore()
316 l.parenDepth = 0
317 return lexInsideAction
318 }
319
320 // lexComment scans a comment. The left comment marker is known to be present.
321 func lexComment(l *lexer) stateFn {
322 l.pos += Pos(len(leftComment))
323 i := strings.Index(l.input[l.pos:], rightComment)
324 if i < 0 {
325 return l.errorf("unclosed comment")
326 }
327 l.pos += Pos(i + len(rightComment))
328 delim, trimSpace := l.atRightDelim()
329 if !delim {
330 return l.errorf("comment ends before closing delimiter")
331 }
332 if l.emitComment {
333 l.emit(itemComment)
334 }
335 if trimSpace {
336 l.pos += trimMarkerLen
337 }
338 l.pos += Pos(len(l.rightDelim))
339 if trimSpace {
340 l.pos += leftTrimLength(l.input[l.pos:])
341 }
342 l.ignore()
343 return lexText
344 }
345
346 // lexRightDelim scans the right delimiter, which is known to be present, possibly with a trim marker.
347 func lexRightDelim(l *lexer) stateFn {
348 trimSpace := hasRightTrimMarker(l.input[l.pos:])
349 if trimSpace {
350 l.pos += trimMarkerLen
351 l.ignore()
352 }
353 l.pos += Pos(len(l.rightDelim))
354 l.emit(itemRightDelim)
355 if trimSpace {
356 l.pos += leftTrimLength(l.input[l.pos:])
357 l.ignore()
358 }
359 return lexText
360 }
361
362 // lexInsideAction scans the elements inside action delimiters.
363 func lexInsideAction(l *lexer) stateFn {
364 // Either number, quoted string, or identifier.
365 // Spaces separate arguments; runs of spaces turn into itemSpace.
366 // Pipe symbols separate and are emitted.
367 delim, _ := l.atRightDelim()
368 if delim {
369 if l.parenDepth == 0 {
370 return lexRightDelim
371 }
372 return l.errorf("unclosed left paren")
373 }
374 switch r := l.next(); {
375 case r == eof:
376 return l.errorf("unclosed action")
377 case isSpace(r):
378 l.backup() // Put space back in case we have " -}}".
379 return lexSpace
380 case r == '=':
381 l.emit(itemAssign)
382 case r == ':':
383 if l.next() != '=' {
384 return l.errorf("expected :=")
385 }
386 l.emit(itemDeclare)
387 case r == '|':
388 l.emit(itemPipe)
389 case r == '"':
390 return lexQuote
391 case r == '`':
392 return lexRawQuote
393 case r == '$':
394 return lexVariable
395 case r == '\'':
396 return lexChar
397 case r == '.':
398 // special look-ahead for ".field" so we don't break l.backup().
399 if l.pos < Pos(len(l.input)) {
400 r := l.input[l.pos]
401 if r < '0' || '9' < r {
402 return lexField
403 }
404 }
405 fallthrough // '.' can start a number.
406 case r == '+' || r == '-' || ('0' <= r && r <= '9'):
407 l.backup()
408 return lexNumber
409 case isAlphaNumeric(r):
410 l.backup()
411 return lexIdentifier
412 case r == '(':
413 l.emit(itemLeftParen)
414 l.parenDepth++
415 case r == ')':
416 l.emit(itemRightParen)
417 l.parenDepth--
418 if l.parenDepth < 0 {
419 return l.errorf("unexpected right paren %#U", r)
420 }
421 case r <= unicode.MaxASCII && unicode.IsPrint(r):
422 l.emit(itemChar)
423 default:
424 return l.errorf("unrecognized character in action: %#U", r)
425 }
426 return lexInsideAction
427 }
428
429 // lexSpace scans a run of space characters.
430 // We have not consumed the first space, which is known to be present.
431 // Take care if there is a trim-marked right delimiter, which starts with a space.
432 func lexSpace(l *lexer) stateFn {
433 var r rune
434 var numSpaces int
435 for {
436 r = l.peek()
437 if !isSpace(r) {
438 break
439 }
440 l.next()
441 numSpaces++
442 }
443 // Be careful about a trim-marked closing delimiter, which has a minus
444 // after a space. We know there is a space, so check for the '-' that might follow.
445 if hasRightTrimMarker(l.input[l.pos-1:]) && strings.HasPrefix(l.input[l.pos-1+trimMarkerLen:], l.rightDelim) {
446 l.backup() // Before the space.
447 if numSpaces == 1 {
448 return lexRightDelim // On the delim, so go right to that.
449 }
450 }
451 l.emit(itemSpace)
452 return lexInsideAction
453 }
454
455 // lexIdentifier scans an alphanumeric.
456 func lexIdentifier(l *lexer) stateFn {
457 Loop:
458 for {
459 switch r := l.next(); {
460 case isAlphaNumeric(r):
461 // absorb.
462 default:
463 l.backup()
464 word := l.input[l.start:l.pos]
465 if !l.atTerminator() {
466 return l.errorf("bad character %#U", r)
467 }
468 switch {
469 case key[word] > itemKeyword:
470 item := key[word]
471 if item == itemBreak && !l.breakOK || item == itemContinue && !l.continueOK {
472 l.emit(itemIdentifier)
473 } else {
474 l.emit(item)
475 }
476 case word[0] == '.':
477 l.emit(itemField)
478 case word == "true", word == "false":
479 l.emit(itemBool)
480 default:
481 l.emit(itemIdentifier)
482 }
483 break Loop
484 }
485 }
486 return lexInsideAction
487 }
488
489 // lexField scans a field: .Alphanumeric.
490 // The . has been scanned.
491 func lexField(l *lexer) stateFn {
492 return lexFieldOrVariable(l, itemField)
493 }
494
495 // lexVariable scans a Variable: $Alphanumeric.
496 // The $ has been scanned.
497 func lexVariable(l *lexer) stateFn {
498 if l.atTerminator() { // Nothing interesting follows -> "$".
499 l.emit(itemVariable)
500 return lexInsideAction
501 }
502 return lexFieldOrVariable(l, itemVariable)
503 }
504
505 // lexVariable scans a field or variable: [.$]Alphanumeric.
506 // The . or $ has been scanned.
507 func lexFieldOrVariable(l *lexer, typ itemType) stateFn {
508 if l.atTerminator() { // Nothing interesting follows -> "." or "$".
509 if typ == itemVariable {
510 l.emit(itemVariable)
511 } else {
512 l.emit(itemDot)
513 }
514 return lexInsideAction
515 }
516 var r rune
517 for {
518 r = l.next()
519 if !isAlphaNumeric(r) {
520 l.backup()
521 break
522 }
523 }
524 if !l.atTerminator() {
525 return l.errorf("bad character %#U", r)
526 }
527 l.emit(typ)
528 return lexInsideAction
529 }
530
531 // atTerminator reports whether the input is at valid termination character to
532 // appear after an identifier. Breaks .X.Y into two pieces. Also catches cases
533 // like "$x+2" not being acceptable without a space, in case we decide one
534 // day to implement arithmetic.
535 func (l *lexer) atTerminator() bool {
536 r := l.peek()
537 if isSpace(r) {
538 return true
539 }
540 switch r {
541 case eof, '.', ',', '|', ':', ')', '(':
542 return true
543 }
544 // Does r start the delimiter? This can be ambiguous (with delim=="//", $x/2 will
545 // succeed but should fail) but only in extremely rare cases caused by willfully
546 // bad choice of delimiter.
547 if rd, _ := utf8.DecodeRuneInString(l.rightDelim); rd == r {
548 return true
549 }
550 return false
551 }
552
553 // lexChar scans a character constant. The initial quote is already
554 // scanned. Syntax checking is done by the parser.
555 func lexChar(l *lexer) stateFn {
556 Loop:
557 for {
558 switch l.next() {
559 case '\\':
560 if r := l.next(); r != eof && r != '\n' {
561 break
562 }
563 fallthrough
564 case eof, '\n':
565 return l.errorf("unterminated character constant")
566 case '\'':
567 break Loop
568 }
569 }
570 l.emit(itemCharConstant)
571 return lexInsideAction
572 }
573
574 // lexNumber scans a number: decimal, octal, hex, float, or imaginary. This
575 // isn't a perfect number scanner - for instance it accepts "." and "0x0.2"
576 // and "089" - but when it's wrong the input is invalid and the parser (via
577 // strconv) will notice.
578 func lexNumber(l *lexer) stateFn {
579 if !l.scanNumber() {
580 return l.errorf("bad number syntax: %q", l.input[l.start:l.pos])
581 }
582 if sign := l.peek(); sign == '+' || sign == '-' {
583 // Complex: 1+2i. No spaces, must end in 'i'.
584 if !l.scanNumber() || l.input[l.pos-1] != 'i' {
585 return l.errorf("bad number syntax: %q", l.input[l.start:l.pos])
586 }
587 l.emit(itemComplex)
588 } else {
589 l.emit(itemNumber)
590 }
591 return lexInsideAction
592 }
593
594 func (l *lexer) scanNumber() bool {
595 // Optional leading sign.
596 l.accept("+-")
597 // Is it hex?
598 digits := "0123456789_"
599 if l.accept("0") {
600 // Note: Leading 0 does not mean octal in floats.
601 if l.accept("xX") {
602 digits = "0123456789abcdefABCDEF_"
603 } else if l.accept("oO") {
604 digits = "01234567_"
605 } else if l.accept("bB") {
606 digits = "01_"
607 }
608 }
609 l.acceptRun(digits)
610 if l.accept(".") {
611 l.acceptRun(digits)
612 }
613 if len(digits) == 10+1 && l.accept("eE") {
614 l.accept("+-")
615 l.acceptRun("0123456789_")
616 }
617 if len(digits) == 16+6+1 && l.accept("pP") {
618 l.accept("+-")
619 l.acceptRun("0123456789_")
620 }
621 // Is it imaginary?
622 l.accept("i")
623 // Next thing mustn't be alphanumeric.
624 if isAlphaNumeric(l.peek()) {
625 l.next()
626 return false
627 }
628 return true
629 }
630
631 // lexQuote scans a quoted string.
632 func lexQuote(l *lexer) stateFn {
633 Loop:
634 for {
635 switch l.next() {
636 case '\\':
637 if r := l.next(); r != eof && r != '\n' {
638 break
639 }
640 fallthrough
641 case eof, '\n':
642 return l.errorf("unterminated quoted string")
643 case '"':
644 break Loop
645 }
646 }
647 l.emit(itemString)
648 return lexInsideAction
649 }
650
651 // lexRawQuote scans a raw quoted string.
652 func lexRawQuote(l *lexer) stateFn {
653 Loop:
654 for {
655 switch l.next() {
656 case eof:
657 return l.errorf("unterminated raw quoted string")
658 case '`':
659 break Loop
660 }
661 }
662 l.emit(itemRawString)
663 return lexInsideAction
664 }
665
666 // isSpace reports whether r is a space character.
667 func isSpace(r rune) bool {
668 return r == ' ' || r == '\t' || r == '\r' || r == '\n'
669 }
670
671 // isAlphaNumeric reports whether r is an alphabetic, digit, or underscore.
672 func isAlphaNumeric(r rune) bool {
673 return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r)
674 }
675
676 func hasLeftTrimMarker(s string) bool {
677 return len(s) >= 2 && s[0] == trimMarker && isSpace(rune(s[1]))
678 }
679
680 func hasRightTrimMarker(s string) bool {
681 return len(s) >= 2 && isSpace(rune(s[0])) && s[1] == trimMarker
682 }