lex.go (18154B)
1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package parse 6 7 import ( 8 "fmt" 9 "strings" 10 "unicode" 11 "unicode/utf8" 12 ) 13 14 // item represents a token or text string returned from the scanner. 15 type item struct { 16 typ itemType // The type of this item. 17 pos Pos // The starting position, in bytes, of this item in the input string. 18 val string // The value of this item. 19 line int // The line number at the start of this item. 20 } 21 22 func (i item) String() string { 23 switch { 24 case i.typ == itemEOF: 25 return "EOF" 26 case i.typ == itemError: 27 return i.val 28 case i.typ > itemKeyword: 29 return fmt.Sprintf("<%s>", i.val) 30 case len(i.val) > 10: 31 return fmt.Sprintf("%.10q...", i.val) 32 } 33 return fmt.Sprintf("%q", i.val) 34 } 35 36 // itemType identifies the type of lex items. 37 type itemType int 38 39 const ( 40 itemError itemType = iota // error occurred; value is text of error 41 itemBool // boolean constant 42 itemChar // printable ASCII character; grab bag for comma etc. 43 itemCharConstant // character constant 44 itemComment // comment text 45 itemComplex // complex constant (1+2i); imaginary is just a number 46 itemAssign // equals ('=') introducing an assignment 47 itemDeclare // colon-equals (':=') introducing a declaration 48 itemEOF 49 itemField // alphanumeric identifier starting with '.' 50 itemIdentifier // alphanumeric identifier not starting with '.' 51 itemLeftDelim // left action delimiter 52 itemLeftParen // '(' inside action 53 itemNumber // simple number, including imaginary 54 itemPipe // pipe symbol 55 itemRawString // raw quoted string (includes quotes) 56 itemRightDelim // right action delimiter 57 itemRightParen // ')' inside action 58 itemSpace // run of spaces separating arguments 59 itemString // quoted string (includes quotes) 60 itemText // plain text 61 itemVariable // variable starting with '$', such as '$' or '$1' or '$hello' 62 // Keywords appear after all the rest. 63 itemKeyword // used only to delimit the keywords 64 itemBlock // block keyword 65 itemBreak // break keyword 66 itemContinue // continue keyword 67 itemDot // the cursor, spelled '.' 68 itemDefine // define keyword 69 itemElse // else keyword 70 itemEnd // end keyword 71 itemIf // if keyword 72 itemNil // the untyped nil constant, easiest to treat as a keyword 73 itemRange // range keyword 74 itemTemplate // template keyword 75 itemWith // with keyword 76 ) 77 78 var key = map[string]itemType{ 79 ".": itemDot, 80 "block": itemBlock, 81 "break": itemBreak, 82 "continue": itemContinue, 83 "define": itemDefine, 84 "else": itemElse, 85 "end": itemEnd, 86 "if": itemIf, 87 "range": itemRange, 88 "nil": itemNil, 89 "template": itemTemplate, 90 "with": itemWith, 91 } 92 93 const eof = -1 94 95 // Trimming spaces. 96 // If the action begins "{{- " rather than "{{", then all space/tab/newlines 97 // preceding the action are trimmed; conversely if it ends " -}}" the 98 // leading spaces are trimmed. This is done entirely in the lexer; the 99 // parser never sees it happen. We require an ASCII space (' ', \t, \r, \n) 100 // to be present to avoid ambiguity with things like "{{-3}}". It reads 101 // better with the space present anyway. For simplicity, only ASCII 102 // does the job. 103 const ( 104 spaceChars = " \t\r\n" // These are the space characters defined by Go itself. 105 trimMarker = '-' // Attached to left/right delimiter, trims trailing spaces from preceding/following text. 106 trimMarkerLen = Pos(1 + 1) // marker plus space before or after 107 ) 108 109 // stateFn represents the state of the scanner as a function that returns the next state. 110 type stateFn func(*lexer) stateFn 111 112 // lexer holds the state of the scanner. 113 type lexer struct { 114 name string // the name of the input; used only for error reports 115 input string // the string being scanned 116 leftDelim string // start of action 117 rightDelim string // end of action 118 emitComment bool // emit itemComment tokens. 119 pos Pos // current position in the input 120 start Pos // start position of this item 121 width Pos // width of last rune read from input 122 items chan item // channel of scanned items 123 parenDepth int // nesting depth of ( ) exprs 124 line int // 1+number of newlines seen 125 startLine int // start line of this item 126 breakOK bool // break keyword allowed 127 continueOK bool // continue keyword allowed 128 } 129 130 // next returns the next rune in the input. 131 func (l *lexer) next() rune { 132 if int(l.pos) >= len(l.input) { 133 l.width = 0 134 return eof 135 } 136 r, w := utf8.DecodeRuneInString(l.input[l.pos:]) 137 l.width = Pos(w) 138 l.pos += l.width 139 if r == '\n' { 140 l.line++ 141 } 142 return r 143 } 144 145 // peek returns but does not consume the next rune in the input. 146 func (l *lexer) peek() rune { 147 r := l.next() 148 l.backup() 149 return r 150 } 151 152 // backup steps back one rune. Can only be called once per call of next. 153 func (l *lexer) backup() { 154 l.pos -= l.width 155 // Correct newline count. 156 if l.width == 1 && l.input[l.pos] == '\n' { 157 l.line-- 158 } 159 } 160 161 // emit passes an item back to the client. 162 func (l *lexer) emit(t itemType) { 163 l.items <- item{t, l.start, l.input[l.start:l.pos], l.startLine} 164 l.start = l.pos 165 l.startLine = l.line 166 } 167 168 // ignore skips over the pending input before this point. 169 func (l *lexer) ignore() { 170 l.line += strings.Count(l.input[l.start:l.pos], "\n") 171 l.start = l.pos 172 l.startLine = l.line 173 } 174 175 // accept consumes the next rune if it's from the valid set. 176 func (l *lexer) accept(valid string) bool { 177 if strings.ContainsRune(valid, l.next()) { 178 return true 179 } 180 l.backup() 181 return false 182 } 183 184 // acceptRun consumes a run of runes from the valid set. 185 func (l *lexer) acceptRun(valid string) { 186 for strings.ContainsRune(valid, l.next()) { 187 } 188 l.backup() 189 } 190 191 // errorf returns an error token and terminates the scan by passing 192 // back a nil pointer that will be the next state, terminating l.nextItem. 193 func (l *lexer) errorf(format string, args ...any) stateFn { 194 l.items <- item{itemError, l.start, fmt.Sprintf(format, args...), l.startLine} 195 return nil 196 } 197 198 // nextItem returns the next item from the input. 199 // Called by the parser, not in the lexing goroutine. 200 func (l *lexer) nextItem() item { 201 return <-l.items 202 } 203 204 // drain drains the output so the lexing goroutine will exit. 205 // Called by the parser, not in the lexing goroutine. 206 func (l *lexer) drain() { 207 for range l.items { 208 } 209 } 210 211 // lex creates a new scanner for the input string. 212 func lex(name, input, left, right string, emitComment bool) *lexer { 213 if left == "" { 214 left = leftDelim 215 } 216 if right == "" { 217 right = rightDelim 218 } 219 l := &lexer{ 220 name: name, 221 input: input, 222 leftDelim: left, 223 rightDelim: right, 224 emitComment: emitComment, 225 items: make(chan item), 226 line: 1, 227 startLine: 1, 228 } 229 go l.run() 230 return l 231 } 232 233 // run runs the state machine for the lexer. 234 func (l *lexer) run() { 235 for state := lexText; state != nil; { 236 state = state(l) 237 } 238 close(l.items) 239 } 240 241 // state functions 242 243 const ( 244 leftDelim = "{{" 245 rightDelim = "}}" 246 leftComment = "/*" 247 rightComment = "*/" 248 ) 249 250 // lexText scans until an opening action delimiter, "{{". 251 func lexText(l *lexer) stateFn { 252 l.width = 0 253 if x := strings.Index(l.input[l.pos:], l.leftDelim); x >= 0 { 254 ldn := Pos(len(l.leftDelim)) 255 l.pos += Pos(x) 256 trimLength := Pos(0) 257 if hasLeftTrimMarker(l.input[l.pos+ldn:]) { 258 trimLength = rightTrimLength(l.input[l.start:l.pos]) 259 } 260 l.pos -= trimLength 261 if l.pos > l.start { 262 l.line += strings.Count(l.input[l.start:l.pos], "\n") 263 l.emit(itemText) 264 } 265 l.pos += trimLength 266 l.ignore() 267 return lexLeftDelim 268 } 269 l.pos = Pos(len(l.input)) 270 // Correctly reached EOF. 271 if l.pos > l.start { 272 l.line += strings.Count(l.input[l.start:l.pos], "\n") 273 l.emit(itemText) 274 } 275 l.emit(itemEOF) 276 return nil 277 } 278 279 // rightTrimLength returns the length of the spaces at the end of the string. 280 func rightTrimLength(s string) Pos { 281 return Pos(len(s) - len(strings.TrimRight(s, spaceChars))) 282 } 283 284 // atRightDelim reports whether the lexer is at a right delimiter, possibly preceded by a trim marker. 285 func (l *lexer) atRightDelim() (delim, trimSpaces bool) { 286 if hasRightTrimMarker(l.input[l.pos:]) && strings.HasPrefix(l.input[l.pos+trimMarkerLen:], l.rightDelim) { // With trim marker. 287 return true, true 288 } 289 if strings.HasPrefix(l.input[l.pos:], l.rightDelim) { // Without trim marker. 290 return true, false 291 } 292 return false, false 293 } 294 295 // leftTrimLength returns the length of the spaces at the beginning of the string. 296 func leftTrimLength(s string) Pos { 297 return Pos(len(s) - len(strings.TrimLeft(s, spaceChars))) 298 } 299 300 // lexLeftDelim scans the left delimiter, which is known to be present, possibly with a trim marker. 301 func lexLeftDelim(l *lexer) stateFn { 302 l.pos += Pos(len(l.leftDelim)) 303 trimSpace := hasLeftTrimMarker(l.input[l.pos:]) 304 afterMarker := Pos(0) 305 if trimSpace { 306 afterMarker = trimMarkerLen 307 } 308 if strings.HasPrefix(l.input[l.pos+afterMarker:], leftComment) { 309 l.pos += afterMarker 310 l.ignore() 311 return lexComment 312 } 313 l.emit(itemLeftDelim) 314 l.pos += afterMarker 315 l.ignore() 316 l.parenDepth = 0 317 return lexInsideAction 318 } 319 320 // lexComment scans a comment. The left comment marker is known to be present. 321 func lexComment(l *lexer) stateFn { 322 l.pos += Pos(len(leftComment)) 323 i := strings.Index(l.input[l.pos:], rightComment) 324 if i < 0 { 325 return l.errorf("unclosed comment") 326 } 327 l.pos += Pos(i + len(rightComment)) 328 delim, trimSpace := l.atRightDelim() 329 if !delim { 330 return l.errorf("comment ends before closing delimiter") 331 } 332 if l.emitComment { 333 l.emit(itemComment) 334 } 335 if trimSpace { 336 l.pos += trimMarkerLen 337 } 338 l.pos += Pos(len(l.rightDelim)) 339 if trimSpace { 340 l.pos += leftTrimLength(l.input[l.pos:]) 341 } 342 l.ignore() 343 return lexText 344 } 345 346 // lexRightDelim scans the right delimiter, which is known to be present, possibly with a trim marker. 347 func lexRightDelim(l *lexer) stateFn { 348 trimSpace := hasRightTrimMarker(l.input[l.pos:]) 349 if trimSpace { 350 l.pos += trimMarkerLen 351 l.ignore() 352 } 353 l.pos += Pos(len(l.rightDelim)) 354 l.emit(itemRightDelim) 355 if trimSpace { 356 l.pos += leftTrimLength(l.input[l.pos:]) 357 l.ignore() 358 } 359 return lexText 360 } 361 362 // lexInsideAction scans the elements inside action delimiters. 363 func lexInsideAction(l *lexer) stateFn { 364 // Either number, quoted string, or identifier. 365 // Spaces separate arguments; runs of spaces turn into itemSpace. 366 // Pipe symbols separate and are emitted. 367 delim, _ := l.atRightDelim() 368 if delim { 369 if l.parenDepth == 0 { 370 return lexRightDelim 371 } 372 return l.errorf("unclosed left paren") 373 } 374 switch r := l.next(); { 375 case r == eof: 376 return l.errorf("unclosed action") 377 case isSpace(r): 378 l.backup() // Put space back in case we have " -}}". 379 return lexSpace 380 case r == '=': 381 l.emit(itemAssign) 382 case r == ':': 383 if l.next() != '=' { 384 return l.errorf("expected :=") 385 } 386 l.emit(itemDeclare) 387 case r == '|': 388 l.emit(itemPipe) 389 case r == '"': 390 return lexQuote 391 case r == '`': 392 return lexRawQuote 393 case r == '$': 394 return lexVariable 395 case r == '\'': 396 return lexChar 397 case r == '.': 398 // special look-ahead for ".field" so we don't break l.backup(). 399 if l.pos < Pos(len(l.input)) { 400 r := l.input[l.pos] 401 if r < '0' || '9' < r { 402 return lexField 403 } 404 } 405 fallthrough // '.' can start a number. 406 case r == '+' || r == '-' || ('0' <= r && r <= '9'): 407 l.backup() 408 return lexNumber 409 case isAlphaNumeric(r): 410 l.backup() 411 return lexIdentifier 412 case r == '(': 413 l.emit(itemLeftParen) 414 l.parenDepth++ 415 case r == ')': 416 l.emit(itemRightParen) 417 l.parenDepth-- 418 if l.parenDepth < 0 { 419 return l.errorf("unexpected right paren %#U", r) 420 } 421 case r <= unicode.MaxASCII && unicode.IsPrint(r): 422 l.emit(itemChar) 423 default: 424 return l.errorf("unrecognized character in action: %#U", r) 425 } 426 return lexInsideAction 427 } 428 429 // lexSpace scans a run of space characters. 430 // We have not consumed the first space, which is known to be present. 431 // Take care if there is a trim-marked right delimiter, which starts with a space. 432 func lexSpace(l *lexer) stateFn { 433 var r rune 434 var numSpaces int 435 for { 436 r = l.peek() 437 if !isSpace(r) { 438 break 439 } 440 l.next() 441 numSpaces++ 442 } 443 // Be careful about a trim-marked closing delimiter, which has a minus 444 // after a space. We know there is a space, so check for the '-' that might follow. 445 if hasRightTrimMarker(l.input[l.pos-1:]) && strings.HasPrefix(l.input[l.pos-1+trimMarkerLen:], l.rightDelim) { 446 l.backup() // Before the space. 447 if numSpaces == 1 { 448 return lexRightDelim // On the delim, so go right to that. 449 } 450 } 451 l.emit(itemSpace) 452 return lexInsideAction 453 } 454 455 // lexIdentifier scans an alphanumeric. 456 func lexIdentifier(l *lexer) stateFn { 457 Loop: 458 for { 459 switch r := l.next(); { 460 case isAlphaNumeric(r): 461 // absorb. 462 default: 463 l.backup() 464 word := l.input[l.start:l.pos] 465 if !l.atTerminator() { 466 return l.errorf("bad character %#U", r) 467 } 468 switch { 469 case key[word] > itemKeyword: 470 item := key[word] 471 if item == itemBreak && !l.breakOK || item == itemContinue && !l.continueOK { 472 l.emit(itemIdentifier) 473 } else { 474 l.emit(item) 475 } 476 case word[0] == '.': 477 l.emit(itemField) 478 case word == "true", word == "false": 479 l.emit(itemBool) 480 default: 481 l.emit(itemIdentifier) 482 } 483 break Loop 484 } 485 } 486 return lexInsideAction 487 } 488 489 // lexField scans a field: .Alphanumeric. 490 // The . has been scanned. 491 func lexField(l *lexer) stateFn { 492 return lexFieldOrVariable(l, itemField) 493 } 494 495 // lexVariable scans a Variable: $Alphanumeric. 496 // The $ has been scanned. 497 func lexVariable(l *lexer) stateFn { 498 if l.atTerminator() { // Nothing interesting follows -> "$". 499 l.emit(itemVariable) 500 return lexInsideAction 501 } 502 return lexFieldOrVariable(l, itemVariable) 503 } 504 505 // lexVariable scans a field or variable: [.$]Alphanumeric. 506 // The . or $ has been scanned. 507 func lexFieldOrVariable(l *lexer, typ itemType) stateFn { 508 if l.atTerminator() { // Nothing interesting follows -> "." or "$". 509 if typ == itemVariable { 510 l.emit(itemVariable) 511 } else { 512 l.emit(itemDot) 513 } 514 return lexInsideAction 515 } 516 var r rune 517 for { 518 r = l.next() 519 if !isAlphaNumeric(r) { 520 l.backup() 521 break 522 } 523 } 524 if !l.atTerminator() { 525 return l.errorf("bad character %#U", r) 526 } 527 l.emit(typ) 528 return lexInsideAction 529 } 530 531 // atTerminator reports whether the input is at valid termination character to 532 // appear after an identifier. Breaks .X.Y into two pieces. Also catches cases 533 // like "$x+2" not being acceptable without a space, in case we decide one 534 // day to implement arithmetic. 535 func (l *lexer) atTerminator() bool { 536 r := l.peek() 537 if isSpace(r) { 538 return true 539 } 540 switch r { 541 case eof, '.', ',', '|', ':', ')', '(': 542 return true 543 } 544 // Does r start the delimiter? This can be ambiguous (with delim=="//", $x/2 will 545 // succeed but should fail) but only in extremely rare cases caused by willfully 546 // bad choice of delimiter. 547 if rd, _ := utf8.DecodeRuneInString(l.rightDelim); rd == r { 548 return true 549 } 550 return false 551 } 552 553 // lexChar scans a character constant. The initial quote is already 554 // scanned. Syntax checking is done by the parser. 555 func lexChar(l *lexer) stateFn { 556 Loop: 557 for { 558 switch l.next() { 559 case '\\': 560 if r := l.next(); r != eof && r != '\n' { 561 break 562 } 563 fallthrough 564 case eof, '\n': 565 return l.errorf("unterminated character constant") 566 case '\'': 567 break Loop 568 } 569 } 570 l.emit(itemCharConstant) 571 return lexInsideAction 572 } 573 574 // lexNumber scans a number: decimal, octal, hex, float, or imaginary. This 575 // isn't a perfect number scanner - for instance it accepts "." and "0x0.2" 576 // and "089" - but when it's wrong the input is invalid and the parser (via 577 // strconv) will notice. 578 func lexNumber(l *lexer) stateFn { 579 if !l.scanNumber() { 580 return l.errorf("bad number syntax: %q", l.input[l.start:l.pos]) 581 } 582 if sign := l.peek(); sign == '+' || sign == '-' { 583 // Complex: 1+2i. No spaces, must end in 'i'. 584 if !l.scanNumber() || l.input[l.pos-1] != 'i' { 585 return l.errorf("bad number syntax: %q", l.input[l.start:l.pos]) 586 } 587 l.emit(itemComplex) 588 } else { 589 l.emit(itemNumber) 590 } 591 return lexInsideAction 592 } 593 594 func (l *lexer) scanNumber() bool { 595 // Optional leading sign. 596 l.accept("+-") 597 // Is it hex? 598 digits := "0123456789_" 599 if l.accept("0") { 600 // Note: Leading 0 does not mean octal in floats. 601 if l.accept("xX") { 602 digits = "0123456789abcdefABCDEF_" 603 } else if l.accept("oO") { 604 digits = "01234567_" 605 } else if l.accept("bB") { 606 digits = "01_" 607 } 608 } 609 l.acceptRun(digits) 610 if l.accept(".") { 611 l.acceptRun(digits) 612 } 613 if len(digits) == 10+1 && l.accept("eE") { 614 l.accept("+-") 615 l.acceptRun("0123456789_") 616 } 617 if len(digits) == 16+6+1 && l.accept("pP") { 618 l.accept("+-") 619 l.acceptRun("0123456789_") 620 } 621 // Is it imaginary? 622 l.accept("i") 623 // Next thing mustn't be alphanumeric. 624 if isAlphaNumeric(l.peek()) { 625 l.next() 626 return false 627 } 628 return true 629 } 630 631 // lexQuote scans a quoted string. 632 func lexQuote(l *lexer) stateFn { 633 Loop: 634 for { 635 switch l.next() { 636 case '\\': 637 if r := l.next(); r != eof && r != '\n' { 638 break 639 } 640 fallthrough 641 case eof, '\n': 642 return l.errorf("unterminated quoted string") 643 case '"': 644 break Loop 645 } 646 } 647 l.emit(itemString) 648 return lexInsideAction 649 } 650 651 // lexRawQuote scans a raw quoted string. 652 func lexRawQuote(l *lexer) stateFn { 653 Loop: 654 for { 655 switch l.next() { 656 case eof: 657 return l.errorf("unterminated raw quoted string") 658 case '`': 659 break Loop 660 } 661 } 662 l.emit(itemRawString) 663 return lexInsideAction 664 } 665 666 // isSpace reports whether r is a space character. 667 func isSpace(r rune) bool { 668 return r == ' ' || r == '\t' || r == '\r' || r == '\n' 669 } 670 671 // isAlphaNumeric reports whether r is an alphabetic, digit, or underscore. 672 func isAlphaNumeric(r rune) bool { 673 return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r) 674 } 675 676 func hasLeftTrimMarker(s string) bool { 677 return len(s) >= 2 && s[0] == trimMarker && isSpace(rune(s[1])) 678 } 679 680 func hasRightTrimMarker(s string) bool { 681 return len(s) >= 2 && isSpace(rune(s[0])) && s[1] == trimMarker 682 }