pagelexer.go (11496B)
1 // Copyright 2018 The Hugo Authors. All rights reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // http://www.apache.org/licenses/LICENSE-2.0 7 // 8 // Unless required by applicable law or agreed to in writing, software 9 // distributed under the License is distributed on an "AS IS" BASIS, 10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package pageparser 15 16 import ( 17 "bytes" 18 "fmt" 19 "unicode" 20 "unicode/utf8" 21 ) 22 23 const eof = -1 24 25 // returns the next state in scanner. 26 type stateFunc func(*pageLexer) stateFunc 27 28 type pageLexer struct { 29 input []byte 30 stateStart stateFunc 31 state stateFunc 32 pos int // input position 33 start int // item start position 34 width int // width of last element 35 36 // Contains lexers for shortcodes and other main section 37 // elements. 38 sectionHandlers *sectionHandlers 39 40 cfg Config 41 42 // The summary divider to look for. 43 summaryDivider []byte 44 // Set when we have parsed any summary divider 45 summaryDividerChecked bool 46 // Whether we're in a HTML comment. 47 isInHTMLComment bool 48 49 lexerShortcodeState 50 51 // items delivered to client 52 items Items 53 } 54 55 // Implement the Result interface 56 func (l *pageLexer) Iterator() *Iterator { 57 return l.newIterator() 58 } 59 60 func (l *pageLexer) Input() []byte { 61 return l.input 62 } 63 64 type Config struct { 65 EnableEmoji bool 66 } 67 68 // note: the input position here is normally 0 (start), but 69 // can be set if position of first shortcode is known 70 func newPageLexer(input []byte, stateStart stateFunc, cfg Config) *pageLexer { 71 lexer := &pageLexer{ 72 input: input, 73 stateStart: stateStart, 74 cfg: cfg, 75 lexerShortcodeState: lexerShortcodeState{ 76 currLeftDelimItem: tLeftDelimScNoMarkup, 77 currRightDelimItem: tRightDelimScNoMarkup, 78 openShortcodes: make(map[string]bool), 79 }, 80 items: make([]Item, 0, 5), 81 } 82 83 lexer.sectionHandlers = createSectionHandlers(lexer) 84 85 return lexer 86 } 87 88 func (l *pageLexer) newIterator() *Iterator { 89 return &Iterator{l: l, lastPos: -1} 90 } 91 92 // main loop 93 func (l *pageLexer) run() *pageLexer { 94 for l.state = l.stateStart; l.state != nil; { 95 l.state = l.state(l) 96 } 97 return l 98 } 99 100 // Page syntax 101 var ( 102 byteOrderMark = '\ufeff' 103 summaryDivider = []byte("<!--more-->") 104 summaryDividerOrg = []byte("# more") 105 delimTOML = []byte("+++") 106 delimYAML = []byte("---") 107 delimOrg = []byte("#+") 108 htmlCommentStart = []byte("<!--") 109 htmlCommentEnd = []byte("-->") 110 111 emojiDelim = byte(':') 112 ) 113 114 func (l *pageLexer) next() rune { 115 if l.pos >= len(l.input) { 116 l.width = 0 117 return eof 118 } 119 120 runeValue, runeWidth := utf8.DecodeRune(l.input[l.pos:]) 121 l.width = runeWidth 122 l.pos += l.width 123 124 return runeValue 125 } 126 127 // peek, but no consume 128 func (l *pageLexer) peek() rune { 129 r := l.next() 130 l.backup() 131 return r 132 } 133 134 // steps back one 135 func (l *pageLexer) backup() { 136 l.pos -= l.width 137 } 138 139 // sends an item back to the client. 140 func (l *pageLexer) emit(t ItemType) { 141 defer func() { 142 l.start = l.pos 143 }() 144 145 if t == tText { 146 // Identify any trailing whitespace/intendation. 147 // We currently only care about the last one. 148 for i := l.pos - 1; i >= l.start; i-- { 149 b := l.input[i] 150 if b != ' ' && b != '\t' && b != '\r' && b != '\n' { 151 break 152 } 153 if i == l.start && b != '\n' { 154 l.items = append(l.items, Item{tIndentation, l.start, l.input[l.start:l.pos], false}) 155 return 156 } else if b == '\n' && i < l.pos-1 { 157 l.items = append(l.items, Item{t, l.start, l.input[l.start : i+1], false}) 158 l.items = append(l.items, Item{tIndentation, i + 1, l.input[i+1 : l.pos], false}) 159 return 160 } else if b == '\n' && i == l.pos-1 { 161 break 162 } 163 164 } 165 } 166 167 l.items = append(l.items, Item{t, l.start, l.input[l.start:l.pos], false}) 168 169 } 170 171 // sends a string item back to the client. 172 func (l *pageLexer) emitString(t ItemType) { 173 l.items = append(l.items, Item{t, l.start, l.input[l.start:l.pos], true}) 174 l.start = l.pos 175 } 176 177 func (l *pageLexer) isEOF() bool { 178 return l.pos >= len(l.input) 179 } 180 181 // special case, do not send '\\' back to client 182 func (l *pageLexer) ignoreEscapesAndEmit(t ItemType, isString bool) { 183 val := bytes.Map(func(r rune) rune { 184 if r == '\\' { 185 return -1 186 } 187 return r 188 }, l.input[l.start:l.pos]) 189 l.items = append(l.items, Item{t, l.start, val, isString}) 190 l.start = l.pos 191 } 192 193 // gets the current value (for debugging and error handling) 194 func (l *pageLexer) current() []byte { 195 return l.input[l.start:l.pos] 196 } 197 198 // ignore current element 199 func (l *pageLexer) ignore() { 200 l.start = l.pos 201 } 202 203 var lf = []byte("\n") 204 205 // nil terminates the parser 206 func (l *pageLexer) errorf(format string, args ...any) stateFunc { 207 l.items = append(l.items, Item{tError, l.start, []byte(fmt.Sprintf(format, args...)), true}) 208 return nil 209 } 210 211 func (l *pageLexer) consumeCRLF() bool { 212 var consumed bool 213 for _, r := range crLf { 214 if l.next() != r { 215 l.backup() 216 } else { 217 consumed = true 218 } 219 } 220 return consumed 221 } 222 223 func (l *pageLexer) consumeToNextLine() { 224 for { 225 r := l.next() 226 if r == eof || isEndOfLine(r) { 227 return 228 } 229 } 230 } 231 232 func (l *pageLexer) consumeToSpace() { 233 for { 234 r := l.next() 235 if r == eof || unicode.IsSpace(r) { 236 l.backup() 237 return 238 } 239 } 240 } 241 242 func (l *pageLexer) consumeSpace() { 243 for { 244 r := l.next() 245 if r == eof || !unicode.IsSpace(r) { 246 l.backup() 247 return 248 } 249 } 250 } 251 252 // lex a string starting at ":" 253 func lexEmoji(l *pageLexer) stateFunc { 254 pos := l.pos + 1 255 valid := false 256 257 for i := pos; i < len(l.input); i++ { 258 if i > pos && l.input[i] == emojiDelim { 259 pos = i + 1 260 valid = true 261 break 262 } 263 r, _ := utf8.DecodeRune(l.input[i:]) 264 if !(isAlphaNumericOrHyphen(r) || r == '+') { 265 break 266 } 267 } 268 269 if valid { 270 l.pos = pos 271 l.emit(TypeEmoji) 272 } else { 273 l.pos++ 274 l.emit(tText) 275 } 276 277 return lexMainSection 278 } 279 280 type sectionHandlers struct { 281 l *pageLexer 282 283 // Set when none of the sections are found so we 284 // can safely stop looking and skip to the end. 285 skipAll bool 286 287 handlers []*sectionHandler 288 skipIndexes []int 289 } 290 291 func (s *sectionHandlers) skip() int { 292 if s.skipAll { 293 return -1 294 } 295 296 s.skipIndexes = s.skipIndexes[:0] 297 var shouldSkip bool 298 for _, skipper := range s.handlers { 299 idx := skipper.skip() 300 if idx != -1 { 301 shouldSkip = true 302 s.skipIndexes = append(s.skipIndexes, idx) 303 } 304 } 305 306 if !shouldSkip { 307 s.skipAll = true 308 return -1 309 } 310 311 return minIndex(s.skipIndexes...) 312 } 313 314 func createSectionHandlers(l *pageLexer) *sectionHandlers { 315 shortCodeHandler := §ionHandler{ 316 l: l, 317 skipFunc: func(l *pageLexer) int { 318 return l.index(leftDelimSc) 319 }, 320 lexFunc: func(origin stateFunc, l *pageLexer) (stateFunc, bool) { 321 if !l.isShortCodeStart() { 322 return origin, false 323 } 324 325 if l.isInline { 326 // If we're inside an inline shortcode, the only valid shortcode markup is 327 // the markup which closes it. 328 b := l.input[l.pos+3:] 329 end := indexNonWhiteSpace(b, '/') 330 if end != len(l.input)-1 { 331 b = bytes.TrimSpace(b[end+1:]) 332 if end == -1 || !bytes.HasPrefix(b, []byte(l.currShortcodeName+" ")) { 333 return l.errorf("inline shortcodes do not support nesting"), true 334 } 335 } 336 } 337 338 if l.hasPrefix(leftDelimScWithMarkup) { 339 l.currLeftDelimItem = tLeftDelimScWithMarkup 340 l.currRightDelimItem = tRightDelimScWithMarkup 341 } else { 342 l.currLeftDelimItem = tLeftDelimScNoMarkup 343 l.currRightDelimItem = tRightDelimScNoMarkup 344 } 345 346 return lexShortcodeLeftDelim, true 347 }, 348 } 349 350 summaryDividerHandler := §ionHandler{ 351 l: l, 352 skipFunc: func(l *pageLexer) int { 353 if l.summaryDividerChecked || l.summaryDivider == nil { 354 return -1 355 } 356 return l.index(l.summaryDivider) 357 }, 358 lexFunc: func(origin stateFunc, l *pageLexer) (stateFunc, bool) { 359 if !l.hasPrefix(l.summaryDivider) { 360 return origin, false 361 } 362 363 l.summaryDividerChecked = true 364 l.pos += len(l.summaryDivider) 365 // This makes it a little easier to reason about later. 366 l.consumeSpace() 367 l.emit(TypeLeadSummaryDivider) 368 369 return origin, true 370 }, 371 } 372 373 handlers := []*sectionHandler{shortCodeHandler, summaryDividerHandler} 374 375 if l.cfg.EnableEmoji { 376 emojiHandler := §ionHandler{ 377 l: l, 378 skipFunc: func(l *pageLexer) int { 379 return l.indexByte(emojiDelim) 380 }, 381 lexFunc: func(origin stateFunc, l *pageLexer) (stateFunc, bool) { 382 return lexEmoji, true 383 }, 384 } 385 386 handlers = append(handlers, emojiHandler) 387 } 388 389 return §ionHandlers{ 390 l: l, 391 handlers: handlers, 392 skipIndexes: make([]int, len(handlers)), 393 } 394 } 395 396 func (s *sectionHandlers) lex(origin stateFunc) stateFunc { 397 if s.skipAll { 398 return nil 399 } 400 401 if s.l.pos > s.l.start { 402 s.l.emit(tText) 403 } 404 405 for _, handler := range s.handlers { 406 if handler.skipAll { 407 continue 408 } 409 410 next, handled := handler.lexFunc(origin, handler.l) 411 if next == nil || handled { 412 return next 413 } 414 } 415 416 // Not handled by the above. 417 s.l.pos++ 418 419 return origin 420 } 421 422 type sectionHandler struct { 423 l *pageLexer 424 425 // No more sections of this type. 426 skipAll bool 427 428 // Returns the index of the next match, -1 if none found. 429 skipFunc func(l *pageLexer) int 430 431 // Lex lexes the current section and returns the next state func and 432 // a bool telling if this section was handled. 433 // Note that returning nil as the next state will terminate the 434 // lexer. 435 lexFunc func(origin stateFunc, l *pageLexer) (stateFunc, bool) 436 } 437 438 func (s *sectionHandler) skip() int { 439 if s.skipAll { 440 return -1 441 } 442 443 idx := s.skipFunc(s.l) 444 if idx == -1 { 445 s.skipAll = true 446 } 447 return idx 448 } 449 450 func lexMainSection(l *pageLexer) stateFunc { 451 if l.isEOF() { 452 return lexDone 453 } 454 455 if l.isInHTMLComment { 456 return lexEndFrontMatterHTMLComment 457 } 458 459 // Fast forward as far as possible. 460 skip := l.sectionHandlers.skip() 461 462 if skip == -1 { 463 l.pos = len(l.input) 464 return lexDone 465 } else if skip > 0 { 466 l.pos += skip 467 } 468 469 next := l.sectionHandlers.lex(lexMainSection) 470 if next != nil { 471 return next 472 } 473 474 l.pos = len(l.input) 475 return lexDone 476 } 477 478 func lexDone(l *pageLexer) stateFunc { 479 // Done! 480 if l.pos > l.start { 481 l.emit(tText) 482 } 483 l.emit(tEOF) 484 return nil 485 } 486 487 func (l *pageLexer) printCurrentInput() { 488 fmt.Printf("input[%d:]: %q", l.pos, string(l.input[l.pos:])) 489 } 490 491 // state helpers 492 493 func (l *pageLexer) index(sep []byte) int { 494 return bytes.Index(l.input[l.pos:], sep) 495 } 496 497 func (l *pageLexer) indexByte(sep byte) int { 498 return bytes.IndexByte(l.input[l.pos:], sep) 499 } 500 501 func (l *pageLexer) hasPrefix(prefix []byte) bool { 502 return bytes.HasPrefix(l.input[l.pos:], prefix) 503 } 504 505 // helper functions 506 507 // returns the min index >= 0 508 func minIndex(indices ...int) int { 509 min := -1 510 511 for _, j := range indices { 512 if j < 0 { 513 continue 514 } 515 if min == -1 { 516 min = j 517 } else if j < min { 518 min = j 519 } 520 } 521 return min 522 } 523 524 func indexNonWhiteSpace(s []byte, in rune) int { 525 idx := bytes.IndexFunc(s, func(r rune) bool { 526 return !unicode.IsSpace(r) 527 }) 528 529 if idx == -1 { 530 return -1 531 } 532 533 r, _ := utf8.DecodeRune(s[idx:]) 534 if r == in { 535 return idx 536 } 537 return -1 538 } 539 540 func isSpace(r rune) bool { 541 return r == ' ' || r == '\t' 542 } 543 544 func isAlphaNumericOrHyphen(r rune) bool { 545 // let unquoted YouTube ids as positional params slip through (they contain hyphens) 546 return isAlphaNumeric(r) || r == '-' 547 } 548 549 var crLf = []rune{'\r', '\n'} 550 551 func isEndOfLine(r rune) bool { 552 return r == '\r' || r == '\n' 553 } 554 555 func isAlphaNumeric(r rune) bool { 556 return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r) 557 }