pagelexer.go (11496B)
1 // Copyright 2018 The Hugo Authors. All rights reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 // http://www.apache.org/licenses/LICENSE-2.0
7 //
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13
14 package pageparser
15
16 import (
17 "bytes"
18 "fmt"
19 "unicode"
20 "unicode/utf8"
21 )
22
23 const eof = -1
24
25 // returns the next state in scanner.
26 type stateFunc func(*pageLexer) stateFunc
27
28 type pageLexer struct {
29 input []byte
30 stateStart stateFunc
31 state stateFunc
32 pos int // input position
33 start int // item start position
34 width int // width of last element
35
36 // Contains lexers for shortcodes and other main section
37 // elements.
38 sectionHandlers *sectionHandlers
39
40 cfg Config
41
42 // The summary divider to look for.
43 summaryDivider []byte
44 // Set when we have parsed any summary divider
45 summaryDividerChecked bool
46 // Whether we're in a HTML comment.
47 isInHTMLComment bool
48
49 lexerShortcodeState
50
51 // items delivered to client
52 items Items
53 }
54
55 // Implement the Result interface
56 func (l *pageLexer) Iterator() *Iterator {
57 return l.newIterator()
58 }
59
60 func (l *pageLexer) Input() []byte {
61 return l.input
62 }
63
64 type Config struct {
65 EnableEmoji bool
66 }
67
68 // note: the input position here is normally 0 (start), but
69 // can be set if position of first shortcode is known
70 func newPageLexer(input []byte, stateStart stateFunc, cfg Config) *pageLexer {
71 lexer := &pageLexer{
72 input: input,
73 stateStart: stateStart,
74 cfg: cfg,
75 lexerShortcodeState: lexerShortcodeState{
76 currLeftDelimItem: tLeftDelimScNoMarkup,
77 currRightDelimItem: tRightDelimScNoMarkup,
78 openShortcodes: make(map[string]bool),
79 },
80 items: make([]Item, 0, 5),
81 }
82
83 lexer.sectionHandlers = createSectionHandlers(lexer)
84
85 return lexer
86 }
87
88 func (l *pageLexer) newIterator() *Iterator {
89 return &Iterator{l: l, lastPos: -1}
90 }
91
92 // main loop
93 func (l *pageLexer) run() *pageLexer {
94 for l.state = l.stateStart; l.state != nil; {
95 l.state = l.state(l)
96 }
97 return l
98 }
99
100 // Page syntax
101 var (
102 byteOrderMark = '\ufeff'
103 summaryDivider = []byte("<!--more-->")
104 summaryDividerOrg = []byte("# more")
105 delimTOML = []byte("+++")
106 delimYAML = []byte("---")
107 delimOrg = []byte("#+")
108 htmlCommentStart = []byte("<!--")
109 htmlCommentEnd = []byte("-->")
110
111 emojiDelim = byte(':')
112 )
113
114 func (l *pageLexer) next() rune {
115 if l.pos >= len(l.input) {
116 l.width = 0
117 return eof
118 }
119
120 runeValue, runeWidth := utf8.DecodeRune(l.input[l.pos:])
121 l.width = runeWidth
122 l.pos += l.width
123
124 return runeValue
125 }
126
127 // peek, but no consume
128 func (l *pageLexer) peek() rune {
129 r := l.next()
130 l.backup()
131 return r
132 }
133
134 // steps back one
135 func (l *pageLexer) backup() {
136 l.pos -= l.width
137 }
138
139 // sends an item back to the client.
140 func (l *pageLexer) emit(t ItemType) {
141 defer func() {
142 l.start = l.pos
143 }()
144
145 if t == tText {
146 // Identify any trailing whitespace/intendation.
147 // We currently only care about the last one.
148 for i := l.pos - 1; i >= l.start; i-- {
149 b := l.input[i]
150 if b != ' ' && b != '\t' && b != '\r' && b != '\n' {
151 break
152 }
153 if i == l.start && b != '\n' {
154 l.items = append(l.items, Item{tIndentation, l.start, l.input[l.start:l.pos], false})
155 return
156 } else if b == '\n' && i < l.pos-1 {
157 l.items = append(l.items, Item{t, l.start, l.input[l.start : i+1], false})
158 l.items = append(l.items, Item{tIndentation, i + 1, l.input[i+1 : l.pos], false})
159 return
160 } else if b == '\n' && i == l.pos-1 {
161 break
162 }
163
164 }
165 }
166
167 l.items = append(l.items, Item{t, l.start, l.input[l.start:l.pos], false})
168
169 }
170
171 // sends a string item back to the client.
172 func (l *pageLexer) emitString(t ItemType) {
173 l.items = append(l.items, Item{t, l.start, l.input[l.start:l.pos], true})
174 l.start = l.pos
175 }
176
177 func (l *pageLexer) isEOF() bool {
178 return l.pos >= len(l.input)
179 }
180
181 // special case, do not send '\\' back to client
182 func (l *pageLexer) ignoreEscapesAndEmit(t ItemType, isString bool) {
183 val := bytes.Map(func(r rune) rune {
184 if r == '\\' {
185 return -1
186 }
187 return r
188 }, l.input[l.start:l.pos])
189 l.items = append(l.items, Item{t, l.start, val, isString})
190 l.start = l.pos
191 }
192
193 // gets the current value (for debugging and error handling)
194 func (l *pageLexer) current() []byte {
195 return l.input[l.start:l.pos]
196 }
197
198 // ignore current element
199 func (l *pageLexer) ignore() {
200 l.start = l.pos
201 }
202
203 var lf = []byte("\n")
204
205 // nil terminates the parser
206 func (l *pageLexer) errorf(format string, args ...any) stateFunc {
207 l.items = append(l.items, Item{tError, l.start, []byte(fmt.Sprintf(format, args...)), true})
208 return nil
209 }
210
211 func (l *pageLexer) consumeCRLF() bool {
212 var consumed bool
213 for _, r := range crLf {
214 if l.next() != r {
215 l.backup()
216 } else {
217 consumed = true
218 }
219 }
220 return consumed
221 }
222
223 func (l *pageLexer) consumeToNextLine() {
224 for {
225 r := l.next()
226 if r == eof || isEndOfLine(r) {
227 return
228 }
229 }
230 }
231
232 func (l *pageLexer) consumeToSpace() {
233 for {
234 r := l.next()
235 if r == eof || unicode.IsSpace(r) {
236 l.backup()
237 return
238 }
239 }
240 }
241
242 func (l *pageLexer) consumeSpace() {
243 for {
244 r := l.next()
245 if r == eof || !unicode.IsSpace(r) {
246 l.backup()
247 return
248 }
249 }
250 }
251
252 // lex a string starting at ":"
253 func lexEmoji(l *pageLexer) stateFunc {
254 pos := l.pos + 1
255 valid := false
256
257 for i := pos; i < len(l.input); i++ {
258 if i > pos && l.input[i] == emojiDelim {
259 pos = i + 1
260 valid = true
261 break
262 }
263 r, _ := utf8.DecodeRune(l.input[i:])
264 if !(isAlphaNumericOrHyphen(r) || r == '+') {
265 break
266 }
267 }
268
269 if valid {
270 l.pos = pos
271 l.emit(TypeEmoji)
272 } else {
273 l.pos++
274 l.emit(tText)
275 }
276
277 return lexMainSection
278 }
279
280 type sectionHandlers struct {
281 l *pageLexer
282
283 // Set when none of the sections are found so we
284 // can safely stop looking and skip to the end.
285 skipAll bool
286
287 handlers []*sectionHandler
288 skipIndexes []int
289 }
290
291 func (s *sectionHandlers) skip() int {
292 if s.skipAll {
293 return -1
294 }
295
296 s.skipIndexes = s.skipIndexes[:0]
297 var shouldSkip bool
298 for _, skipper := range s.handlers {
299 idx := skipper.skip()
300 if idx != -1 {
301 shouldSkip = true
302 s.skipIndexes = append(s.skipIndexes, idx)
303 }
304 }
305
306 if !shouldSkip {
307 s.skipAll = true
308 return -1
309 }
310
311 return minIndex(s.skipIndexes...)
312 }
313
314 func createSectionHandlers(l *pageLexer) *sectionHandlers {
315 shortCodeHandler := §ionHandler{
316 l: l,
317 skipFunc: func(l *pageLexer) int {
318 return l.index(leftDelimSc)
319 },
320 lexFunc: func(origin stateFunc, l *pageLexer) (stateFunc, bool) {
321 if !l.isShortCodeStart() {
322 return origin, false
323 }
324
325 if l.isInline {
326 // If we're inside an inline shortcode, the only valid shortcode markup is
327 // the markup which closes it.
328 b := l.input[l.pos+3:]
329 end := indexNonWhiteSpace(b, '/')
330 if end != len(l.input)-1 {
331 b = bytes.TrimSpace(b[end+1:])
332 if end == -1 || !bytes.HasPrefix(b, []byte(l.currShortcodeName+" ")) {
333 return l.errorf("inline shortcodes do not support nesting"), true
334 }
335 }
336 }
337
338 if l.hasPrefix(leftDelimScWithMarkup) {
339 l.currLeftDelimItem = tLeftDelimScWithMarkup
340 l.currRightDelimItem = tRightDelimScWithMarkup
341 } else {
342 l.currLeftDelimItem = tLeftDelimScNoMarkup
343 l.currRightDelimItem = tRightDelimScNoMarkup
344 }
345
346 return lexShortcodeLeftDelim, true
347 },
348 }
349
350 summaryDividerHandler := §ionHandler{
351 l: l,
352 skipFunc: func(l *pageLexer) int {
353 if l.summaryDividerChecked || l.summaryDivider == nil {
354 return -1
355 }
356 return l.index(l.summaryDivider)
357 },
358 lexFunc: func(origin stateFunc, l *pageLexer) (stateFunc, bool) {
359 if !l.hasPrefix(l.summaryDivider) {
360 return origin, false
361 }
362
363 l.summaryDividerChecked = true
364 l.pos += len(l.summaryDivider)
365 // This makes it a little easier to reason about later.
366 l.consumeSpace()
367 l.emit(TypeLeadSummaryDivider)
368
369 return origin, true
370 },
371 }
372
373 handlers := []*sectionHandler{shortCodeHandler, summaryDividerHandler}
374
375 if l.cfg.EnableEmoji {
376 emojiHandler := §ionHandler{
377 l: l,
378 skipFunc: func(l *pageLexer) int {
379 return l.indexByte(emojiDelim)
380 },
381 lexFunc: func(origin stateFunc, l *pageLexer) (stateFunc, bool) {
382 return lexEmoji, true
383 },
384 }
385
386 handlers = append(handlers, emojiHandler)
387 }
388
389 return §ionHandlers{
390 l: l,
391 handlers: handlers,
392 skipIndexes: make([]int, len(handlers)),
393 }
394 }
395
396 func (s *sectionHandlers) lex(origin stateFunc) stateFunc {
397 if s.skipAll {
398 return nil
399 }
400
401 if s.l.pos > s.l.start {
402 s.l.emit(tText)
403 }
404
405 for _, handler := range s.handlers {
406 if handler.skipAll {
407 continue
408 }
409
410 next, handled := handler.lexFunc(origin, handler.l)
411 if next == nil || handled {
412 return next
413 }
414 }
415
416 // Not handled by the above.
417 s.l.pos++
418
419 return origin
420 }
421
422 type sectionHandler struct {
423 l *pageLexer
424
425 // No more sections of this type.
426 skipAll bool
427
428 // Returns the index of the next match, -1 if none found.
429 skipFunc func(l *pageLexer) int
430
431 // Lex lexes the current section and returns the next state func and
432 // a bool telling if this section was handled.
433 // Note that returning nil as the next state will terminate the
434 // lexer.
435 lexFunc func(origin stateFunc, l *pageLexer) (stateFunc, bool)
436 }
437
438 func (s *sectionHandler) skip() int {
439 if s.skipAll {
440 return -1
441 }
442
443 idx := s.skipFunc(s.l)
444 if idx == -1 {
445 s.skipAll = true
446 }
447 return idx
448 }
449
450 func lexMainSection(l *pageLexer) stateFunc {
451 if l.isEOF() {
452 return lexDone
453 }
454
455 if l.isInHTMLComment {
456 return lexEndFrontMatterHTMLComment
457 }
458
459 // Fast forward as far as possible.
460 skip := l.sectionHandlers.skip()
461
462 if skip == -1 {
463 l.pos = len(l.input)
464 return lexDone
465 } else if skip > 0 {
466 l.pos += skip
467 }
468
469 next := l.sectionHandlers.lex(lexMainSection)
470 if next != nil {
471 return next
472 }
473
474 l.pos = len(l.input)
475 return lexDone
476 }
477
478 func lexDone(l *pageLexer) stateFunc {
479 // Done!
480 if l.pos > l.start {
481 l.emit(tText)
482 }
483 l.emit(tEOF)
484 return nil
485 }
486
487 func (l *pageLexer) printCurrentInput() {
488 fmt.Printf("input[%d:]: %q", l.pos, string(l.input[l.pos:]))
489 }
490
491 // state helpers
492
493 func (l *pageLexer) index(sep []byte) int {
494 return bytes.Index(l.input[l.pos:], sep)
495 }
496
497 func (l *pageLexer) indexByte(sep byte) int {
498 return bytes.IndexByte(l.input[l.pos:], sep)
499 }
500
501 func (l *pageLexer) hasPrefix(prefix []byte) bool {
502 return bytes.HasPrefix(l.input[l.pos:], prefix)
503 }
504
505 // helper functions
506
507 // returns the min index >= 0
508 func minIndex(indices ...int) int {
509 min := -1
510
511 for _, j := range indices {
512 if j < 0 {
513 continue
514 }
515 if min == -1 {
516 min = j
517 } else if j < min {
518 min = j
519 }
520 }
521 return min
522 }
523
524 func indexNonWhiteSpace(s []byte, in rune) int {
525 idx := bytes.IndexFunc(s, func(r rune) bool {
526 return !unicode.IsSpace(r)
527 })
528
529 if idx == -1 {
530 return -1
531 }
532
533 r, _ := utf8.DecodeRune(s[idx:])
534 if r == in {
535 return idx
536 }
537 return -1
538 }
539
540 func isSpace(r rune) bool {
541 return r == ' ' || r == '\t'
542 }
543
544 func isAlphaNumericOrHyphen(r rune) bool {
545 // let unquoted YouTube ids as positional params slip through (they contain hyphens)
546 return isAlphaNumeric(r) || r == '-'
547 }
548
549 var crLf = []rune{'\r', '\n'}
550
551 func isEndOfLine(r rune) bool {
552 return r == '\r' || r == '\n'
553 }
554
555 func isAlphaNumeric(r rune) bool {
556 return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r)
557 }