pageparser.go (5057B)
1 // Copyright 2019 The Hugo Authors. All rights reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 // http://www.apache.org/licenses/LICENSE-2.0
7 //
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13
14 package pageparser
15
16 import (
17 "bytes"
18 "fmt"
19 "io"
20 "io/ioutil"
21
22 "github.com/gohugoio/hugo/parser/metadecoders"
23 )
24
25 // Result holds the parse result.
26 type Result interface {
27 // Iterator returns a new Iterator positioned at the beginning of the parse tree.
28 Iterator() *Iterator
29 // Input returns the input to Parse.
30 Input() []byte
31 }
32
33 var _ Result = (*pageLexer)(nil)
34
35 // Parse parses the page in the given reader according to the given Config.
36 // TODO(bep) now that we have improved the "lazy order" init, it *may* be
37 // some potential saving in doing a buffered approach where the first pass does
38 // the frontmatter only.
39 func Parse(r io.Reader, cfg Config) (Result, error) {
40 return parseSection(r, cfg, lexIntroSection)
41 }
42
43 type ContentFrontMatter struct {
44 Content []byte
45 FrontMatter map[string]any
46 FrontMatterFormat metadecoders.Format
47 }
48
49 // ParseFrontMatterAndContent is a convenience method to extract front matter
50 // and content from a content page.
51 func ParseFrontMatterAndContent(r io.Reader) (ContentFrontMatter, error) {
52 var cf ContentFrontMatter
53
54 psr, err := Parse(r, Config{})
55 if err != nil {
56 return cf, err
57 }
58
59 var frontMatterSource []byte
60
61 iter := psr.Iterator()
62
63 walkFn := func(item Item) bool {
64 if frontMatterSource != nil {
65 // The rest is content.
66 cf.Content = psr.Input()[item.Pos:]
67 // Done
68 return false
69 } else if item.IsFrontMatter() {
70 cf.FrontMatterFormat = FormatFromFrontMatterType(item.Type)
71 frontMatterSource = item.Val
72 }
73 return true
74 }
75
76 iter.PeekWalk(walkFn)
77
78 cf.FrontMatter, err = metadecoders.Default.UnmarshalToMap(frontMatterSource, cf.FrontMatterFormat)
79 return cf, err
80 }
81
82 func FormatFromFrontMatterType(typ ItemType) metadecoders.Format {
83 switch typ {
84 case TypeFrontMatterJSON:
85 return metadecoders.JSON
86 case TypeFrontMatterORG:
87 return metadecoders.ORG
88 case TypeFrontMatterTOML:
89 return metadecoders.TOML
90 case TypeFrontMatterYAML:
91 return metadecoders.YAML
92 default:
93 return ""
94 }
95 }
96
97 // ParseMain parses starting with the main section. Used in tests.
98 func ParseMain(r io.Reader, cfg Config) (Result, error) {
99 return parseSection(r, cfg, lexMainSection)
100 }
101
102 func parseSection(r io.Reader, cfg Config, start stateFunc) (Result, error) {
103 b, err := ioutil.ReadAll(r)
104 if err != nil {
105 return nil, fmt.Errorf("failed to read page content: %w", err)
106 }
107 return parseBytes(b, cfg, start)
108 }
109
110 func parseBytes(b []byte, cfg Config, start stateFunc) (Result, error) {
111 lexer := newPageLexer(b, start, cfg)
112 lexer.run()
113 return lexer, nil
114 }
115
116 // An Iterator has methods to iterate a parsed page with support going back
117 // if needed.
118 type Iterator struct {
119 l *pageLexer
120 lastPos int // position of the last item returned by nextItem
121 }
122
123 // consumes and returns the next item
124 func (t *Iterator) Next() Item {
125 t.lastPos++
126 return t.Current()
127 }
128
129 // Input returns the input source.
130 func (t *Iterator) Input() []byte {
131 return t.l.Input()
132 }
133
134 var errIndexOutOfBounds = Item{tError, 0, []byte("no more tokens"), true}
135
136 // Current will repeatably return the current item.
137 func (t *Iterator) Current() Item {
138 if t.lastPos >= len(t.l.items) {
139 return errIndexOutOfBounds
140 }
141 return t.l.items[t.lastPos]
142 }
143
144 // backs up one token.
145 func (t *Iterator) Backup() {
146 if t.lastPos < 0 {
147 panic("need to go forward before going back")
148 }
149 t.lastPos--
150 }
151
152 // Pos returns the current position in the input.
153 func (t *Iterator) Pos() int {
154 return t.lastPos
155 }
156
157 // check for non-error and non-EOF types coming next
158 func (t *Iterator) IsValueNext() bool {
159 i := t.Peek()
160 return i.Type != tError && i.Type != tEOF
161 }
162
163 // look at, but do not consume, the next item
164 // repeated, sequential calls will return the same item
165 func (t *Iterator) Peek() Item {
166 return t.l.items[t.lastPos+1]
167 }
168
169 // PeekWalk will feed the next items in the iterator to walkFn
170 // until it returns false.
171 func (t *Iterator) PeekWalk(walkFn func(item Item) bool) {
172 for i := t.lastPos + 1; i < len(t.l.items); i++ {
173 item := t.l.items[i]
174 if !walkFn(item) {
175 break
176 }
177 }
178 }
179
180 // Consume is a convenience method to consume the next n tokens,
181 // but back off Errors and EOF.
182 func (t *Iterator) Consume(cnt int) {
183 for i := 0; i < cnt; i++ {
184 token := t.Next()
185 if token.Type == tError || token.Type == tEOF {
186 t.Backup()
187 break
188 }
189 }
190 }
191
192 // LineNumber returns the current line number. Used for logging.
193 func (t *Iterator) LineNumber() int {
194 return bytes.Count(t.l.input[:t.Current().Pos], lf) + 1
195 }