content.go (8032B)
1 // Copyright 2019 The Hugo Authors. All rights reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 // http://www.apache.org/licenses/LICENSE-2.0
7 //
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13
14 // Package helpers implements general utility functions that work with
15 // and on content. The helper functions defined here lay down the
16 // foundation of how Hugo works with files and filepaths, and perform
17 // string operations on content.
18 package helpers
19
20 import (
21 "bytes"
22 "html/template"
23 "strings"
24 "unicode"
25 "unicode/utf8"
26
27 "github.com/gohugoio/hugo/common/hexec"
28 "github.com/gohugoio/hugo/common/loggers"
29
30 "github.com/spf13/afero"
31
32 "github.com/gohugoio/hugo/markup/converter"
33 "github.com/gohugoio/hugo/markup/converter/hooks"
34
35 "github.com/gohugoio/hugo/markup"
36
37 "github.com/gohugoio/hugo/config"
38 )
39
40 var (
41 openingPTag = []byte("<p>")
42 closingPTag = []byte("</p>")
43 paragraphIndicator = []byte("<p")
44 closingIndicator = []byte("</")
45 )
46
47 // ContentSpec provides functionality to render markdown content.
48 type ContentSpec struct {
49 Converters markup.ConverterProvider
50 anchorNameSanitizer converter.AnchorNameSanitizer
51 getRenderer func(t hooks.RendererType, id any) any
52
53 // SummaryLength is the length of the summary that Hugo extracts from a content.
54 summaryLength int
55
56 BuildFuture bool
57 BuildExpired bool
58 BuildDrafts bool
59
60 Cfg config.Provider
61 }
62
63 // NewContentSpec returns a ContentSpec initialized
64 // with the appropriate fields from the given config.Provider.
65 func NewContentSpec(cfg config.Provider, logger loggers.Logger, contentFs afero.Fs, ex *hexec.Exec) (*ContentSpec, error) {
66 spec := &ContentSpec{
67 summaryLength: cfg.GetInt("summaryLength"),
68 BuildFuture: cfg.GetBool("buildFuture"),
69 BuildExpired: cfg.GetBool("buildExpired"),
70 BuildDrafts: cfg.GetBool("buildDrafts"),
71
72 Cfg: cfg,
73 }
74
75 converterProvider, err := markup.NewConverterProvider(converter.ProviderConfig{
76 Cfg: cfg,
77 ContentFs: contentFs,
78 Logger: logger,
79 Exec: ex,
80 })
81 if err != nil {
82 return nil, err
83 }
84
85 spec.Converters = converterProvider
86 p := converterProvider.Get("markdown")
87 conv, err := p.New(converter.DocumentContext{})
88 if err != nil {
89 return nil, err
90 }
91 if as, ok := conv.(converter.AnchorNameSanitizer); ok {
92 spec.anchorNameSanitizer = as
93 } else {
94 // Use Goldmark's sanitizer
95 p := converterProvider.Get("goldmark")
96 conv, err := p.New(converter.DocumentContext{})
97 if err != nil {
98 return nil, err
99 }
100 spec.anchorNameSanitizer = conv.(converter.AnchorNameSanitizer)
101 }
102
103 return spec, nil
104 }
105
106 // stripEmptyNav strips out empty <nav> tags from content.
107 func stripEmptyNav(in []byte) []byte {
108 return bytes.Replace(in, []byte("<nav>\n</nav>\n\n"), []byte(``), -1)
109 }
110
111 // BytesToHTML converts bytes to type template.HTML.
112 func BytesToHTML(b []byte) template.HTML {
113 return template.HTML(string(b))
114 }
115
116 // ExtractTOC extracts Table of Contents from content.
117 func ExtractTOC(content []byte) (newcontent []byte, toc []byte) {
118 if !bytes.Contains(content, []byte("<nav>")) {
119 return content, nil
120 }
121 origContent := make([]byte, len(content))
122 copy(origContent, content)
123 first := []byte(`<nav>
124 <ul>`)
125
126 last := []byte(`</ul>
127 </nav>`)
128
129 replacement := []byte(`<nav id="TableOfContents">
130 <ul>`)
131
132 startOfTOC := bytes.Index(content, first)
133
134 peekEnd := len(content)
135 if peekEnd > 70+startOfTOC {
136 peekEnd = 70 + startOfTOC
137 }
138
139 if startOfTOC < 0 {
140 return stripEmptyNav(content), toc
141 }
142 // Need to peek ahead to see if this nav element is actually the right one.
143 correctNav := bytes.Index(content[startOfTOC:peekEnd], []byte(`<li><a href="#`))
144 if correctNav < 0 { // no match found
145 return content, toc
146 }
147 lengthOfTOC := bytes.Index(content[startOfTOC:], last) + len(last)
148 endOfTOC := startOfTOC + lengthOfTOC
149
150 newcontent = append(content[:startOfTOC], content[endOfTOC:]...)
151 toc = append(replacement, origContent[startOfTOC+len(first):endOfTOC]...)
152 return
153 }
154
155 func (c *ContentSpec) SanitizeAnchorName(s string) string {
156 return c.anchorNameSanitizer.SanitizeAnchorName(s)
157 }
158
159 func (c *ContentSpec) ResolveMarkup(in string) string {
160 in = strings.ToLower(in)
161 switch in {
162 case "md", "markdown", "mdown":
163 return "markdown"
164 case "html", "htm":
165 return "html"
166 default:
167 if conv := c.Converters.Get(in); conv != nil {
168 return conv.Name()
169 }
170 }
171 return ""
172 }
173
174 // TotalWords counts instance of one or more consecutive white space
175 // characters, as defined by unicode.IsSpace, in s.
176 // This is a cheaper way of word counting than the obvious len(strings.Fields(s)).
177 func TotalWords(s string) int {
178 n := 0
179 inWord := false
180 for _, r := range s {
181 wasInWord := inWord
182 inWord = !unicode.IsSpace(r)
183 if inWord && !wasInWord {
184 n++
185 }
186 }
187 return n
188 }
189
190 // TruncateWordsByRune truncates words by runes.
191 func (c *ContentSpec) TruncateWordsByRune(in []string) (string, bool) {
192 words := make([]string, len(in))
193 copy(words, in)
194
195 count := 0
196 for index, word := range words {
197 if count >= c.summaryLength {
198 return strings.Join(words[:index], " "), true
199 }
200 runeCount := utf8.RuneCountInString(word)
201 if len(word) == runeCount {
202 count++
203 } else if count+runeCount < c.summaryLength {
204 count += runeCount
205 } else {
206 for ri := range word {
207 if count >= c.summaryLength {
208 truncatedWords := append(words[:index], word[:ri])
209 return strings.Join(truncatedWords, " "), true
210 }
211 count++
212 }
213 }
214 }
215
216 return strings.Join(words, " "), false
217 }
218
219 // TruncateWordsToWholeSentence takes content and truncates to whole sentence
220 // limited by max number of words. It also returns whether it is truncated.
221 func (c *ContentSpec) TruncateWordsToWholeSentence(s string) (string, bool) {
222 var (
223 wordCount = 0
224 lastWordIndex = -1
225 )
226
227 for i, r := range s {
228 if unicode.IsSpace(r) {
229 wordCount++
230 lastWordIndex = i
231
232 if wordCount >= c.summaryLength {
233 break
234 }
235
236 }
237 }
238
239 if lastWordIndex == -1 {
240 return s, false
241 }
242
243 endIndex := -1
244
245 for j, r := range s[lastWordIndex:] {
246 if isEndOfSentence(r) {
247 endIndex = j + lastWordIndex + utf8.RuneLen(r)
248 break
249 }
250 }
251
252 if endIndex == -1 {
253 return s, false
254 }
255
256 return strings.TrimSpace(s[:endIndex]), endIndex < len(s)
257 }
258
259 // TrimShortHTML removes the <p>/</p> tags from HTML input in the situation
260 // where said tags are the only <p> tags in the input and enclose the content
261 // of the input (whitespace excluded).
262 func (c *ContentSpec) TrimShortHTML(input []byte) []byte {
263 firstOpeningP := bytes.Index(input, paragraphIndicator)
264 lastOpeningP := bytes.LastIndex(input, paragraphIndicator)
265
266 lastClosingP := bytes.LastIndex(input, closingPTag)
267 lastClosing := bytes.LastIndex(input, closingIndicator)
268
269 if firstOpeningP == lastOpeningP && lastClosingP == lastClosing {
270 input = bytes.TrimSpace(input)
271 input = bytes.TrimPrefix(input, openingPTag)
272 input = bytes.TrimSuffix(input, closingPTag)
273 input = bytes.TrimSpace(input)
274 }
275 return input
276 }
277
278 func isEndOfSentence(r rune) bool {
279 return r == '.' || r == '?' || r == '!' || r == '"' || r == '\n'
280 }
281
282 // Kept only for benchmark.
283 func (c *ContentSpec) truncateWordsToWholeSentenceOld(content string) (string, bool) {
284 words := strings.Fields(content)
285
286 if c.summaryLength >= len(words) {
287 return strings.Join(words, " "), false
288 }
289
290 for counter, word := range words[c.summaryLength:] {
291 if strings.HasSuffix(word, ".") ||
292 strings.HasSuffix(word, "?") ||
293 strings.HasSuffix(word, ".\"") ||
294 strings.HasSuffix(word, "!") {
295 upper := c.summaryLength + counter + 1
296 return strings.Join(words[:upper], " "), (upper < len(words))
297 }
298 }
299
300 return strings.Join(words[:c.summaryLength], " "), true
301 }