content.go - hugo - Fork of github.com/gohugoio/hugo with reverse pagination support

content.go (8032B)

    1 // Copyright 2019 The Hugo Authors. All rights reserved.
    2 //
    3 // Licensed under the Apache License, Version 2.0 (the "License");
    4 // you may not use this file except in compliance with the License.
    5 // You may obtain a copy of the License at
    6 // http://www.apache.org/licenses/LICENSE-2.0
    7 //
    8 // Unless required by applicable law or agreed to in writing, software
    9 // distributed under the License is distributed on an "AS IS" BASIS,
   10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   11 // See the License for the specific language governing permissions and
   12 // limitations under the License.
   13 
   14 // Package helpers implements general utility functions that work with
   15 // and on content.  The helper functions defined here lay down the
   16 // foundation of how Hugo works with files and filepaths, and perform
   17 // string operations on content.
   18 package helpers
   19 
   20 import (
   21 	"bytes"
   22 	"html/template"
   23 	"strings"
   24 	"unicode"
   25 	"unicode/utf8"
   26 
   27 	"github.com/gohugoio/hugo/common/hexec"
   28 	"github.com/gohugoio/hugo/common/loggers"
   29 
   30 	"github.com/spf13/afero"
   31 
   32 	"github.com/gohugoio/hugo/markup/converter"
   33 	"github.com/gohugoio/hugo/markup/converter/hooks"
   34 
   35 	"github.com/gohugoio/hugo/markup"
   36 
   37 	"github.com/gohugoio/hugo/config"
   38 )
   39 
   40 var (
   41 	openingPTag        = []byte("<p>")
   42 	closingPTag        = []byte("</p>")
   43 	paragraphIndicator = []byte("<p")
   44 	closingIndicator   = []byte("</")
   45 )
   46 
   47 // ContentSpec provides functionality to render markdown content.
   48 type ContentSpec struct {
   49 	Converters          markup.ConverterProvider
   50 	anchorNameSanitizer converter.AnchorNameSanitizer
   51 	getRenderer         func(t hooks.RendererType, id any) any
   52 
   53 	// SummaryLength is the length of the summary that Hugo extracts from a content.
   54 	summaryLength int
   55 
   56 	BuildFuture  bool
   57 	BuildExpired bool
   58 	BuildDrafts  bool
   59 
   60 	Cfg config.Provider
   61 }
   62 
   63 // NewContentSpec returns a ContentSpec initialized
   64 // with the appropriate fields from the given config.Provider.
   65 func NewContentSpec(cfg config.Provider, logger loggers.Logger, contentFs afero.Fs, ex *hexec.Exec) (*ContentSpec, error) {
   66 	spec := &ContentSpec{
   67 		summaryLength: cfg.GetInt("summaryLength"),
   68 		BuildFuture:   cfg.GetBool("buildFuture"),
   69 		BuildExpired:  cfg.GetBool("buildExpired"),
   70 		BuildDrafts:   cfg.GetBool("buildDrafts"),
   71 
   72 		Cfg: cfg,
   73 	}
   74 
   75 	converterProvider, err := markup.NewConverterProvider(converter.ProviderConfig{
   76 		Cfg:       cfg,
   77 		ContentFs: contentFs,
   78 		Logger:    logger,
   79 		Exec:      ex,
   80 	})
   81 	if err != nil {
   82 		return nil, err
   83 	}
   84 
   85 	spec.Converters = converterProvider
   86 	p := converterProvider.Get("markdown")
   87 	conv, err := p.New(converter.DocumentContext{})
   88 	if err != nil {
   89 		return nil, err
   90 	}
   91 	if as, ok := conv.(converter.AnchorNameSanitizer); ok {
   92 		spec.anchorNameSanitizer = as
   93 	} else {
   94 		// Use Goldmark's sanitizer
   95 		p := converterProvider.Get("goldmark")
   96 		conv, err := p.New(converter.DocumentContext{})
   97 		if err != nil {
   98 			return nil, err
   99 		}
  100 		spec.anchorNameSanitizer = conv.(converter.AnchorNameSanitizer)
  101 	}
  102 
  103 	return spec, nil
  104 }
  105 
  106 // stripEmptyNav strips out empty <nav> tags from content.
  107 func stripEmptyNav(in []byte) []byte {
  108 	return bytes.Replace(in, []byte("<nav>\n</nav>\n\n"), []byte(``), -1)
  109 }
  110 
  111 // BytesToHTML converts bytes to type template.HTML.
  112 func BytesToHTML(b []byte) template.HTML {
  113 	return template.HTML(string(b))
  114 }
  115 
  116 // ExtractTOC extracts Table of Contents from content.
  117 func ExtractTOC(content []byte) (newcontent []byte, toc []byte) {
  118 	if !bytes.Contains(content, []byte("<nav>")) {
  119 		return content, nil
  120 	}
  121 	origContent := make([]byte, len(content))
  122 	copy(origContent, content)
  123 	first := []byte(`<nav>
  124 <ul>`)
  125 
  126 	last := []byte(`</ul>
  127 </nav>`)
  128 
  129 	replacement := []byte(`<nav id="TableOfContents">
  130 <ul>`)
  131 
  132 	startOfTOC := bytes.Index(content, first)
  133 
  134 	peekEnd := len(content)
  135 	if peekEnd > 70+startOfTOC {
  136 		peekEnd = 70 + startOfTOC
  137 	}
  138 
  139 	if startOfTOC < 0 {
  140 		return stripEmptyNav(content), toc
  141 	}
  142 	// Need to peek ahead to see if this nav element is actually the right one.
  143 	correctNav := bytes.Index(content[startOfTOC:peekEnd], []byte(`<li><a href="#`))
  144 	if correctNav < 0 { // no match found
  145 		return content, toc
  146 	}
  147 	lengthOfTOC := bytes.Index(content[startOfTOC:], last) + len(last)
  148 	endOfTOC := startOfTOC + lengthOfTOC
  149 
  150 	newcontent = append(content[:startOfTOC], content[endOfTOC:]...)
  151 	toc = append(replacement, origContent[startOfTOC+len(first):endOfTOC]...)
  152 	return
  153 }
  154 
  155 func (c *ContentSpec) SanitizeAnchorName(s string) string {
  156 	return c.anchorNameSanitizer.SanitizeAnchorName(s)
  157 }
  158 
  159 func (c *ContentSpec) ResolveMarkup(in string) string {
  160 	in = strings.ToLower(in)
  161 	switch in {
  162 	case "md", "markdown", "mdown":
  163 		return "markdown"
  164 	case "html", "htm":
  165 		return "html"
  166 	default:
  167 		if conv := c.Converters.Get(in); conv != nil {
  168 			return conv.Name()
  169 		}
  170 	}
  171 	return ""
  172 }
  173 
  174 // TotalWords counts instance of one or more consecutive white space
  175 // characters, as defined by unicode.IsSpace, in s.
  176 // This is a cheaper way of word counting than the obvious len(strings.Fields(s)).
  177 func TotalWords(s string) int {
  178 	n := 0
  179 	inWord := false
  180 	for _, r := range s {
  181 		wasInWord := inWord
  182 		inWord = !unicode.IsSpace(r)
  183 		if inWord && !wasInWord {
  184 			n++
  185 		}
  186 	}
  187 	return n
  188 }
  189 
  190 // TruncateWordsByRune truncates words by runes.
  191 func (c *ContentSpec) TruncateWordsByRune(in []string) (string, bool) {
  192 	words := make([]string, len(in))
  193 	copy(words, in)
  194 
  195 	count := 0
  196 	for index, word := range words {
  197 		if count >= c.summaryLength {
  198 			return strings.Join(words[:index], " "), true
  199 		}
  200 		runeCount := utf8.RuneCountInString(word)
  201 		if len(word) == runeCount {
  202 			count++
  203 		} else if count+runeCount < c.summaryLength {
  204 			count += runeCount
  205 		} else {
  206 			for ri := range word {
  207 				if count >= c.summaryLength {
  208 					truncatedWords := append(words[:index], word[:ri])
  209 					return strings.Join(truncatedWords, " "), true
  210 				}
  211 				count++
  212 			}
  213 		}
  214 	}
  215 
  216 	return strings.Join(words, " "), false
  217 }
  218 
  219 // TruncateWordsToWholeSentence takes content and truncates to whole sentence
  220 // limited by max number of words. It also returns whether it is truncated.
  221 func (c *ContentSpec) TruncateWordsToWholeSentence(s string) (string, bool) {
  222 	var (
  223 		wordCount     = 0
  224 		lastWordIndex = -1
  225 	)
  226 
  227 	for i, r := range s {
  228 		if unicode.IsSpace(r) {
  229 			wordCount++
  230 			lastWordIndex = i
  231 
  232 			if wordCount >= c.summaryLength {
  233 				break
  234 			}
  235 
  236 		}
  237 	}
  238 
  239 	if lastWordIndex == -1 {
  240 		return s, false
  241 	}
  242 
  243 	endIndex := -1
  244 
  245 	for j, r := range s[lastWordIndex:] {
  246 		if isEndOfSentence(r) {
  247 			endIndex = j + lastWordIndex + utf8.RuneLen(r)
  248 			break
  249 		}
  250 	}
  251 
  252 	if endIndex == -1 {
  253 		return s, false
  254 	}
  255 
  256 	return strings.TrimSpace(s[:endIndex]), endIndex < len(s)
  257 }
  258 
  259 // TrimShortHTML removes the <p>/</p> tags from HTML input in the situation
  260 // where said tags are the only <p> tags in the input and enclose the content
  261 // of the input (whitespace excluded).
  262 func (c *ContentSpec) TrimShortHTML(input []byte) []byte {
  263 	firstOpeningP := bytes.Index(input, paragraphIndicator)
  264 	lastOpeningP := bytes.LastIndex(input, paragraphIndicator)
  265 
  266 	lastClosingP := bytes.LastIndex(input, closingPTag)
  267 	lastClosing := bytes.LastIndex(input, closingIndicator)
  268 
  269 	if firstOpeningP == lastOpeningP && lastClosingP == lastClosing {
  270 		input = bytes.TrimSpace(input)
  271 		input = bytes.TrimPrefix(input, openingPTag)
  272 		input = bytes.TrimSuffix(input, closingPTag)
  273 		input = bytes.TrimSpace(input)
  274 	}
  275 	return input
  276 }
  277 
  278 func isEndOfSentence(r rune) bool {
  279 	return r == '.' || r == '?' || r == '!' || r == '"' || r == '\n'
  280 }
  281 
  282 // Kept only for benchmark.
  283 func (c *ContentSpec) truncateWordsToWholeSentenceOld(content string) (string, bool) {
  284 	words := strings.Fields(content)
  285 
  286 	if c.summaryLength >= len(words) {
  287 		return strings.Join(words, " "), false
  288 	}
  289 
  290 	for counter, word := range words[c.summaryLength:] {
  291 		if strings.HasSuffix(word, ".") ||
  292 			strings.HasSuffix(word, "?") ||
  293 			strings.HasSuffix(word, ".\"") ||
  294 			strings.HasSuffix(word, "!") {
  295 			upper := c.summaryLength + counter + 1
  296 			return strings.Join(words[:upper], " "), (upper < len(words))
  297 		}
  298 	}
  299 
  300 	return strings.Join(words[:c.summaryLength], " "), true
  301 }