htmlElementsCollector.go - hugo - Fork of github.com/gohugoio/hugo with reverse pagination support

htmlElementsCollector.go (10233B)

    1 // Copyright 2020 The Hugo Authors. All rights reserved.
    2 //
    3 // Licensed under the Apache License, Version 2.0 (the "License");
    4 // you may not use this file except in compliance with the License.
    5 // You may obtain a copy of the License at
    6 // http://www.apache.org/licenses/LICENSE-2.0
    7 //
    8 // Unless required by applicable law or agreed to in writing, software
    9 // distributed under the License is distributed on an "AS IS" BASIS,
   10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   11 // See the License for the specific language governing permissions and
   12 // limitations under the License.
   13 
   14 package publisher
   15 
   16 import (
   17 	"bytes"
   18 	"regexp"
   19 	"sort"
   20 	"strings"
   21 	"sync"
   22 	"unicode"
   23 	"unicode/utf8"
   24 
   25 	"golang.org/x/net/html"
   26 
   27 	"github.com/gohugoio/hugo/helpers"
   28 )
   29 
   30 const eof = -1
   31 
   32 var (
   33 	htmlJsonFixer = strings.NewReplacer(", ", "\n")
   34 	jsonAttrRe    = regexp.MustCompile(`'?(.*?)'?:.*`)
   35 	classAttrRe   = regexp.MustCompile(`(?i)^class$|transition`)
   36 
   37 	skipInnerElementRe = regexp.MustCompile(`(?i)^(pre|textarea|script|style)`)
   38 	skipAllElementRe   = regexp.MustCompile(`(?i)^!DOCTYPE`)
   39 	endTagRe           = regexp.MustCompile(`(?i)<\/\s*([a-zA-Z]+)\s*>$`)
   40 
   41 	exceptionList = map[string]bool{
   42 		"thead": true,
   43 		"tbody": true,
   44 		"tfoot": true,
   45 		"td":    true,
   46 		"tr":    true,
   47 	}
   48 )
   49 
   50 func newHTMLElementsCollector() *htmlElementsCollector {
   51 	return &htmlElementsCollector{
   52 		elementSet: make(map[string]bool),
   53 	}
   54 }
   55 
   56 func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlElementsCollectorWriter {
   57 	w := &htmlElementsCollectorWriter{
   58 		collector: collector,
   59 		state:     htmlLexStart,
   60 	}
   61 
   62 	w.defaultLexElementInside = w.lexElementInside(htmlLexStart)
   63 
   64 	return w
   65 }
   66 
   67 // HTMLElements holds lists of tags and attribute values for classes and id.
   68 type HTMLElements struct {
   69 	Tags    []string `json:"tags"`
   70 	Classes []string `json:"classes"`
   71 	IDs     []string `json:"ids"`
   72 }
   73 
   74 func (h *HTMLElements) Merge(other HTMLElements) {
   75 	h.Tags = append(h.Tags, other.Tags...)
   76 	h.Classes = append(h.Classes, other.Classes...)
   77 	h.IDs = append(h.IDs, other.IDs...)
   78 
   79 	h.Tags = helpers.UniqueStringsReuse(h.Tags)
   80 	h.Classes = helpers.UniqueStringsReuse(h.Classes)
   81 	h.IDs = helpers.UniqueStringsReuse(h.IDs)
   82 }
   83 
   84 func (h *HTMLElements) Sort() {
   85 	sort.Strings(h.Tags)
   86 	sort.Strings(h.Classes)
   87 	sort.Strings(h.IDs)
   88 }
   89 
   90 type htmlElement struct {
   91 	Tag     string
   92 	Classes []string
   93 	IDs     []string
   94 }
   95 
   96 type htmlElementsCollector struct {
   97 	// Contains the raw HTML string. We will get the same element
   98 	// several times, and want to avoid costly reparsing when this
   99 	// is used for aggregated data only.
  100 	elementSet map[string]bool
  101 
  102 	elements []htmlElement
  103 
  104 	mu sync.RWMutex
  105 }
  106 
  107 func (c *htmlElementsCollector) getHTMLElements() HTMLElements {
  108 	var (
  109 		classes []string
  110 		ids     []string
  111 		tags    []string
  112 	)
  113 
  114 	for _, el := range c.elements {
  115 		classes = append(classes, el.Classes...)
  116 		ids = append(ids, el.IDs...)
  117 		tags = append(tags, el.Tag)
  118 	}
  119 
  120 	classes = helpers.UniqueStringsSorted(classes)
  121 	ids = helpers.UniqueStringsSorted(ids)
  122 	tags = helpers.UniqueStringsSorted(tags)
  123 
  124 	els := HTMLElements{
  125 		Classes: classes,
  126 		IDs:     ids,
  127 		Tags:    tags,
  128 	}
  129 
  130 	return els
  131 }
  132 
  133 type htmlElementsCollectorWriter struct {
  134 	collector *htmlElementsCollector
  135 
  136 	r     rune   // Current rune
  137 	width int    // The width in bytes of r
  138 	input []byte // The current slice written to Write
  139 	pos   int    // The current position in input
  140 
  141 	err error
  142 
  143 	inQuote rune
  144 
  145 	buff bytes.Buffer
  146 
  147 	// Current state
  148 	state htmlCollectorStateFunc
  149 
  150 	// Precompiled state funcs
  151 	defaultLexElementInside htmlCollectorStateFunc
  152 }
  153 
  154 // Write collects HTML elements from p, which must contain complete runes.
  155 func (w *htmlElementsCollectorWriter) Write(p []byte) (int, error) {
  156 	if p == nil {
  157 		return 0, nil
  158 	}
  159 
  160 	w.input = p
  161 
  162 	for {
  163 		w.r = w.next()
  164 		if w.r == eof || w.r == utf8.RuneError {
  165 			break
  166 		}
  167 		w.state = w.state(w)
  168 	}
  169 
  170 	w.pos = 0
  171 	w.input = nil
  172 
  173 	return len(p), nil
  174 }
  175 
  176 func (l *htmlElementsCollectorWriter) backup() {
  177 	l.pos -= l.width
  178 	l.r, _ = utf8.DecodeRune(l.input[l.pos:])
  179 }
  180 
  181 func (w *htmlElementsCollectorWriter) consumeBuffUntil(condition func() bool, resolve htmlCollectorStateFunc) htmlCollectorStateFunc {
  182 	var s htmlCollectorStateFunc
  183 	s = func(*htmlElementsCollectorWriter) htmlCollectorStateFunc {
  184 		w.buff.WriteRune(w.r)
  185 		if condition() {
  186 			w.buff.Reset()
  187 			return resolve
  188 		}
  189 		return s
  190 	}
  191 	return s
  192 }
  193 
  194 func (w *htmlElementsCollectorWriter) consumeRuneUntil(condition func(r rune) bool, resolve htmlCollectorStateFunc) htmlCollectorStateFunc {
  195 	var s htmlCollectorStateFunc
  196 	s = func(*htmlElementsCollectorWriter) htmlCollectorStateFunc {
  197 		if condition(w.r) {
  198 			return resolve
  199 		}
  200 		return s
  201 	}
  202 	return s
  203 }
  204 
  205 // Starts with e.g. "<body " or "<div"
  206 func (w *htmlElementsCollectorWriter) lexElementInside(resolve htmlCollectorStateFunc) htmlCollectorStateFunc {
  207 	var s htmlCollectorStateFunc
  208 	s = func(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
  209 		w.buff.WriteRune(w.r)
  210 
  211 		// Skip any text inside a quote.
  212 		if w.r == '\'' || w.r == '"' {
  213 			if w.inQuote == w.r {
  214 				w.inQuote = 0
  215 			} else if w.inQuote == 0 {
  216 				w.inQuote = w.r
  217 			}
  218 		}
  219 
  220 		if w.inQuote != 0 {
  221 			return s
  222 		}
  223 
  224 		if w.r == '>' {
  225 
  226 			// Work with the bytes slice as long as it's practical,
  227 			// to save memory allocations.
  228 			b := w.buff.Bytes()
  229 
  230 			defer func() {
  231 				w.buff.Reset()
  232 			}()
  233 
  234 			// First check if we have processed this element before.
  235 			w.collector.mu.RLock()
  236 
  237 			seen := w.collector.elementSet[string(b)]
  238 			w.collector.mu.RUnlock()
  239 			if seen {
  240 				return resolve
  241 			}
  242 
  243 			s := w.buff.String()
  244 
  245 			if s == "" {
  246 				return resolve
  247 			}
  248 
  249 			// Parse each collected element.
  250 			el, err := parseHTMLElement(s)
  251 			if err != nil {
  252 				w.err = err
  253 				return resolve
  254 			}
  255 
  256 			// Write this tag to the element set.
  257 			w.collector.mu.Lock()
  258 			w.collector.elementSet[s] = true
  259 			w.collector.elements = append(w.collector.elements, el)
  260 			w.collector.mu.Unlock()
  261 
  262 			return resolve
  263 
  264 		}
  265 
  266 		return s
  267 	}
  268 
  269 	return s
  270 }
  271 
  272 func (l *htmlElementsCollectorWriter) next() rune {
  273 	if l.pos >= len(l.input) {
  274 		l.width = 0
  275 		return eof
  276 	}
  277 
  278 	runeValue, runeWidth := utf8.DecodeRune(l.input[l.pos:])
  279 
  280 	l.width = runeWidth
  281 	l.pos += l.width
  282 	return runeValue
  283 }
  284 
  285 // returns the next state in HTML element scanner.
  286 type htmlCollectorStateFunc func(*htmlElementsCollectorWriter) htmlCollectorStateFunc
  287 
  288 // At "<", buffer empty.
  289 // Potentially starting a HTML element.
  290 func htmlLexElementStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
  291 	if w.r == '>' || unicode.IsSpace(w.r) {
  292 		if w.buff.Len() < 2 || bytes.HasPrefix(w.buff.Bytes(), []byte("</")) {
  293 			w.buff.Reset()
  294 			return htmlLexStart
  295 		}
  296 
  297 		tagName := w.buff.Bytes()[1:]
  298 
  299 		switch {
  300 		case skipInnerElementRe.Match(tagName):
  301 			// pre, script etc. We collect classes etc. on the surrounding
  302 			// element, but skip the inner content.
  303 			w.backup()
  304 
  305 			// tagName will be overwritten, so make a copy.
  306 			tagNameCopy := make([]byte, len(tagName))
  307 			copy(tagNameCopy, tagName)
  308 
  309 			return w.lexElementInside(
  310 				w.consumeBuffUntil(
  311 					func() bool {
  312 						if w.r != '>' {
  313 							return false
  314 						}
  315 						m := endTagRe.FindSubmatch(w.buff.Bytes())
  316 						if m == nil {
  317 							return false
  318 						}
  319 						return bytes.EqualFold(m[1], tagNameCopy)
  320 					},
  321 					htmlLexStart,
  322 				))
  323 		case skipAllElementRe.Match(tagName):
  324 			// E.g. "<!DOCTYPE ..."
  325 			w.buff.Reset()
  326 			return w.consumeRuneUntil(func(r rune) bool {
  327 				return r == '>'
  328 			}, htmlLexStart)
  329 		default:
  330 			w.backup()
  331 			return w.defaultLexElementInside
  332 		}
  333 	}
  334 
  335 	w.buff.WriteRune(w.r)
  336 
  337 	// If it's a comment, skip to its end.
  338 	if w.r == '-' && bytes.Equal(w.buff.Bytes(), []byte("<!--")) {
  339 		w.buff.Reset()
  340 		return htmlLexToEndOfComment
  341 	}
  342 
  343 	return htmlLexElementStart
  344 }
  345 
  346 // Entry state func.
  347 // Looks for a opening bracket, '<'.
  348 func htmlLexStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
  349 	if w.r == '<' {
  350 		w.backup()
  351 		w.buff.Reset()
  352 		return htmlLexElementStart
  353 	}
  354 
  355 	return htmlLexStart
  356 }
  357 
  358 // After "<!--", buff empty.
  359 func htmlLexToEndOfComment(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
  360 	w.buff.WriteRune(w.r)
  361 
  362 	if w.r == '>' && bytes.HasSuffix(w.buff.Bytes(), []byte("-->")) {
  363 		// Done, start looking for HTML elements again.
  364 		return htmlLexStart
  365 	}
  366 
  367 	return htmlLexToEndOfComment
  368 }
  369 
  370 func parseHTMLElement(elStr string) (el htmlElement, err error) {
  371 
  372 	tagName := parseStartTag(elStr)
  373 
  374 	el.Tag = strings.ToLower(tagName)
  375 	tagNameToParse := el.Tag
  376 
  377 	// The net/html parser does not handle single table elements as input, e.g. tbody.
  378 	// We only care about the element/class/ids, so just store away the original tag name
  379 	// and pretend it's a <div>.
  380 	if exceptionList[el.Tag] {
  381 		elStr = strings.Replace(elStr, tagName, "div", 1)
  382 		tagNameToParse = "div"
  383 	}
  384 
  385 	n, err := html.Parse(strings.NewReader(elStr))
  386 	if err != nil {
  387 		return
  388 	}
  389 
  390 	var walk func(*html.Node)
  391 	walk = func(n *html.Node) {
  392 		if n.Type == html.ElementNode && n.Data == tagNameToParse {
  393 			for _, a := range n.Attr {
  394 				switch {
  395 				case strings.EqualFold(a.Key, "id"):
  396 					// There should be only one, but one never knows...
  397 					el.IDs = append(el.IDs, a.Val)
  398 				default:
  399 					if classAttrRe.MatchString(a.Key) {
  400 						el.Classes = append(el.Classes, strings.Fields(a.Val)...)
  401 					} else {
  402 						key := strings.ToLower(a.Key)
  403 						val := strings.TrimSpace(a.Val)
  404 						if strings.Contains(key, "class") && strings.HasPrefix(val, "{") {
  405 							// This looks like a Vue or AlpineJS class binding.
  406 							val = htmlJsonFixer.Replace(strings.Trim(val, "{}"))
  407 							lines := strings.Split(val, "\n")
  408 							for i, l := range lines {
  409 								lines[i] = strings.TrimSpace(l)
  410 							}
  411 							val = strings.Join(lines, "\n")
  412 							val = jsonAttrRe.ReplaceAllString(val, "$1")
  413 							el.Classes = append(el.Classes, strings.Fields(val)...)
  414 						}
  415 					}
  416 				}
  417 			}
  418 		}
  419 
  420 		for c := n.FirstChild; c != nil; c = c.NextSibling {
  421 			walk(c)
  422 		}
  423 	}
  424 
  425 	walk(n)
  426 
  427 	return
  428 }
  429 
  430 // Variants of s
  431 //    <body class="b a">
  432 //    <div>
  433 func parseStartTag(s string) string {
  434 	spaceIndex := strings.IndexFunc(s, func(r rune) bool {
  435 		return unicode.IsSpace(r)
  436 	})
  437 
  438 	if spaceIndex == -1 {
  439 		return s[1 : len(s)-1]
  440 	}
  441 
  442 	return s[1:spaceIndex]
  443 }