htmlElementsCollector.go (10233B)
1 // Copyright 2020 The Hugo Authors. All rights reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 // http://www.apache.org/licenses/LICENSE-2.0
7 //
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13
14 package publisher
15
16 import (
17 "bytes"
18 "regexp"
19 "sort"
20 "strings"
21 "sync"
22 "unicode"
23 "unicode/utf8"
24
25 "golang.org/x/net/html"
26
27 "github.com/gohugoio/hugo/helpers"
28 )
29
30 const eof = -1
31
32 var (
33 htmlJsonFixer = strings.NewReplacer(", ", "\n")
34 jsonAttrRe = regexp.MustCompile(`'?(.*?)'?:.*`)
35 classAttrRe = regexp.MustCompile(`(?i)^class$|transition`)
36
37 skipInnerElementRe = regexp.MustCompile(`(?i)^(pre|textarea|script|style)`)
38 skipAllElementRe = regexp.MustCompile(`(?i)^!DOCTYPE`)
39 endTagRe = regexp.MustCompile(`(?i)<\/\s*([a-zA-Z]+)\s*>$`)
40
41 exceptionList = map[string]bool{
42 "thead": true,
43 "tbody": true,
44 "tfoot": true,
45 "td": true,
46 "tr": true,
47 }
48 )
49
50 func newHTMLElementsCollector() *htmlElementsCollector {
51 return &htmlElementsCollector{
52 elementSet: make(map[string]bool),
53 }
54 }
55
56 func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlElementsCollectorWriter {
57 w := &htmlElementsCollectorWriter{
58 collector: collector,
59 state: htmlLexStart,
60 }
61
62 w.defaultLexElementInside = w.lexElementInside(htmlLexStart)
63
64 return w
65 }
66
67 // HTMLElements holds lists of tags and attribute values for classes and id.
68 type HTMLElements struct {
69 Tags []string `json:"tags"`
70 Classes []string `json:"classes"`
71 IDs []string `json:"ids"`
72 }
73
74 func (h *HTMLElements) Merge(other HTMLElements) {
75 h.Tags = append(h.Tags, other.Tags...)
76 h.Classes = append(h.Classes, other.Classes...)
77 h.IDs = append(h.IDs, other.IDs...)
78
79 h.Tags = helpers.UniqueStringsReuse(h.Tags)
80 h.Classes = helpers.UniqueStringsReuse(h.Classes)
81 h.IDs = helpers.UniqueStringsReuse(h.IDs)
82 }
83
84 func (h *HTMLElements) Sort() {
85 sort.Strings(h.Tags)
86 sort.Strings(h.Classes)
87 sort.Strings(h.IDs)
88 }
89
90 type htmlElement struct {
91 Tag string
92 Classes []string
93 IDs []string
94 }
95
96 type htmlElementsCollector struct {
97 // Contains the raw HTML string. We will get the same element
98 // several times, and want to avoid costly reparsing when this
99 // is used for aggregated data only.
100 elementSet map[string]bool
101
102 elements []htmlElement
103
104 mu sync.RWMutex
105 }
106
107 func (c *htmlElementsCollector) getHTMLElements() HTMLElements {
108 var (
109 classes []string
110 ids []string
111 tags []string
112 )
113
114 for _, el := range c.elements {
115 classes = append(classes, el.Classes...)
116 ids = append(ids, el.IDs...)
117 tags = append(tags, el.Tag)
118 }
119
120 classes = helpers.UniqueStringsSorted(classes)
121 ids = helpers.UniqueStringsSorted(ids)
122 tags = helpers.UniqueStringsSorted(tags)
123
124 els := HTMLElements{
125 Classes: classes,
126 IDs: ids,
127 Tags: tags,
128 }
129
130 return els
131 }
132
133 type htmlElementsCollectorWriter struct {
134 collector *htmlElementsCollector
135
136 r rune // Current rune
137 width int // The width in bytes of r
138 input []byte // The current slice written to Write
139 pos int // The current position in input
140
141 err error
142
143 inQuote rune
144
145 buff bytes.Buffer
146
147 // Current state
148 state htmlCollectorStateFunc
149
150 // Precompiled state funcs
151 defaultLexElementInside htmlCollectorStateFunc
152 }
153
154 // Write collects HTML elements from p, which must contain complete runes.
155 func (w *htmlElementsCollectorWriter) Write(p []byte) (int, error) {
156 if p == nil {
157 return 0, nil
158 }
159
160 w.input = p
161
162 for {
163 w.r = w.next()
164 if w.r == eof || w.r == utf8.RuneError {
165 break
166 }
167 w.state = w.state(w)
168 }
169
170 w.pos = 0
171 w.input = nil
172
173 return len(p), nil
174 }
175
176 func (l *htmlElementsCollectorWriter) backup() {
177 l.pos -= l.width
178 l.r, _ = utf8.DecodeRune(l.input[l.pos:])
179 }
180
181 func (w *htmlElementsCollectorWriter) consumeBuffUntil(condition func() bool, resolve htmlCollectorStateFunc) htmlCollectorStateFunc {
182 var s htmlCollectorStateFunc
183 s = func(*htmlElementsCollectorWriter) htmlCollectorStateFunc {
184 w.buff.WriteRune(w.r)
185 if condition() {
186 w.buff.Reset()
187 return resolve
188 }
189 return s
190 }
191 return s
192 }
193
194 func (w *htmlElementsCollectorWriter) consumeRuneUntil(condition func(r rune) bool, resolve htmlCollectorStateFunc) htmlCollectorStateFunc {
195 var s htmlCollectorStateFunc
196 s = func(*htmlElementsCollectorWriter) htmlCollectorStateFunc {
197 if condition(w.r) {
198 return resolve
199 }
200 return s
201 }
202 return s
203 }
204
205 // Starts with e.g. "<body " or "<div"
206 func (w *htmlElementsCollectorWriter) lexElementInside(resolve htmlCollectorStateFunc) htmlCollectorStateFunc {
207 var s htmlCollectorStateFunc
208 s = func(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
209 w.buff.WriteRune(w.r)
210
211 // Skip any text inside a quote.
212 if w.r == '\'' || w.r == '"' {
213 if w.inQuote == w.r {
214 w.inQuote = 0
215 } else if w.inQuote == 0 {
216 w.inQuote = w.r
217 }
218 }
219
220 if w.inQuote != 0 {
221 return s
222 }
223
224 if w.r == '>' {
225
226 // Work with the bytes slice as long as it's practical,
227 // to save memory allocations.
228 b := w.buff.Bytes()
229
230 defer func() {
231 w.buff.Reset()
232 }()
233
234 // First check if we have processed this element before.
235 w.collector.mu.RLock()
236
237 seen := w.collector.elementSet[string(b)]
238 w.collector.mu.RUnlock()
239 if seen {
240 return resolve
241 }
242
243 s := w.buff.String()
244
245 if s == "" {
246 return resolve
247 }
248
249 // Parse each collected element.
250 el, err := parseHTMLElement(s)
251 if err != nil {
252 w.err = err
253 return resolve
254 }
255
256 // Write this tag to the element set.
257 w.collector.mu.Lock()
258 w.collector.elementSet[s] = true
259 w.collector.elements = append(w.collector.elements, el)
260 w.collector.mu.Unlock()
261
262 return resolve
263
264 }
265
266 return s
267 }
268
269 return s
270 }
271
272 func (l *htmlElementsCollectorWriter) next() rune {
273 if l.pos >= len(l.input) {
274 l.width = 0
275 return eof
276 }
277
278 runeValue, runeWidth := utf8.DecodeRune(l.input[l.pos:])
279
280 l.width = runeWidth
281 l.pos += l.width
282 return runeValue
283 }
284
285 // returns the next state in HTML element scanner.
286 type htmlCollectorStateFunc func(*htmlElementsCollectorWriter) htmlCollectorStateFunc
287
288 // At "<", buffer empty.
289 // Potentially starting a HTML element.
290 func htmlLexElementStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
291 if w.r == '>' || unicode.IsSpace(w.r) {
292 if w.buff.Len() < 2 || bytes.HasPrefix(w.buff.Bytes(), []byte("</")) {
293 w.buff.Reset()
294 return htmlLexStart
295 }
296
297 tagName := w.buff.Bytes()[1:]
298
299 switch {
300 case skipInnerElementRe.Match(tagName):
301 // pre, script etc. We collect classes etc. on the surrounding
302 // element, but skip the inner content.
303 w.backup()
304
305 // tagName will be overwritten, so make a copy.
306 tagNameCopy := make([]byte, len(tagName))
307 copy(tagNameCopy, tagName)
308
309 return w.lexElementInside(
310 w.consumeBuffUntil(
311 func() bool {
312 if w.r != '>' {
313 return false
314 }
315 m := endTagRe.FindSubmatch(w.buff.Bytes())
316 if m == nil {
317 return false
318 }
319 return bytes.EqualFold(m[1], tagNameCopy)
320 },
321 htmlLexStart,
322 ))
323 case skipAllElementRe.Match(tagName):
324 // E.g. "<!DOCTYPE ..."
325 w.buff.Reset()
326 return w.consumeRuneUntil(func(r rune) bool {
327 return r == '>'
328 }, htmlLexStart)
329 default:
330 w.backup()
331 return w.defaultLexElementInside
332 }
333 }
334
335 w.buff.WriteRune(w.r)
336
337 // If it's a comment, skip to its end.
338 if w.r == '-' && bytes.Equal(w.buff.Bytes(), []byte("<!--")) {
339 w.buff.Reset()
340 return htmlLexToEndOfComment
341 }
342
343 return htmlLexElementStart
344 }
345
346 // Entry state func.
347 // Looks for a opening bracket, '<'.
348 func htmlLexStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
349 if w.r == '<' {
350 w.backup()
351 w.buff.Reset()
352 return htmlLexElementStart
353 }
354
355 return htmlLexStart
356 }
357
358 // After "<!--", buff empty.
359 func htmlLexToEndOfComment(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
360 w.buff.WriteRune(w.r)
361
362 if w.r == '>' && bytes.HasSuffix(w.buff.Bytes(), []byte("-->")) {
363 // Done, start looking for HTML elements again.
364 return htmlLexStart
365 }
366
367 return htmlLexToEndOfComment
368 }
369
370 func parseHTMLElement(elStr string) (el htmlElement, err error) {
371
372 tagName := parseStartTag(elStr)
373
374 el.Tag = strings.ToLower(tagName)
375 tagNameToParse := el.Tag
376
377 // The net/html parser does not handle single table elements as input, e.g. tbody.
378 // We only care about the element/class/ids, so just store away the original tag name
379 // and pretend it's a <div>.
380 if exceptionList[el.Tag] {
381 elStr = strings.Replace(elStr, tagName, "div", 1)
382 tagNameToParse = "div"
383 }
384
385 n, err := html.Parse(strings.NewReader(elStr))
386 if err != nil {
387 return
388 }
389
390 var walk func(*html.Node)
391 walk = func(n *html.Node) {
392 if n.Type == html.ElementNode && n.Data == tagNameToParse {
393 for _, a := range n.Attr {
394 switch {
395 case strings.EqualFold(a.Key, "id"):
396 // There should be only one, but one never knows...
397 el.IDs = append(el.IDs, a.Val)
398 default:
399 if classAttrRe.MatchString(a.Key) {
400 el.Classes = append(el.Classes, strings.Fields(a.Val)...)
401 } else {
402 key := strings.ToLower(a.Key)
403 val := strings.TrimSpace(a.Val)
404 if strings.Contains(key, "class") && strings.HasPrefix(val, "{") {
405 // This looks like a Vue or AlpineJS class binding.
406 val = htmlJsonFixer.Replace(strings.Trim(val, "{}"))
407 lines := strings.Split(val, "\n")
408 for i, l := range lines {
409 lines[i] = strings.TrimSpace(l)
410 }
411 val = strings.Join(lines, "\n")
412 val = jsonAttrRe.ReplaceAllString(val, "$1")
413 el.Classes = append(el.Classes, strings.Fields(val)...)
414 }
415 }
416 }
417 }
418 }
419
420 for c := n.FirstChild; c != nil; c = c.NextSibling {
421 walk(c)
422 }
423 }
424
425 walk(n)
426
427 return
428 }
429
430 // Variants of s
431 // <body class="b a">
432 // <div>
433 func parseStartTag(s string) string {
434 spaceIndex := strings.IndexFunc(s, func(r rune) bool {
435 return unicode.IsSpace(r)
436 })
437
438 if spaceIndex == -1 {
439 return s[1 : len(s)-1]
440 }
441
442 return s[1:spaceIndex]
443 }