htmlElementsCollector.go (10233B)
1 // Copyright 2020 The Hugo Authors. All rights reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // http://www.apache.org/licenses/LICENSE-2.0 7 // 8 // Unless required by applicable law or agreed to in writing, software 9 // distributed under the License is distributed on an "AS IS" BASIS, 10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package publisher 15 16 import ( 17 "bytes" 18 "regexp" 19 "sort" 20 "strings" 21 "sync" 22 "unicode" 23 "unicode/utf8" 24 25 "golang.org/x/net/html" 26 27 "github.com/gohugoio/hugo/helpers" 28 ) 29 30 const eof = -1 31 32 var ( 33 htmlJsonFixer = strings.NewReplacer(", ", "\n") 34 jsonAttrRe = regexp.MustCompile(`'?(.*?)'?:.*`) 35 classAttrRe = regexp.MustCompile(`(?i)^class$|transition`) 36 37 skipInnerElementRe = regexp.MustCompile(`(?i)^(pre|textarea|script|style)`) 38 skipAllElementRe = regexp.MustCompile(`(?i)^!DOCTYPE`) 39 endTagRe = regexp.MustCompile(`(?i)<\/\s*([a-zA-Z]+)\s*>$`) 40 41 exceptionList = map[string]bool{ 42 "thead": true, 43 "tbody": true, 44 "tfoot": true, 45 "td": true, 46 "tr": true, 47 } 48 ) 49 50 func newHTMLElementsCollector() *htmlElementsCollector { 51 return &htmlElementsCollector{ 52 elementSet: make(map[string]bool), 53 } 54 } 55 56 func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlElementsCollectorWriter { 57 w := &htmlElementsCollectorWriter{ 58 collector: collector, 59 state: htmlLexStart, 60 } 61 62 w.defaultLexElementInside = w.lexElementInside(htmlLexStart) 63 64 return w 65 } 66 67 // HTMLElements holds lists of tags and attribute values for classes and id. 68 type HTMLElements struct { 69 Tags []string `json:"tags"` 70 Classes []string `json:"classes"` 71 IDs []string `json:"ids"` 72 } 73 74 func (h *HTMLElements) Merge(other HTMLElements) { 75 h.Tags = append(h.Tags, other.Tags...) 76 h.Classes = append(h.Classes, other.Classes...) 77 h.IDs = append(h.IDs, other.IDs...) 78 79 h.Tags = helpers.UniqueStringsReuse(h.Tags) 80 h.Classes = helpers.UniqueStringsReuse(h.Classes) 81 h.IDs = helpers.UniqueStringsReuse(h.IDs) 82 } 83 84 func (h *HTMLElements) Sort() { 85 sort.Strings(h.Tags) 86 sort.Strings(h.Classes) 87 sort.Strings(h.IDs) 88 } 89 90 type htmlElement struct { 91 Tag string 92 Classes []string 93 IDs []string 94 } 95 96 type htmlElementsCollector struct { 97 // Contains the raw HTML string. We will get the same element 98 // several times, and want to avoid costly reparsing when this 99 // is used for aggregated data only. 100 elementSet map[string]bool 101 102 elements []htmlElement 103 104 mu sync.RWMutex 105 } 106 107 func (c *htmlElementsCollector) getHTMLElements() HTMLElements { 108 var ( 109 classes []string 110 ids []string 111 tags []string 112 ) 113 114 for _, el := range c.elements { 115 classes = append(classes, el.Classes...) 116 ids = append(ids, el.IDs...) 117 tags = append(tags, el.Tag) 118 } 119 120 classes = helpers.UniqueStringsSorted(classes) 121 ids = helpers.UniqueStringsSorted(ids) 122 tags = helpers.UniqueStringsSorted(tags) 123 124 els := HTMLElements{ 125 Classes: classes, 126 IDs: ids, 127 Tags: tags, 128 } 129 130 return els 131 } 132 133 type htmlElementsCollectorWriter struct { 134 collector *htmlElementsCollector 135 136 r rune // Current rune 137 width int // The width in bytes of r 138 input []byte // The current slice written to Write 139 pos int // The current position in input 140 141 err error 142 143 inQuote rune 144 145 buff bytes.Buffer 146 147 // Current state 148 state htmlCollectorStateFunc 149 150 // Precompiled state funcs 151 defaultLexElementInside htmlCollectorStateFunc 152 } 153 154 // Write collects HTML elements from p, which must contain complete runes. 155 func (w *htmlElementsCollectorWriter) Write(p []byte) (int, error) { 156 if p == nil { 157 return 0, nil 158 } 159 160 w.input = p 161 162 for { 163 w.r = w.next() 164 if w.r == eof || w.r == utf8.RuneError { 165 break 166 } 167 w.state = w.state(w) 168 } 169 170 w.pos = 0 171 w.input = nil 172 173 return len(p), nil 174 } 175 176 func (l *htmlElementsCollectorWriter) backup() { 177 l.pos -= l.width 178 l.r, _ = utf8.DecodeRune(l.input[l.pos:]) 179 } 180 181 func (w *htmlElementsCollectorWriter) consumeBuffUntil(condition func() bool, resolve htmlCollectorStateFunc) htmlCollectorStateFunc { 182 var s htmlCollectorStateFunc 183 s = func(*htmlElementsCollectorWriter) htmlCollectorStateFunc { 184 w.buff.WriteRune(w.r) 185 if condition() { 186 w.buff.Reset() 187 return resolve 188 } 189 return s 190 } 191 return s 192 } 193 194 func (w *htmlElementsCollectorWriter) consumeRuneUntil(condition func(r rune) bool, resolve htmlCollectorStateFunc) htmlCollectorStateFunc { 195 var s htmlCollectorStateFunc 196 s = func(*htmlElementsCollectorWriter) htmlCollectorStateFunc { 197 if condition(w.r) { 198 return resolve 199 } 200 return s 201 } 202 return s 203 } 204 205 // Starts with e.g. "<body " or "<div" 206 func (w *htmlElementsCollectorWriter) lexElementInside(resolve htmlCollectorStateFunc) htmlCollectorStateFunc { 207 var s htmlCollectorStateFunc 208 s = func(w *htmlElementsCollectorWriter) htmlCollectorStateFunc { 209 w.buff.WriteRune(w.r) 210 211 // Skip any text inside a quote. 212 if w.r == '\'' || w.r == '"' { 213 if w.inQuote == w.r { 214 w.inQuote = 0 215 } else if w.inQuote == 0 { 216 w.inQuote = w.r 217 } 218 } 219 220 if w.inQuote != 0 { 221 return s 222 } 223 224 if w.r == '>' { 225 226 // Work with the bytes slice as long as it's practical, 227 // to save memory allocations. 228 b := w.buff.Bytes() 229 230 defer func() { 231 w.buff.Reset() 232 }() 233 234 // First check if we have processed this element before. 235 w.collector.mu.RLock() 236 237 seen := w.collector.elementSet[string(b)] 238 w.collector.mu.RUnlock() 239 if seen { 240 return resolve 241 } 242 243 s := w.buff.String() 244 245 if s == "" { 246 return resolve 247 } 248 249 // Parse each collected element. 250 el, err := parseHTMLElement(s) 251 if err != nil { 252 w.err = err 253 return resolve 254 } 255 256 // Write this tag to the element set. 257 w.collector.mu.Lock() 258 w.collector.elementSet[s] = true 259 w.collector.elements = append(w.collector.elements, el) 260 w.collector.mu.Unlock() 261 262 return resolve 263 264 } 265 266 return s 267 } 268 269 return s 270 } 271 272 func (l *htmlElementsCollectorWriter) next() rune { 273 if l.pos >= len(l.input) { 274 l.width = 0 275 return eof 276 } 277 278 runeValue, runeWidth := utf8.DecodeRune(l.input[l.pos:]) 279 280 l.width = runeWidth 281 l.pos += l.width 282 return runeValue 283 } 284 285 // returns the next state in HTML element scanner. 286 type htmlCollectorStateFunc func(*htmlElementsCollectorWriter) htmlCollectorStateFunc 287 288 // At "<", buffer empty. 289 // Potentially starting a HTML element. 290 func htmlLexElementStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc { 291 if w.r == '>' || unicode.IsSpace(w.r) { 292 if w.buff.Len() < 2 || bytes.HasPrefix(w.buff.Bytes(), []byte("</")) { 293 w.buff.Reset() 294 return htmlLexStart 295 } 296 297 tagName := w.buff.Bytes()[1:] 298 299 switch { 300 case skipInnerElementRe.Match(tagName): 301 // pre, script etc. We collect classes etc. on the surrounding 302 // element, but skip the inner content. 303 w.backup() 304 305 // tagName will be overwritten, so make a copy. 306 tagNameCopy := make([]byte, len(tagName)) 307 copy(tagNameCopy, tagName) 308 309 return w.lexElementInside( 310 w.consumeBuffUntil( 311 func() bool { 312 if w.r != '>' { 313 return false 314 } 315 m := endTagRe.FindSubmatch(w.buff.Bytes()) 316 if m == nil { 317 return false 318 } 319 return bytes.EqualFold(m[1], tagNameCopy) 320 }, 321 htmlLexStart, 322 )) 323 case skipAllElementRe.Match(tagName): 324 // E.g. "<!DOCTYPE ..." 325 w.buff.Reset() 326 return w.consumeRuneUntil(func(r rune) bool { 327 return r == '>' 328 }, htmlLexStart) 329 default: 330 w.backup() 331 return w.defaultLexElementInside 332 } 333 } 334 335 w.buff.WriteRune(w.r) 336 337 // If it's a comment, skip to its end. 338 if w.r == '-' && bytes.Equal(w.buff.Bytes(), []byte("<!--")) { 339 w.buff.Reset() 340 return htmlLexToEndOfComment 341 } 342 343 return htmlLexElementStart 344 } 345 346 // Entry state func. 347 // Looks for a opening bracket, '<'. 348 func htmlLexStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc { 349 if w.r == '<' { 350 w.backup() 351 w.buff.Reset() 352 return htmlLexElementStart 353 } 354 355 return htmlLexStart 356 } 357 358 // After "<!--", buff empty. 359 func htmlLexToEndOfComment(w *htmlElementsCollectorWriter) htmlCollectorStateFunc { 360 w.buff.WriteRune(w.r) 361 362 if w.r == '>' && bytes.HasSuffix(w.buff.Bytes(), []byte("-->")) { 363 // Done, start looking for HTML elements again. 364 return htmlLexStart 365 } 366 367 return htmlLexToEndOfComment 368 } 369 370 func parseHTMLElement(elStr string) (el htmlElement, err error) { 371 372 tagName := parseStartTag(elStr) 373 374 el.Tag = strings.ToLower(tagName) 375 tagNameToParse := el.Tag 376 377 // The net/html parser does not handle single table elements as input, e.g. tbody. 378 // We only care about the element/class/ids, so just store away the original tag name 379 // and pretend it's a <div>. 380 if exceptionList[el.Tag] { 381 elStr = strings.Replace(elStr, tagName, "div", 1) 382 tagNameToParse = "div" 383 } 384 385 n, err := html.Parse(strings.NewReader(elStr)) 386 if err != nil { 387 return 388 } 389 390 var walk func(*html.Node) 391 walk = func(n *html.Node) { 392 if n.Type == html.ElementNode && n.Data == tagNameToParse { 393 for _, a := range n.Attr { 394 switch { 395 case strings.EqualFold(a.Key, "id"): 396 // There should be only one, but one never knows... 397 el.IDs = append(el.IDs, a.Val) 398 default: 399 if classAttrRe.MatchString(a.Key) { 400 el.Classes = append(el.Classes, strings.Fields(a.Val)...) 401 } else { 402 key := strings.ToLower(a.Key) 403 val := strings.TrimSpace(a.Val) 404 if strings.Contains(key, "class") && strings.HasPrefix(val, "{") { 405 // This looks like a Vue or AlpineJS class binding. 406 val = htmlJsonFixer.Replace(strings.Trim(val, "{}")) 407 lines := strings.Split(val, "\n") 408 for i, l := range lines { 409 lines[i] = strings.TrimSpace(l) 410 } 411 val = strings.Join(lines, "\n") 412 val = jsonAttrRe.ReplaceAllString(val, "$1") 413 el.Classes = append(el.Classes, strings.Fields(val)...) 414 } 415 } 416 } 417 } 418 } 419 420 for c := n.FirstChild; c != nil; c = c.NextSibling { 421 walk(c) 422 } 423 } 424 425 walk(n) 426 427 return 428 } 429 430 // Variants of s 431 // <body class="b a"> 432 // <div> 433 func parseStartTag(s string) string { 434 spaceIndex := strings.IndexFunc(s, func(r rune) bool { 435 return unicode.IsSpace(r) 436 }) 437 438 if spaceIndex == -1 { 439 return s[1 : len(s)-1] 440 } 441 442 return s[1:spaceIndex] 443 }