hugo

Unnamed repository; edit this file 'description' to name the repository.

git clone git://git.shimmy1996.com/hugo.git
commit 3854a6fa6c323d1c09aa71a0626c9eef62709294
parent cd0112a05a9ddb7043c9808284f93d8099c48473
Author: Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>
Date:   Wed, 25 May 2022 10:56:14 +0200

Fix Plainify edge cases

This commit replaces the main part of `helpers.StripHTML` with Go's implementation in its html/template package.

It's a little slower, but correctness is more important:

```bash
BenchmarkStripHTMLOld-10    	  680316	      1764 ns/op	     728 B/op	       4 allocs/op
BenchmarkStripHTMLNew-10    	  384520	      3099 ns/op	    2089 B/op	      10 allocs/op
```

Fixes #9199
Fixes #9909
Closes #9410

Diffstat:
Mhelpers/content.go | 40----------------------------------------
Mhelpers/content_test.go | 38--------------------------------------
Mhugolib/page__per_output.go | 2+-
Mhugolib/page_test.go | 5+++--
Mtpl/internal/go_templates/htmltemplate/hugo_template.go | 5+++++
Mtpl/strings/strings.go | 7++++---
Mtpl/template.go | 46++++++++++++++++++++++++++++++++++++++++++++++
Mtpl/template_test.go | 41+++++++++++++++++++++++++++++++++++++++++
Mtpl/transform/transform.go | 3++-
Mtpl/transform/transform_test.go | 1+
10 files changed, 103 insertions(+), 85 deletions(-)
diff --git a/helpers/content.go b/helpers/content.go
@@ -34,7 +34,6 @@ import (
 
 	"github.com/gohugoio/hugo/markup"
 
-	bp "github.com/gohugoio/hugo/bufferpool"
 	"github.com/gohugoio/hugo/config"
 )
 
@@ -104,45 +103,6 @@ func NewContentSpec(cfg config.Provider, logger loggers.Logger, contentFs afero.
 	return spec, nil
 }
 
-var stripHTMLReplacer = strings.NewReplacer("\n", " ", "</p>", "\n", "<br>", "\n", "<br />", "\n")
-
-// StripHTML accepts a string, strips out all HTML tags and returns it.
-func StripHTML(s string) string {
-	// Shortcut strings with no tags in them
-	if !strings.ContainsAny(s, "<>") {
-		return s
-	}
-	s = stripHTMLReplacer.Replace(s)
-
-	// Walk through the string removing all tags
-	b := bp.GetBuffer()
-	defer bp.PutBuffer(b)
-	var inTag, isSpace, wasSpace bool
-	for _, r := range s {
-		if !inTag {
-			isSpace = false
-		}
-
-		switch {
-		case r == '<':
-			inTag = true
-		case r == '>':
-			inTag = false
-		case unicode.IsSpace(r):
-			isSpace = true
-			fallthrough
-		default:
-			if !inTag && (!isSpace || (isSpace && !wasSpace)) {
-				b.WriteRune(r)
-			}
-		}
-
-		wasSpace = isSpace
-
-	}
-	return b.String()
-}
-
 // stripEmptyNav strips out empty <nav> tags from content.
 func stripEmptyNav(in []byte) []byte {
 	return bytes.Replace(in, []byte("<nav>\n</nav>\n\n"), []byte(``), -1)
diff --git a/helpers/content_test.go b/helpers/content_test.go
@@ -52,44 +52,6 @@ func TestTrimShortHTML(t *testing.T) {
 	}
 }
 
-func TestStripHTML(t *testing.T) {
-	type test struct {
-		input, expected string
-	}
-	data := []test{
-		{"<h1>strip h1 tag <h1>", "strip h1 tag "},
-		{"<p> strip p tag </p>", " strip p tag "},
-		{"</br> strip br<br>", " strip br\n"},
-		{"</br> strip br2<br />", " strip br2\n"},
-		{"This <strong>is</strong> a\nnewline", "This is a newline"},
-		{"No Tags", "No Tags"},
-		{`<p>Summary Next Line.
-<figure >
-
-        <img src="/not/real" />
-
-
-</figure>
-.
-More text here.</p>
-
-<p>Some more text</p>`, "Summary Next Line.  . More text here.\nSome more text\n"},
-	}
-	for i, d := range data {
-		output := StripHTML(d.input)
-		if d.expected != output {
-			t.Errorf("Test %d failed. Expected %q got %q", i, d.expected, output)
-		}
-	}
-}
-
-func BenchmarkStripHTML(b *testing.B) {
-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		StripHTML(tstHTMLContent)
-	}
-}
-
 func TestStripEmptyNav(t *testing.T) {
 	c := qt.New(t)
 	cleaned := stripEmptyNav([]byte("do<nav>\n</nav>\n\nbedobedo"))
diff --git a/hugolib/page__per_output.go b/hugolib/page__per_output.go
@@ -201,7 +201,7 @@ func newPageContentOutput(p *pageState, po *pageOutput) (*pageContentOutput, err
 	})
 
 	cp.initPlain = cp.initMain.Branch(func() (any, error) {
-		cp.plain = helpers.StripHTML(string(cp.content))
+		cp.plain = tpl.StripHTML(string(cp.content))
 		cp.plainWords = strings.Fields(cp.plain)
 		cp.setWordCounts(p.m.isCJKLanguage)
 
diff --git a/hugolib/page_test.go b/hugolib/page_test.go
@@ -26,6 +26,7 @@ import (
 	"github.com/gohugoio/hugo/htesting"
 	"github.com/gohugoio/hugo/markup/asciidocext"
 	"github.com/gohugoio/hugo/markup/rst"
+	"github.com/gohugoio/hugo/tpl"
 
 	"github.com/gohugoio/hugo/config"
 
@@ -40,7 +41,6 @@ import (
 
 	qt "github.com/frankban/quicktest"
 	"github.com/gohugoio/hugo/deps"
-	"github.com/gohugoio/hugo/helpers"
 )
 
 const (
@@ -351,7 +351,7 @@ func normalizeExpected(ext, str string) string {
 	default:
 		return str
 	case "html":
-		return strings.Trim(helpers.StripHTML(str), " ")
+		return strings.Trim(tpl.StripHTML(str), " ")
 	case "ad":
 		paragraphs := strings.Split(str, "</p>")
 		expected := ""
@@ -1736,6 +1736,7 @@ Len Summary: {{ len .Summary }}
 Len Content: {{ len .Content }}
 
 SUMMARY:{{ .Summary }}:{{ len .Summary }}:END
+
 `}
 
 	b := newTestSitesBuilder(t)
diff --git a/tpl/internal/go_templates/htmltemplate/hugo_template.go b/tpl/internal/go_templates/htmltemplate/hugo_template.go
@@ -34,3 +34,8 @@ func (t *Template) Prepare() (*template.Template, error) {
 	}
 	return t.text, nil
 }
+
+// See https://github.com/golang/go/issues/5884
+func StripTags(html string) string {
+	return stripTags(html)
+}
diff --git a/tpl/strings/strings.go b/tpl/strings/strings.go
@@ -25,6 +25,7 @@ import (
 	"github.com/gohugoio/hugo/common/text"
 	"github.com/gohugoio/hugo/deps"
 	"github.com/gohugoio/hugo/helpers"
+	"github.com/gohugoio/hugo/tpl"
 
 	"github.com/spf13/cast"
 )
@@ -52,7 +53,7 @@ func (ns *Namespace) CountRunes(s any) (int, error) {
 	}
 
 	counter := 0
-	for _, r := range helpers.StripHTML(ss) {
+	for _, r := range tpl.StripHTML(ss) {
 		if !helpers.IsWhitespace(r) {
 			counter++
 		}
@@ -83,11 +84,11 @@ func (ns *Namespace) CountWords(s any) (int, error) {
 	}
 
 	if !isCJKLanguage {
-		return len(strings.Fields(helpers.StripHTML((ss)))), nil
+		return len(strings.Fields(tpl.StripHTML(ss))), nil
 	}
 
 	counter := 0
-	for _, word := range strings.Fields(helpers.StripHTML(ss)) {
+	for _, word := range strings.Fields(tpl.StripHTML(ss)) {
 		runeCount := utf8.RuneCountInString(word)
 		if len(word) == runeCount {
 			counter++
diff --git a/tpl/template.go b/tpl/template.go
@@ -18,9 +18,14 @@ import (
 	"io"
 	"reflect"
 	"regexp"
+	"strings"
+	"unicode"
+
+	bp "github.com/gohugoio/hugo/bufferpool"
 
 	"github.com/gohugoio/hugo/output"
 
+	htmltemplate "github.com/gohugoio/hugo/tpl/internal/go_templates/htmltemplate"
 	texttemplate "github.com/gohugoio/hugo/tpl/internal/go_templates/texttemplate"
 )
 
@@ -163,3 +168,44 @@ func GetHasLockFromContext(ctx context.Context) bool {
 func SetHasLockInContext(ctx context.Context, hasLock bool) context.Context {
 	return context.WithValue(ctx, texttemplate.HasLockContextKey, hasLock)
 }
+
+const hugoNewLinePlaceholder = "___hugonl_"
+
+var (
+	stripHTMLReplacerPre = strings.NewReplacer("\n", " ", "</p>", hugoNewLinePlaceholder, "<br>", hugoNewLinePlaceholder, "<br />", hugoNewLinePlaceholder)
+	whitespaceRe         = regexp.MustCompile(`\s+`)
+)
+
+// StripHTML strips out all HTML tags in s.
+func StripHTML(s string) string {
+	// Shortcut strings with no tags in them
+	if !strings.ContainsAny(s, "<>") {
+		return s
+	}
+
+	pre := stripHTMLReplacerPre.Replace(s)
+	preReplaced := pre != s
+
+	s = htmltemplate.StripTags(pre)
+
+	if preReplaced {
+		s = strings.ReplaceAll(s, hugoNewLinePlaceholder, "\n")
+	}
+
+	var wasSpace bool
+	b := bp.GetBuffer()
+	defer bp.PutBuffer(b)
+	for _, r := range s {
+		isSpace := unicode.IsSpace(r)
+		if !(isSpace && wasSpace) {
+			b.WriteRune(r)
+		}
+		wasSpace = isSpace
+	}
+
+	if b.Len() > 0 {
+		s = b.String()
+	}
+
+	return s
+}
diff --git a/tpl/template_test.go b/tpl/template_test.go
@@ -28,3 +28,44 @@ func TestExtractBaseof(t *testing.T) {
 	c.Assert(extractBaseOf("not baseof for you"), qt.Equals, "")
 	c.Assert(extractBaseOf("template: blog/baseof.html:23:11:"), qt.Equals, "blog/baseof.html")
 }
+
+func TestStripHTML(t *testing.T) {
+	type test struct {
+		input, expected string
+	}
+	data := []test{
+		{"<h1>strip h1 tag <h1>", "strip h1 tag "},
+		{"<p> strip p tag </p>", " strip p tag "},
+		{"</br> strip br<br>", " strip br\n"},
+		{"</br> strip br2<br />", " strip br2\n"},
+		{"This <strong>is</strong> a\nnewline", "This is a newline"},
+		{"No Tags", "No Tags"},
+		{`<p>Summary Next Line.
+<figure >
+
+        <img src="/not/real" />
+
+
+</figure>
+.
+More text here.</p>
+
+<p>Some more text</p>`, "Summary Next Line. . More text here.\nSome more text\n"},
+
+		// Issue 9199
+		{"<div data-action='click->my-controller#doThing'>qwe</div>", "qwe"},
+		{"Hello, World!", "Hello, World!"},
+		{"foo&amp;bar", "foo&amp;bar"},
+		{`Hello <a href="www.example.com/">World</a>!`, "Hello World!"},
+		{"Foo <textarea>Bar</textarea> Baz", "Foo Bar Baz"},
+		{"Foo <!-- Bar --> Baz", "Foo Baz"},
+	}
+	for i, d := range data {
+		output := StripHTML(d.input)
+		if d.expected != output {
+			t.Errorf("Test %d failed. Expected %q got %q", i, d.expected, output)
+		}
+	}
+}
+
+const tstHTMLContent = "<!DOCTYPE html><html><head><script src=\"http://two/foobar.js\"></script></head><body><nav><ul><li hugo-nav=\"section_0\"></li><li hugo-nav=\"section_1\"></li></ul></nav><article>content <a href=\"http://two/foobar\">foobar</a>. Follow up</article><p>This is some text.<br>And some more.</p></body></html>"
diff --git a/tpl/transform/transform.go b/tpl/transform/transform.go
@@ -22,6 +22,7 @@ import (
 	"github.com/gohugoio/hugo/cache/namedmemcache"
 	"github.com/gohugoio/hugo/markup/converter/hooks"
 	"github.com/gohugoio/hugo/markup/highlight"
+	"github.com/gohugoio/hugo/tpl"
 
 	"github.com/gohugoio/hugo/deps"
 	"github.com/gohugoio/hugo/helpers"
@@ -141,7 +142,7 @@ func (ns *Namespace) Plainify(s any) (string, error) {
 		return "", err
 	}
 
-	return helpers.StripHTML(ss), nil
+	return tpl.StripHTML(ss), nil
 }
 
 // For internal use.
diff --git a/tpl/transform/transform_test.go b/tpl/transform/transform_test.go
@@ -237,6 +237,7 @@ func TestPlainify(t *testing.T) {
 		expect any
 	}{
 		{"<em>Note:</em> blah <b>blah</b>", "Note: blah blah"},
+		{"<div data-action='click->my-controller#doThing'>qwe</div>", "qwe"},
 		// errors
 		{tstNoStringer{}, false},
 	} {