hugo

Fork of github.com/gohugoio/hugo with reverse pagination support

git clone git://git.shimmy1996.com/hugo.git

inverted_index.go (11382B)

    1 // Copyright 2019 The Hugo Authors. All rights reserved.
    2 //
    3 // Licensed under the Apache License, Version 2.0 (the "License");
    4 // you may not use this file except in compliance with the License.
    5 // You may obtain a copy of the License at
    6 // http://www.apache.org/licenses/LICENSE-2.0
    7 //
    8 // Unless required by applicable law or agreed to in writing, software
    9 // distributed under the License is distributed on an "AS IS" BASIS,
   10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   11 // See the License for the specific language governing permissions and
   12 // limitations under the License.
   13 
   14 // Package related holds code to help finding related content.
   15 package related
   16 
   17 import (
   18 	"errors"
   19 	"fmt"
   20 	"math"
   21 	"sort"
   22 	"strings"
   23 	"time"
   24 
   25 	"github.com/gohugoio/hugo/common/maps"
   26 
   27 	"github.com/gohugoio/hugo/common/types"
   28 	"github.com/mitchellh/mapstructure"
   29 )
   30 
   31 var (
   32 	_        Keyword = (*StringKeyword)(nil)
   33 	zeroDate         = time.Time{}
   34 
   35 	// DefaultConfig is the default related config.
   36 	DefaultConfig = Config{
   37 		Threshold: 80,
   38 		Indices: IndexConfigs{
   39 			IndexConfig{Name: "keywords", Weight: 100},
   40 			IndexConfig{Name: "date", Weight: 10},
   41 		},
   42 	}
   43 )
   44 
   45 /*
   46 Config is the top level configuration element used to configure how to retrieve
   47 related content in Hugo.
   48 
   49 An example site config.toml:
   50 
   51 	[related]
   52 	threshold = 1
   53 	[[related.indices]]
   54 	name = "keywords"
   55 	weight = 200
   56 	[[related.indices]]
   57 	name  = "tags"
   58 	weight = 100
   59 	[[related.indices]]
   60 	name  = "date"
   61 	weight = 1
   62 	pattern = "2006"
   63 */
   64 type Config struct {
   65 	// Only include matches >= threshold, a normalized rank between 0 and 100.
   66 	Threshold int
   67 
   68 	// To get stable "See also" sections we, by default, exclude newer related pages.
   69 	IncludeNewer bool
   70 
   71 	// Will lower case all string values and queries to the indices.
   72 	// May get better results, but at a slight performance cost.
   73 	ToLower bool
   74 
   75 	Indices IndexConfigs
   76 }
   77 
   78 // Add adds a given index.
   79 func (c *Config) Add(index IndexConfig) {
   80 	if c.ToLower {
   81 		index.ToLower = true
   82 	}
   83 	c.Indices = append(c.Indices, index)
   84 }
   85 
   86 // IndexConfigs holds a set of index configurations.
   87 type IndexConfigs []IndexConfig
   88 
   89 // IndexConfig configures an index.
   90 type IndexConfig struct {
   91 	// The index name. This directly maps to a field or Param name.
   92 	Name string
   93 
   94 	// Contextual pattern used to convert the Param value into a string.
   95 	// Currently only used for dates. Can be used to, say, bump posts in the same
   96 	// time frame when searching for related documents.
   97 	// For dates it follows Go's time.Format patterns, i.e.
   98 	// "2006" for YYYY and "200601" for YYYYMM.
   99 	Pattern string
  100 
  101 	// This field's weight when doing multi-index searches. Higher is "better".
  102 	Weight int
  103 
  104 	// Will lower case all string values in and queries tothis index.
  105 	// May get better accurate results, but at a slight performance cost.
  106 	ToLower bool
  107 }
  108 
  109 // Document is the interface an indexable document in Hugo must fulfill.
  110 type Document interface {
  111 	// RelatedKeywords returns a list of keywords for the given index config.
  112 	RelatedKeywords(cfg IndexConfig) ([]Keyword, error)
  113 
  114 	// When this document was or will be published.
  115 	PublishDate() time.Time
  116 
  117 	// Name is used as an tiebreaker if both Weight and PublishDate are
  118 	// the same.
  119 	Name() string
  120 }
  121 
  122 // InvertedIndex holds an inverted index, also sometimes named posting list, which
  123 // lists, for every possible search term, the documents that contain that term.
  124 type InvertedIndex struct {
  125 	cfg   Config
  126 	index map[string]map[Keyword][]Document
  127 
  128 	minWeight int
  129 	maxWeight int
  130 }
  131 
  132 func (idx *InvertedIndex) getIndexCfg(name string) (IndexConfig, bool) {
  133 	for _, conf := range idx.cfg.Indices {
  134 		if conf.Name == name {
  135 			return conf, true
  136 		}
  137 	}
  138 
  139 	return IndexConfig{}, false
  140 }
  141 
  142 // NewInvertedIndex creates a new InvertedIndex.
  143 // Documents to index must be added in Add.
  144 func NewInvertedIndex(cfg Config) *InvertedIndex {
  145 	idx := &InvertedIndex{index: make(map[string]map[Keyword][]Document), cfg: cfg}
  146 	for _, conf := range cfg.Indices {
  147 		idx.index[conf.Name] = make(map[Keyword][]Document)
  148 		if conf.Weight < idx.minWeight {
  149 			// By default, the weight scale starts at 0, but we allow
  150 			// negative weights.
  151 			idx.minWeight = conf.Weight
  152 		}
  153 		if conf.Weight > idx.maxWeight {
  154 			idx.maxWeight = conf.Weight
  155 		}
  156 	}
  157 	return idx
  158 }
  159 
  160 // Add documents to the inverted index.
  161 // The value must support == and !=.
  162 func (idx *InvertedIndex) Add(docs ...Document) error {
  163 	var err error
  164 	for _, config := range idx.cfg.Indices {
  165 		if config.Weight == 0 {
  166 			// Disabled
  167 			continue
  168 		}
  169 		setm := idx.index[config.Name]
  170 
  171 		for _, doc := range docs {
  172 			var words []Keyword
  173 			words, err = doc.RelatedKeywords(config)
  174 			if err != nil {
  175 				continue
  176 			}
  177 
  178 			for _, keyword := range words {
  179 				setm[keyword] = append(setm[keyword], doc)
  180 			}
  181 		}
  182 	}
  183 
  184 	return err
  185 }
  186 
  187 // queryElement holds the index name and keywords that can be used to compose a
  188 // search for related content.
  189 type queryElement struct {
  190 	Index    string
  191 	Keywords []Keyword
  192 }
  193 
  194 func newQueryElement(index string, keywords ...Keyword) queryElement {
  195 	return queryElement{Index: index, Keywords: keywords}
  196 }
  197 
  198 type ranks []*rank
  199 
  200 type rank struct {
  201 	Doc     Document
  202 	Weight  int
  203 	Matches int
  204 }
  205 
  206 func (r *rank) addWeight(w int) {
  207 	r.Weight += w
  208 	r.Matches++
  209 }
  210 
  211 func newRank(doc Document, weight int) *rank {
  212 	return &rank{Doc: doc, Weight: weight, Matches: 1}
  213 }
  214 
  215 func (r ranks) Len() int      { return len(r) }
  216 func (r ranks) Swap(i, j int) { r[i], r[j] = r[j], r[i] }
  217 func (r ranks) Less(i, j int) bool {
  218 	if r[i].Weight == r[j].Weight {
  219 		if r[i].Doc.PublishDate() == r[j].Doc.PublishDate() {
  220 			return r[i].Doc.Name() < r[j].Doc.Name()
  221 		}
  222 		return r[i].Doc.PublishDate().After(r[j].Doc.PublishDate())
  223 	}
  224 	return r[i].Weight > r[j].Weight
  225 }
  226 
  227 // SearchDoc finds the documents matching any of the keywords in the given indices
  228 // against the given document.
  229 // The resulting document set will be sorted according to number of matches
  230 // and the index weights, and any matches with a rank below the configured
  231 // threshold (normalize to 0..100) will be removed.
  232 // If an index name is provided, only that index will be queried.
  233 func (idx *InvertedIndex) SearchDoc(doc Document, indices ...string) ([]Document, error) {
  234 	var q []queryElement
  235 
  236 	var configs IndexConfigs
  237 
  238 	if len(indices) == 0 {
  239 		configs = idx.cfg.Indices
  240 	} else {
  241 		configs = make(IndexConfigs, len(indices))
  242 		for i, indexName := range indices {
  243 			cfg, found := idx.getIndexCfg(indexName)
  244 			if !found {
  245 				return nil, fmt.Errorf("index %q not found", indexName)
  246 			}
  247 			configs[i] = cfg
  248 		}
  249 	}
  250 
  251 	for _, cfg := range configs {
  252 		keywords, err := doc.RelatedKeywords(cfg)
  253 		if err != nil {
  254 			return nil, err
  255 		}
  256 
  257 		q = append(q, newQueryElement(cfg.Name, keywords...))
  258 
  259 	}
  260 
  261 	return idx.searchDate(doc.PublishDate(), q...)
  262 }
  263 
  264 // ToKeywords returns a Keyword slice of the given input.
  265 func (cfg IndexConfig) ToKeywords(v any) ([]Keyword, error) {
  266 	var (
  267 		keywords []Keyword
  268 		toLower  = cfg.ToLower
  269 	)
  270 	switch vv := v.(type) {
  271 	case string:
  272 		if toLower {
  273 			vv = strings.ToLower(vv)
  274 		}
  275 		keywords = append(keywords, StringKeyword(vv))
  276 	case []string:
  277 		if toLower {
  278 			vc := make([]string, len(vv))
  279 			copy(vc, vv)
  280 			for i := 0; i < len(vc); i++ {
  281 				vc[i] = strings.ToLower(vc[i])
  282 			}
  283 			vv = vc
  284 		}
  285 		keywords = append(keywords, StringsToKeywords(vv...)...)
  286 	case time.Time:
  287 		layout := "2006"
  288 		if cfg.Pattern != "" {
  289 			layout = cfg.Pattern
  290 		}
  291 		keywords = append(keywords, StringKeyword(vv.Format(layout)))
  292 	case nil:
  293 		return keywords, nil
  294 	default:
  295 		return keywords, fmt.Errorf("indexing currently not supported for index %q and type %T", cfg.Name, vv)
  296 	}
  297 
  298 	return keywords, nil
  299 }
  300 
  301 // SearchKeyValues finds the documents matching any of the keywords in the given indices.
  302 // The resulting document set will be sorted according to number of matches
  303 // and the index weights, and any matches with a rank below the configured
  304 // threshold (normalize to 0..100) will be removed.
  305 func (idx *InvertedIndex) SearchKeyValues(args ...types.KeyValues) ([]Document, error) {
  306 	q := make([]queryElement, len(args))
  307 
  308 	for i, arg := range args {
  309 		var keywords []Keyword
  310 		key := arg.KeyString()
  311 		if key == "" {
  312 			return nil, fmt.Errorf("index %q not valid", arg.Key)
  313 		}
  314 		conf, found := idx.getIndexCfg(key)
  315 		if !found {
  316 			return nil, fmt.Errorf("index %q not found", key)
  317 		}
  318 
  319 		for _, val := range arg.Values {
  320 			k, err := conf.ToKeywords(val)
  321 			if err != nil {
  322 				return nil, err
  323 			}
  324 			keywords = append(keywords, k...)
  325 		}
  326 
  327 		q[i] = newQueryElement(conf.Name, keywords...)
  328 
  329 	}
  330 
  331 	return idx.search(q...)
  332 }
  333 
  334 func (idx *InvertedIndex) search(query ...queryElement) ([]Document, error) {
  335 	return idx.searchDate(zeroDate, query...)
  336 }
  337 
  338 func (idx *InvertedIndex) searchDate(upperDate time.Time, query ...queryElement) ([]Document, error) {
  339 	matchm := make(map[Document]*rank, 200)
  340 	applyDateFilter := !idx.cfg.IncludeNewer && !upperDate.IsZero()
  341 
  342 	for _, el := range query {
  343 		setm, found := idx.index[el.Index]
  344 		if !found {
  345 			return []Document{}, fmt.Errorf("index for %q not found", el.Index)
  346 		}
  347 
  348 		config, found := idx.getIndexCfg(el.Index)
  349 		if !found {
  350 			return []Document{}, fmt.Errorf("index config for %q not found", el.Index)
  351 		}
  352 
  353 		for _, kw := range el.Keywords {
  354 			if docs, found := setm[kw]; found {
  355 				for _, doc := range docs {
  356 					if applyDateFilter {
  357 						// Exclude newer than the limit given
  358 						if doc.PublishDate().After(upperDate) {
  359 							continue
  360 						}
  361 					}
  362 					r, found := matchm[doc]
  363 					if !found {
  364 						matchm[doc] = newRank(doc, config.Weight)
  365 					} else {
  366 						r.addWeight(config.Weight)
  367 					}
  368 				}
  369 			}
  370 		}
  371 	}
  372 
  373 	if len(matchm) == 0 {
  374 		return []Document{}, nil
  375 	}
  376 
  377 	matches := make(ranks, 0, 100)
  378 
  379 	for _, v := range matchm {
  380 		avgWeight := v.Weight / v.Matches
  381 		weight := norm(avgWeight, idx.minWeight, idx.maxWeight)
  382 		threshold := idx.cfg.Threshold / v.Matches
  383 
  384 		if weight >= threshold {
  385 			matches = append(matches, v)
  386 		}
  387 	}
  388 
  389 	sort.Stable(matches)
  390 
  391 	result := make([]Document, len(matches))
  392 
  393 	for i, m := range matches {
  394 		result[i] = m.Doc
  395 	}
  396 
  397 	return result, nil
  398 }
  399 
  400 // normalizes num to a number between 0 and 100.
  401 func norm(num, min, max int) int {
  402 	if min > max {
  403 		panic("min > max")
  404 	}
  405 	return int(math.Floor((float64(num-min) / float64(max-min) * 100) + 0.5))
  406 }
  407 
  408 // DecodeConfig decodes a slice of map into Config.
  409 func DecodeConfig(m maps.Params) (Config, error) {
  410 	if m == nil {
  411 		return Config{}, errors.New("no related config provided")
  412 	}
  413 
  414 	if len(m) == 0 {
  415 		return Config{}, errors.New("empty related config provided")
  416 	}
  417 
  418 	var c Config
  419 
  420 	if err := mapstructure.WeakDecode(m, &c); err != nil {
  421 		return c, err
  422 	}
  423 
  424 	if c.Threshold < 0 || c.Threshold > 100 {
  425 		return Config{}, errors.New("related threshold must be between 0 and 100")
  426 	}
  427 
  428 	if c.ToLower {
  429 		for i := range c.Indices {
  430 			c.Indices[i].ToLower = true
  431 		}
  432 	}
  433 
  434 	return c, nil
  435 }
  436 
  437 // StringKeyword is a string search keyword.
  438 type StringKeyword string
  439 
  440 func (s StringKeyword) String() string {
  441 	return string(s)
  442 }
  443 
  444 // Keyword is the interface a keyword in the search index must implement.
  445 type Keyword interface {
  446 	String() string
  447 }
  448 
  449 // StringsToKeywords converts the given slice of strings to a slice of Keyword.
  450 func StringsToKeywords(s ...string) []Keyword {
  451 	kw := make([]Keyword, len(s))
  452 
  453 	for i := 0; i < len(s); i++ {
  454 		kw[i] = StringKeyword(s[i])
  455 	}
  456 
  457 	return kw
  458 }