inverted_index.go (11382B)
1 // Copyright 2019 The Hugo Authors. All rights reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 // http://www.apache.org/licenses/LICENSE-2.0
7 //
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13
14 // Package related holds code to help finding related content.
15 package related
16
17 import (
18 "errors"
19 "fmt"
20 "math"
21 "sort"
22 "strings"
23 "time"
24
25 "github.com/gohugoio/hugo/common/maps"
26
27 "github.com/gohugoio/hugo/common/types"
28 "github.com/mitchellh/mapstructure"
29 )
30
31 var (
32 _ Keyword = (*StringKeyword)(nil)
33 zeroDate = time.Time{}
34
35 // DefaultConfig is the default related config.
36 DefaultConfig = Config{
37 Threshold: 80,
38 Indices: IndexConfigs{
39 IndexConfig{Name: "keywords", Weight: 100},
40 IndexConfig{Name: "date", Weight: 10},
41 },
42 }
43 )
44
45 /*
46 Config is the top level configuration element used to configure how to retrieve
47 related content in Hugo.
48
49 An example site config.toml:
50
51 [related]
52 threshold = 1
53 [[related.indices]]
54 name = "keywords"
55 weight = 200
56 [[related.indices]]
57 name = "tags"
58 weight = 100
59 [[related.indices]]
60 name = "date"
61 weight = 1
62 pattern = "2006"
63 */
64 type Config struct {
65 // Only include matches >= threshold, a normalized rank between 0 and 100.
66 Threshold int
67
68 // To get stable "See also" sections we, by default, exclude newer related pages.
69 IncludeNewer bool
70
71 // Will lower case all string values and queries to the indices.
72 // May get better results, but at a slight performance cost.
73 ToLower bool
74
75 Indices IndexConfigs
76 }
77
78 // Add adds a given index.
79 func (c *Config) Add(index IndexConfig) {
80 if c.ToLower {
81 index.ToLower = true
82 }
83 c.Indices = append(c.Indices, index)
84 }
85
86 // IndexConfigs holds a set of index configurations.
87 type IndexConfigs []IndexConfig
88
89 // IndexConfig configures an index.
90 type IndexConfig struct {
91 // The index name. This directly maps to a field or Param name.
92 Name string
93
94 // Contextual pattern used to convert the Param value into a string.
95 // Currently only used for dates. Can be used to, say, bump posts in the same
96 // time frame when searching for related documents.
97 // For dates it follows Go's time.Format patterns, i.e.
98 // "2006" for YYYY and "200601" for YYYYMM.
99 Pattern string
100
101 // This field's weight when doing multi-index searches. Higher is "better".
102 Weight int
103
104 // Will lower case all string values in and queries tothis index.
105 // May get better accurate results, but at a slight performance cost.
106 ToLower bool
107 }
108
109 // Document is the interface an indexable document in Hugo must fulfill.
110 type Document interface {
111 // RelatedKeywords returns a list of keywords for the given index config.
112 RelatedKeywords(cfg IndexConfig) ([]Keyword, error)
113
114 // When this document was or will be published.
115 PublishDate() time.Time
116
117 // Name is used as an tiebreaker if both Weight and PublishDate are
118 // the same.
119 Name() string
120 }
121
122 // InvertedIndex holds an inverted index, also sometimes named posting list, which
123 // lists, for every possible search term, the documents that contain that term.
124 type InvertedIndex struct {
125 cfg Config
126 index map[string]map[Keyword][]Document
127
128 minWeight int
129 maxWeight int
130 }
131
132 func (idx *InvertedIndex) getIndexCfg(name string) (IndexConfig, bool) {
133 for _, conf := range idx.cfg.Indices {
134 if conf.Name == name {
135 return conf, true
136 }
137 }
138
139 return IndexConfig{}, false
140 }
141
142 // NewInvertedIndex creates a new InvertedIndex.
143 // Documents to index must be added in Add.
144 func NewInvertedIndex(cfg Config) *InvertedIndex {
145 idx := &InvertedIndex{index: make(map[string]map[Keyword][]Document), cfg: cfg}
146 for _, conf := range cfg.Indices {
147 idx.index[conf.Name] = make(map[Keyword][]Document)
148 if conf.Weight < idx.minWeight {
149 // By default, the weight scale starts at 0, but we allow
150 // negative weights.
151 idx.minWeight = conf.Weight
152 }
153 if conf.Weight > idx.maxWeight {
154 idx.maxWeight = conf.Weight
155 }
156 }
157 return idx
158 }
159
160 // Add documents to the inverted index.
161 // The value must support == and !=.
162 func (idx *InvertedIndex) Add(docs ...Document) error {
163 var err error
164 for _, config := range idx.cfg.Indices {
165 if config.Weight == 0 {
166 // Disabled
167 continue
168 }
169 setm := idx.index[config.Name]
170
171 for _, doc := range docs {
172 var words []Keyword
173 words, err = doc.RelatedKeywords(config)
174 if err != nil {
175 continue
176 }
177
178 for _, keyword := range words {
179 setm[keyword] = append(setm[keyword], doc)
180 }
181 }
182 }
183
184 return err
185 }
186
187 // queryElement holds the index name and keywords that can be used to compose a
188 // search for related content.
189 type queryElement struct {
190 Index string
191 Keywords []Keyword
192 }
193
194 func newQueryElement(index string, keywords ...Keyword) queryElement {
195 return queryElement{Index: index, Keywords: keywords}
196 }
197
198 type ranks []*rank
199
200 type rank struct {
201 Doc Document
202 Weight int
203 Matches int
204 }
205
206 func (r *rank) addWeight(w int) {
207 r.Weight += w
208 r.Matches++
209 }
210
211 func newRank(doc Document, weight int) *rank {
212 return &rank{Doc: doc, Weight: weight, Matches: 1}
213 }
214
215 func (r ranks) Len() int { return len(r) }
216 func (r ranks) Swap(i, j int) { r[i], r[j] = r[j], r[i] }
217 func (r ranks) Less(i, j int) bool {
218 if r[i].Weight == r[j].Weight {
219 if r[i].Doc.PublishDate() == r[j].Doc.PublishDate() {
220 return r[i].Doc.Name() < r[j].Doc.Name()
221 }
222 return r[i].Doc.PublishDate().After(r[j].Doc.PublishDate())
223 }
224 return r[i].Weight > r[j].Weight
225 }
226
227 // SearchDoc finds the documents matching any of the keywords in the given indices
228 // against the given document.
229 // The resulting document set will be sorted according to number of matches
230 // and the index weights, and any matches with a rank below the configured
231 // threshold (normalize to 0..100) will be removed.
232 // If an index name is provided, only that index will be queried.
233 func (idx *InvertedIndex) SearchDoc(doc Document, indices ...string) ([]Document, error) {
234 var q []queryElement
235
236 var configs IndexConfigs
237
238 if len(indices) == 0 {
239 configs = idx.cfg.Indices
240 } else {
241 configs = make(IndexConfigs, len(indices))
242 for i, indexName := range indices {
243 cfg, found := idx.getIndexCfg(indexName)
244 if !found {
245 return nil, fmt.Errorf("index %q not found", indexName)
246 }
247 configs[i] = cfg
248 }
249 }
250
251 for _, cfg := range configs {
252 keywords, err := doc.RelatedKeywords(cfg)
253 if err != nil {
254 return nil, err
255 }
256
257 q = append(q, newQueryElement(cfg.Name, keywords...))
258
259 }
260
261 return idx.searchDate(doc.PublishDate(), q...)
262 }
263
264 // ToKeywords returns a Keyword slice of the given input.
265 func (cfg IndexConfig) ToKeywords(v any) ([]Keyword, error) {
266 var (
267 keywords []Keyword
268 toLower = cfg.ToLower
269 )
270 switch vv := v.(type) {
271 case string:
272 if toLower {
273 vv = strings.ToLower(vv)
274 }
275 keywords = append(keywords, StringKeyword(vv))
276 case []string:
277 if toLower {
278 vc := make([]string, len(vv))
279 copy(vc, vv)
280 for i := 0; i < len(vc); i++ {
281 vc[i] = strings.ToLower(vc[i])
282 }
283 vv = vc
284 }
285 keywords = append(keywords, StringsToKeywords(vv...)...)
286 case time.Time:
287 layout := "2006"
288 if cfg.Pattern != "" {
289 layout = cfg.Pattern
290 }
291 keywords = append(keywords, StringKeyword(vv.Format(layout)))
292 case nil:
293 return keywords, nil
294 default:
295 return keywords, fmt.Errorf("indexing currently not supported for index %q and type %T", cfg.Name, vv)
296 }
297
298 return keywords, nil
299 }
300
301 // SearchKeyValues finds the documents matching any of the keywords in the given indices.
302 // The resulting document set will be sorted according to number of matches
303 // and the index weights, and any matches with a rank below the configured
304 // threshold (normalize to 0..100) will be removed.
305 func (idx *InvertedIndex) SearchKeyValues(args ...types.KeyValues) ([]Document, error) {
306 q := make([]queryElement, len(args))
307
308 for i, arg := range args {
309 var keywords []Keyword
310 key := arg.KeyString()
311 if key == "" {
312 return nil, fmt.Errorf("index %q not valid", arg.Key)
313 }
314 conf, found := idx.getIndexCfg(key)
315 if !found {
316 return nil, fmt.Errorf("index %q not found", key)
317 }
318
319 for _, val := range arg.Values {
320 k, err := conf.ToKeywords(val)
321 if err != nil {
322 return nil, err
323 }
324 keywords = append(keywords, k...)
325 }
326
327 q[i] = newQueryElement(conf.Name, keywords...)
328
329 }
330
331 return idx.search(q...)
332 }
333
334 func (idx *InvertedIndex) search(query ...queryElement) ([]Document, error) {
335 return idx.searchDate(zeroDate, query...)
336 }
337
338 func (idx *InvertedIndex) searchDate(upperDate time.Time, query ...queryElement) ([]Document, error) {
339 matchm := make(map[Document]*rank, 200)
340 applyDateFilter := !idx.cfg.IncludeNewer && !upperDate.IsZero()
341
342 for _, el := range query {
343 setm, found := idx.index[el.Index]
344 if !found {
345 return []Document{}, fmt.Errorf("index for %q not found", el.Index)
346 }
347
348 config, found := idx.getIndexCfg(el.Index)
349 if !found {
350 return []Document{}, fmt.Errorf("index config for %q not found", el.Index)
351 }
352
353 for _, kw := range el.Keywords {
354 if docs, found := setm[kw]; found {
355 for _, doc := range docs {
356 if applyDateFilter {
357 // Exclude newer than the limit given
358 if doc.PublishDate().After(upperDate) {
359 continue
360 }
361 }
362 r, found := matchm[doc]
363 if !found {
364 matchm[doc] = newRank(doc, config.Weight)
365 } else {
366 r.addWeight(config.Weight)
367 }
368 }
369 }
370 }
371 }
372
373 if len(matchm) == 0 {
374 return []Document{}, nil
375 }
376
377 matches := make(ranks, 0, 100)
378
379 for _, v := range matchm {
380 avgWeight := v.Weight / v.Matches
381 weight := norm(avgWeight, idx.minWeight, idx.maxWeight)
382 threshold := idx.cfg.Threshold / v.Matches
383
384 if weight >= threshold {
385 matches = append(matches, v)
386 }
387 }
388
389 sort.Stable(matches)
390
391 result := make([]Document, len(matches))
392
393 for i, m := range matches {
394 result[i] = m.Doc
395 }
396
397 return result, nil
398 }
399
400 // normalizes num to a number between 0 and 100.
401 func norm(num, min, max int) int {
402 if min > max {
403 panic("min > max")
404 }
405 return int(math.Floor((float64(num-min) / float64(max-min) * 100) + 0.5))
406 }
407
408 // DecodeConfig decodes a slice of map into Config.
409 func DecodeConfig(m maps.Params) (Config, error) {
410 if m == nil {
411 return Config{}, errors.New("no related config provided")
412 }
413
414 if len(m) == 0 {
415 return Config{}, errors.New("empty related config provided")
416 }
417
418 var c Config
419
420 if err := mapstructure.WeakDecode(m, &c); err != nil {
421 return c, err
422 }
423
424 if c.Threshold < 0 || c.Threshold > 100 {
425 return Config{}, errors.New("related threshold must be between 0 and 100")
426 }
427
428 if c.ToLower {
429 for i := range c.Indices {
430 c.Indices[i].ToLower = true
431 }
432 }
433
434 return c, nil
435 }
436
437 // StringKeyword is a string search keyword.
438 type StringKeyword string
439
440 func (s StringKeyword) String() string {
441 return string(s)
442 }
443
444 // Keyword is the interface a keyword in the search index must implement.
445 type Keyword interface {
446 String() string
447 }
448
449 // StringsToKeywords converts the given slice of strings to a slice of Keyword.
450 func StringsToKeywords(s ...string) []Keyword {
451 kw := make([]Keyword, len(s))
452
453 for i := 0; i < len(s); i++ {
454 kw[i] = StringKeyword(s[i])
455 }
456
457 return kw
458 }