inverted_index.go (11382B)
1 // Copyright 2019 The Hugo Authors. All rights reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // http://www.apache.org/licenses/LICENSE-2.0 7 // 8 // Unless required by applicable law or agreed to in writing, software 9 // distributed under the License is distributed on an "AS IS" BASIS, 10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 // Package related holds code to help finding related content. 15 package related 16 17 import ( 18 "errors" 19 "fmt" 20 "math" 21 "sort" 22 "strings" 23 "time" 24 25 "github.com/gohugoio/hugo/common/maps" 26 27 "github.com/gohugoio/hugo/common/types" 28 "github.com/mitchellh/mapstructure" 29 ) 30 31 var ( 32 _ Keyword = (*StringKeyword)(nil) 33 zeroDate = time.Time{} 34 35 // DefaultConfig is the default related config. 36 DefaultConfig = Config{ 37 Threshold: 80, 38 Indices: IndexConfigs{ 39 IndexConfig{Name: "keywords", Weight: 100}, 40 IndexConfig{Name: "date", Weight: 10}, 41 }, 42 } 43 ) 44 45 /* 46 Config is the top level configuration element used to configure how to retrieve 47 related content in Hugo. 48 49 An example site config.toml: 50 51 [related] 52 threshold = 1 53 [[related.indices]] 54 name = "keywords" 55 weight = 200 56 [[related.indices]] 57 name = "tags" 58 weight = 100 59 [[related.indices]] 60 name = "date" 61 weight = 1 62 pattern = "2006" 63 */ 64 type Config struct { 65 // Only include matches >= threshold, a normalized rank between 0 and 100. 66 Threshold int 67 68 // To get stable "See also" sections we, by default, exclude newer related pages. 69 IncludeNewer bool 70 71 // Will lower case all string values and queries to the indices. 72 // May get better results, but at a slight performance cost. 73 ToLower bool 74 75 Indices IndexConfigs 76 } 77 78 // Add adds a given index. 79 func (c *Config) Add(index IndexConfig) { 80 if c.ToLower { 81 index.ToLower = true 82 } 83 c.Indices = append(c.Indices, index) 84 } 85 86 // IndexConfigs holds a set of index configurations. 87 type IndexConfigs []IndexConfig 88 89 // IndexConfig configures an index. 90 type IndexConfig struct { 91 // The index name. This directly maps to a field or Param name. 92 Name string 93 94 // Contextual pattern used to convert the Param value into a string. 95 // Currently only used for dates. Can be used to, say, bump posts in the same 96 // time frame when searching for related documents. 97 // For dates it follows Go's time.Format patterns, i.e. 98 // "2006" for YYYY and "200601" for YYYYMM. 99 Pattern string 100 101 // This field's weight when doing multi-index searches. Higher is "better". 102 Weight int 103 104 // Will lower case all string values in and queries tothis index. 105 // May get better accurate results, but at a slight performance cost. 106 ToLower bool 107 } 108 109 // Document is the interface an indexable document in Hugo must fulfill. 110 type Document interface { 111 // RelatedKeywords returns a list of keywords for the given index config. 112 RelatedKeywords(cfg IndexConfig) ([]Keyword, error) 113 114 // When this document was or will be published. 115 PublishDate() time.Time 116 117 // Name is used as an tiebreaker if both Weight and PublishDate are 118 // the same. 119 Name() string 120 } 121 122 // InvertedIndex holds an inverted index, also sometimes named posting list, which 123 // lists, for every possible search term, the documents that contain that term. 124 type InvertedIndex struct { 125 cfg Config 126 index map[string]map[Keyword][]Document 127 128 minWeight int 129 maxWeight int 130 } 131 132 func (idx *InvertedIndex) getIndexCfg(name string) (IndexConfig, bool) { 133 for _, conf := range idx.cfg.Indices { 134 if conf.Name == name { 135 return conf, true 136 } 137 } 138 139 return IndexConfig{}, false 140 } 141 142 // NewInvertedIndex creates a new InvertedIndex. 143 // Documents to index must be added in Add. 144 func NewInvertedIndex(cfg Config) *InvertedIndex { 145 idx := &InvertedIndex{index: make(map[string]map[Keyword][]Document), cfg: cfg} 146 for _, conf := range cfg.Indices { 147 idx.index[conf.Name] = make(map[Keyword][]Document) 148 if conf.Weight < idx.minWeight { 149 // By default, the weight scale starts at 0, but we allow 150 // negative weights. 151 idx.minWeight = conf.Weight 152 } 153 if conf.Weight > idx.maxWeight { 154 idx.maxWeight = conf.Weight 155 } 156 } 157 return idx 158 } 159 160 // Add documents to the inverted index. 161 // The value must support == and !=. 162 func (idx *InvertedIndex) Add(docs ...Document) error { 163 var err error 164 for _, config := range idx.cfg.Indices { 165 if config.Weight == 0 { 166 // Disabled 167 continue 168 } 169 setm := idx.index[config.Name] 170 171 for _, doc := range docs { 172 var words []Keyword 173 words, err = doc.RelatedKeywords(config) 174 if err != nil { 175 continue 176 } 177 178 for _, keyword := range words { 179 setm[keyword] = append(setm[keyword], doc) 180 } 181 } 182 } 183 184 return err 185 } 186 187 // queryElement holds the index name and keywords that can be used to compose a 188 // search for related content. 189 type queryElement struct { 190 Index string 191 Keywords []Keyword 192 } 193 194 func newQueryElement(index string, keywords ...Keyword) queryElement { 195 return queryElement{Index: index, Keywords: keywords} 196 } 197 198 type ranks []*rank 199 200 type rank struct { 201 Doc Document 202 Weight int 203 Matches int 204 } 205 206 func (r *rank) addWeight(w int) { 207 r.Weight += w 208 r.Matches++ 209 } 210 211 func newRank(doc Document, weight int) *rank { 212 return &rank{Doc: doc, Weight: weight, Matches: 1} 213 } 214 215 func (r ranks) Len() int { return len(r) } 216 func (r ranks) Swap(i, j int) { r[i], r[j] = r[j], r[i] } 217 func (r ranks) Less(i, j int) bool { 218 if r[i].Weight == r[j].Weight { 219 if r[i].Doc.PublishDate() == r[j].Doc.PublishDate() { 220 return r[i].Doc.Name() < r[j].Doc.Name() 221 } 222 return r[i].Doc.PublishDate().After(r[j].Doc.PublishDate()) 223 } 224 return r[i].Weight > r[j].Weight 225 } 226 227 // SearchDoc finds the documents matching any of the keywords in the given indices 228 // against the given document. 229 // The resulting document set will be sorted according to number of matches 230 // and the index weights, and any matches with a rank below the configured 231 // threshold (normalize to 0..100) will be removed. 232 // If an index name is provided, only that index will be queried. 233 func (idx *InvertedIndex) SearchDoc(doc Document, indices ...string) ([]Document, error) { 234 var q []queryElement 235 236 var configs IndexConfigs 237 238 if len(indices) == 0 { 239 configs = idx.cfg.Indices 240 } else { 241 configs = make(IndexConfigs, len(indices)) 242 for i, indexName := range indices { 243 cfg, found := idx.getIndexCfg(indexName) 244 if !found { 245 return nil, fmt.Errorf("index %q not found", indexName) 246 } 247 configs[i] = cfg 248 } 249 } 250 251 for _, cfg := range configs { 252 keywords, err := doc.RelatedKeywords(cfg) 253 if err != nil { 254 return nil, err 255 } 256 257 q = append(q, newQueryElement(cfg.Name, keywords...)) 258 259 } 260 261 return idx.searchDate(doc.PublishDate(), q...) 262 } 263 264 // ToKeywords returns a Keyword slice of the given input. 265 func (cfg IndexConfig) ToKeywords(v any) ([]Keyword, error) { 266 var ( 267 keywords []Keyword 268 toLower = cfg.ToLower 269 ) 270 switch vv := v.(type) { 271 case string: 272 if toLower { 273 vv = strings.ToLower(vv) 274 } 275 keywords = append(keywords, StringKeyword(vv)) 276 case []string: 277 if toLower { 278 vc := make([]string, len(vv)) 279 copy(vc, vv) 280 for i := 0; i < len(vc); i++ { 281 vc[i] = strings.ToLower(vc[i]) 282 } 283 vv = vc 284 } 285 keywords = append(keywords, StringsToKeywords(vv...)...) 286 case time.Time: 287 layout := "2006" 288 if cfg.Pattern != "" { 289 layout = cfg.Pattern 290 } 291 keywords = append(keywords, StringKeyword(vv.Format(layout))) 292 case nil: 293 return keywords, nil 294 default: 295 return keywords, fmt.Errorf("indexing currently not supported for index %q and type %T", cfg.Name, vv) 296 } 297 298 return keywords, nil 299 } 300 301 // SearchKeyValues finds the documents matching any of the keywords in the given indices. 302 // The resulting document set will be sorted according to number of matches 303 // and the index weights, and any matches with a rank below the configured 304 // threshold (normalize to 0..100) will be removed. 305 func (idx *InvertedIndex) SearchKeyValues(args ...types.KeyValues) ([]Document, error) { 306 q := make([]queryElement, len(args)) 307 308 for i, arg := range args { 309 var keywords []Keyword 310 key := arg.KeyString() 311 if key == "" { 312 return nil, fmt.Errorf("index %q not valid", arg.Key) 313 } 314 conf, found := idx.getIndexCfg(key) 315 if !found { 316 return nil, fmt.Errorf("index %q not found", key) 317 } 318 319 for _, val := range arg.Values { 320 k, err := conf.ToKeywords(val) 321 if err != nil { 322 return nil, err 323 } 324 keywords = append(keywords, k...) 325 } 326 327 q[i] = newQueryElement(conf.Name, keywords...) 328 329 } 330 331 return idx.search(q...) 332 } 333 334 func (idx *InvertedIndex) search(query ...queryElement) ([]Document, error) { 335 return idx.searchDate(zeroDate, query...) 336 } 337 338 func (idx *InvertedIndex) searchDate(upperDate time.Time, query ...queryElement) ([]Document, error) { 339 matchm := make(map[Document]*rank, 200) 340 applyDateFilter := !idx.cfg.IncludeNewer && !upperDate.IsZero() 341 342 for _, el := range query { 343 setm, found := idx.index[el.Index] 344 if !found { 345 return []Document{}, fmt.Errorf("index for %q not found", el.Index) 346 } 347 348 config, found := idx.getIndexCfg(el.Index) 349 if !found { 350 return []Document{}, fmt.Errorf("index config for %q not found", el.Index) 351 } 352 353 for _, kw := range el.Keywords { 354 if docs, found := setm[kw]; found { 355 for _, doc := range docs { 356 if applyDateFilter { 357 // Exclude newer than the limit given 358 if doc.PublishDate().After(upperDate) { 359 continue 360 } 361 } 362 r, found := matchm[doc] 363 if !found { 364 matchm[doc] = newRank(doc, config.Weight) 365 } else { 366 r.addWeight(config.Weight) 367 } 368 } 369 } 370 } 371 } 372 373 if len(matchm) == 0 { 374 return []Document{}, nil 375 } 376 377 matches := make(ranks, 0, 100) 378 379 for _, v := range matchm { 380 avgWeight := v.Weight / v.Matches 381 weight := norm(avgWeight, idx.minWeight, idx.maxWeight) 382 threshold := idx.cfg.Threshold / v.Matches 383 384 if weight >= threshold { 385 matches = append(matches, v) 386 } 387 } 388 389 sort.Stable(matches) 390 391 result := make([]Document, len(matches)) 392 393 for i, m := range matches { 394 result[i] = m.Doc 395 } 396 397 return result, nil 398 } 399 400 // normalizes num to a number between 0 and 100. 401 func norm(num, min, max int) int { 402 if min > max { 403 panic("min > max") 404 } 405 return int(math.Floor((float64(num-min) / float64(max-min) * 100) + 0.5)) 406 } 407 408 // DecodeConfig decodes a slice of map into Config. 409 func DecodeConfig(m maps.Params) (Config, error) { 410 if m == nil { 411 return Config{}, errors.New("no related config provided") 412 } 413 414 if len(m) == 0 { 415 return Config{}, errors.New("empty related config provided") 416 } 417 418 var c Config 419 420 if err := mapstructure.WeakDecode(m, &c); err != nil { 421 return c, err 422 } 423 424 if c.Threshold < 0 || c.Threshold > 100 { 425 return Config{}, errors.New("related threshold must be between 0 and 100") 426 } 427 428 if c.ToLower { 429 for i := range c.Indices { 430 c.Indices[i].ToLower = true 431 } 432 } 433 434 return c, nil 435 } 436 437 // StringKeyword is a string search keyword. 438 type StringKeyword string 439 440 func (s StringKeyword) String() string { 441 return string(s) 442 } 443 444 // Keyword is the interface a keyword in the search index must implement. 445 type Keyword interface { 446 String() string 447 } 448 449 // StringsToKeywords converts the given slice of strings to a slice of Keyword. 450 func StringsToKeywords(s ...string) []Keyword { 451 kw := make([]Keyword, len(s)) 452 453 for i := 0; i < len(s); i++ { 454 kw[i] = StringKeyword(s[i]) 455 } 456 457 return kw 458 }