• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2021 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_SCORING_BM25F_CALCULATOR_H_
16 #define ICING_SCORING_BM25F_CALCULATOR_H_
17 
18 #include <cstdint>
19 #include <string>
20 #include <unordered_set>
21 #include <vector>
22 
23 #include "icing/index/iterator/doc-hit-info-iterator.h"
24 #include "icing/legacy/index/icing-bit-util.h"
25 #include "icing/scoring/section-weights.h"
26 #include "icing/store/corpus-id.h"
27 #include "icing/store/document-store.h"
28 
29 namespace icing {
30 namespace lib {
31 
32 // Bm25fCalculator encapsulates the logic to compute BM25F term-weight based
33 // ranking function.
34 //
35 // The formula to compute BM25F is as follows:
36 //
37 // BM25F = \sum_i IDF(q_i) * tf(q_i, D)
38 //
39 // where IDF(q_i) is the Inverse Document Frequency (IDF) weight of the query
40 // term q_i in the corpus with document D, and tf(q_i, D) is the weighted and
41 // normalized term frequency of query term q_i in the document D.
42 //
43 // IDF(q_i) is computed as follows:
44 //
45 //                     N - n(q_i) + 0.5
46 // IDF(q_i) = log(1 + ------------------)
47 //                       n(q_i) + 0.5
48 //
49 // where N is the number of documents in the corpus, and n(q_i) is the number
50 // of documents in the corpus containing the query term q_i.
51 //
52 // Lastly, tf(q_i, D) is computed as follows:
53 //
54 //                            f(q_i, D) * (k1 + 1)
55 // Normalized TF = --------------------------------------------
56 //                 f(q_i, D) + k1 * (1 - b + b * |D| / avgdl)
57 //
58 // where f(q_i, D) is the frequency of query term q_i in document D,
59 // |D| is the #tokens in D, avgdl is the average document length in the corpus,
60 // k1 and b are smoothing parameters.
61 //
62 // see: go/icing-bm25f
63 // see: glossary/bm25
64 class Bm25fCalculator {
65  public:
66   explicit Bm25fCalculator(const DocumentStore *document_store,
67                            SectionWeights *section_weights,
68                            int64_t current_time_ms);
69 
70   // Precompute and cache statistics relevant to BM25F.
71   // Populates term_id_map_ and corpus_nqi_map_ for use while scoring other
72   // results.
73   // The query_term_iterators map is used to build the
74   // std::unordered_map<std::string_view, TermId> term_id_map_. It must
75   // outlive the bm25f-calculator otherwise the string_view key in term_id_map_,
76   // used later to compute a document score, will be meaningless.
77   void PrepareToScore(
78       std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>>
79           *query_term_iterators);
80 
81   // Compute the BM25F relevance score for the given hit, represented by
82   // DocHitInfo.
83   // The default score will be returned only when the scorer fails to find or
84   // calculate a score for the document.
85   float ComputeScore(const DocHitInfoIterator *query_it,
86                      const DocHitInfo &hit_info, double default_score);
87 
88  private:
89   // Compact ID for each query term.
90   using TermId = uint16_t;
91 
92   // Compact representation of <CorpusId, TermId> for use as a key in a
93   // hash_map.
94   struct CorpusTermInfo {
95     // Layout bits: 16 bit CorpusId + 16 bit TermId
96     using Value = uint32_t;
97 
98     Value value;
99 
100     static constexpr int kCorpusIdBits = sizeof(CorpusId);
101     static constexpr int kTermIdBits = sizeof(TermId);
102 
CorpusTermInfoCorpusTermInfo103     explicit CorpusTermInfo(CorpusId corpus_id, TermId term_id) : value(0) {
104       BITFIELD_OR(value, kTermIdBits, kCorpusIdBits,
105                   static_cast<uint64_t>(corpus_id));
106       BITFIELD_OR(value, 0, kTermIdBits, term_id);
107     }
108 
109     bool operator==(const CorpusTermInfo &other) const {
110       return value == other.value;
111     }
112   };
113 
114   // Returns idf weight for the term and provided corpus.
115   float GetCorpusIdfWeightForTerm(std::string_view term, CorpusId corpus_id);
116 
117   // Returns the average document length for the corpus. The average is
118   // calculated as the sum of tokens in the corpus' documents over the total
119   // number of documents plus one.
120   float GetCorpusAvgDocLength(CorpusId corpus_id);
121 
122   // Returns the normalized term frequency for the term match and document hit.
123   // This normalizes the term frequency by applying smoothing parameters and
124   // factoring document length.
125   float ComputedNormalizedTermFrequency(
126       const TermMatchInfo &term_match_info, const DocHitInfo &hit_info,
127       const DocumentAssociatedScoreData &data);
128 
129   // Returns the weighted term frequency for the term match and document. For
130   // each section the term is present, we scale the term frequency by its
131   // section weight. We return the sum of the weighted term frequencies over all
132   // sections.
133   float ComputeTermFrequencyForMatchedSections(
134       CorpusId corpus_id, const TermMatchInfo &term_match_info,
135       DocumentId document_id) const;
136 
137   // Returns the schema type id for the document by retrieving it from the
138   // DocumentFilterData.
139   SchemaTypeId GetSchemaTypeId(DocumentId document_id) const;
140 
141   // Clears cached scoring data and prepares the calculator for a new scoring
142   // run.
143   void Clear();
144 
145   const DocumentStore *document_store_;  // Does not own.
146 
147   // Used for accessing normalized section weights when computing the weighted
148   // term frequency.
149   SectionWeights &section_weights_;
150 
151   // Map from query term to compact term ID.
152   // Necessary as a key to the other maps.
153   // The use of the string_view as key here means that the query_term_iterators
154   // map must outlive the bm25f
155   std::unordered_map<std::string_view, TermId> term_id_map_;
156 
157   // Map from corpus ID to average document length (avgdl).
158   // Necessary to calculate the normalized term frequency.
159   // This information is cached in the DocumentStore::CorpusScoreCache
160   std::unordered_map<CorpusId, float> corpus_avgdl_map_;
161   // Map from <corpus ID, term ID> to number of documents containing term q_i,
162   // called n(q_i).
163   // Necessary to calculate IDF(q_i) (inverse document frequency).
164   // This information must be calculated by iterating through the hits for these
165   // terms.
166   std::unordered_map<CorpusTermInfo::Value, uint32_t> corpus_nqi_map_;
167 
168   // Map from <corpus ID, term ID> to IDF(q_i) (inverse document frequency).
169   std::unordered_map<CorpusTermInfo::Value, float> corpus_idf_map_;
170 
171   int64_t current_time_ms_;
172 };
173 
174 }  // namespace lib
175 }  // namespace icing
176 
177 #endif  // ICING_SCORING_BM25F_CALCULATOR_H_
178