1 // Copyright (C) 2021 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_SCORING_BM25F_CALCULATOR_H_ 16 #define ICING_SCORING_BM25F_CALCULATOR_H_ 17 18 #include <cstdint> 19 #include <string> 20 #include <unordered_set> 21 #include <vector> 22 23 #include "icing/index/iterator/doc-hit-info-iterator.h" 24 #include "icing/legacy/index/icing-bit-util.h" 25 #include "icing/scoring/section-weights.h" 26 #include "icing/store/corpus-id.h" 27 #include "icing/store/document-store.h" 28 29 namespace icing { 30 namespace lib { 31 32 // Bm25fCalculator encapsulates the logic to compute BM25F term-weight based 33 // ranking function. 34 // 35 // The formula to compute BM25F is as follows: 36 // 37 // BM25F = \sum_i IDF(q_i) * tf(q_i, D) 38 // 39 // where IDF(q_i) is the Inverse Document Frequency (IDF) weight of the query 40 // term q_i in the corpus with document D, and tf(q_i, D) is the weighted and 41 // normalized term frequency of query term q_i in the document D. 42 // 43 // IDF(q_i) is computed as follows: 44 // 45 // N - n(q_i) + 0.5 46 // IDF(q_i) = log(1 + ------------------) 47 // n(q_i) + 0.5 48 // 49 // where N is the number of documents in the corpus, and n(q_i) is the number 50 // of documents in the corpus containing the query term q_i. 51 // 52 // Lastly, tf(q_i, D) is computed as follows: 53 // 54 // f(q_i, D) * (k1 + 1) 55 // Normalized TF = -------------------------------------------- 56 // f(q_i, D) + k1 * (1 - b + b * |D| / avgdl) 57 // 58 // where f(q_i, D) is the frequency of query term q_i in document D, 59 // |D| is the #tokens in D, avgdl is the average document length in the corpus, 60 // k1 and b are smoothing parameters. 61 // 62 // see: go/icing-bm25f 63 // see: glossary/bm25 64 class Bm25fCalculator { 65 public: 66 explicit Bm25fCalculator(const DocumentStore *document_store, 67 SectionWeights *section_weights, 68 int64_t current_time_ms); 69 70 // Precompute and cache statistics relevant to BM25F. 71 // Populates term_id_map_ and corpus_nqi_map_ for use while scoring other 72 // results. 73 // The query_term_iterators map is used to build the 74 // std::unordered_map<std::string_view, TermId> term_id_map_. It must 75 // outlive the bm25f-calculator otherwise the string_view key in term_id_map_, 76 // used later to compute a document score, will be meaningless. 77 void PrepareToScore( 78 std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>> 79 *query_term_iterators); 80 81 // Compute the BM25F relevance score for the given hit, represented by 82 // DocHitInfo. 83 // The default score will be returned only when the scorer fails to find or 84 // calculate a score for the document. 85 float ComputeScore(const DocHitInfoIterator *query_it, 86 const DocHitInfo &hit_info, double default_score); 87 88 private: 89 // Compact ID for each query term. 90 using TermId = uint16_t; 91 92 // Compact representation of <CorpusId, TermId> for use as a key in a 93 // hash_map. 94 struct CorpusTermInfo { 95 // Layout bits: 16 bit CorpusId + 16 bit TermId 96 using Value = uint32_t; 97 98 Value value; 99 100 static constexpr int kCorpusIdBits = sizeof(CorpusId); 101 static constexpr int kTermIdBits = sizeof(TermId); 102 CorpusTermInfoCorpusTermInfo103 explicit CorpusTermInfo(CorpusId corpus_id, TermId term_id) : value(0) { 104 BITFIELD_OR(value, kTermIdBits, kCorpusIdBits, 105 static_cast<uint64_t>(corpus_id)); 106 BITFIELD_OR(value, 0, kTermIdBits, term_id); 107 } 108 109 bool operator==(const CorpusTermInfo &other) const { 110 return value == other.value; 111 } 112 }; 113 114 // Returns idf weight for the term and provided corpus. 115 float GetCorpusIdfWeightForTerm(std::string_view term, CorpusId corpus_id); 116 117 // Returns the average document length for the corpus. The average is 118 // calculated as the sum of tokens in the corpus' documents over the total 119 // number of documents plus one. 120 float GetCorpusAvgDocLength(CorpusId corpus_id); 121 122 // Returns the normalized term frequency for the term match and document hit. 123 // This normalizes the term frequency by applying smoothing parameters and 124 // factoring document length. 125 float ComputedNormalizedTermFrequency( 126 const TermMatchInfo &term_match_info, const DocHitInfo &hit_info, 127 const DocumentAssociatedScoreData &data); 128 129 // Returns the weighted term frequency for the term match and document. For 130 // each section the term is present, we scale the term frequency by its 131 // section weight. We return the sum of the weighted term frequencies over all 132 // sections. 133 float ComputeTermFrequencyForMatchedSections( 134 CorpusId corpus_id, const TermMatchInfo &term_match_info, 135 DocumentId document_id) const; 136 137 // Returns the schema type id for the document by retrieving it from the 138 // DocumentFilterData. 139 SchemaTypeId GetSchemaTypeId(DocumentId document_id) const; 140 141 // Clears cached scoring data and prepares the calculator for a new scoring 142 // run. 143 void Clear(); 144 145 const DocumentStore *document_store_; // Does not own. 146 147 // Used for accessing normalized section weights when computing the weighted 148 // term frequency. 149 SectionWeights §ion_weights_; 150 151 // Map from query term to compact term ID. 152 // Necessary as a key to the other maps. 153 // The use of the string_view as key here means that the query_term_iterators 154 // map must outlive the bm25f 155 std::unordered_map<std::string_view, TermId> term_id_map_; 156 157 // Map from corpus ID to average document length (avgdl). 158 // Necessary to calculate the normalized term frequency. 159 // This information is cached in the DocumentStore::CorpusScoreCache 160 std::unordered_map<CorpusId, float> corpus_avgdl_map_; 161 // Map from <corpus ID, term ID> to number of documents containing term q_i, 162 // called n(q_i). 163 // Necessary to calculate IDF(q_i) (inverse document frequency). 164 // This information must be calculated by iterating through the hits for these 165 // terms. 166 std::unordered_map<CorpusTermInfo::Value, uint32_t> corpus_nqi_map_; 167 168 // Map from <corpus ID, term ID> to IDF(q_i) (inverse document frequency). 169 std::unordered_map<CorpusTermInfo::Value, float> corpus_idf_map_; 170 171 int64_t current_time_ms_; 172 }; 173 174 } // namespace lib 175 } // namespace icing 176 177 #endif // ICING_SCORING_BM25F_CALCULATOR_H_ 178