1 // Copyright 2017 The Chromium OS Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "bsdiff/suffix_array_index.h"
6
7 #include <limits>
8 #include <vector>
9
10 #include <divsufsort.h>
11 #include <divsufsort64.h>
12
13 #include "bsdiff/logging.h"
14
15 namespace {
16
17 // libdivsufsort C++ overloaded functions used to allow calling the right
18 // implementation based on the pointer size.
CallDivSufSort(const uint8_t * text,saidx_t * sa,size_t n)19 int CallDivSufSort(const uint8_t* text, saidx_t* sa, size_t n) {
20 return divsufsort(text, sa, n);
21 }
CallDivSufSort(const uint8_t * text,saidx64_t * sa,size_t n)22 int CallDivSufSort(const uint8_t* text, saidx64_t* sa, size_t n) {
23 return divsufsort64(text, sa, n);
24 }
25
CallSaSearch(const uint8_t * text,size_t text_size,const uint8_t * pattern,size_t pattern_size,const saidx_t * sa,size_t sa_size,saidx_t * left)26 saidx_t CallSaSearch(const uint8_t* text,
27 size_t text_size,
28 const uint8_t* pattern,
29 size_t pattern_size,
30 const saidx_t* sa,
31 size_t sa_size,
32 saidx_t* left) {
33 return sa_search(text, text_size, pattern, pattern_size, sa, sa_size, left);
34 }
CallSaSearch(const uint8_t * text,size_t text_size,const uint8_t * pattern,size_t pattern_size,const saidx64_t * sa,size_t sa_size,saidx64_t * left)35 saidx64_t CallSaSearch(const uint8_t* text,
36 size_t text_size,
37 const uint8_t* pattern,
38 size_t pattern_size,
39 const saidx64_t* sa,
40 size_t sa_size,
41 saidx64_t* left) {
42 return sa_search64(text, text_size, pattern, pattern_size, sa, sa_size, left);
43 }
44
45 } // namespace
46
47 namespace bsdiff {
48
49 // The SAIDX template type must be either saidx_t or saidx64_t, which will
50 // depend on the maximum size of the input text needed.
51 template <typename SAIDX>
52 class SuffixArrayIndex : public SuffixArrayIndexInterface {
53 public:
54 SuffixArrayIndex() = default;
55
56 // Initialize and construct the suffix array of the |text| string of length
57 // |n|. The memory pointed by |text| must be kept alive. Returns whether the
58 // construction succeeded.
59 bool Init(const uint8_t* text, size_t n);
60
61 // SuffixArrayIndexInterface overrides.
62 void SearchPrefix(const uint8_t* target,
63 size_t length,
64 size_t* out_length,
65 uint64_t* out_pos) const override;
66
67 private:
68 const uint8_t* text_{nullptr}; // Owned by the caller.
69 size_t n_{0};
70
71 std::vector<SAIDX> sa_;
72 };
73
74 template <typename SAIDX>
Init(const uint8_t * text,size_t n)75 bool SuffixArrayIndex<SAIDX>::Init(const uint8_t* text, size_t n) {
76 if (!sa_.empty()) {
77 // Already initialized.
78 LOG(ERROR) << "SuffixArray already initialized";
79 return false;
80 }
81 if (static_cast<uint64_t>(n) >
82 static_cast<uint64_t>(std::numeric_limits<SAIDX>::max())) {
83 LOG(ERROR) << "Input too big (" << n << ") for this implementation";
84 return false;
85 }
86 text_ = text;
87 n_ = n;
88 sa_.resize(n + 1);
89
90 if (n > 0 && CallDivSufSort(text_, sa_.data(), n) != 0) {
91 LOG(ERROR) << "divsufsrot() failed";
92 return false;
93 }
94
95 return true;
96 }
97
98 template <typename SAIDX>
SearchPrefix(const uint8_t * target,size_t length,size_t * out_length,uint64_t * out_pos) const99 void SuffixArrayIndex<SAIDX>::SearchPrefix(const uint8_t* target,
100 size_t length,
101 size_t* out_length,
102 uint64_t* out_pos) const {
103 SAIDX suf_left;
104 SAIDX count =
105 CallSaSearch(text_, n_, target, length, sa_.data(), n_, &suf_left);
106 if (count > 0) {
107 // This is the simple case where we found the whole |target| string was
108 // found.
109 *out_pos = sa_[suf_left];
110 *out_length = length;
111 return;
112 }
113 // In this case, |suf_left| points to the first suffix array position such
114 // that the suffix at that position is lexicographically larger than |target|.
115 // We only need to check whether the previous entry or the current entry is a
116 // longer match.
117 size_t prev_suffix_len = 0;
118 if (suf_left > 0) {
119 const size_t prev_max_len =
120 std::min(n_ - static_cast<size_t>(sa_[suf_left - 1]), length);
121 const uint8_t* prev_suffix = text_ + sa_[suf_left - 1];
122 prev_suffix_len =
123 std::mismatch(target, target + prev_max_len, prev_suffix).first -
124 target;
125 }
126
127 size_t next_suffix_len = 0;
128 if (static_cast<size_t>(suf_left) < n_) {
129 const uint8_t* next_suffix = text_ + sa_[suf_left];
130 const size_t next_max_len =
131 std::min(n_ - static_cast<size_t>(sa_[suf_left]), length);
132 next_suffix_len =
133 std::mismatch(target, target + next_max_len, next_suffix).first -
134 target;
135 }
136
137 *out_length = std::max(next_suffix_len, prev_suffix_len);
138 if (!*out_length) {
139 *out_pos = 0;
140 } else if (next_suffix_len >= prev_suffix_len) {
141 *out_pos = sa_[suf_left];
142 } else {
143 *out_pos = sa_[suf_left - 1];
144 }
145 }
146
CreateSuffixArrayIndex(const uint8_t * text,size_t n)147 std::unique_ptr<SuffixArrayIndexInterface> CreateSuffixArrayIndex(
148 const uint8_t* text,
149 size_t n) {
150 // The maximum supported size when using the suffix array based on the 32-bit
151 // saidx_t type. We limit this to something a bit smaller (16 bytes smaller)
152 // than the maximum representable number so references like "n + 1" are don't
153 // overflow.
154 const size_t kMaxSaidxSize = std::numeric_limits<saidx_t>::max() - 16;
155 std::unique_ptr<SuffixArrayIndexInterface> ret;
156
157 if (n > kMaxSaidxSize) {
158 SuffixArrayIndex<saidx64_t>* sa_ptr = new SuffixArrayIndex<saidx64_t>();
159 ret.reset(sa_ptr);
160 if (!sa_ptr->Init(text, n))
161 return nullptr;
162 } else {
163 SuffixArrayIndex<saidx_t>* sa_ptr = new SuffixArrayIndex<saidx_t>();
164 ret.reset(sa_ptr);
165 if (!sa_ptr->Init(text, n))
166 return nullptr;
167 }
168 return ret;
169 }
170
171
172 } // namespace bsdiff
173