1 // Copyright 2017 The Chromium OS Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "bsdiff/suffix_array_index.h"
6
7 #include <algorithm>
8 #include <limits>
9 #include <vector>
10
11 #include <divsufsort.h>
12 #include <divsufsort64.h>
13
14 #include "bsdiff/logging.h"
15
16 namespace {
17
18 // libdivsufsort C++ overloaded functions used to allow calling the right
19 // implementation based on the pointer size.
CallDivSufSort(const uint8_t * text,saidx_t * sa,size_t n)20 int CallDivSufSort(const uint8_t* text, saidx_t* sa, size_t n) {
21 return divsufsort(text, sa, n);
22 }
CallDivSufSort(const uint8_t * text,saidx64_t * sa,size_t n)23 int CallDivSufSort(const uint8_t* text, saidx64_t* sa, size_t n) {
24 return divsufsort64(text, sa, n);
25 }
26
CallSaSearch(const uint8_t * text,size_t text_size,const uint8_t * pattern,size_t pattern_size,const saidx_t * sa,size_t sa_size,saidx_t * left)27 saidx_t CallSaSearch(const uint8_t* text,
28 size_t text_size,
29 const uint8_t* pattern,
30 size_t pattern_size,
31 const saidx_t* sa,
32 size_t sa_size,
33 saidx_t* left) {
34 return sa_search(text, text_size, pattern, pattern_size, sa, sa_size, left);
35 }
CallSaSearch(const uint8_t * text,size_t text_size,const uint8_t * pattern,size_t pattern_size,const saidx64_t * sa,size_t sa_size,saidx64_t * left)36 saidx64_t CallSaSearch(const uint8_t* text,
37 size_t text_size,
38 const uint8_t* pattern,
39 size_t pattern_size,
40 const saidx64_t* sa,
41 size_t sa_size,
42 saidx64_t* left) {
43 return sa_search64(text, text_size, pattern, pattern_size, sa, sa_size, left);
44 }
45
46 } // namespace
47
48 namespace bsdiff {
49
50 // The SAIDX template type must be either saidx_t or saidx64_t, which will
51 // depend on the maximum size of the input text needed.
52 template <typename SAIDX>
53 class SuffixArrayIndex : public SuffixArrayIndexInterface {
54 public:
55 SuffixArrayIndex() = default;
56
57 // Initialize and construct the suffix array of the |text| string of length
58 // |n|. The memory pointed by |text| must be kept alive. Returns whether the
59 // construction succeeded.
60 bool Init(const uint8_t* text, size_t n);
61
62 // SuffixArrayIndexInterface overrides.
63 void SearchPrefix(const uint8_t* target,
64 size_t length,
65 size_t* out_length,
66 uint64_t* out_pos) const override;
67
68 private:
69 const uint8_t* text_{nullptr}; // Owned by the caller.
70 size_t n_{0};
71
72 std::vector<SAIDX> sa_;
73 };
74
75 template <typename SAIDX>
Init(const uint8_t * text,size_t n)76 bool SuffixArrayIndex<SAIDX>::Init(const uint8_t* text, size_t n) {
77 if (!sa_.empty()) {
78 // Already initialized.
79 LOG(ERROR) << "SuffixArray already initialized";
80 return false;
81 }
82 if (static_cast<uint64_t>(n) >
83 static_cast<uint64_t>(std::numeric_limits<SAIDX>::max())) {
84 LOG(ERROR) << "Input too big (" << n << ") for this implementation";
85 return false;
86 }
87 text_ = text;
88 n_ = n;
89 sa_.resize(n + 1);
90
91 if (n > 0 && CallDivSufSort(text_, sa_.data(), n) != 0) {
92 LOG(ERROR) << "divsufsrot() failed";
93 return false;
94 }
95
96 return true;
97 }
98
99 template <typename SAIDX>
SearchPrefix(const uint8_t * target,size_t length,size_t * out_length,uint64_t * out_pos) const100 void SuffixArrayIndex<SAIDX>::SearchPrefix(const uint8_t* target,
101 size_t length,
102 size_t* out_length,
103 uint64_t* out_pos) const {
104 SAIDX suf_left;
105 SAIDX count =
106 CallSaSearch(text_, n_, target, length, sa_.data(), n_, &suf_left);
107 if (count > 0) {
108 // This is the simple case where we found the whole |target| string was
109 // found.
110 *out_pos = sa_[suf_left];
111 *out_length = length;
112 return;
113 }
114 // In this case, |suf_left| points to the first suffix array position such
115 // that the suffix at that position is lexicographically larger than |target|.
116 // We only need to check whether the previous entry or the current entry is a
117 // longer match.
118 size_t prev_suffix_len = 0;
119 if (suf_left > 0) {
120 const size_t prev_max_len =
121 std::min(n_ - static_cast<size_t>(sa_[suf_left - 1]), length);
122 const uint8_t* prev_suffix = text_ + sa_[suf_left - 1];
123 prev_suffix_len =
124 std::mismatch(target, target + prev_max_len, prev_suffix).first -
125 target;
126 }
127
128 size_t next_suffix_len = 0;
129 if (static_cast<size_t>(suf_left) < n_) {
130 const uint8_t* next_suffix = text_ + sa_[suf_left];
131 const size_t next_max_len =
132 std::min(n_ - static_cast<size_t>(sa_[suf_left]), length);
133 next_suffix_len =
134 std::mismatch(target, target + next_max_len, next_suffix).first -
135 target;
136 }
137
138 *out_length = std::max(next_suffix_len, prev_suffix_len);
139 if (!*out_length) {
140 *out_pos = 0;
141 } else if (next_suffix_len >= prev_suffix_len) {
142 *out_pos = sa_[suf_left];
143 } else {
144 *out_pos = sa_[suf_left - 1];
145 }
146 }
147
CreateSuffixArrayIndex(const uint8_t * text,size_t n)148 std::unique_ptr<SuffixArrayIndexInterface> CreateSuffixArrayIndex(
149 const uint8_t* text,
150 size_t n) {
151 // The maximum supported size when using the suffix array based on the 32-bit
152 // saidx_t type. We limit this to something a bit smaller (16 bytes smaller)
153 // than the maximum representable number so references like "n + 1" are don't
154 // overflow.
155 const size_t kMaxSaidxSize = std::numeric_limits<saidx_t>::max() - 16;
156 std::unique_ptr<SuffixArrayIndexInterface> ret;
157
158 if (n > kMaxSaidxSize) {
159 SuffixArrayIndex<saidx64_t>* sa_ptr = new SuffixArrayIndex<saidx64_t>();
160 ret.reset(sa_ptr);
161 if (!sa_ptr->Init(text, n))
162 return nullptr;
163 } else {
164 SuffixArrayIndex<saidx_t>* sa_ptr = new SuffixArrayIndex<saidx_t>();
165 ret.reset(sa_ptr);
166 if (!sa_ptr->Init(text, n))
167 return nullptr;
168 }
169 return ret;
170 }
171
172
173 } // namespace bsdiff
174