1 /*
2 * utils.c for libdivsufsort
3 * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person
6 * obtaining a copy of this software and associated documentation
7 * files (the "Software"), to deal in the Software without
8 * restriction, including without limitation the rights to use,
9 * copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following
12 * conditions:
13 *
14 * The above copyright notice and this permission notice shall be
15 * included in all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
19 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
21 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
22 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
24 * OTHER DEALINGS IN THE SOFTWARE.
25 */
26
27 #include "divsufsort_private.h"
28
29
30 /*- Private Function -*/
31
32 /* Binary search for inverse bwt. */
33 static
34 saidx_t
binarysearch_lower(const saidx_t * A,saidx_t size,saidx_t value)35 binarysearch_lower(const saidx_t *A, saidx_t size, saidx_t value) {
36 saidx_t half, i;
37 for(i = 0, half = size >> 1;
38 0 < size;
39 size = half, half >>= 1) {
40 if(A[i + half] < value) {
41 i += half + 1;
42 half -= (size & 1) ^ 1;
43 }
44 }
45 return i;
46 }
47
48
49 /*- Functions -*/
50
51 /* Burrows-Wheeler transform. */
52 saint_t
bw_transform(const sauchar_t * T,sauchar_t * U,saidx_t * SA,saidx_t n,saidx_t * idx)53 bw_transform(const sauchar_t *T, sauchar_t *U, saidx_t *SA,
54 saidx_t n, saidx_t *idx) {
55 saidx_t *A, i, j, p, t;
56 saint_t c;
57
58 /* Check arguments. */
59 if((T == NULL) || (U == NULL) || (n < 0) || (idx == NULL)) { return -1; }
60 if(n <= 1) {
61 if(n == 1) { U[0] = T[0]; }
62 *idx = n;
63 return 0;
64 }
65
66 if((A = SA) == NULL) {
67 i = divbwt(T, U, NULL, n);
68 if(0 <= i) { *idx = i; i = 0; }
69 return (saint_t)i;
70 }
71
72 /* BW transform. */
73 if(T == U) {
74 t = n;
75 for(i = 0, j = 0; i < n; ++i) {
76 p = t - 1;
77 t = A[i];
78 if(0 <= p) {
79 c = T[j];
80 U[j] = (j <= p) ? T[p] : (sauchar_t)A[p];
81 A[j] = c;
82 j++;
83 } else {
84 *idx = i;
85 }
86 }
87 p = t - 1;
88 if(0 <= p) {
89 c = T[j];
90 U[j] = (j <= p) ? T[p] : (sauchar_t)A[p];
91 A[j] = c;
92 } else {
93 *idx = i;
94 }
95 } else {
96 U[0] = T[n - 1];
97 for(i = 0; A[i] != 0; ++i) { U[i + 1] = T[A[i] - 1]; }
98 *idx = i + 1;
99 for(++i; i < n; ++i) { U[i] = T[A[i] - 1]; }
100 }
101
102 if(SA == NULL) {
103 /* Deallocate memory. */
104 free(A);
105 }
106
107 return 0;
108 }
109
110 /* Inverse Burrows-Wheeler transform. */
111 saint_t
inverse_bw_transform(const sauchar_t * T,sauchar_t * U,saidx_t * A,saidx_t n,saidx_t idx)112 inverse_bw_transform(const sauchar_t *T, sauchar_t *U, saidx_t *A,
113 saidx_t n, saidx_t idx) {
114 saidx_t C[ALPHABET_SIZE];
115 sauchar_t D[ALPHABET_SIZE];
116 saidx_t *B;
117 saidx_t i, p;
118 saint_t c, d;
119
120 /* Check arguments. */
121 if((T == NULL) || (U == NULL) || (n < 0) || (idx < 0) ||
122 (n < idx) || ((0 < n) && (idx == 0))) {
123 return -1;
124 }
125 if(n <= 1) { return 0; }
126
127 if((B = A) == NULL) {
128 /* Allocate n*sizeof(saidx_t) bytes of memory. */
129 if((B = (saidx_t *)malloc((size_t)n * sizeof(saidx_t))) == NULL) { return -2; }
130 }
131
132 /* Inverse BW transform. */
133 for(c = 0; c < ALPHABET_SIZE; ++c) { C[c] = 0; }
134 for(i = 0; i < n; ++i) { ++C[T[i]]; }
135 for(c = 0, d = 0, i = 0; c < ALPHABET_SIZE; ++c) {
136 p = C[c];
137 if(0 < p) {
138 C[c] = i;
139 D[d++] = (sauchar_t)c;
140 i += p;
141 }
142 }
143 for(i = 0; i < idx; ++i) { B[C[T[i]]++] = i; }
144 for( ; i < n; ++i) { B[C[T[i]]++] = i + 1; }
145 for(c = 0; c < d; ++c) { C[c] = C[D[c]]; }
146 for(i = 0, p = idx; i < n; ++i) {
147 U[i] = D[binarysearch_lower(C, d, p)];
148 p = B[p - 1];
149 }
150
151 if(A == NULL) {
152 /* Deallocate memory. */
153 free(B);
154 }
155
156 return 0;
157 }
158
159 /* Checks the suffix array SA of the string T. */
160 saint_t
sufcheck(const sauchar_t * T,const saidx_t * SA,saidx_t n,saint_t verbose)161 sufcheck(const sauchar_t *T, const saidx_t *SA,
162 saidx_t n, saint_t verbose) {
163 saidx_t C[ALPHABET_SIZE];
164 saidx_t i, p, q, t;
165 saint_t c;
166
167 if(verbose) { fprintf(stderr, "sufcheck: "); }
168
169 /* Check arguments. */
170 if((T == NULL) || (SA == NULL) || (n < 0)) {
171 if(verbose) { fprintf(stderr, "Invalid arguments.\n"); }
172 return -1;
173 }
174 if(n == 0) {
175 if(verbose) { fprintf(stderr, "Done.\n"); }
176 return 0;
177 }
178
179 /* check range: [0..n-1] */
180 for(i = 0; i < n; ++i) {
181 if((SA[i] < 0) || (n <= SA[i])) {
182 if(verbose) {
183 fprintf(stderr, "Out of the range [0,%" PRIdSAIDX_T "].\n"
184 " SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "\n",
185 n - 1, i, SA[i]);
186 }
187 return -2;
188 }
189 }
190
191 /* check first characters. */
192 for(i = 1; i < n; ++i) {
193 if(T[SA[i - 1]] > T[SA[i]]) {
194 if(verbose) {
195 fprintf(stderr, "Suffixes in wrong order.\n"
196 " T[SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "]=%d"
197 " > T[SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "]=%d\n",
198 i - 1, SA[i - 1], T[SA[i - 1]], i, SA[i], T[SA[i]]);
199 }
200 return -3;
201 }
202 }
203
204 /* check suffixes. */
205 for(i = 0; i < ALPHABET_SIZE; ++i) { C[i] = 0; }
206 for(i = 0; i < n; ++i) { ++C[T[i]]; }
207 for(i = 0, p = 0; i < ALPHABET_SIZE; ++i) {
208 t = C[i];
209 C[i] = p;
210 p += t;
211 }
212
213 q = C[T[n - 1]];
214 C[T[n - 1]] += 1;
215 for(i = 0; i < n; ++i) {
216 p = SA[i];
217 if(0 < p) {
218 c = T[--p];
219 t = C[c];
220 } else {
221 c = T[p = n - 1];
222 t = q;
223 }
224 if((t < 0) || (p != SA[t])) {
225 if(verbose) {
226 fprintf(stderr, "Suffix in wrong position.\n"
227 " SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T " or\n"
228 " SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "\n",
229 t, (0 <= t) ? SA[t] : -1, i, SA[i]);
230 }
231 return -4;
232 }
233 if(t != q) {
234 ++C[c];
235 if((n <= C[c]) || (T[SA[C[c]]] != c)) { C[c] = -1; }
236 }
237 }
238
239 if(1 <= verbose) { fprintf(stderr, "Done.\n"); }
240 return 0;
241 }
242
243
244 static
245 int
_compare(const sauchar_t * T,saidx_t Tsize,const sauchar_t * P,saidx_t Psize,saidx_t suf,saidx_t * match)246 _compare(const sauchar_t *T, saidx_t Tsize,
247 const sauchar_t *P, saidx_t Psize,
248 saidx_t suf, saidx_t *match) {
249 saidx_t i, j;
250 saint_t r;
251 for(i = suf + *match, j = *match, r = 0;
252 (i < Tsize) && (j < Psize) && ((r = T[i] - P[j]) == 0); ++i, ++j) { }
253 *match = j;
254 return (r == 0) ? -(j != Psize) : r;
255 }
256
257 /* Search for the pattern P in the string T. */
258 saidx_t
sa_search(const sauchar_t * T,saidx_t Tsize,const sauchar_t * P,saidx_t Psize,const saidx_t * SA,saidx_t SAsize,saidx_t * idx)259 sa_search(const sauchar_t *T, saidx_t Tsize,
260 const sauchar_t *P, saidx_t Psize,
261 const saidx_t *SA, saidx_t SAsize,
262 saidx_t *idx) {
263 saidx_t size, lsize, rsize, half;
264 saidx_t match, lmatch, rmatch;
265 saidx_t llmatch, lrmatch, rlmatch, rrmatch;
266 saidx_t i, j, k;
267 saint_t r;
268
269 if(idx != NULL) { *idx = -1; }
270 if((T == NULL) || (P == NULL) || (SA == NULL) ||
271 (Tsize < 0) || (Psize < 0) || (SAsize < 0)) { return -1; }
272 if((Tsize == 0) || (SAsize == 0)) { return 0; }
273 if(Psize == 0) { if(idx != NULL) { *idx = 0; } return SAsize; }
274
275 for(i = j = k = 0, lmatch = rmatch = 0, size = SAsize, half = size >> 1;
276 0 < size;
277 size = half, half >>= 1) {
278 match = MIN(lmatch, rmatch);
279 r = _compare(T, Tsize, P, Psize, SA[i + half], &match);
280 if(r < 0) {
281 i += half + 1;
282 half -= (size & 1) ^ 1;
283 lmatch = match;
284 } else if(r > 0) {
285 rmatch = match;
286 } else {
287 lsize = half, j = i, rsize = size - half - 1, k = i + half + 1;
288
289 /* left part */
290 for(llmatch = lmatch, lrmatch = match, half = lsize >> 1;
291 0 < lsize;
292 lsize = half, half >>= 1) {
293 lmatch = MIN(llmatch, lrmatch);
294 r = _compare(T, Tsize, P, Psize, SA[j + half], &lmatch);
295 if(r < 0) {
296 j += half + 1;
297 half -= (lsize & 1) ^ 1;
298 llmatch = lmatch;
299 } else {
300 lrmatch = lmatch;
301 }
302 }
303
304 /* right part */
305 for(rlmatch = match, rrmatch = rmatch, half = rsize >> 1;
306 0 < rsize;
307 rsize = half, half >>= 1) {
308 rmatch = MIN(rlmatch, rrmatch);
309 r = _compare(T, Tsize, P, Psize, SA[k + half], &rmatch);
310 if(r <= 0) {
311 k += half + 1;
312 half -= (rsize & 1) ^ 1;
313 rlmatch = rmatch;
314 } else {
315 rrmatch = rmatch;
316 }
317 }
318
319 break;
320 }
321 }
322
323 if(idx != NULL) { *idx = (0 < (k - j)) ? j : i; }
324 return k - j;
325 }
326
327 /* Search for the character c in the string T. */
328 saidx_t
sa_simplesearch(const sauchar_t * T,saidx_t Tsize,const saidx_t * SA,saidx_t SAsize,saint_t c,saidx_t * idx)329 sa_simplesearch(const sauchar_t *T, saidx_t Tsize,
330 const saidx_t *SA, saidx_t SAsize,
331 saint_t c, saidx_t *idx) {
332 saidx_t size, lsize, rsize, half;
333 saidx_t i, j, k, p;
334 saint_t r;
335
336 if(idx != NULL) { *idx = -1; }
337 if((T == NULL) || (SA == NULL) || (Tsize < 0) || (SAsize < 0)) { return -1; }
338 if((Tsize == 0) || (SAsize == 0)) { return 0; }
339
340 for(i = j = k = 0, size = SAsize, half = size >> 1;
341 0 < size;
342 size = half, half >>= 1) {
343 p = SA[i + half];
344 r = (p < Tsize) ? T[p] - c : -1;
345 if(r < 0) {
346 i += half + 1;
347 half -= (size & 1) ^ 1;
348 } else if(r == 0) {
349 lsize = half, j = i, rsize = size - half - 1, k = i + half + 1;
350
351 /* left part */
352 for(half = lsize >> 1;
353 0 < lsize;
354 lsize = half, half >>= 1) {
355 p = SA[j + half];
356 r = (p < Tsize) ? T[p] - c : -1;
357 if(r < 0) {
358 j += half + 1;
359 half -= (lsize & 1) ^ 1;
360 }
361 }
362
363 /* right part */
364 for(half = rsize >> 1;
365 0 < rsize;
366 rsize = half, half >>= 1) {
367 p = SA[k + half];
368 r = (p < Tsize) ? T[p] - c : -1;
369 if(r <= 0) {
370 k += half + 1;
371 half -= (rsize & 1) ^ 1;
372 }
373 }
374
375 break;
376 }
377 }
378
379 if(idx != NULL) { *idx = (0 < (k - j)) ? j : i; }
380 return k - j;
381 }
382