• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1'use strict';
2/*
3  Based heavily on the Streaming Boyer-Moore-Horspool C++ implementation
4  by Hongli Lai at: https://github.com/FooBarWidget/boyer-moore-horspool
5*/
6function memcmp(buf1, pos1, buf2, pos2, num) {
7  for (let i = 0; i < num; ++i) {
8    if (buf1[pos1 + i] !== buf2[pos2 + i])
9      return false;
10  }
11  return true;
12}
13
14class SBMH {
15  constructor(needle, cb) {
16    if (typeof cb !== 'function')
17      throw new Error('Missing match callback');
18
19    if (typeof needle === 'string')
20      needle = Buffer.from(needle);
21    else if (!Buffer.isBuffer(needle))
22      throw new Error(`Expected Buffer for needle, got ${typeof needle}`);
23
24    const needleLen = needle.length;
25
26    this.maxMatches = Infinity;
27    this.matches = 0;
28
29    this._cb = cb;
30    this._lookbehindSize = 0;
31    this._needle = needle;
32    this._bufPos = 0;
33
34    this._lookbehind = Buffer.allocUnsafe(needleLen);
35
36    // Initialize occurrence table.
37    this._occ = [
38      needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
39      needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
40      needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
41      needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
42      needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
43      needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
44      needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
45      needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
46      needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
47      needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
48      needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
49      needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
50      needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
51      needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
52      needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
53      needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
54      needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
55      needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
56      needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
57      needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
58      needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
59      needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
60      needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
61      needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
62      needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
63      needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
64      needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
65      needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
66      needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
67      needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
68      needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
69      needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
70      needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
71      needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
72      needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
73      needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
74      needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
75      needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
76      needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
77      needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
78      needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
79      needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
80      needleLen, needleLen, needleLen, needleLen
81    ];
82
83    // Populate occurrence table with analysis of the needle, ignoring the last
84    // letter.
85    if (needleLen > 1) {
86      for (let i = 0; i < needleLen - 1; ++i)
87        this._occ[needle[i]] = needleLen - 1 - i;
88    }
89  }
90
91  reset() {
92    this.matches = 0;
93    this._lookbehindSize = 0;
94    this._bufPos = 0;
95  }
96
97  push(chunk, pos) {
98    let result;
99    if (!Buffer.isBuffer(chunk))
100      chunk = Buffer.from(chunk, 'latin1');
101    const chunkLen = chunk.length;
102    this._bufPos = pos || 0;
103    while (result !== chunkLen && this.matches < this.maxMatches)
104      result = feed(this, chunk);
105    return result;
106  }
107
108  destroy() {
109    const lbSize = this._lookbehindSize;
110    if (lbSize)
111      this._cb(false, this._lookbehind, 0, lbSize, false);
112    this.reset();
113  }
114}
115
116function feed(self, data) {
117  const len = data.length;
118  const needle = self._needle;
119  const needleLen = needle.length;
120
121  // Positive: points to a position in `data`
122  //           pos == 3 points to data[3]
123  // Negative: points to a position in the lookbehind buffer
124  //           pos == -2 points to lookbehind[lookbehindSize - 2]
125  let pos = -self._lookbehindSize;
126  const lastNeedleCharPos = needleLen - 1;
127  const lastNeedleChar = needle[lastNeedleCharPos];
128  const end = len - needleLen;
129  const occ = self._occ;
130  const lookbehind = self._lookbehind;
131
132  if (pos < 0) {
133    // Lookbehind buffer is not empty. Perform Boyer-Moore-Horspool
134    // search with character lookup code that considers both the
135    // lookbehind buffer and the current round's haystack data.
136    //
137    // Loop until
138    //   there is a match.
139    // or until
140    //   we've moved past the position that requires the
141    //   lookbehind buffer. In this case we switch to the
142    //   optimized loop.
143    // or until
144    //   the character to look at lies outside the haystack.
145    while (pos < 0 && pos <= end) {
146      const nextPos = pos + lastNeedleCharPos;
147      const ch = (nextPos < 0
148                  ? lookbehind[self._lookbehindSize + nextPos]
149                  : data[nextPos]);
150
151      if (ch === lastNeedleChar
152          && matchNeedle(self, data, pos, lastNeedleCharPos)) {
153        self._lookbehindSize = 0;
154        ++self.matches;
155        if (pos > -self._lookbehindSize)
156          self._cb(true, lookbehind, 0, self._lookbehindSize + pos, false);
157        else
158          self._cb(true, undefined, 0, 0, true);
159
160        return (self._bufPos = pos + needleLen);
161      }
162
163      pos += occ[ch];
164    }
165
166    // No match.
167
168    // There's too few data for Boyer-Moore-Horspool to run,
169    // so let's use a different algorithm to skip as much as
170    // we can.
171    // Forward pos until
172    //   the trailing part of lookbehind + data
173    //   looks like the beginning of the needle
174    // or until
175    //   pos == 0
176    while (pos < 0 && !matchNeedle(self, data, pos, len - pos))
177      ++pos;
178
179    if (pos < 0) {
180      // Cut off part of the lookbehind buffer that has
181      // been processed and append the entire haystack
182      // into it.
183      const bytesToCutOff = self._lookbehindSize + pos;
184
185      if (bytesToCutOff > 0) {
186        // The cut off data is guaranteed not to contain the needle.
187        self._cb(false, lookbehind, 0, bytesToCutOff, false);
188      }
189
190      self._lookbehindSize -= bytesToCutOff;
191      lookbehind.copy(lookbehind, 0, bytesToCutOff, self._lookbehindSize);
192      lookbehind.set(data, self._lookbehindSize);
193      self._lookbehindSize += len;
194
195      self._bufPos = len;
196      return len;
197    }
198
199    // Discard lookbehind buffer.
200    self._cb(false, lookbehind, 0, self._lookbehindSize, false);
201    self._lookbehindSize = 0;
202  }
203
204  pos += self._bufPos;
205
206  const firstNeedleChar = needle[0];
207
208  // Lookbehind buffer is now empty. Perform Boyer-Moore-Horspool
209  // search with optimized character lookup code that only considers
210  // the current round's haystack data.
211  while (pos <= end) {
212    const ch = data[pos + lastNeedleCharPos];
213
214    if (ch === lastNeedleChar
215        && data[pos] === firstNeedleChar
216        && memcmp(needle, 0, data, pos, lastNeedleCharPos)) {
217      ++self.matches;
218      if (pos > 0)
219        self._cb(true, data, self._bufPos, pos, true);
220      else
221        self._cb(true, undefined, 0, 0, true);
222
223      return (self._bufPos = pos + needleLen);
224    }
225
226    pos += occ[ch];
227  }
228
229  // There was no match. If there's trailing haystack data that we cannot
230  // match yet using the Boyer-Moore-Horspool algorithm (because the trailing
231  // data is less than the needle size) then match using a modified
232  // algorithm that starts matching from the beginning instead of the end.
233  // Whatever trailing data is left after running this algorithm is added to
234  // the lookbehind buffer.
235  while (pos < len) {
236    if (data[pos] !== firstNeedleChar
237        || !memcmp(data, pos, needle, 0, len - pos)) {
238      ++pos;
239      continue;
240    }
241    data.copy(lookbehind, 0, pos, len);
242    self._lookbehindSize = len - pos;
243    break;
244  }
245
246  // Everything until `pos` is guaranteed not to contain needle data.
247  if (pos > 0)
248    self._cb(false, data, self._bufPos, pos < len ? pos : len, true);
249
250  self._bufPos = len;
251  return len;
252}
253
254function matchNeedle(self, data, pos, len) {
255  const lb = self._lookbehind;
256  const lbSize = self._lookbehindSize;
257  const needle = self._needle;
258
259  for (let i = 0; i < len; ++i, ++pos) {
260    const ch = (pos < 0 ? lb[lbSize + pos] : data[pos]);
261    if (ch !== needle[i])
262      return false;
263  }
264  return true;
265}
266
267module.exports = SBMH;
268