1'use strict'; 2/* 3 Based heavily on the Streaming Boyer-Moore-Horspool C++ implementation 4 by Hongli Lai at: https://github.com/FooBarWidget/boyer-moore-horspool 5*/ 6function memcmp(buf1, pos1, buf2, pos2, num) { 7 for (let i = 0; i < num; ++i) { 8 if (buf1[pos1 + i] !== buf2[pos2 + i]) 9 return false; 10 } 11 return true; 12} 13 14class SBMH { 15 constructor(needle, cb) { 16 if (typeof cb !== 'function') 17 throw new Error('Missing match callback'); 18 19 if (typeof needle === 'string') 20 needle = Buffer.from(needle); 21 else if (!Buffer.isBuffer(needle)) 22 throw new Error(`Expected Buffer for needle, got ${typeof needle}`); 23 24 const needleLen = needle.length; 25 26 this.maxMatches = Infinity; 27 this.matches = 0; 28 29 this._cb = cb; 30 this._lookbehindSize = 0; 31 this._needle = needle; 32 this._bufPos = 0; 33 34 this._lookbehind = Buffer.allocUnsafe(needleLen); 35 36 // Initialize occurrence table. 37 this._occ = [ 38 needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 39 needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 40 needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 41 needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 42 needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 43 needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 44 needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 45 needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 46 needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 47 needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 48 needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 49 needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 50 needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 51 needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 52 needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 53 needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 54 needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 55 needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 56 needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 57 needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 58 needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 59 needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 60 needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 61 needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 62 needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 63 needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 64 needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 65 needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 66 needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 67 needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 68 needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 69 needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 70 needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 71 needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 72 needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 73 needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 74 needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 75 needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 76 needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 77 needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 78 needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 79 needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 80 needleLen, needleLen, needleLen, needleLen 81 ]; 82 83 // Populate occurrence table with analysis of the needle, ignoring the last 84 // letter. 85 if (needleLen > 1) { 86 for (let i = 0; i < needleLen - 1; ++i) 87 this._occ[needle[i]] = needleLen - 1 - i; 88 } 89 } 90 91 reset() { 92 this.matches = 0; 93 this._lookbehindSize = 0; 94 this._bufPos = 0; 95 } 96 97 push(chunk, pos) { 98 let result; 99 if (!Buffer.isBuffer(chunk)) 100 chunk = Buffer.from(chunk, 'latin1'); 101 const chunkLen = chunk.length; 102 this._bufPos = pos || 0; 103 while (result !== chunkLen && this.matches < this.maxMatches) 104 result = feed(this, chunk); 105 return result; 106 } 107 108 destroy() { 109 const lbSize = this._lookbehindSize; 110 if (lbSize) 111 this._cb(false, this._lookbehind, 0, lbSize, false); 112 this.reset(); 113 } 114} 115 116function feed(self, data) { 117 const len = data.length; 118 const needle = self._needle; 119 const needleLen = needle.length; 120 121 // Positive: points to a position in `data` 122 // pos == 3 points to data[3] 123 // Negative: points to a position in the lookbehind buffer 124 // pos == -2 points to lookbehind[lookbehindSize - 2] 125 let pos = -self._lookbehindSize; 126 const lastNeedleCharPos = needleLen - 1; 127 const lastNeedleChar = needle[lastNeedleCharPos]; 128 const end = len - needleLen; 129 const occ = self._occ; 130 const lookbehind = self._lookbehind; 131 132 if (pos < 0) { 133 // Lookbehind buffer is not empty. Perform Boyer-Moore-Horspool 134 // search with character lookup code that considers both the 135 // lookbehind buffer and the current round's haystack data. 136 // 137 // Loop until 138 // there is a match. 139 // or until 140 // we've moved past the position that requires the 141 // lookbehind buffer. In this case we switch to the 142 // optimized loop. 143 // or until 144 // the character to look at lies outside the haystack. 145 while (pos < 0 && pos <= end) { 146 const nextPos = pos + lastNeedleCharPos; 147 const ch = (nextPos < 0 148 ? lookbehind[self._lookbehindSize + nextPos] 149 : data[nextPos]); 150 151 if (ch === lastNeedleChar 152 && matchNeedle(self, data, pos, lastNeedleCharPos)) { 153 self._lookbehindSize = 0; 154 ++self.matches; 155 if (pos > -self._lookbehindSize) 156 self._cb(true, lookbehind, 0, self._lookbehindSize + pos, false); 157 else 158 self._cb(true, undefined, 0, 0, true); 159 160 return (self._bufPos = pos + needleLen); 161 } 162 163 pos += occ[ch]; 164 } 165 166 // No match. 167 168 // There's too few data for Boyer-Moore-Horspool to run, 169 // so let's use a different algorithm to skip as much as 170 // we can. 171 // Forward pos until 172 // the trailing part of lookbehind + data 173 // looks like the beginning of the needle 174 // or until 175 // pos == 0 176 while (pos < 0 && !matchNeedle(self, data, pos, len - pos)) 177 ++pos; 178 179 if (pos < 0) { 180 // Cut off part of the lookbehind buffer that has 181 // been processed and append the entire haystack 182 // into it. 183 const bytesToCutOff = self._lookbehindSize + pos; 184 185 if (bytesToCutOff > 0) { 186 // The cut off data is guaranteed not to contain the needle. 187 self._cb(false, lookbehind, 0, bytesToCutOff, false); 188 } 189 190 self._lookbehindSize -= bytesToCutOff; 191 lookbehind.copy(lookbehind, 0, bytesToCutOff, self._lookbehindSize); 192 lookbehind.set(data, self._lookbehindSize); 193 self._lookbehindSize += len; 194 195 self._bufPos = len; 196 return len; 197 } 198 199 // Discard lookbehind buffer. 200 self._cb(false, lookbehind, 0, self._lookbehindSize, false); 201 self._lookbehindSize = 0; 202 } 203 204 pos += self._bufPos; 205 206 const firstNeedleChar = needle[0]; 207 208 // Lookbehind buffer is now empty. Perform Boyer-Moore-Horspool 209 // search with optimized character lookup code that only considers 210 // the current round's haystack data. 211 while (pos <= end) { 212 const ch = data[pos + lastNeedleCharPos]; 213 214 if (ch === lastNeedleChar 215 && data[pos] === firstNeedleChar 216 && memcmp(needle, 0, data, pos, lastNeedleCharPos)) { 217 ++self.matches; 218 if (pos > 0) 219 self._cb(true, data, self._bufPos, pos, true); 220 else 221 self._cb(true, undefined, 0, 0, true); 222 223 return (self._bufPos = pos + needleLen); 224 } 225 226 pos += occ[ch]; 227 } 228 229 // There was no match. If there's trailing haystack data that we cannot 230 // match yet using the Boyer-Moore-Horspool algorithm (because the trailing 231 // data is less than the needle size) then match using a modified 232 // algorithm that starts matching from the beginning instead of the end. 233 // Whatever trailing data is left after running this algorithm is added to 234 // the lookbehind buffer. 235 while (pos < len) { 236 if (data[pos] !== firstNeedleChar 237 || !memcmp(data, pos, needle, 0, len - pos)) { 238 ++pos; 239 continue; 240 } 241 data.copy(lookbehind, 0, pos, len); 242 self._lookbehindSize = len - pos; 243 break; 244 } 245 246 // Everything until `pos` is guaranteed not to contain needle data. 247 if (pos > 0) 248 self._cb(false, data, self._bufPos, pos < len ? pos : len, true); 249 250 self._bufPos = len; 251 return len; 252} 253 254function matchNeedle(self, data, pos, len) { 255 const lb = self._lookbehind; 256 const lbSize = self._lookbehindSize; 257 const needle = self._needle; 258 259 for (let i = 0; i < len; ++i, ++pos) { 260 const ch = (pos < 0 ? lb[lbSize + pos] : data[pos]); 261 if (ch !== needle[i]) 262 return false; 263 } 264 return true; 265} 266 267module.exports = SBMH; 268