229 lines
7.0 KiB
JavaScript
229 lines
7.0 KiB
JavaScript
|
'use strict'
|
||
|
|
||
|
/**
|
||
|
* Copyright Brian White. All rights reserved.
|
||
|
*
|
||
|
* @see https://github.com/mscdex/streamsearch
|
||
|
*
|
||
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||
|
* of this software and associated documentation files (the "Software"), to
|
||
|
* deal in the Software without restriction, including without limitation the
|
||
|
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||
|
* sell copies of the Software, and to permit persons to whom the Software is
|
||
|
* furnished to do so, subject to the following conditions:
|
||
|
*
|
||
|
* The above copyright notice and this permission notice shall be included in
|
||
|
* all copies or substantial portions of the Software.
|
||
|
*
|
||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||
|
* IN THE SOFTWARE.
|
||
|
*
|
||
|
* Based heavily on the Streaming Boyer-Moore-Horspool C++ implementation
|
||
|
* by Hongli Lai at: https://github.com/FooBarWidget/boyer-moore-horspool
|
||
|
*/
|
||
|
const EventEmitter = require('node:events').EventEmitter
|
||
|
const inherits = require('node:util').inherits
|
||
|
|
||
|
function SBMH (needle) {
|
||
|
if (typeof needle === 'string') {
|
||
|
needle = Buffer.from(needle)
|
||
|
}
|
||
|
|
||
|
if (!Buffer.isBuffer(needle)) {
|
||
|
throw new TypeError('The needle has to be a String or a Buffer.')
|
||
|
}
|
||
|
|
||
|
const needleLength = needle.length
|
||
|
|
||
|
if (needleLength === 0) {
|
||
|
throw new Error('The needle cannot be an empty String/Buffer.')
|
||
|
}
|
||
|
|
||
|
if (needleLength > 256) {
|
||
|
throw new Error('The needle cannot have a length bigger than 256.')
|
||
|
}
|
||
|
|
||
|
this.maxMatches = Infinity
|
||
|
this.matches = 0
|
||
|
|
||
|
this._occ = new Array(256)
|
||
|
.fill(needleLength) // Initialize occurrence table.
|
||
|
this._lookbehind_size = 0
|
||
|
this._needle = needle
|
||
|
this._bufpos = 0
|
||
|
|
||
|
this._lookbehind = Buffer.alloc(needleLength)
|
||
|
|
||
|
// Populate occurrence table with analysis of the needle,
|
||
|
// ignoring last letter.
|
||
|
for (var i = 0; i < needleLength - 1; ++i) { // eslint-disable-line no-var
|
||
|
this._occ[needle[i]] = needleLength - 1 - i
|
||
|
}
|
||
|
}
|
||
|
inherits(SBMH, EventEmitter)
|
||
|
|
||
|
SBMH.prototype.reset = function () {
|
||
|
this._lookbehind_size = 0
|
||
|
this.matches = 0
|
||
|
this._bufpos = 0
|
||
|
}
|
||
|
|
||
|
SBMH.prototype.push = function (chunk, pos) {
|
||
|
if (!Buffer.isBuffer(chunk)) {
|
||
|
chunk = Buffer.from(chunk, 'binary')
|
||
|
}
|
||
|
const chlen = chunk.length
|
||
|
this._bufpos = pos || 0
|
||
|
let r
|
||
|
while (r !== chlen && this.matches < this.maxMatches) { r = this._sbmh_feed(chunk) }
|
||
|
return r
|
||
|
}
|
||
|
|
||
|
SBMH.prototype._sbmh_feed = function (data) {
|
||
|
const len = data.length
|
||
|
const needle = this._needle
|
||
|
const needleLength = needle.length
|
||
|
const lastNeedleChar = needle[needleLength - 1]
|
||
|
|
||
|
// Positive: points to a position in `data`
|
||
|
// pos == 3 points to data[3]
|
||
|
// Negative: points to a position in the lookbehind buffer
|
||
|
// pos == -2 points to lookbehind[lookbehind_size - 2]
|
||
|
let pos = -this._lookbehind_size
|
||
|
let ch
|
||
|
|
||
|
if (pos < 0) {
|
||
|
// Lookbehind buffer is not empty. Perform Boyer-Moore-Horspool
|
||
|
// search with character lookup code that considers both the
|
||
|
// lookbehind buffer and the current round's haystack data.
|
||
|
//
|
||
|
// Loop until
|
||
|
// there is a match.
|
||
|
// or until
|
||
|
// we've moved past the position that requires the
|
||
|
// lookbehind buffer. In this case we switch to the
|
||
|
// optimized loop.
|
||
|
// or until
|
||
|
// the character to look at lies outside the haystack.
|
||
|
while (pos < 0 && pos <= len - needleLength) {
|
||
|
ch = this._sbmh_lookup_char(data, pos + needleLength - 1)
|
||
|
|
||
|
if (
|
||
|
ch === lastNeedleChar &&
|
||
|
this._sbmh_memcmp(data, pos, needleLength - 1)
|
||
|
) {
|
||
|
this._lookbehind_size = 0
|
||
|
++this.matches
|
||
|
this.emit('info', true)
|
||
|
|
||
|
return (this._bufpos = pos + needleLength)
|
||
|
}
|
||
|
pos += this._occ[ch]
|
||
|
}
|
||
|
|
||
|
// No match.
|
||
|
|
||
|
if (pos < 0) {
|
||
|
// There's too few data for Boyer-Moore-Horspool to run,
|
||
|
// so let's use a different algorithm to skip as much as
|
||
|
// we can.
|
||
|
// Forward pos until
|
||
|
// the trailing part of lookbehind + data
|
||
|
// looks like the beginning of the needle
|
||
|
// or until
|
||
|
// pos == 0
|
||
|
while (pos < 0 && !this._sbmh_memcmp(data, pos, len - pos)) { ++pos }
|
||
|
}
|
||
|
|
||
|
if (pos >= 0) {
|
||
|
// Discard lookbehind buffer.
|
||
|
this.emit('info', false, this._lookbehind, 0, this._lookbehind_size)
|
||
|
this._lookbehind_size = 0
|
||
|
} else {
|
||
|
// Cut off part of the lookbehind buffer that has
|
||
|
// been processed and append the entire haystack
|
||
|
// into it.
|
||
|
const bytesToCutOff = this._lookbehind_size + pos
|
||
|
if (bytesToCutOff > 0) {
|
||
|
// The cut off data is guaranteed not to contain the needle.
|
||
|
this.emit('info', false, this._lookbehind, 0, bytesToCutOff)
|
||
|
}
|
||
|
|
||
|
this._lookbehind.copy(this._lookbehind, 0, bytesToCutOff,
|
||
|
this._lookbehind_size - bytesToCutOff)
|
||
|
this._lookbehind_size -= bytesToCutOff
|
||
|
|
||
|
data.copy(this._lookbehind, this._lookbehind_size)
|
||
|
this._lookbehind_size += len
|
||
|
|
||
|
this._bufpos = len
|
||
|
return len
|
||
|
}
|
||
|
}
|
||
|
|
||
|
pos += (pos >= 0) * this._bufpos
|
||
|
|
||
|
// Lookbehind buffer is now empty. We only need to check if the
|
||
|
// needle is in the haystack.
|
||
|
if (data.indexOf(needle, pos) !== -1) {
|
||
|
pos = data.indexOf(needle, pos)
|
||
|
++this.matches
|
||
|
if (pos > 0) { this.emit('info', true, data, this._bufpos, pos) } else { this.emit('info', true) }
|
||
|
|
||
|
return (this._bufpos = pos + needleLength)
|
||
|
} else {
|
||
|
pos = len - needleLength
|
||
|
}
|
||
|
|
||
|
// There was no match. If there's trailing haystack data that we cannot
|
||
|
// match yet using the Boyer-Moore-Horspool algorithm (because the trailing
|
||
|
// data is less than the needle size) then match using a modified
|
||
|
// algorithm that starts matching from the beginning instead of the end.
|
||
|
// Whatever trailing data is left after running this algorithm is added to
|
||
|
// the lookbehind buffer.
|
||
|
while (
|
||
|
pos < len &&
|
||
|
(
|
||
|
data[pos] !== needle[0] ||
|
||
|
(
|
||
|
(Buffer.compare(
|
||
|
data.subarray(pos, pos + len - pos),
|
||
|
needle.subarray(0, len - pos)
|
||
|
) !== 0)
|
||
|
)
|
||
|
)
|
||
|
) {
|
||
|
++pos
|
||
|
}
|
||
|
if (pos < len) {
|
||
|
data.copy(this._lookbehind, 0, pos, pos + (len - pos))
|
||
|
this._lookbehind_size = len - pos
|
||
|
}
|
||
|
|
||
|
// Everything until pos is guaranteed not to contain needle data.
|
||
|
if (pos > 0) { this.emit('info', false, data, this._bufpos, pos < len ? pos : len) }
|
||
|
|
||
|
this._bufpos = len
|
||
|
return len
|
||
|
}
|
||
|
|
||
|
SBMH.prototype._sbmh_lookup_char = function (data, pos) {
|
||
|
return (pos < 0)
|
||
|
? this._lookbehind[this._lookbehind_size + pos]
|
||
|
: data[pos]
|
||
|
}
|
||
|
|
||
|
SBMH.prototype._sbmh_memcmp = function (data, pos, len) {
|
||
|
for (var i = 0; i < len; ++i) { // eslint-disable-line no-var
|
||
|
if (this._sbmh_lookup_char(data, pos + i) !== this._needle[i]) { return false }
|
||
|
}
|
||
|
return true
|
||
|
}
|
||
|
|
||
|
module.exports = SBMH
|