|
|
|
|
|
|
|
|
|
export class SentenceChunker { |
|
|
|
|
|
|
|
|
|
|
|
constructor(options = {}) { |
|
this.buffer = ""; |
|
this.chunkLength = options.chunkLength || 128; |
|
this.emitParagraphs = options.emitParagraphs !== false; |
|
this.callbacks = []; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
emit(output) { |
|
this.callbacks.forEach(cb => cb(output)); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
onChunk(callback) { |
|
this.callbacks.push(callback); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
push(data) { |
|
let paragraphs = data.split(/(\n+)/); |
|
let numParagraphs = paragraphs.length; |
|
for (let i = 0; i < numParagraphs; i++) { |
|
let paragraph = paragraphs[i]; |
|
if (!paragraph) { |
|
continue; |
|
} |
|
let sentences = paragraph.split(/(?<=[;:,.!?]\s+)|(?<=[;:,。!?])/); |
|
let bufferLength = this.buffer.length; |
|
for (let sentence of sentences) { |
|
let sentenceLength = sentence.length; |
|
if (sentenceLength === 0) { |
|
continue; |
|
} |
|
if (bufferLength + sentenceLength <= this.chunkLength) { |
|
this.buffer += sentence; |
|
bufferLength += sentenceLength; |
|
} else { |
|
if (bufferLength > 0) { |
|
this.emit(this.buffer); |
|
} |
|
this.buffer = sentence; |
|
bufferLength = sentenceLength; |
|
} |
|
} |
|
|
|
if (this.emitParagraphs && numParagraphs > 1 && i < numParagraphs - 1) { |
|
this.emit(this.buffer); |
|
this.buffer = ""; |
|
} |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
flush() { |
|
if (this.buffer.length > 0) { |
|
this.emit(this.buffer); |
|
this.buffer = ""; |
|
} |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
export class GrowingSentenceChunker extends SentenceChunker { |
|
constructor(options = {}) { |
|
super(options); |
|
this.partialSentence = ""; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
push(data) { |
|
const newData = data.substring(this.partialSentence.length); |
|
this.partialSentence += newData; |
|
super.push(newData); |
|
} |
|
|
|
|
|
|
|
|
|
flush() { |
|
super.flush(); |
|
this.partialSentence = ""; |
|
} |
|
} |
|
|