mirror of
https://github.com/SillyTavern/SillyTavern.git
synced 2025-02-09 08:38:53 +01:00
Add file chunks overlap control
This commit is contained in:
parent
671b7ef7cb
commit
39721b6a8f
@ -22,7 +22,7 @@ import {
|
||||
import { collapseNewlines } from '../../power-user.js';
|
||||
import { SECRET_KEYS, secret_state, writeSecret } from '../../secrets.js';
|
||||
import { getDataBankAttachments, getDataBankAttachmentsForSource, getFileAttachment } from '../../chats.js';
|
||||
import { debounce, getStringHash as calculateHash, waitUntilCondition, onlyUnique, splitRecursive } from '../../utils.js';
|
||||
import { debounce, getStringHash as calculateHash, waitUntilCondition, onlyUnique, splitRecursive, trimToStartSentence, trimToEndSentence } from '../../utils.js';
|
||||
import { debounce_timeout } from '../../constants.js';
|
||||
import { getSortedEntries } from '../../world-info.js';
|
||||
import { textgen_types, textgenerationwebui_settings } from '../../textgen-settings.js';
|
||||
@ -66,11 +66,13 @@ const settings = {
|
||||
size_threshold: 10,
|
||||
chunk_size: 5000,
|
||||
chunk_count: 2,
|
||||
overlap_percent: 0,
|
||||
|
||||
// For Data Bank
|
||||
size_threshold_db: 5,
|
||||
chunk_size_db: 2500,
|
||||
chunk_count_db: 5,
|
||||
overlap_percent_db: 0,
|
||||
file_template_db: 'Related information:\n{{text}}',
|
||||
file_position_db: extension_prompt_types.IN_PROMPT,
|
||||
file_depth_db: 4,
|
||||
@ -369,7 +371,7 @@ async function processFiles(chat) {
|
||||
|
||||
// File is already in the collection
|
||||
if (!hashesInCollection.length) {
|
||||
await vectorizeFile(fileText, fileName, collectionId, settings.chunk_size);
|
||||
await vectorizeFile(fileText, fileName, collectionId, settings.chunk_size, settings.overlap_percent);
|
||||
}
|
||||
|
||||
const queryText = await getQueryText(chat);
|
||||
@ -409,7 +411,7 @@ async function ingestDataBankAttachments(source) {
|
||||
const thresholdLength = settings.size_threshold_db * 1024;
|
||||
// Use chunk size from settings if file is larger than threshold
|
||||
const chunkSize = file.size > thresholdLength ? settings.chunk_size_db : -1;
|
||||
await vectorizeFile(file.text, file.name, collectionId, chunkSize);
|
||||
await vectorizeFile(file.text, file.name, collectionId, chunkSize, settings.overlap_percent_db);
|
||||
}
|
||||
|
||||
return dataBankCollectionIds;
|
||||
@ -467,9 +469,10 @@ async function retrieveFileChunks(queryText, collectionId) {
|
||||
* @param {string} fileName File name
|
||||
* @param {string} collectionId File collection ID
|
||||
* @param {number} chunkSize Chunk size
|
||||
* @param {number} overlapPercent Overlap size (in %)
|
||||
* @returns {Promise<boolean>} True if successful, false if not
|
||||
*/
|
||||
async function vectorizeFile(fileText, fileName, collectionId, chunkSize) {
|
||||
async function vectorizeFile(fileText, fileName, collectionId, chunkSize, overlapPercent) {
|
||||
try {
|
||||
if (settings.translate_files && typeof window['translate'] === 'function') {
|
||||
console.log(`Vectors: Translating file ${fileName} to English...`);
|
||||
@ -478,8 +481,11 @@ async function vectorizeFile(fileText, fileName, collectionId, chunkSize) {
|
||||
}
|
||||
|
||||
const toast = toastr.info('Vectorization may take some time, please wait...', `Ingesting file ${fileName}`);
|
||||
const chunks = splitRecursive(fileText, chunkSize);
|
||||
console.debug(`Vectors: Split file ${fileName} into ${chunks.length} chunks`, chunks);
|
||||
const overlapSize = Math.round(chunkSize * overlapPercent / 100);
|
||||
// Overlap should not be included in chunk size. It will be later compensated by overlapChunks
|
||||
chunkSize = overlapSize > 0 ? (chunkSize - overlapSize) : chunkSize;
|
||||
const chunks = splitRecursive(fileText, chunkSize).map((x, y, z) => overlapSize > 0 ? overlapChunks(x, y, z, overlapSize) : x);
|
||||
console.debug(`Vectors: Split file ${fileName} into ${chunks.length} chunks with ${overlapPercent}% overlap`, chunks);
|
||||
|
||||
const items = chunks.map((chunk, index) => ({ hash: getStringHash(chunk), text: chunk, index: index }));
|
||||
await insertVectorItems(collectionId, items);
|
||||
@ -588,6 +594,25 @@ function getPromptText(queriedMessages) {
|
||||
return substituteParams(settings.template.replace(/{{text}}/i, queriedText));
|
||||
}
|
||||
|
||||
/**
|
||||
* Modifies text chunks to include overlap with adjacent chunks.
|
||||
* @param {string} chunk Current item
|
||||
* @param {number} index Current index
|
||||
* @param {string[]} chunks List of chunks
|
||||
* @param {number} overlapSize Size of the overlap
|
||||
* @returns {string} Overlapped chunks, with overlap trimmed to sentence boundaries
|
||||
*/
|
||||
function overlapChunks(chunk, index, chunks, overlapSize) {
|
||||
const nextChunk = chunks[index + 1];
|
||||
const prevChunk = chunks[index - 1];
|
||||
|
||||
const nextOverlap = trimToEndSentence(nextChunk?.substring(0, overlapSize)) || '';
|
||||
const prevOverlap = trimToStartSentence(prevChunk?.substring(prevChunk.length - overlapSize)) || '';
|
||||
const overlappedChunk = [prevOverlap, chunk, nextOverlap].filter(x => x).join(' ');
|
||||
|
||||
return overlappedChunk;
|
||||
}
|
||||
|
||||
window['vectors_rearrangeChat'] = rearrangeChat;
|
||||
|
||||
const onChatEvent = debounce(async () => await moduleWorker.update(), debounce_timeout.relaxed);
|
||||
@ -969,8 +994,9 @@ async function onViewStatsClick() {
|
||||
toastr.info(`Total hashes: <b>${totalHashes}</b><br>
|
||||
Unique hashes: <b>${uniqueHashes}</b><br><br>
|
||||
I'll mark collected messages with a green circle.`,
|
||||
`Stats for chat ${chatId}`,
|
||||
{ timeOut: 10000, escapeHtml: false });
|
||||
`Stats for chat ${chatId}`,
|
||||
{ timeOut: 10000, escapeHtml: false },
|
||||
);
|
||||
|
||||
const chat = getContext().chat;
|
||||
for (const message of chat) {
|
||||
@ -1010,6 +1036,23 @@ async function onVectorizeAllFilesClick() {
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the overlap percent for a file attachment.
|
||||
* @param file {import('../../chats.js').FileAttachment} File attachment
|
||||
* @returns {number} Overlap percent for the file
|
||||
*/
|
||||
function getOverlapPercent(file) {
|
||||
if (chatAttachments.includes(file)) {
|
||||
return settings.overlap_percent;
|
||||
}
|
||||
|
||||
if (dataBank.includes(file)) {
|
||||
return settings.overlap_percent_db;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
let allSuccess = true;
|
||||
|
||||
for (const file of allFiles) {
|
||||
@ -1023,7 +1066,8 @@ async function onVectorizeAllFilesClick() {
|
||||
}
|
||||
|
||||
const chunkSize = getChunkSize(file);
|
||||
const result = await vectorizeFile(text, file.name, collectionId, chunkSize);
|
||||
const overlapPercent = getOverlapPercent(file);
|
||||
const result = await vectorizeFile(text, file.name, collectionId, chunkSize, overlapPercent);
|
||||
|
||||
if (!result) {
|
||||
allSuccess = false;
|
||||
@ -1343,6 +1387,18 @@ jQuery(async () => {
|
||||
saveSettingsDebounced();
|
||||
});
|
||||
|
||||
$('#vectors_overlap_percent').val(settings.overlap_percent).on('input', () => {
|
||||
settings.overlap_percent = Number($('#vectors_overlap_percent').val());
|
||||
Object.assign(extension_settings.vectors, settings);
|
||||
saveSettingsDebounced();
|
||||
});
|
||||
|
||||
$('#vectors_overlap_percent_db').val(settings.overlap_percent_db).on('input', () => {
|
||||
settings.overlap_percent_db = Number($('#vectors_overlap_percent_db').val());
|
||||
Object.assign(extension_settings.vectors, settings);
|
||||
saveSettingsDebounced();
|
||||
});
|
||||
|
||||
$('#vectors_file_template_db').val(settings.file_template_db).on('input', () => {
|
||||
settings.file_template_db = String($('#vectors_file_template_db').val());
|
||||
Object.assign(extension_settings.vectors, settings);
|
||||
|
@ -193,19 +193,25 @@
|
||||
<label for="vectors_size_threshold">
|
||||
<small>Size threshold (KB)</small>
|
||||
</label>
|
||||
<input id="vectors_size_threshold" type="number" class="text_pole widthUnset" min="1" max="99999" />
|
||||
<input id="vectors_size_threshold" type="number" class="text_pole" min="1" max="99999" />
|
||||
</div>
|
||||
<div class="flex1" title="Chunk size for file splitting.">
|
||||
<label for="vectors_chunk_size">
|
||||
<small>Chunk size (chars)</small>
|
||||
</label>
|
||||
<input id="vectors_chunk_size" type="number" class="text_pole widthUnset" min="1" max="99999" />
|
||||
<input id="vectors_chunk_size" type="number" class="text_pole" min="1" max="99999" />
|
||||
</div>
|
||||
<div class="flex1" title="The overlap between adjacent chunks in % from chunk size. The overlap text is trimmed to sentence boundaries. 0 = disabled.">
|
||||
<label for="vectors_overlap_percent">
|
||||
<small>Chunk overlap (%)</small>
|
||||
</label>
|
||||
<input id="vectors_overlap_percent" type="number" class="text_pole" min="0" max="99" step="1" />
|
||||
</div>
|
||||
<div class="flex1" title="How many chunks to retrieve when querying.">
|
||||
<label for="vectors_chunk_count">
|
||||
<small>Retrieve chunks</small>
|
||||
</label>
|
||||
<input id="vectors_chunk_count" type="number" class="text_pole widthUnset" min="1" max="99999" />
|
||||
<input id="vectors_chunk_count" type="number" class="text_pole" min="1" max="99999" />
|
||||
</div>
|
||||
</div>
|
||||
<div class="flex justifyCenter" title="These settings apply to files stored in the Data Bank.">
|
||||
@ -216,19 +222,25 @@
|
||||
<label for="vectors_size_threshold_db">
|
||||
<small>Size threshold (KB)</small>
|
||||
</label>
|
||||
<input id="vectors_size_threshold_db" type="number" class="text_pole widthUnset" min="1" max="99999" />
|
||||
<input id="vectors_size_threshold_db" type="number" class="text_pole" min="1" max="99999" />
|
||||
</div>
|
||||
<div class="flex1" title="Chunk size for file splitting.">
|
||||
<label for="vectors_chunk_size_db">
|
||||
<small>Chunk size (chars)</small>
|
||||
</label>
|
||||
<input id="vectors_chunk_size_db" type="number" class="text_pole widthUnset" min="1" max="99999" />
|
||||
<input id="vectors_chunk_size_db" type="number" class="text_pole" min="1" max="99999" />
|
||||
</div>
|
||||
<div class="flex1" title="The overlap between adjacent chunks in % from chunk size. The overlap text is trimmed to sentence boundaries. 0 = disabled.">
|
||||
<label for="vectors_overlap_percent_db">
|
||||
<small>Chunk overlap (%)</small>
|
||||
</label>
|
||||
<input id="vectors_overlap_percent_db" type="number" class="text_pole" min="0" max="99" step="1" />
|
||||
</div>
|
||||
<div class="flex1" title="How many chunks to retrieve when querying.">
|
||||
<label for="vectors_chunk_count_db">
|
||||
<small>Retrieve chunks</small>
|
||||
</label>
|
||||
<input id="vectors_chunk_count_db" type="number" class="text_pole widthUnset" min="1" max="99999" />
|
||||
<input id="vectors_chunk_count_db" type="number" class="text_pole" min="1" max="99999" />
|
||||
</div>
|
||||
</div>
|
||||
<div class="flex-container flexFlowColumn">
|
||||
|
@ -477,6 +477,10 @@ export function sortByCssOrder(a, b) {
|
||||
* trimToEndSentence('Hello, world! I am from'); // 'Hello, world!'
|
||||
*/
|
||||
export function trimToEndSentence(input, include_newline = false) {
|
||||
if (!input) {
|
||||
return '';
|
||||
}
|
||||
|
||||
const punctuation = new Set(['.', '!', '?', '*', '"', ')', '}', '`', ']', '$', '。', '!', '?', '”', ')', '】', '’', '」', '_']); // extend this as you see fit
|
||||
let last = -1;
|
||||
|
||||
@ -506,6 +510,10 @@ export function trimToEndSentence(input, include_newline = false) {
|
||||
}
|
||||
|
||||
export function trimToStartSentence(input) {
|
||||
if (!input) {
|
||||
return '';
|
||||
}
|
||||
|
||||
let p1 = input.indexOf('.');
|
||||
let p2 = input.indexOf('!');
|
||||
let p3 = input.indexOf('?');
|
||||
|
Loading…
x
Reference in New Issue
Block a user