mirror of
https://github.com/SillyTavern/SillyTavern.git
synced 2025-02-10 17:10:45 +01:00
Add file chunks overlap control
This commit is contained in:
parent
671b7ef7cb
commit
39721b6a8f
@ -22,7 +22,7 @@ import {
|
|||||||
import { collapseNewlines } from '../../power-user.js';
|
import { collapseNewlines } from '../../power-user.js';
|
||||||
import { SECRET_KEYS, secret_state, writeSecret } from '../../secrets.js';
|
import { SECRET_KEYS, secret_state, writeSecret } from '../../secrets.js';
|
||||||
import { getDataBankAttachments, getDataBankAttachmentsForSource, getFileAttachment } from '../../chats.js';
|
import { getDataBankAttachments, getDataBankAttachmentsForSource, getFileAttachment } from '../../chats.js';
|
||||||
import { debounce, getStringHash as calculateHash, waitUntilCondition, onlyUnique, splitRecursive } from '../../utils.js';
|
import { debounce, getStringHash as calculateHash, waitUntilCondition, onlyUnique, splitRecursive, trimToStartSentence, trimToEndSentence } from '../../utils.js';
|
||||||
import { debounce_timeout } from '../../constants.js';
|
import { debounce_timeout } from '../../constants.js';
|
||||||
import { getSortedEntries } from '../../world-info.js';
|
import { getSortedEntries } from '../../world-info.js';
|
||||||
import { textgen_types, textgenerationwebui_settings } from '../../textgen-settings.js';
|
import { textgen_types, textgenerationwebui_settings } from '../../textgen-settings.js';
|
||||||
@ -66,11 +66,13 @@ const settings = {
|
|||||||
size_threshold: 10,
|
size_threshold: 10,
|
||||||
chunk_size: 5000,
|
chunk_size: 5000,
|
||||||
chunk_count: 2,
|
chunk_count: 2,
|
||||||
|
overlap_percent: 0,
|
||||||
|
|
||||||
// For Data Bank
|
// For Data Bank
|
||||||
size_threshold_db: 5,
|
size_threshold_db: 5,
|
||||||
chunk_size_db: 2500,
|
chunk_size_db: 2500,
|
||||||
chunk_count_db: 5,
|
chunk_count_db: 5,
|
||||||
|
overlap_percent_db: 0,
|
||||||
file_template_db: 'Related information:\n{{text}}',
|
file_template_db: 'Related information:\n{{text}}',
|
||||||
file_position_db: extension_prompt_types.IN_PROMPT,
|
file_position_db: extension_prompt_types.IN_PROMPT,
|
||||||
file_depth_db: 4,
|
file_depth_db: 4,
|
||||||
@ -369,7 +371,7 @@ async function processFiles(chat) {
|
|||||||
|
|
||||||
// File is already in the collection
|
// File is already in the collection
|
||||||
if (!hashesInCollection.length) {
|
if (!hashesInCollection.length) {
|
||||||
await vectorizeFile(fileText, fileName, collectionId, settings.chunk_size);
|
await vectorizeFile(fileText, fileName, collectionId, settings.chunk_size, settings.overlap_percent);
|
||||||
}
|
}
|
||||||
|
|
||||||
const queryText = await getQueryText(chat);
|
const queryText = await getQueryText(chat);
|
||||||
@ -409,7 +411,7 @@ async function ingestDataBankAttachments(source) {
|
|||||||
const thresholdLength = settings.size_threshold_db * 1024;
|
const thresholdLength = settings.size_threshold_db * 1024;
|
||||||
// Use chunk size from settings if file is larger than threshold
|
// Use chunk size from settings if file is larger than threshold
|
||||||
const chunkSize = file.size > thresholdLength ? settings.chunk_size_db : -1;
|
const chunkSize = file.size > thresholdLength ? settings.chunk_size_db : -1;
|
||||||
await vectorizeFile(file.text, file.name, collectionId, chunkSize);
|
await vectorizeFile(file.text, file.name, collectionId, chunkSize, settings.overlap_percent_db);
|
||||||
}
|
}
|
||||||
|
|
||||||
return dataBankCollectionIds;
|
return dataBankCollectionIds;
|
||||||
@ -467,9 +469,10 @@ async function retrieveFileChunks(queryText, collectionId) {
|
|||||||
* @param {string} fileName File name
|
* @param {string} fileName File name
|
||||||
* @param {string} collectionId File collection ID
|
* @param {string} collectionId File collection ID
|
||||||
* @param {number} chunkSize Chunk size
|
* @param {number} chunkSize Chunk size
|
||||||
|
* @param {number} overlapPercent Overlap size (in %)
|
||||||
* @returns {Promise<boolean>} True if successful, false if not
|
* @returns {Promise<boolean>} True if successful, false if not
|
||||||
*/
|
*/
|
||||||
async function vectorizeFile(fileText, fileName, collectionId, chunkSize) {
|
async function vectorizeFile(fileText, fileName, collectionId, chunkSize, overlapPercent) {
|
||||||
try {
|
try {
|
||||||
if (settings.translate_files && typeof window['translate'] === 'function') {
|
if (settings.translate_files && typeof window['translate'] === 'function') {
|
||||||
console.log(`Vectors: Translating file ${fileName} to English...`);
|
console.log(`Vectors: Translating file ${fileName} to English...`);
|
||||||
@ -478,8 +481,11 @@ async function vectorizeFile(fileText, fileName, collectionId, chunkSize) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const toast = toastr.info('Vectorization may take some time, please wait...', `Ingesting file ${fileName}`);
|
const toast = toastr.info('Vectorization may take some time, please wait...', `Ingesting file ${fileName}`);
|
||||||
const chunks = splitRecursive(fileText, chunkSize);
|
const overlapSize = Math.round(chunkSize * overlapPercent / 100);
|
||||||
console.debug(`Vectors: Split file ${fileName} into ${chunks.length} chunks`, chunks);
|
// Overlap should not be included in chunk size. It will be later compensated by overlapChunks
|
||||||
|
chunkSize = overlapSize > 0 ? (chunkSize - overlapSize) : chunkSize;
|
||||||
|
const chunks = splitRecursive(fileText, chunkSize).map((x, y, z) => overlapSize > 0 ? overlapChunks(x, y, z, overlapSize) : x);
|
||||||
|
console.debug(`Vectors: Split file ${fileName} into ${chunks.length} chunks with ${overlapPercent}% overlap`, chunks);
|
||||||
|
|
||||||
const items = chunks.map((chunk, index) => ({ hash: getStringHash(chunk), text: chunk, index: index }));
|
const items = chunks.map((chunk, index) => ({ hash: getStringHash(chunk), text: chunk, index: index }));
|
||||||
await insertVectorItems(collectionId, items);
|
await insertVectorItems(collectionId, items);
|
||||||
@ -588,6 +594,25 @@ function getPromptText(queriedMessages) {
|
|||||||
return substituteParams(settings.template.replace(/{{text}}/i, queriedText));
|
return substituteParams(settings.template.replace(/{{text}}/i, queriedText));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Modifies text chunks to include overlap with adjacent chunks.
|
||||||
|
* @param {string} chunk Current item
|
||||||
|
* @param {number} index Current index
|
||||||
|
* @param {string[]} chunks List of chunks
|
||||||
|
* @param {number} overlapSize Size of the overlap
|
||||||
|
* @returns {string} Overlapped chunks, with overlap trimmed to sentence boundaries
|
||||||
|
*/
|
||||||
|
function overlapChunks(chunk, index, chunks, overlapSize) {
|
||||||
|
const nextChunk = chunks[index + 1];
|
||||||
|
const prevChunk = chunks[index - 1];
|
||||||
|
|
||||||
|
const nextOverlap = trimToEndSentence(nextChunk?.substring(0, overlapSize)) || '';
|
||||||
|
const prevOverlap = trimToStartSentence(prevChunk?.substring(prevChunk.length - overlapSize)) || '';
|
||||||
|
const overlappedChunk = [prevOverlap, chunk, nextOverlap].filter(x => x).join(' ');
|
||||||
|
|
||||||
|
return overlappedChunk;
|
||||||
|
}
|
||||||
|
|
||||||
window['vectors_rearrangeChat'] = rearrangeChat;
|
window['vectors_rearrangeChat'] = rearrangeChat;
|
||||||
|
|
||||||
const onChatEvent = debounce(async () => await moduleWorker.update(), debounce_timeout.relaxed);
|
const onChatEvent = debounce(async () => await moduleWorker.update(), debounce_timeout.relaxed);
|
||||||
@ -970,7 +995,8 @@ async function onViewStatsClick() {
|
|||||||
Unique hashes: <b>${uniqueHashes}</b><br><br>
|
Unique hashes: <b>${uniqueHashes}</b><br><br>
|
||||||
I'll mark collected messages with a green circle.`,
|
I'll mark collected messages with a green circle.`,
|
||||||
`Stats for chat ${chatId}`,
|
`Stats for chat ${chatId}`,
|
||||||
{ timeOut: 10000, escapeHtml: false });
|
{ timeOut: 10000, escapeHtml: false },
|
||||||
|
);
|
||||||
|
|
||||||
const chat = getContext().chat;
|
const chat = getContext().chat;
|
||||||
for (const message of chat) {
|
for (const message of chat) {
|
||||||
@ -1010,6 +1036,23 @@ async function onVectorizeAllFilesClick() {
|
|||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the overlap percent for a file attachment.
|
||||||
|
* @param file {import('../../chats.js').FileAttachment} File attachment
|
||||||
|
* @returns {number} Overlap percent for the file
|
||||||
|
*/
|
||||||
|
function getOverlapPercent(file) {
|
||||||
|
if (chatAttachments.includes(file)) {
|
||||||
|
return settings.overlap_percent;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (dataBank.includes(file)) {
|
||||||
|
return settings.overlap_percent_db;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
let allSuccess = true;
|
let allSuccess = true;
|
||||||
|
|
||||||
for (const file of allFiles) {
|
for (const file of allFiles) {
|
||||||
@ -1023,7 +1066,8 @@ async function onVectorizeAllFilesClick() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const chunkSize = getChunkSize(file);
|
const chunkSize = getChunkSize(file);
|
||||||
const result = await vectorizeFile(text, file.name, collectionId, chunkSize);
|
const overlapPercent = getOverlapPercent(file);
|
||||||
|
const result = await vectorizeFile(text, file.name, collectionId, chunkSize, overlapPercent);
|
||||||
|
|
||||||
if (!result) {
|
if (!result) {
|
||||||
allSuccess = false;
|
allSuccess = false;
|
||||||
@ -1343,6 +1387,18 @@ jQuery(async () => {
|
|||||||
saveSettingsDebounced();
|
saveSettingsDebounced();
|
||||||
});
|
});
|
||||||
|
|
||||||
|
$('#vectors_overlap_percent').val(settings.overlap_percent).on('input', () => {
|
||||||
|
settings.overlap_percent = Number($('#vectors_overlap_percent').val());
|
||||||
|
Object.assign(extension_settings.vectors, settings);
|
||||||
|
saveSettingsDebounced();
|
||||||
|
});
|
||||||
|
|
||||||
|
$('#vectors_overlap_percent_db').val(settings.overlap_percent_db).on('input', () => {
|
||||||
|
settings.overlap_percent_db = Number($('#vectors_overlap_percent_db').val());
|
||||||
|
Object.assign(extension_settings.vectors, settings);
|
||||||
|
saveSettingsDebounced();
|
||||||
|
});
|
||||||
|
|
||||||
$('#vectors_file_template_db').val(settings.file_template_db).on('input', () => {
|
$('#vectors_file_template_db').val(settings.file_template_db).on('input', () => {
|
||||||
settings.file_template_db = String($('#vectors_file_template_db').val());
|
settings.file_template_db = String($('#vectors_file_template_db').val());
|
||||||
Object.assign(extension_settings.vectors, settings);
|
Object.assign(extension_settings.vectors, settings);
|
||||||
|
@ -193,19 +193,25 @@
|
|||||||
<label for="vectors_size_threshold">
|
<label for="vectors_size_threshold">
|
||||||
<small>Size threshold (KB)</small>
|
<small>Size threshold (KB)</small>
|
||||||
</label>
|
</label>
|
||||||
<input id="vectors_size_threshold" type="number" class="text_pole widthUnset" min="1" max="99999" />
|
<input id="vectors_size_threshold" type="number" class="text_pole" min="1" max="99999" />
|
||||||
</div>
|
</div>
|
||||||
<div class="flex1" title="Chunk size for file splitting.">
|
<div class="flex1" title="Chunk size for file splitting.">
|
||||||
<label for="vectors_chunk_size">
|
<label for="vectors_chunk_size">
|
||||||
<small>Chunk size (chars)</small>
|
<small>Chunk size (chars)</small>
|
||||||
</label>
|
</label>
|
||||||
<input id="vectors_chunk_size" type="number" class="text_pole widthUnset" min="1" max="99999" />
|
<input id="vectors_chunk_size" type="number" class="text_pole" min="1" max="99999" />
|
||||||
|
</div>
|
||||||
|
<div class="flex1" title="The overlap between adjacent chunks in % from chunk size. The overlap text is trimmed to sentence boundaries. 0 = disabled.">
|
||||||
|
<label for="vectors_overlap_percent">
|
||||||
|
<small>Chunk overlap (%)</small>
|
||||||
|
</label>
|
||||||
|
<input id="vectors_overlap_percent" type="number" class="text_pole" min="0" max="99" step="1" />
|
||||||
</div>
|
</div>
|
||||||
<div class="flex1" title="How many chunks to retrieve when querying.">
|
<div class="flex1" title="How many chunks to retrieve when querying.">
|
||||||
<label for="vectors_chunk_count">
|
<label for="vectors_chunk_count">
|
||||||
<small>Retrieve chunks</small>
|
<small>Retrieve chunks</small>
|
||||||
</label>
|
</label>
|
||||||
<input id="vectors_chunk_count" type="number" class="text_pole widthUnset" min="1" max="99999" />
|
<input id="vectors_chunk_count" type="number" class="text_pole" min="1" max="99999" />
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<div class="flex justifyCenter" title="These settings apply to files stored in the Data Bank.">
|
<div class="flex justifyCenter" title="These settings apply to files stored in the Data Bank.">
|
||||||
@ -216,19 +222,25 @@
|
|||||||
<label for="vectors_size_threshold_db">
|
<label for="vectors_size_threshold_db">
|
||||||
<small>Size threshold (KB)</small>
|
<small>Size threshold (KB)</small>
|
||||||
</label>
|
</label>
|
||||||
<input id="vectors_size_threshold_db" type="number" class="text_pole widthUnset" min="1" max="99999" />
|
<input id="vectors_size_threshold_db" type="number" class="text_pole" min="1" max="99999" />
|
||||||
</div>
|
</div>
|
||||||
<div class="flex1" title="Chunk size for file splitting.">
|
<div class="flex1" title="Chunk size for file splitting.">
|
||||||
<label for="vectors_chunk_size_db">
|
<label for="vectors_chunk_size_db">
|
||||||
<small>Chunk size (chars)</small>
|
<small>Chunk size (chars)</small>
|
||||||
</label>
|
</label>
|
||||||
<input id="vectors_chunk_size_db" type="number" class="text_pole widthUnset" min="1" max="99999" />
|
<input id="vectors_chunk_size_db" type="number" class="text_pole" min="1" max="99999" />
|
||||||
|
</div>
|
||||||
|
<div class="flex1" title="The overlap between adjacent chunks in % from chunk size. The overlap text is trimmed to sentence boundaries. 0 = disabled.">
|
||||||
|
<label for="vectors_overlap_percent_db">
|
||||||
|
<small>Chunk overlap (%)</small>
|
||||||
|
</label>
|
||||||
|
<input id="vectors_overlap_percent_db" type="number" class="text_pole" min="0" max="99" step="1" />
|
||||||
</div>
|
</div>
|
||||||
<div class="flex1" title="How many chunks to retrieve when querying.">
|
<div class="flex1" title="How many chunks to retrieve when querying.">
|
||||||
<label for="vectors_chunk_count_db">
|
<label for="vectors_chunk_count_db">
|
||||||
<small>Retrieve chunks</small>
|
<small>Retrieve chunks</small>
|
||||||
</label>
|
</label>
|
||||||
<input id="vectors_chunk_count_db" type="number" class="text_pole widthUnset" min="1" max="99999" />
|
<input id="vectors_chunk_count_db" type="number" class="text_pole" min="1" max="99999" />
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<div class="flex-container flexFlowColumn">
|
<div class="flex-container flexFlowColumn">
|
||||||
|
@ -477,6 +477,10 @@ export function sortByCssOrder(a, b) {
|
|||||||
* trimToEndSentence('Hello, world! I am from'); // 'Hello, world!'
|
* trimToEndSentence('Hello, world! I am from'); // 'Hello, world!'
|
||||||
*/
|
*/
|
||||||
export function trimToEndSentence(input, include_newline = false) {
|
export function trimToEndSentence(input, include_newline = false) {
|
||||||
|
if (!input) {
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
|
||||||
const punctuation = new Set(['.', '!', '?', '*', '"', ')', '}', '`', ']', '$', '。', '!', '?', '”', ')', '】', '’', '」', '_']); // extend this as you see fit
|
const punctuation = new Set(['.', '!', '?', '*', '"', ')', '}', '`', ']', '$', '。', '!', '?', '”', ')', '】', '’', '」', '_']); // extend this as you see fit
|
||||||
let last = -1;
|
let last = -1;
|
||||||
|
|
||||||
@ -506,6 +510,10 @@ export function trimToEndSentence(input, include_newline = false) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
export function trimToStartSentence(input) {
|
export function trimToStartSentence(input) {
|
||||||
|
if (!input) {
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
|
||||||
let p1 = input.indexOf('.');
|
let p1 = input.indexOf('.');
|
||||||
let p2 = input.indexOf('!');
|
let p2 = input.indexOf('!');
|
||||||
let p3 = input.indexOf('?');
|
let p3 = input.indexOf('?');
|
||||||
|
Loading…
x
Reference in New Issue
Block a user