Add chunking of vector storage messages

This commit is contained in:
Cohee 2023-12-31 04:00:04 +02:00
parent 060bc60794
commit a2e4dc2950
4 changed files with 96 additions and 10 deletions

View File

@ -1,6 +1,6 @@
import { eventSource, event_types, extension_prompt_types, getCurrentChatId, getRequestHeaders, is_send_press, saveSettingsDebounced, setExtensionPrompt, substituteParams } from '../../../script.js';
import { ModuleWorkerWrapper, extension_settings, getContext, renderExtensionTemplate } from '../../extensions.js';
import { collapseNewlines, power_user, ui_mode } from '../../power-user.js';
import { collapseNewlines } from '../../power-user.js';
import { SECRET_KEYS, secret_state } from '../../secrets.js';
import { debounce, getStringHash as calculateHash, waitUntilCondition, onlyUnique, splitRecursive } from '../../utils.js';
@ -21,6 +21,7 @@ const settings = {
protect: 5,
insert: 3,
query: 2,
message_chunk_size: 400,
// For files
enabled_files: false,
@ -87,6 +88,29 @@ async function onVectorizeAllClick() {
let syncBlocked = false;
/**
* Splits messages into chunks before inserting them into the vector index.
* @param {object[]} items Array of vector items
* @returns {object[]} Array of vector items (possibly chunked)
*/
function splitByChunks(items) {
if (settings.message_chunk_size <= 0) {
return items;
}
const chunkedItems = [];
for (const item of items) {
const chunks = splitRecursive(item.text, settings.message_chunk_size);
for (const chunk of chunks) {
const chunkedItem = { ...item, text: chunk };
chunkedItems.push(chunkedItem);
}
}
return chunkedItems;
}
async function synchronizeChat(batchSize = 5) {
if (!settings.enabled_chats) {
return -1;
@ -116,8 +140,9 @@ async function synchronizeChat(batchSize = 5) {
const deletedHashes = hashesInCollection.filter(x => !hashedMessages.some(y => y.hash === x));
if (newVectorItems.length > 0) {
const chunkedBatch = splitByChunks(newVectorItems.slice(0, batchSize));
console.log(`Vectors: Found ${newVectorItems.length} new items. Processing ${batchSize}...`);
await insertVectorItems(chatId, newVectorItems.slice(0, batchSize));
await insertVectorItems(chatId, chunkedBatch);
}
if (deletedHashes.length > 0) {
@ -492,6 +517,43 @@ function toggleSettings() {
$('#vectors_chats_settings').toggle(!!settings.enabled_chats);
}
async function onPurgeClick() {
const chatId = getCurrentChatId();
if (!chatId) {
toastr.info('No chat selected', 'Purge aborted');
return;
}
await purgeVectorIndex(chatId);
toastr.success('Vector index purged', 'Purge successful');
}
async function onViewStatsClick() {
const chatId = getCurrentChatId();
if (!chatId) {
toastr.info('No chat selected');
return;
}
const hashesInCollection = await getSavedHashes(chatId);
const totalHashes = hashesInCollection.length;
const uniqueHashes = hashesInCollection.filter(onlyUnique).length;
toastr.info(`Total hashes: <b>${totalHashes}</b><br>
Unique hashes: <b>${uniqueHashes}</b><br><br>
I'll mark collected messages with a green circle.`,
`Stats for chat ${chatId}`,
{ timeOut: 10000, escapeHtml: false });
const chat = getContext().chat;
for (const message of chat) {
if (hashesInCollection.includes(getStringHash(message.mes))) {
const messageElement = $(`.mes[mesid="${chat.indexOf(message)}"]`);
messageElement.addClass('vectorized');
}
}
}
jQuery(async () => {
if (!extension_settings.vectors) {
extension_settings.vectors = settings;
@ -554,9 +616,9 @@ jQuery(async () => {
Object.assign(extension_settings.vectors, settings);
saveSettingsDebounced();
});
$('#vectors_advanced_settings').toggleClass('displayNone', power_user.ui_mode === ui_mode.SIMPLE);
$('#vectors_vectorize_all').on('click', onVectorizeAllClick);
$('#vectors_purge').on('click', onPurgeClick);
$('#vectors_view_stats').on('click', onViewStatsClick);
$('#vectors_size_threshold').val(settings.size_threshold).on('input', () => {
settings.size_threshold = Number($('#vectors_size_threshold').val());
@ -582,6 +644,12 @@ jQuery(async () => {
saveSettingsDebounced();
});
$('#vectors_message_chunk_size').val(settings.message_chunk_size).on('input', () => {
settings.message_chunk_size = Number($('#vectors_message_chunk_size').val());
Object.assign(extension_settings.vectors, settings);
saveSettingsDebounced();
});
toggleSettings();
eventSource.on(event_types.MESSAGE_DELETED, onChatEvent);
eventSource.on(event_types.MESSAGE_EDITED, onChatEvent);

View File

@ -5,7 +5,7 @@
"optional": [],
"generate_interceptor": "vectors_rearrangeChat",
"js": "index.js",
"css": "",
"css": "style.css",
"author": "Cohee#1207",
"version": "1.0.0",
"homePage": "https://github.com/SillyTavern/SillyTavern"

View File

@ -75,7 +75,7 @@
</label>
<div id="vectors_chats_settings">
<div id="vectors_advanced_settings" data-newbie-hidden>
<div id="vectors_advanced_settings">
<label for="vectors_template">
Insertion Template
</label>
@ -97,17 +97,23 @@
</label>
</div>
<div class="flex-container">
<div class="flex1" title="Can increase the retrieval quality for the cost of processing. 0 = disabled.">
<label for="vectors_message_chunk_size">
<small>Chunk size (chars)</small>
</label>
<input id="vectors_message_chunk_size" type="number" class="text_pole widthUnset" min="0" max="9999" />
</div>
<div class="flex1" title="Prevents last N messages from being placed out of order.">
<label for="vectors_protect">
<small>Retain#</small>
</label>
<input type="number" id="vectors_protect" class="text_pole widthUnset" min="1" max="99" />
<input type="number" id="vectors_protect" class="text_pole widthUnset" min="1" max="9999" />
</div>
<div class="flex1" title="How many past messages to insert as memories.">
<label for="vectors_insert">
<small>Insert#</small>
</label>
<input type="number" id="vectors_insert" class="text_pole widthUnset" min="1" max="99" />
<input type="number" id="vectors_insert" class="text_pole widthUnset" min="1" max="9999" />
</div>
</div>
</div>
@ -115,8 +121,16 @@
Old messages are vectorized gradually as you chat.
To process all previous messages, click the button below.
</small>
<div id="vectors_vectorize_all" class="menu_button menu_button_icon">
Vectorize All
<div class="flex-container">
<div id="vectors_vectorize_all" class="menu_button menu_button_icon">
Vectorize All
</div>
<div id="vectors_purge" class="menu_button menu_button_icon">
Purge Vectors
</div>
<div id="vectors_view_stats" class="menu_button menu_button_icon">
View Stats
</div>
</div>
<div id="vectorize_progress" style="display: none;">
<small>

View File

@ -0,0 +1,4 @@
.mes.vectorized .name_text::after {
content: '🟢';
margin-left: 5px;
}