From bba16f52639ada2abeb4d8422e938ba754f3cc74 Mon Sep 17 00:00:00 2001 From: Cohee <18619528+Cohee1207@users.noreply.github.com> Date: Sun, 16 Jun 2024 02:16:27 +0300 Subject: [PATCH] Custom vector chunk boundary --- public/scripts/extensions/vectors/index.js | 26 +++++++++++++++++-- .../scripts/extensions/vectors/settings.html | 14 +++++++--- 2 files changed, 34 insertions(+), 6 deletions(-) diff --git a/public/scripts/extensions/vectors/index.js b/public/scripts/extensions/vectors/index.js index 60fab346b..41afb1846 100644 --- a/public/scripts/extensions/vectors/index.js +++ b/public/scripts/extensions/vectors/index.js @@ -50,6 +50,7 @@ const settings = { summarize_sent: false, summary_source: 'main', summary_prompt: 'Pause your roleplay. Summarize the most important parts of the message. Limit yourself to 250 words or less. Your response should include nothing but the summary.', + force_chunk_delimiter: '', // For chats enabled_chats: false, @@ -153,6 +154,20 @@ async function onVectorizeAllClick() { let syncBlocked = false; +/** + * Gets the chunk delimiters for splitting text. + * @returns {string[]} Array of chunk delimiters + */ +function getChunkDelimiters() { + const delimiters = ['\n\n', '\n', ' ', '']; + + if (settings.force_chunk_delimiter) { + delimiters.unshift(settings.force_chunk_delimiter); + } + + return delimiters; +} + /** * Splits messages into chunks before inserting them into the vector index. * @param {object[]} items Array of vector items @@ -166,7 +181,7 @@ function splitByChunks(items) { const chunkedItems = []; for (const item of items) { - const chunks = splitRecursive(item.text, settings.message_chunk_size); + const chunks = splitRecursive(item.text, settings.message_chunk_size, getChunkDelimiters()); for (const chunk of chunks) { const chunkedItem = { ...item, text: chunk }; chunkedItems.push(chunkedItem); @@ -484,9 +499,10 @@ async function vectorizeFile(fileText, fileName, collectionId, chunkSize, overla const toast = toastr.info('Vectorization may take some time, please wait...', `Ingesting file ${fileName}`); const overlapSize = Math.round(chunkSize * overlapPercent / 100); + const delimiters = getChunkDelimiters(); // Overlap should not be included in chunk size. It will be later compensated by overlapChunks chunkSize = overlapSize > 0 ? (chunkSize - overlapSize) : chunkSize; - const chunks = splitRecursive(fileText, chunkSize).map((x, y, z) => overlapSize > 0 ? overlapChunks(x, y, z, overlapSize) : x); + const chunks = splitRecursive(fileText, chunkSize, delimiters).map((x, y, z) => overlapSize > 0 ? overlapChunks(x, y, z, overlapSize) : x); console.debug(`Vectors: Split file ${fileName} into ${chunks.length} chunks with ${overlapPercent}% overlap`, chunks); const items = chunks.map((chunk, index) => ({ hash: getStringHash(chunk), text: chunk, index: index })); @@ -1480,6 +1496,12 @@ jQuery(async () => { saveSettingsDebounced(); }); + $('#vectors_force_chunk_delimiter').prop('checked', settings.force_chunk_delimiter).on('input', () => { + settings.force_chunk_delimiter = String($('#vectors_force_chunk_delimiter').val()); + Object.assign(extension_settings.vectors, settings); + saveSettingsDebounced(); + }); + const validSecret = !!secret_state[SECRET_KEYS.NOMICAI]; const placeholder = validSecret ? '✔️ Key saved' : '❌ Missing key'; $('#api_key_nomicai').attr('placeholder', placeholder); diff --git a/public/scripts/extensions/vectors/settings.html b/public/scripts/extensions/vectors/settings.html index 81247e92a..b4a413e83 100644 --- a/public/scripts/extensions/vectors/settings.html +++ b/public/scripts/extensions/vectors/settings.html @@ -117,15 +117,21 @@
- +
- + +
+
+ +