mirror of
https://github.com/SillyTavern/SillyTavern.git
synced 2025-06-05 21:59:27 +02:00
Custom vector chunk boundary
This commit is contained in:
@ -50,6 +50,7 @@ const settings = {
|
|||||||
summarize_sent: false,
|
summarize_sent: false,
|
||||||
summary_source: 'main',
|
summary_source: 'main',
|
||||||
summary_prompt: 'Pause your roleplay. Summarize the most important parts of the message. Limit yourself to 250 words or less. Your response should include nothing but the summary.',
|
summary_prompt: 'Pause your roleplay. Summarize the most important parts of the message. Limit yourself to 250 words or less. Your response should include nothing but the summary.',
|
||||||
|
force_chunk_delimiter: '',
|
||||||
|
|
||||||
// For chats
|
// For chats
|
||||||
enabled_chats: false,
|
enabled_chats: false,
|
||||||
@ -153,6 +154,20 @@ async function onVectorizeAllClick() {
|
|||||||
|
|
||||||
let syncBlocked = false;
|
let syncBlocked = false;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the chunk delimiters for splitting text.
|
||||||
|
* @returns {string[]} Array of chunk delimiters
|
||||||
|
*/
|
||||||
|
function getChunkDelimiters() {
|
||||||
|
const delimiters = ['\n\n', '\n', ' ', ''];
|
||||||
|
|
||||||
|
if (settings.force_chunk_delimiter) {
|
||||||
|
delimiters.unshift(settings.force_chunk_delimiter);
|
||||||
|
}
|
||||||
|
|
||||||
|
return delimiters;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Splits messages into chunks before inserting them into the vector index.
|
* Splits messages into chunks before inserting them into the vector index.
|
||||||
* @param {object[]} items Array of vector items
|
* @param {object[]} items Array of vector items
|
||||||
@ -166,7 +181,7 @@ function splitByChunks(items) {
|
|||||||
const chunkedItems = [];
|
const chunkedItems = [];
|
||||||
|
|
||||||
for (const item of items) {
|
for (const item of items) {
|
||||||
const chunks = splitRecursive(item.text, settings.message_chunk_size);
|
const chunks = splitRecursive(item.text, settings.message_chunk_size, getChunkDelimiters());
|
||||||
for (const chunk of chunks) {
|
for (const chunk of chunks) {
|
||||||
const chunkedItem = { ...item, text: chunk };
|
const chunkedItem = { ...item, text: chunk };
|
||||||
chunkedItems.push(chunkedItem);
|
chunkedItems.push(chunkedItem);
|
||||||
@ -484,9 +499,10 @@ async function vectorizeFile(fileText, fileName, collectionId, chunkSize, overla
|
|||||||
|
|
||||||
const toast = toastr.info('Vectorization may take some time, please wait...', `Ingesting file ${fileName}`);
|
const toast = toastr.info('Vectorization may take some time, please wait...', `Ingesting file ${fileName}`);
|
||||||
const overlapSize = Math.round(chunkSize * overlapPercent / 100);
|
const overlapSize = Math.round(chunkSize * overlapPercent / 100);
|
||||||
|
const delimiters = getChunkDelimiters();
|
||||||
// Overlap should not be included in chunk size. It will be later compensated by overlapChunks
|
// Overlap should not be included in chunk size. It will be later compensated by overlapChunks
|
||||||
chunkSize = overlapSize > 0 ? (chunkSize - overlapSize) : chunkSize;
|
chunkSize = overlapSize > 0 ? (chunkSize - overlapSize) : chunkSize;
|
||||||
const chunks = splitRecursive(fileText, chunkSize).map((x, y, z) => overlapSize > 0 ? overlapChunks(x, y, z, overlapSize) : x);
|
const chunks = splitRecursive(fileText, chunkSize, delimiters).map((x, y, z) => overlapSize > 0 ? overlapChunks(x, y, z, overlapSize) : x);
|
||||||
console.debug(`Vectors: Split file ${fileName} into ${chunks.length} chunks with ${overlapPercent}% overlap`, chunks);
|
console.debug(`Vectors: Split file ${fileName} into ${chunks.length} chunks with ${overlapPercent}% overlap`, chunks);
|
||||||
|
|
||||||
const items = chunks.map((chunk, index) => ({ hash: getStringHash(chunk), text: chunk, index: index }));
|
const items = chunks.map((chunk, index) => ({ hash: getStringHash(chunk), text: chunk, index: index }));
|
||||||
@ -1480,6 +1496,12 @@ jQuery(async () => {
|
|||||||
saveSettingsDebounced();
|
saveSettingsDebounced();
|
||||||
});
|
});
|
||||||
|
|
||||||
|
$('#vectors_force_chunk_delimiter').prop('checked', settings.force_chunk_delimiter).on('input', () => {
|
||||||
|
settings.force_chunk_delimiter = String($('#vectors_force_chunk_delimiter').val());
|
||||||
|
Object.assign(extension_settings.vectors, settings);
|
||||||
|
saveSettingsDebounced();
|
||||||
|
});
|
||||||
|
|
||||||
const validSecret = !!secret_state[SECRET_KEYS.NOMICAI];
|
const validSecret = !!secret_state[SECRET_KEYS.NOMICAI];
|
||||||
const placeholder = validSecret ? '✔️ Key saved' : '❌ Missing key';
|
const placeholder = validSecret ? '✔️ Key saved' : '❌ Missing key';
|
||||||
$('#api_key_nomicai').attr('placeholder', placeholder);
|
$('#api_key_nomicai').attr('placeholder', placeholder);
|
||||||
|
@ -117,15 +117,21 @@
|
|||||||
<div class="flex-container marginTopBot5">
|
<div class="flex-container marginTopBot5">
|
||||||
<div class="flex-container flex1 flexFlowColumn" title="How many last messages will be matched for relevance.">
|
<div class="flex-container flex1 flexFlowColumn" title="How many last messages will be matched for relevance.">
|
||||||
<label for="vectors_query">
|
<label for="vectors_query">
|
||||||
<span data-i18n="Query messages">Query messages</span>
|
<small data-i18n="Query messages">Query messages</small>
|
||||||
</label>
|
</label>
|
||||||
<input type="number" id="vectors_query" class="text_pole widthUnset" min="1" max="99" />
|
<input type="number" id="vectors_query" class="text_pole" min="1" max="99" />
|
||||||
</div>
|
</div>
|
||||||
<div class="flex-container flex1 flexFlowColumn" title="Cut-off score for relevance. Helps to filter out irrelevant data.">
|
<div class="flex-container flex1 flexFlowColumn" title="Cut-off score for relevance. Helps to filter out irrelevant data.">
|
||||||
<label for="vectors_query">
|
<label for="vectors_query">
|
||||||
<span data-i18n="Score threshold">Score threshold</span>
|
<small data-i18n="Score threshold">Score threshold</small>
|
||||||
</label>
|
</label>
|
||||||
<input type="number" id="vectors_score_threshold" class="text_pole widthUnset" min="0" max="1" step="0.05" />
|
<input type="number" id="vectors_score_threshold" class="text_pole" min="0" max="1" step="0.05" />
|
||||||
|
</div>
|
||||||
|
<div class="flex-container flex1 flexFlowColumn" title="Prioritize chunking on the preferred delimiter.">
|
||||||
|
<label for="vectors_force_chunk_delimiter">
|
||||||
|
<small data-i18n="Chunk boundary">Chunk boundary</small>
|
||||||
|
</label>
|
||||||
|
<textarea id="vectors_force_chunk_delimiter" class="text_pole" rows="1" placeholder="(None)"></textarea>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user