mirror of
https://github.com/SillyTavern/SillyTavern.git
synced 2025-06-05 21:59:27 +02:00
Vectors: Add only custom boundary chunking
This commit is contained in:
@@ -82,6 +82,7 @@ const settings = {
|
|||||||
chunk_size: 5000,
|
chunk_size: 5000,
|
||||||
chunk_count: 2,
|
chunk_count: 2,
|
||||||
overlap_percent: 0,
|
overlap_percent: 0,
|
||||||
|
only_custom_boundary: false,
|
||||||
|
|
||||||
// For Data Bank
|
// For Data Bank
|
||||||
size_threshold_db: 5,
|
size_threshold_db: 5,
|
||||||
@@ -571,7 +572,9 @@ async function vectorizeFile(fileText, fileName, collectionId, chunkSize, overla
|
|||||||
const delimiters = getChunkDelimiters();
|
const delimiters = getChunkDelimiters();
|
||||||
// Overlap should not be included in chunk size. It will be later compensated by overlapChunks
|
// Overlap should not be included in chunk size. It will be later compensated by overlapChunks
|
||||||
chunkSize = overlapSize > 0 ? (chunkSize - overlapSize) : chunkSize;
|
chunkSize = overlapSize > 0 ? (chunkSize - overlapSize) : chunkSize;
|
||||||
const chunks = splitRecursive(fileText, chunkSize, delimiters).map((x, y, z) => overlapSize > 0 ? overlapChunks(x, y, z, overlapSize) : x);
|
const chunks = settings.only_custom_boundary && settings.force_chunk_delimiter
|
||||||
|
? fileText.split(settings.force_chunk_delimiter)
|
||||||
|
: splitRecursive(fileText, chunkSize, delimiters).map((x, y, z) => overlapSize > 0 ? overlapChunks(x, y, z, overlapSize) : x);
|
||||||
console.debug(`Vectors: Split file ${fileName} into ${chunks.length} chunks with ${overlapPercent}% overlap`, chunks);
|
console.debug(`Vectors: Split file ${fileName} into ${chunks.length} chunks with ${overlapPercent}% overlap`, chunks);
|
||||||
|
|
||||||
const items = chunks.map((chunk, index) => ({ hash: getStringHash(chunk), text: chunk, index: index }));
|
const items = chunks.map((chunk, index) => ({ hash: getStringHash(chunk), text: chunk, index: index }));
|
||||||
@@ -1555,6 +1558,12 @@ jQuery(async () => {
|
|||||||
saveSettingsDebounced();
|
saveSettingsDebounced();
|
||||||
});
|
});
|
||||||
|
|
||||||
|
$('#vectors_only_custom_boundary').prop('checked', settings.only_custom_boundary).on('input', () => {
|
||||||
|
settings.only_custom_boundary = !!$('#vectors_only_custom_boundary').prop('checked');
|
||||||
|
Object.assign(extension_settings.vectors, settings);
|
||||||
|
saveSettingsDebounced();
|
||||||
|
});
|
||||||
|
|
||||||
$('#vectors_ollama_pull').on('click', (e) => {
|
$('#vectors_ollama_pull').on('click', (e) => {
|
||||||
const presetModel = extension_settings.vectors.ollama_model || '';
|
const presetModel = extension_settings.vectors.ollama_model || '';
|
||||||
e.preventDefault();
|
e.preventDefault();
|
||||||
|
@@ -186,6 +186,12 @@
|
|||||||
</label>
|
</label>
|
||||||
|
|
||||||
<div id="vectors_files_settings" class="marginTopBot5">
|
<div id="vectors_files_settings" class="marginTopBot5">
|
||||||
|
<label class="checkbox_label" for="vectors_only_custom_boundary" title="Create a chunk for every custom Chunk boundary detected in the file text. Don't chunk on default boundaries.">
|
||||||
|
<input id="vectors_only_custom_boundary" type="checkbox" class="checkbox">
|
||||||
|
<span data-i18n="Only chunk on custom boundary">
|
||||||
|
Only chunk on custom boundary
|
||||||
|
</span>
|
||||||
|
</label>
|
||||||
<label class="checkbox_label" for="vectors_translate_files" title="This can help with retrieval accuracy if using embedding models that are trained on English data. Uses the selected API from Chat Translation extension settings.">
|
<label class="checkbox_label" for="vectors_translate_files" title="This can help with retrieval accuracy if using embedding models that are trained on English data. Uses the selected API from Chat Translation extension settings.">
|
||||||
<input id="vectors_translate_files" type="checkbox" class="checkbox">
|
<input id="vectors_translate_files" type="checkbox" class="checkbox">
|
||||||
<span data-i18n="Translate files into English before processing">
|
<span data-i18n="Translate files into English before processing">
|
||||||
|
Reference in New Issue
Block a user