Compare commits
14 Commits
b414ceba2f
...
35c4062786
Author | SHA1 | Date |
---|---|---|
Juha Jeronen | 35c4062786 | |
Cohee | 694cf6f762 | |
Cohee | 83c77c1f18 | |
Cohee | d54ccece5c | |
kingbri | 96506947cb | |
Juha Jeronen | 54c1c5912f | |
Juha Jeronen | 6874e361ae | |
Juha Jeronen | 820dbc97a5 | |
Juha Jeronen | 7c0b944d81 | |
Juha Jeronen | e59a337303 | |
Juha Jeronen | 4102705a8e | |
Juha Jeronen | 7432e00bb6 | |
Juha Jeronen | 82dbfa9ac5 | |
Juha Jeronen | 7672b5260c |
|
@ -360,6 +360,14 @@
|
|||
flex: 2 !important;
|
||||
}
|
||||
|
||||
.flex3 {
|
||||
flex: 3;
|
||||
}
|
||||
|
||||
.flex4 {
|
||||
flex: 4;
|
||||
}
|
||||
|
||||
.flexFlowColumn {
|
||||
flex-flow: column;
|
||||
}
|
||||
|
@ -563,4 +571,4 @@ textarea:disabled {
|
|||
height: 30px;
|
||||
text-align: center;
|
||||
padding: 5px;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -5197,7 +5197,7 @@
|
|||
</div>
|
||||
</div>
|
||||
<div class="flex-container wide100p flexGap10">
|
||||
<div class="flex1 flex-container flexFlowColumn flexNoGap">
|
||||
<div class="flex4 flex-container flexFlowColumn flexNoGap">
|
||||
<div class="flex-container justifySpaceBetween">
|
||||
<small for="characterFilter" data-i18n="Filter to Character(s)">
|
||||
Filter to Character(s)
|
||||
|
@ -5217,7 +5217,7 @@
|
|||
</select>
|
||||
</div>
|
||||
</div>
|
||||
<div class="flex1 flex-container flexFlowColumn flexNoGap">
|
||||
<div class="flex3 flex-container flexFlowColumn flexNoGap">
|
||||
<div class="flex-container justifySpaceBetween">
|
||||
<small for="group" data-i18n="Inclusion Group">
|
||||
Inclusion Group
|
||||
|
@ -5239,6 +5239,16 @@
|
|||
<input type="text" class="text_pole margin0" name="group" rows="1" data-i18n="[placeholder]Only one entry with the same label will be activated" placeholder="Only one entry with the same label will be activated">
|
||||
</div>
|
||||
</div>
|
||||
<div class="flex1 flex-container flexFlowColumn flexNoGap" data-i18n="[title]A relative likelihood of entry activation within the group" title="A relative likelihood of entry activation within the group">
|
||||
<div class="flex-container justifySpaceBetween marginBot5">
|
||||
<small for="groupWeight" data-i18n="Group Weight">
|
||||
Group Weight
|
||||
</small>
|
||||
</div>
|
||||
<div class="range-block-range">
|
||||
<input type="number" class="text_pole margin0" name="groupWeight" rows="1" placeholder="100" min="0" max="999999">
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div name="WIEntryBottomControls" class="flex-container flex1 justifySpaceBetween world_entry_form_horizontal">
|
||||
<div class="flex-container flexFlowColumn flexNoGap wi-enter-footer-text ">
|
||||
|
|
|
@ -4387,7 +4387,8 @@ function formatMessageHistoryItem(chatItem, isInstruct, forceOutputSequence) {
|
|||
const itemName = chatItem.is_user ? chatItem['name'] : characterName;
|
||||
const shouldPrependName = !isNarratorType;
|
||||
|
||||
let textResult = shouldPrependName ? `${itemName}: ${chatItem.mes}\n` : `${chatItem.mes}\n`;
|
||||
// Don't include a name if it's empty
|
||||
let textResult = chatItem?.name && shouldPrependName ? `${itemName}: ${chatItem.mes}\n` : `${chatItem.mes}\n`;
|
||||
|
||||
if (isInstruct) {
|
||||
textResult = formatInstructModeChat(itemName, chatItem.mes, chatItem.is_user, isNarratorType, chatItem.force_avatar, name1, name2, forceOutputSequence);
|
||||
|
|
|
@ -55,6 +55,7 @@ const settings = {
|
|||
|
||||
// For files
|
||||
enabled_files: false,
|
||||
science_mode: false,
|
||||
translate_files: false,
|
||||
size_threshold: 10,
|
||||
chunk_size: 5000,
|
||||
|
@ -95,7 +96,7 @@ async function onVectorizeAllClick() {
|
|||
const chatId = getCurrentChatId();
|
||||
|
||||
if (!chatId) {
|
||||
toastr.info('No chat selected', 'Vectorization aborted');
|
||||
toastr.info('No chat selected. Vectorization aborted.', 'Vector Storage');
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -108,7 +109,7 @@ async function onVectorizeAllClick() {
|
|||
|
||||
while (!finished) {
|
||||
if (is_send_press) {
|
||||
toastr.info('Message generation is in progress.', 'Vectorization aborted');
|
||||
toastr.info('Message generation is in progress. Vectorization aborted.', 'Vector Storage');
|
||||
throw new Error('Message generation is in progress.');
|
||||
}
|
||||
|
||||
|
@ -135,6 +136,7 @@ async function onVectorizeAllClick() {
|
|||
}
|
||||
} catch (error) {
|
||||
console.error('Vectors: Failed to vectorize all', error);
|
||||
toastr.error(`Vectorize all failed. ${new String(error)}`, 'Vector Storage')
|
||||
} finally {
|
||||
$('#vectorize_progress').hide();
|
||||
}
|
||||
|
@ -274,14 +276,14 @@ async function synchronizeChat(batchSize = 5) {
|
|||
case 'extras_module_missing':
|
||||
return 'Extras API must provide an "embeddings" module.';
|
||||
default:
|
||||
return 'Check server console for more details';
|
||||
return 'Check server console for more details.';
|
||||
}
|
||||
}
|
||||
|
||||
console.error('Vectors: Failed to synchronize chat', error);
|
||||
|
||||
const message = getErrorMessage(error.cause);
|
||||
toastr.error(message, 'Vectorization failed', { preventDuplicates: true });
|
||||
toastr.error(`Vectorization failed. ${message}`, 'Vector Storage', { preventDuplicates: true });
|
||||
return -1;
|
||||
} finally {
|
||||
syncBlocked = false;
|
||||
|
@ -357,6 +359,7 @@ async function processFiles(chat) {
|
|||
if (!message?.extra?.file) {
|
||||
continue;
|
||||
}
|
||||
console.debug(`Vectors: processFiles: message ${message.index}: has a file attachment, processing.`)
|
||||
|
||||
// Trim file inserted by the script
|
||||
const fileText = String(message.mes)
|
||||
|
@ -367,6 +370,7 @@ async function processFiles(chat) {
|
|||
|
||||
// File is too small
|
||||
if (fileText.length < thresholdLength) {
|
||||
console.debug(`Vectors: processFiles: message ${message.index}: text of file "${message.extra.file.name}" shorter than vectorization threshold (${fileText.length} < ${thresholdLength} chars), keeping inlined.`)
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -379,11 +383,16 @@ async function processFiles(chat) {
|
|||
|
||||
// File is already in the collection
|
||||
if (!hashesInCollection.length) {
|
||||
console.debug(`Vectors: processFiles: message ${message.index}: file "${fileName}" not yet in collection, vectorizing.`)
|
||||
await vectorizeFile(fileText, fileName, collectionId, settings.chunk_size);
|
||||
} else {
|
||||
console.debug(`Vectors: processFiles: message ${message.index}: file "${fileName}" found in collection.`)
|
||||
}
|
||||
|
||||
console.debug(`Vectors: processFiles: message ${message.index}: querying vector DB.`)
|
||||
const queryText = await getQueryText(chat);
|
||||
const fileChunks = await retrieveFileChunks(queryText, collectionId);
|
||||
console.debug(`Vectors: processFiles: message ${message.index}: retrieved ${fileChunks.length} chars.`);
|
||||
|
||||
message.mes = `${fileChunks}\n\n${message.mes}`;
|
||||
}
|
||||
|
@ -438,6 +447,62 @@ async function retrieveFileChunks(queryText, collectionId) {
|
|||
return fileText;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sanitizes the text content of a scientific paper to obtain higher-quality text for vectorization.
|
||||
*
|
||||
* This is a really simplistic, classical regex-based algorithm. An LLM could likely do better, but that would be slow.
|
||||
* We hope to get a result that's not horribly broken and that won't include irrelevant RAG query poisoning stuff.
|
||||
*
|
||||
* Currently, we:
|
||||
*
|
||||
* - Strip the reference list.
|
||||
*
|
||||
* The reference list contains the highest concentration of keywords of any kind (in the titles of the cited studies),
|
||||
* so it usually poisons RAG queries so that no matter what you search for, you'll only get chunks of the reference list.
|
||||
* Omitting the reference list from the text to be vectorized, RAG will look for matches in the paper content only.
|
||||
*
|
||||
* - F IX H EADINGS T HAT L OOK L IKE T HIS.
|
||||
*
|
||||
* This is a rather common issue in text extraction from a PDF.
|
||||
*
|
||||
* @param {string} fileText The text to sanitize
|
||||
* @returns {string} The sanitized text
|
||||
*/
|
||||
function sanitizeScientificInput(fileText) {
|
||||
// Fix section headings
|
||||
//
|
||||
const brokenUppercaseWordsFinder = new RegExp(/(?<!\b[A-Z]\s+)\b([A-Z])\s+([A-Z]+)\b/, 'g'); // "H EADING", but not "C H EADING" (appendix section)
|
||||
fileText = fileText.replaceAll(brokenUppercaseWordsFinder, '$1$2');
|
||||
const brokenAppendixHeadingFinder = new RegExp(/([A-Z])\s+([A-Z])\s+([A-Z]+)\b/, 'g'); // "C H EADING"
|
||||
fileText = fileText.replaceAll(brokenAppendixHeadingFinder, '$1 $2$3'); // -> "C HEADING"
|
||||
|
||||
const brokenHeadingsFinder = new RegExp(/^\s*([A-Z])\s+([a-z]+)\s*$/, 'mg'); // "H eading", on its own line
|
||||
fileText = fileText.replaceAll(brokenHeadingsFinder, '$1$2');
|
||||
|
||||
// Strip reference list (easier now that the headings are already fixed).
|
||||
//
|
||||
// Linefeeds are sometimes lost, so the references may begin in the middle of a line.
|
||||
// Since we can't trigger on any random mention of the word "References", we trigger in the middle of a line
|
||||
// only for an all-uppercase "REFERENCES".
|
||||
//
|
||||
const referencesFinder = new RegExp(/(^\s*References\s*$|^\s*REFERENCES\s*$|\bREFERENCES\s*)/, 'mg');
|
||||
const referencesMatches = [...fileText.matchAll(referencesFinder)];
|
||||
if (referencesMatches.length > 0) { // Detected a reference list
|
||||
const appendixFinder = new RegExp(/(^\s*Appendi(x|ces)\s*$|^\s*A\s*PPENDI(X|CES)\s*$|\bAPPENDI(X|CES)\s*)/, 'mg');
|
||||
// Some documents just start appendices like "A Some stuff..." without a heading, but there's not much we can do about that.
|
||||
// In those cases, we will simply ignore the appendices.
|
||||
const appendixMatches = [...fileText.matchAll(appendixFinder)];
|
||||
if (appendixMatches.length > 0) { // Detected both a reference list and appendices
|
||||
fileText = fileText.substring(0, referencesMatches[0].index).trim() + fileText.substring(appendixMatches[0].index);
|
||||
} else { // Detected only a reference list, no appendices
|
||||
fileText = fileText.substring(0, referencesMatches[0].index).trim();
|
||||
}
|
||||
}
|
||||
|
||||
console.debug(fileText);
|
||||
return fileText;
|
||||
}
|
||||
|
||||
/**
|
||||
* Vectorizes a file and inserts it into the vector index.
|
||||
* @param {string} fileText File text
|
||||
|
@ -454,12 +519,19 @@ async function vectorizeFile(fileText, fileName, collectionId, chunkSize) {
|
|||
fileText = translatedText;
|
||||
}
|
||||
|
||||
const toast = toastr.info('Vectorization may take some time, please wait...', `Ingesting file ${fileName}`);
|
||||
const chunks = splitRecursive(fileText, chunkSize);
|
||||
const toast = toastr.info(`Ingesting file ${fileName}. Vectorization may take some time, please wait...`, 'Vector Storage');
|
||||
|
||||
if (settings.science_mode) {
|
||||
console.debug(`Vectors: Science mode is enabled. Sanitizing input ${fileName}.`);
|
||||
fileText = sanitizeScientificInput(fileText);
|
||||
}
|
||||
|
||||
const chunks = splitRecursive(fileText, settings.chunk_size);
|
||||
console.debug(`Vectors: Split file ${fileName} into ${chunks.length} chunks`, chunks);
|
||||
|
||||
const items = chunks.map((chunk, index) => ({ hash: getStringHash(chunk), text: chunk, index: index }));
|
||||
await insertVectorItems(collectionId, items);
|
||||
toastr.info(`Vectorization complete for ${fileName}.`, `Vector Storage`);
|
||||
|
||||
toastr.clear(toast);
|
||||
console.log(`Vectors: Inserted ${chunks.length} vector items for file ${fileName} into ${collectionId}`);
|
||||
|
@ -467,6 +539,7 @@ async function vectorizeFile(fileText, fileName, collectionId, chunkSize) {
|
|||
} catch (error) {
|
||||
toastr.error(String(error), 'Failed to vectorize file', { preventDuplicates: true });
|
||||
console.error('Vectors: Failed to vectorize file', error);
|
||||
toastr.error(`Vectorization failed for ${fileName}. ${new String(error)}`, 'Vector Storage');
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -873,20 +946,20 @@ function toggleSettings() {
|
|||
async function onPurgeClick() {
|
||||
const chatId = getCurrentChatId();
|
||||
if (!chatId) {
|
||||
toastr.info('No chat selected', 'Purge aborted');
|
||||
toastr.info('No chat selected. Purge aborted.', 'Vector Storage');
|
||||
return;
|
||||
}
|
||||
if (await purgeVectorIndex(chatId)) {
|
||||
toastr.success('Vector index purged', 'Purge successful');
|
||||
toastr.success('Vector index purged successfully.', 'Vector Storage');
|
||||
} else {
|
||||
toastr.error('Failed to purge vector index', 'Purge failed');
|
||||
toastr.error('Failed to purge vector index', 'Vector Storage');
|
||||
}
|
||||
}
|
||||
|
||||
async function onViewStatsClick() {
|
||||
const chatId = getCurrentChatId();
|
||||
if (!chatId) {
|
||||
toastr.info('No chat selected');
|
||||
toastr.info('No chat selected', 'Vector Storage');
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -1097,6 +1170,11 @@ jQuery(async () => {
|
|||
saveSettingsDebounced();
|
||||
toggleSettings();
|
||||
});
|
||||
$('#vectors_science_mode').prop('checked', settings.science_mode).on('input', () => {
|
||||
settings.science_mode = $('#vectors_science_mode').prop('checked');
|
||||
Object.assign(extension_settings.vectors, settings);
|
||||
saveSettingsDebounced();
|
||||
});
|
||||
$('#vectors_source').val(settings.source).on('change', () => {
|
||||
settings.source = String($('#vectors_source').val());
|
||||
Object.assign(extension_settings.vectors, settings);
|
||||
|
|
|
@ -200,6 +200,10 @@
|
|||
<input id="vectors_chunk_count_db" type="number" class="text_pole widthUnset" min="1" max="99999" />
|
||||
</div>
|
||||
</div>
|
||||
<label class="checkbox_label" for="vectors_science_mode" title="Sanitize input text to improve retrieval quality for scientific paper inputs.">
|
||||
<input id="vectors_science_mode" type="checkbox" class="checkbox">
|
||||
Science mode
|
||||
</label>
|
||||
<div class="flex-container flexFlowColumn">
|
||||
<label for="vectors_file_template_db">
|
||||
<span>Injection Template</span>
|
||||
|
|
|
@ -340,8 +340,11 @@ export function formatInstructModeChat(name, mes, isUser, isNarrator, forceAvata
|
|||
}
|
||||
|
||||
const separator = power_user.instruct.wrap ? '\n' : '';
|
||||
const textArray = includeNames ? [prefix, `${name}: ${mes}` + suffix] : [prefix, mes + suffix];
|
||||
|
||||
// Don't include the name if it's empty
|
||||
const textArray = includeNames && name ? [prefix, `${name}: ${mes}` + suffix] : [prefix, mes + suffix];
|
||||
const text = textArray.filter(x => x).join(separator);
|
||||
|
||||
return text;
|
||||
}
|
||||
|
||||
|
|
|
@ -74,6 +74,7 @@ const SORT_ORDER_KEY = 'world_info_sort_order';
|
|||
const METADATA_KEY = 'world_info';
|
||||
|
||||
const DEFAULT_DEPTH = 4;
|
||||
const DEFAULT_WEIGHT = 100;
|
||||
const MAX_SCAN_DEPTH = 1000;
|
||||
|
||||
/**
|
||||
|
@ -1093,6 +1094,7 @@ const originalDataKeyMap = {
|
|||
'automationId': 'extensions.automation_id',
|
||||
'vectorized': 'extensions.vectorized',
|
||||
'groupOverride': 'extensions.group_override',
|
||||
'groupWeight': 'extensions.group_weight',
|
||||
};
|
||||
|
||||
/** Checks the state of the current search, and adds/removes the search sorting option accordingly */
|
||||
|
@ -1466,11 +1468,24 @@ function getWorldEntry(name, data, entry) {
|
|||
const uid = $(this).data('uid');
|
||||
const value = $(this).prop('checked');
|
||||
data.entries[uid].groupOverride = value;
|
||||
setOriginalDataValue(data, uid, 'extensions.groupOverride', data.entries[uid].groupOverride);
|
||||
setOriginalDataValue(data, uid, 'extensions.group_override', data.entries[uid].groupOverride);
|
||||
saveWorldInfo(name, data);
|
||||
});
|
||||
groupOverrideInput.prop('checked', entry.groupOverride).trigger('input');
|
||||
|
||||
// group weight
|
||||
const groupWeightInput = template.find('input[name="groupWeight"]');
|
||||
groupWeightInput.data('uid', entry.uid);
|
||||
groupWeightInput.on('input', function () {
|
||||
const uid = $(this).data('uid');
|
||||
const value = Number($(this).val());
|
||||
|
||||
data.entries[uid].groupWeight = !isNaN(value) ? Math.abs(value) : 0;
|
||||
setOriginalDataValue(data, uid, 'extensions.group_weight', data.entries[uid].groupWeight);
|
||||
saveWorldInfo(name, data);
|
||||
});
|
||||
groupWeightInput.val(entry.groupWeight).trigger('input');
|
||||
|
||||
// probability
|
||||
if (entry.probability === undefined) {
|
||||
entry.probability = null;
|
||||
|
@ -1971,6 +1986,7 @@ const newEntryTemplate = {
|
|||
depth: DEFAULT_DEPTH,
|
||||
group: '',
|
||||
groupOverride: false,
|
||||
groupWeight: DEFAULT_WEIGHT,
|
||||
scanDepth: null,
|
||||
caseSensitive: null,
|
||||
matchWholeWords: null,
|
||||
|
@ -2424,7 +2440,7 @@ async function checkWorldInfo(chat, maxContext) {
|
|||
for (const entry of newEntries) {
|
||||
const rollValue = Math.random() * 100;
|
||||
|
||||
if (!entry.group && entry.useProbability && rollValue > entry.probability) {
|
||||
if (entry.useProbability && rollValue > entry.probability) {
|
||||
console.debug(`WI entry ${entry.uid} ${entry.key} failed probability check, skipping`);
|
||||
failedProbabilityChecks.add(entry);
|
||||
continue;
|
||||
|
@ -2633,14 +2649,14 @@ function filterByInclusionGroups(newEntries, allActivatedEntries, buffer) {
|
|||
continue;
|
||||
}
|
||||
|
||||
// Do weighted random using probability of entry as weight
|
||||
const totalWeight = group.reduce((acc, item) => acc + item.probability, 0);
|
||||
// Do weighted random using entry's weight
|
||||
const totalWeight = group.reduce((acc, item) => acc + (item.groupWeight ?? DEFAULT_WEIGHT), 0);
|
||||
const rollValue = Math.random() * totalWeight;
|
||||
let currentWeight = 0;
|
||||
let winner = null;
|
||||
|
||||
for (const entry of group) {
|
||||
currentWeight += entry.probability;
|
||||
currentWeight += (entry.groupWeight ?? DEFAULT_WEIGHT);
|
||||
|
||||
if (rollValue <= currentWeight) {
|
||||
console.debug(`Activated inclusion group '${key}' with roll winner entry '${entry.uid}'`, entry);
|
||||
|
@ -2684,6 +2700,7 @@ function convertAgnaiMemoryBook(inputObj) {
|
|||
useProbability: false,
|
||||
group: '',
|
||||
groupOverride: false,
|
||||
groupWeight: DEFAULT_WEIGHT,
|
||||
scanDepth: null,
|
||||
caseSensitive: null,
|
||||
matchWholeWords: null,
|
||||
|
@ -2721,6 +2738,7 @@ function convertRisuLorebook(inputObj) {
|
|||
useProbability: entry.activationPercent ?? false,
|
||||
group: '',
|
||||
groupOverride: false,
|
||||
groupWeight: DEFAULT_WEIGHT,
|
||||
scanDepth: null,
|
||||
caseSensitive: null,
|
||||
matchWholeWords: null,
|
||||
|
@ -2763,6 +2781,7 @@ function convertNovelLorebook(inputObj) {
|
|||
useProbability: false,
|
||||
group: '',
|
||||
groupOverride: false,
|
||||
groupWeight: DEFAULT_WEIGHT,
|
||||
scanDepth: null,
|
||||
caseSensitive: null,
|
||||
matchWholeWords: null,
|
||||
|
@ -2806,6 +2825,7 @@ function convertCharacterBook(characterBook) {
|
|||
selectiveLogic: entry.extensions?.selectiveLogic ?? world_info_logic.AND_ANY,
|
||||
group: entry.extensions?.group ?? '',
|
||||
groupOverride: entry.extensions?.group_override ?? false,
|
||||
groupWeight: entry.extensions?.group_weight ?? DEFAULT_WEIGHT,
|
||||
scanDepth: entry.extensions?.scan_depth ?? null,
|
||||
caseSensitive: entry.extensions?.case_sensitive ?? null,
|
||||
matchWholeWords: entry.extensions?.match_whole_words ?? null,
|
||||
|
|
|
@ -434,6 +434,7 @@ function convertWorldInfoToCharacterBook(name, entries) {
|
|||
selectiveLogic: entry.selectiveLogic ?? 0,
|
||||
group: entry.group ?? '',
|
||||
group_override: entry.groupOverride ?? false,
|
||||
group_weight: entry.groupWeight ?? null,
|
||||
prevent_recursion: entry.preventRecursion ?? false,
|
||||
scan_depth: entry.scanDepth ?? null,
|
||||
match_whole_words: entry.matchWholeWords ?? null,
|
||||
|
|
Loading…
Reference in New Issue