|
|
|
@ -55,6 +55,7 @@ const settings = {
|
|
|
|
|
|
|
|
|
|
// For files
|
|
|
|
|
enabled_files: false,
|
|
|
|
|
science_mode: false,
|
|
|
|
|
translate_files: false,
|
|
|
|
|
size_threshold: 10,
|
|
|
|
|
chunk_size: 5000,
|
|
|
|
@ -95,7 +96,7 @@ async function onVectorizeAllClick() {
|
|
|
|
|
const chatId = getCurrentChatId();
|
|
|
|
|
|
|
|
|
|
if (!chatId) {
|
|
|
|
|
toastr.info('No chat selected', 'Vectorization aborted');
|
|
|
|
|
toastr.info('No chat selected. Vectorization aborted.', 'Vector Storage');
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -108,7 +109,7 @@ async function onVectorizeAllClick() {
|
|
|
|
|
|
|
|
|
|
while (!finished) {
|
|
|
|
|
if (is_send_press) {
|
|
|
|
|
toastr.info('Message generation is in progress.', 'Vectorization aborted');
|
|
|
|
|
toastr.info('Message generation is in progress. Vectorization aborted.', 'Vector Storage');
|
|
|
|
|
throw new Error('Message generation is in progress.');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -135,6 +136,7 @@ async function onVectorizeAllClick() {
|
|
|
|
|
}
|
|
|
|
|
} catch (error) {
|
|
|
|
|
console.error('Vectors: Failed to vectorize all', error);
|
|
|
|
|
toastr.error(`Vectorize all failed. ${new String(error)}`, 'Vector Storage')
|
|
|
|
|
} finally {
|
|
|
|
|
$('#vectorize_progress').hide();
|
|
|
|
|
}
|
|
|
|
@ -274,14 +276,14 @@ async function synchronizeChat(batchSize = 5) {
|
|
|
|
|
case 'extras_module_missing':
|
|
|
|
|
return 'Extras API must provide an "embeddings" module.';
|
|
|
|
|
default:
|
|
|
|
|
return 'Check server console for more details';
|
|
|
|
|
return 'Check server console for more details.';
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
console.error('Vectors: Failed to synchronize chat', error);
|
|
|
|
|
|
|
|
|
|
const message = getErrorMessage(error.cause);
|
|
|
|
|
toastr.error(message, 'Vectorization failed', { preventDuplicates: true });
|
|
|
|
|
toastr.error(`Vectorization failed. ${message}`, 'Vector Storage', { preventDuplicates: true });
|
|
|
|
|
return -1;
|
|
|
|
|
} finally {
|
|
|
|
|
syncBlocked = false;
|
|
|
|
@ -357,6 +359,7 @@ async function processFiles(chat) {
|
|
|
|
|
if (!message?.extra?.file) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
console.debug(`Vectors: processFiles: message ${message.index}: has a file attachment, processing.`)
|
|
|
|
|
|
|
|
|
|
// Trim file inserted by the script
|
|
|
|
|
const fileText = String(message.mes)
|
|
|
|
@ -367,6 +370,7 @@ async function processFiles(chat) {
|
|
|
|
|
|
|
|
|
|
// File is too small
|
|
|
|
|
if (fileText.length < thresholdLength) {
|
|
|
|
|
console.debug(`Vectors: processFiles: message ${message.index}: text of file "${message.extra.file.name}" shorter than vectorization threshold (${fileText.length} < ${thresholdLength} chars), keeping inlined.`)
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -379,11 +383,16 @@ async function processFiles(chat) {
|
|
|
|
|
|
|
|
|
|
// File is already in the collection
|
|
|
|
|
if (!hashesInCollection.length) {
|
|
|
|
|
console.debug(`Vectors: processFiles: message ${message.index}: file "${fileName}" not yet in collection, vectorizing.`)
|
|
|
|
|
await vectorizeFile(fileText, fileName, collectionId, settings.chunk_size);
|
|
|
|
|
} else {
|
|
|
|
|
console.debug(`Vectors: processFiles: message ${message.index}: file "${fileName}" found in collection.`)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
console.debug(`Vectors: processFiles: message ${message.index}: querying vector DB.`)
|
|
|
|
|
const queryText = await getQueryText(chat);
|
|
|
|
|
const fileChunks = await retrieveFileChunks(queryText, collectionId);
|
|
|
|
|
console.debug(`Vectors: processFiles: message ${message.index}: retrieved ${fileChunks.length} chars.`);
|
|
|
|
|
|
|
|
|
|
message.mes = `${fileChunks}\n\n${message.mes}`;
|
|
|
|
|
}
|
|
|
|
@ -438,6 +447,62 @@ async function retrieveFileChunks(queryText, collectionId) {
|
|
|
|
|
return fileText;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Sanitizes the text content of a scientific paper to obtain higher-quality text for vectorization.
|
|
|
|
|
*
|
|
|
|
|
* This is a really simplistic, classical regex-based algorithm. An LLM could likely do better, but that would be slow.
|
|
|
|
|
* We hope to get a result that's not horribly broken and that won't include irrelevant RAG query poisoning stuff.
|
|
|
|
|
*
|
|
|
|
|
* Currently, we:
|
|
|
|
|
*
|
|
|
|
|
* - Strip the reference list.
|
|
|
|
|
*
|
|
|
|
|
* The reference list contains the highest concentration of keywords of any kind (in the titles of the cited studies),
|
|
|
|
|
* so it usually poisons RAG queries so that no matter what you search for, you'll only get chunks of the reference list.
|
|
|
|
|
* Omitting the reference list from the text to be vectorized, RAG will look for matches in the paper content only.
|
|
|
|
|
*
|
|
|
|
|
* - F IX H EADINGS T HAT L OOK L IKE T HIS.
|
|
|
|
|
*
|
|
|
|
|
* This is a rather common issue in text extraction from a PDF.
|
|
|
|
|
*
|
|
|
|
|
* @param {string} fileText The text to sanitize
|
|
|
|
|
* @returns {string} The sanitized text
|
|
|
|
|
*/
|
|
|
|
|
function sanitizeScientificInput(fileText) {
|
|
|
|
|
// Fix section headings
|
|
|
|
|
//
|
|
|
|
|
const brokenUppercaseWordsFinder = new RegExp(/(?<!\b[A-Z]\s+)\b([A-Z])\s+([A-Z]+)\b/, 'g'); // "H EADING", but not "C H EADING" (appendix section)
|
|
|
|
|
fileText = fileText.replaceAll(brokenUppercaseWordsFinder, '$1$2');
|
|
|
|
|
const brokenAppendixHeadingFinder = new RegExp(/([A-Z])\s+([A-Z])\s+([A-Z]+)\b/, 'g'); // "C H EADING"
|
|
|
|
|
fileText = fileText.replaceAll(brokenAppendixHeadingFinder, '$1 $2$3'); // -> "C HEADING"
|
|
|
|
|
|
|
|
|
|
const brokenHeadingsFinder = new RegExp(/^\s*([A-Z])\s+([a-z]+)\s*$/, 'mg'); // "H eading", on its own line
|
|
|
|
|
fileText = fileText.replaceAll(brokenHeadingsFinder, '$1$2');
|
|
|
|
|
|
|
|
|
|
// Strip reference list (easier now that the headings are already fixed).
|
|
|
|
|
//
|
|
|
|
|
// Linefeeds are sometimes lost, so the references may begin in the middle of a line.
|
|
|
|
|
// Since we can't trigger on any random mention of the word "References", we trigger in the middle of a line
|
|
|
|
|
// only for an all-uppercase "REFERENCES".
|
|
|
|
|
//
|
|
|
|
|
const referencesFinder = new RegExp(/(^\s*References\s*$|^\s*REFERENCES\s*$|\bREFERENCES\s*)/, 'mg');
|
|
|
|
|
const referencesMatches = [...fileText.matchAll(referencesFinder)];
|
|
|
|
|
if (referencesMatches.length > 0) { // Detected a reference list
|
|
|
|
|
const appendixFinder = new RegExp(/(^\s*Appendi(x|ces)\s*$|^\s*A\s*PPENDI(X|CES)\s*$|\bAPPENDI(X|CES)\s*)/, 'mg');
|
|
|
|
|
// Some documents just start appendices like "A Some stuff..." without a heading, but there's not much we can do about that.
|
|
|
|
|
// In those cases, we will simply ignore the appendices.
|
|
|
|
|
const appendixMatches = [...fileText.matchAll(appendixFinder)];
|
|
|
|
|
if (appendixMatches.length > 0) { // Detected both a reference list and appendices
|
|
|
|
|
fileText = fileText.substring(0, referencesMatches[0].index).trim() + fileText.substring(appendixMatches[0].index);
|
|
|
|
|
} else { // Detected only a reference list, no appendices
|
|
|
|
|
fileText = fileText.substring(0, referencesMatches[0].index).trim();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
console.debug(fileText);
|
|
|
|
|
return fileText;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Vectorizes a file and inserts it into the vector index.
|
|
|
|
|
* @param {string} fileText File text
|
|
|
|
@ -454,12 +519,19 @@ async function vectorizeFile(fileText, fileName, collectionId, chunkSize) {
|
|
|
|
|
fileText = translatedText;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const toast = toastr.info('Vectorization may take some time, please wait...', `Ingesting file ${fileName}`);
|
|
|
|
|
const chunks = splitRecursive(fileText, chunkSize);
|
|
|
|
|
const toast = toastr.info(`Ingesting file ${fileName}. Vectorization may take some time, please wait...`, 'Vector Storage');
|
|
|
|
|
|
|
|
|
|
if (settings.science_mode) {
|
|
|
|
|
console.debug(`Vectors: Science mode is enabled. Sanitizing input ${fileName}.`);
|
|
|
|
|
fileText = sanitizeScientificInput(fileText);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const chunks = splitRecursive(fileText, settings.chunk_size);
|
|
|
|
|
console.debug(`Vectors: Split file ${fileName} into ${chunks.length} chunks`, chunks);
|
|
|
|
|
|
|
|
|
|
const items = chunks.map((chunk, index) => ({ hash: getStringHash(chunk), text: chunk, index: index }));
|
|
|
|
|
await insertVectorItems(collectionId, items);
|
|
|
|
|
toastr.info(`Vectorization complete for ${fileName}.`, `Vector Storage`);
|
|
|
|
|
|
|
|
|
|
toastr.clear(toast);
|
|
|
|
|
console.log(`Vectors: Inserted ${chunks.length} vector items for file ${fileName} into ${collectionId}`);
|
|
|
|
@ -467,6 +539,7 @@ async function vectorizeFile(fileText, fileName, collectionId, chunkSize) {
|
|
|
|
|
} catch (error) {
|
|
|
|
|
toastr.error(String(error), 'Failed to vectorize file', { preventDuplicates: true });
|
|
|
|
|
console.error('Vectors: Failed to vectorize file', error);
|
|
|
|
|
toastr.error(`Vectorization failed for ${fileName}. ${new String(error)}`, 'Vector Storage');
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
@ -873,20 +946,20 @@ function toggleSettings() {
|
|
|
|
|
async function onPurgeClick() {
|
|
|
|
|
const chatId = getCurrentChatId();
|
|
|
|
|
if (!chatId) {
|
|
|
|
|
toastr.info('No chat selected', 'Purge aborted');
|
|
|
|
|
toastr.info('No chat selected. Purge aborted.', 'Vector Storage');
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
if (await purgeVectorIndex(chatId)) {
|
|
|
|
|
toastr.success('Vector index purged', 'Purge successful');
|
|
|
|
|
toastr.success('Vector index purged successfully.', 'Vector Storage');
|
|
|
|
|
} else {
|
|
|
|
|
toastr.error('Failed to purge vector index', 'Purge failed');
|
|
|
|
|
toastr.error('Failed to purge vector index', 'Vector Storage');
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async function onViewStatsClick() {
|
|
|
|
|
const chatId = getCurrentChatId();
|
|
|
|
|
if (!chatId) {
|
|
|
|
|
toastr.info('No chat selected');
|
|
|
|
|
toastr.info('No chat selected', 'Vector Storage');
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -1097,6 +1170,11 @@ jQuery(async () => {
|
|
|
|
|
saveSettingsDebounced();
|
|
|
|
|
toggleSettings();
|
|
|
|
|
});
|
|
|
|
|
$('#vectors_science_mode').prop('checked', settings.science_mode).on('input', () => {
|
|
|
|
|
settings.science_mode = $('#vectors_science_mode').prop('checked');
|
|
|
|
|
Object.assign(extension_settings.vectors, settings);
|
|
|
|
|
saveSettingsDebounced();
|
|
|
|
|
});
|
|
|
|
|
$('#vectors_source').val(settings.source).on('change', () => {
|
|
|
|
|
settings.source = String($('#vectors_source').val());
|
|
|
|
|
Object.assign(extension_settings.vectors, settings);
|
|
|
|
|