Add Cohere as embedding source

This commit is contained in:
Cohee 2024-04-19 00:07:12 +03:00
parent b69493d252
commit 25cb598694
9 changed files with 147 additions and 32 deletions

View File

@ -35,6 +35,7 @@ const settings = {
include_wi: false, include_wi: false,
togetherai_model: 'togethercomputer/m2-bert-80M-32k-retrieval', togetherai_model: 'togethercomputer/m2-bert-80M-32k-retrieval',
openai_model: 'text-embedding-ada-002', openai_model: 'text-embedding-ada-002',
cohere_model: 'embed-english-v3.0',
summarize: false, summarize: false,
summarize_sent: false, summarize_sent: false,
summary_source: 'main', summary_source: 'main',
@ -598,6 +599,9 @@ function getVectorHeaders() {
case 'openai': case 'openai':
addOpenAiHeaders(headers); addOpenAiHeaders(headers);
break; break;
case 'cohere':
addCohereHeaders(headers);
break;
default: default:
break; break;
} }
@ -636,6 +640,16 @@ function addOpenAiHeaders(headers) {
}); });
} }
/**
* Add headers for the Cohere API source.
* @param {object} headers Header object
*/
function addCohereHeaders(headers) {
Object.assign(headers, {
'X-Cohere-Model': extension_settings.vectors.cohere_model,
});
}
/** /**
* Inserts vector items into a collection * Inserts vector items into a collection
* @param {string} collectionId - The collection to insert into * @param {string} collectionId - The collection to insert into
@ -647,7 +661,8 @@ async function insertVectorItems(collectionId, items) {
settings.source === 'palm' && !secret_state[SECRET_KEYS.MAKERSUITE] || settings.source === 'palm' && !secret_state[SECRET_KEYS.MAKERSUITE] ||
settings.source === 'mistral' && !secret_state[SECRET_KEYS.MISTRALAI] || settings.source === 'mistral' && !secret_state[SECRET_KEYS.MISTRALAI] ||
settings.source === 'togetherai' && !secret_state[SECRET_KEYS.TOGETHERAI] || settings.source === 'togetherai' && !secret_state[SECRET_KEYS.TOGETHERAI] ||
settings.source === 'nomicai' && !secret_state[SECRET_KEYS.NOMICAI]) { settings.source === 'nomicai' && !secret_state[SECRET_KEYS.NOMICAI] ||
settings.source === 'cohere' && !secret_state[SECRET_KEYS.COHERE]) {
throw new Error('Vectors: API key missing', { cause: 'api_key_missing' }); throw new Error('Vectors: API key missing', { cause: 'api_key_missing' });
} }
@ -816,6 +831,7 @@ function toggleSettings() {
$('#vectors_chats_settings').toggle(!!settings.enabled_chats); $('#vectors_chats_settings').toggle(!!settings.enabled_chats);
$('#together_vectorsModel').toggle(settings.source === 'togetherai'); $('#together_vectorsModel').toggle(settings.source === 'togetherai');
$('#openai_vectorsModel').toggle(settings.source === 'openai'); $('#openai_vectorsModel').toggle(settings.source === 'openai');
$('#cohere_vectorsModel').toggle(settings.source === 'cohere');
$('#nomicai_apiKey').toggle(settings.source === 'nomicai'); $('#nomicai_apiKey').toggle(settings.source === 'nomicai');
} }
@ -913,6 +929,12 @@ jQuery(async () => {
Object.assign(extension_settings.vectors, settings); Object.assign(extension_settings.vectors, settings);
saveSettingsDebounced(); saveSettingsDebounced();
}); });
$('#vectors_cohere_model').val(settings.cohere_model).on('change', () => {
$('#vectors_modelWarning').show();
settings.cohere_model = String($('#vectors_cohere_model').val());
Object.assign(extension_settings.vectors, settings);
saveSettingsDebounced();
});
$('#vectors_template').val(settings.template).on('input', () => { $('#vectors_template').val(settings.template).on('input', () => {
settings.template = String($('#vectors_template').val()); settings.template = String($('#vectors_template').val());
Object.assign(extension_settings.vectors, settings); Object.assign(extension_settings.vectors, settings);

View File

@ -10,13 +10,14 @@
Vectorization Source Vectorization Source
</label> </label>
<select id="vectors_source" class="text_pole"> <select id="vectors_source" class="text_pole">
<option value="transformers">Local (Transformers)</option> <option value="cohere">Cohere</option>
<option value="extras">Extras</option> <option value="extras">Extras</option>
<option value="openai">OpenAI</option>
<option value="palm">Google MakerSuite (PaLM)</option> <option value="palm">Google MakerSuite (PaLM)</option>
<option value="transformers">Local (Transformers)</option>
<option value="mistral">MistralAI</option> <option value="mistral">MistralAI</option>
<option value="togetherai">TogetherAI</option>
<option value="nomicai">NomicAI</option> <option value="nomicai">NomicAI</option>
<option value="openai">OpenAI</option>
<option value="togetherai">TogetherAI</option>
</select> </select>
</div> </div>
<div class="flex-container flexFlowColumn" id="openai_vectorsModel"> <div class="flex-container flexFlowColumn" id="openai_vectorsModel">
@ -29,6 +30,20 @@
<option value="text-embedding-3-large">text-embedding-3-large</option> <option value="text-embedding-3-large">text-embedding-3-large</option>
</select> </select>
</div> </div>
<div class="flex-container flexFlowColumn" id="cohere_vectorsModel">
<label for="vectors_cohere_model">
Vectorization Model
</label>
<select id="vectors_cohere_model" class="text_pole">
<option value="embed-english-v3.0">embed-english-v3.0</option>
<option value="embed-multilingual-v3.0">embed-multilingual-v3.0</option>
<option value="embed-english-light-v3.0">embed-english-light-v3.0</option>
<option value="embed-multilingual-light-v3.0">embed-multilingual-light-v3.0</option>
<option value="embed-english-v2.0">embed-english-v2.0</option>
<option value="embed-english-light-v2.0">embed-english-light-v2.0</option>
<option value="embed-multilingual-v2.0">embed-multilingual-v2.0</option>
</select>
</div>
<div class="flex-container flexFlowColumn" id="together_vectorsModel"> <div class="flex-container flexFlowColumn" id="together_vectorsModel">
<label for="vectors_togetherai_model"> <label for="vectors_togetherai_model">
Vectorization Model Vectorization Model

View File

@ -12,23 +12,26 @@ const SOURCES = ['transformers', 'mistral', 'openai', 'extras', 'palm', 'togethe
* @param {string} source - The source of the vector * @param {string} source - The source of the vector
* @param {Object} sourceSettings - Settings for the source, if it needs any * @param {Object} sourceSettings - Settings for the source, if it needs any
* @param {string} text - The text to get the vector for * @param {string} text - The text to get the vector for
* @param {boolean} isQuery - If the text is a query for embedding search
* @param {import('../users').UserDirectoryList} directories - The directories object for the user * @param {import('../users').UserDirectoryList} directories - The directories object for the user
* @returns {Promise<number[]>} - The vector for the text * @returns {Promise<number[]>} - The vector for the text
*/ */
async function getVector(source, sourceSettings, text, directories) { async function getVector(source, sourceSettings, text, isQuery, directories) {
switch (source) { switch (source) {
case 'nomicai': case 'nomicai':
return require('../nomicai-vectors').getNomicAIVector(text, source, directories); return require('../vectors/nomicai-vectors').getNomicAIVector(text, source, directories);
case 'togetherai': case 'togetherai':
case 'mistral': case 'mistral':
case 'openai': case 'openai':
return require('../openai-vectors').getOpenAIVector(text, source, directories, sourceSettings.model); return require('../vectors/openai-vectors').getOpenAIVector(text, source, directories, sourceSettings.model);
case 'transformers': case 'transformers':
return require('../embedding').getTransformersVector(text); return require('../vectors/embedding').getTransformersVector(text);
case 'extras': case 'extras':
return require('../extras-vectors').getExtrasVector(text, sourceSettings.extrasUrl, sourceSettings.extrasKey); return require('../vectors/extras-vectors').getExtrasVector(text, sourceSettings.extrasUrl, sourceSettings.extrasKey);
case 'palm': case 'palm':
return require('../makersuite-vectors').getMakerSuiteVector(text, directories); return require('../vectors/makersuite-vectors').getMakerSuiteVector(text, directories);
case 'cohere':
return require('../vectors/cohere-vectors').getCohereVector(text, isQuery, directories, sourceSettings.model);
} }
throw new Error(`Unknown vector source ${source}`); throw new Error(`Unknown vector source ${source}`);
@ -39,10 +42,11 @@ async function getVector(source, sourceSettings, text, directories) {
* @param {string} source - The source of the vector * @param {string} source - The source of the vector
* @param {Object} sourceSettings - Settings for the source, if it needs any * @param {Object} sourceSettings - Settings for the source, if it needs any
* @param {string[]} texts - The array of texts to get the vector for * @param {string[]} texts - The array of texts to get the vector for
* @param {boolean} isQuery - If the text is a query for embedding search
* @param {import('../users').UserDirectoryList} directories - The directories object for the user * @param {import('../users').UserDirectoryList} directories - The directories object for the user
* @returns {Promise<number[][]>} - The array of vectors for the texts * @returns {Promise<number[][]>} - The array of vectors for the texts
*/ */
async function getBatchVector(source, sourceSettings, texts, directories) { async function getBatchVector(source, sourceSettings, texts, isQuery, directories) {
const batchSize = 10; const batchSize = 10;
const batches = Array(Math.ceil(texts.length / batchSize)).fill(undefined).map((_, i) => texts.slice(i * batchSize, i * batchSize + batchSize)); const batches = Array(Math.ceil(texts.length / batchSize)).fill(undefined).map((_, i) => texts.slice(i * batchSize, i * batchSize + batchSize));
@ -50,21 +54,24 @@ async function getBatchVector(source, sourceSettings, texts, directories) {
for (let batch of batches) { for (let batch of batches) {
switch (source) { switch (source) {
case 'nomicai': case 'nomicai':
results.push(...await require('../nomicai-vectors').getNomicAIBatchVector(batch, source, directories)); results.push(...await require('../vectors/nomicai-vectors').getNomicAIBatchVector(batch, source, directories));
break; break;
case 'togetherai': case 'togetherai':
case 'mistral': case 'mistral':
case 'openai': case 'openai':
results.push(...await require('../openai-vectors').getOpenAIBatchVector(batch, source, directories, sourceSettings.model)); results.push(...await require('../vectors/openai-vectors').getOpenAIBatchVector(batch, source, directories, sourceSettings.model));
break; break;
case 'transformers': case 'transformers':
results.push(...await require('../embedding').getTransformersBatchVector(batch)); results.push(...await require('../vectors/embedding').getTransformersBatchVector(batch));
break; break;
case 'extras': case 'extras':
results.push(...await require('../extras-vectors').getExtrasBatchVector(batch, sourceSettings.extrasUrl, sourceSettings.extrasKey)); results.push(...await require('../vectors/extras-vectors').getExtrasBatchVector(batch, sourceSettings.extrasUrl, sourceSettings.extrasKey));
break; break;
case 'palm': case 'palm':
results.push(...await require('../makersuite-vectors').getMakerSuiteBatchVector(batch, directories)); results.push(...await require('../vectors/makersuite-vectors').getMakerSuiteBatchVector(batch, directories));
break;
case 'cohere':
results.push(...await require('../vectors/cohere-vectors').getCohereBatchVector(batch, isQuery, directories, sourceSettings.model));
break; break;
default: default:
throw new Error(`Unknown vector source ${source}`); throw new Error(`Unknown vector source ${source}`);
@ -106,7 +113,7 @@ async function insertVectorItems(directories, collectionId, source, sourceSettin
await store.beginUpdate(); await store.beginUpdate();
const vectors = await getBatchVector(source, sourceSettings, items.map(x => x.text), directories); const vectors = await getBatchVector(source, sourceSettings, items.map(x => x.text), false, directories);
for (let i = 0; i < items.length; i++) { for (let i = 0; i < items.length; i++) {
const item = items[i]; const item = items[i];
@ -165,7 +172,7 @@ async function deleteVectorItems(directories, collectionId, source, hashes) {
*/ */
async function queryCollection(directories, collectionId, source, sourceSettings, searchText, topK) { async function queryCollection(directories, collectionId, source, sourceSettings, searchText, topK) {
const store = await getIndex(directories, collectionId, source); const store = await getIndex(directories, collectionId, source);
const vector = await getVector(source, sourceSettings, searchText, directories); const vector = await getVector(source, sourceSettings, searchText, true, directories);
const result = await store.queryItems(vector, topK); const result = await store.queryItems(vector, topK);
const metadata = result.map(x => x.item.metadata); const metadata = result.map(x => x.item.metadata);
@ -184,7 +191,7 @@ async function queryCollection(directories, collectionId, source, sourceSettings
* @returns {Promise<Record<string, { hashes: number[], metadata: object[] }>>} - The top K results from each collection * @returns {Promise<Record<string, { hashes: number[], metadata: object[] }>>} - The top K results from each collection
*/ */
async function multiQueryCollection(directories, collectionIds, source, sourceSettings, searchText, topK) { async function multiQueryCollection(directories, collectionIds, source, sourceSettings, searchText, topK) {
const vector = await getVector(source, sourceSettings, searchText, directories); const vector = await getVector(source, sourceSettings, searchText, true, directories);
const results = []; const results = [];
for (const collectionId of collectionIds) { for (const collectionId of collectionIds) {
@ -223,13 +230,19 @@ async function multiQueryCollection(directories, collectionIds, source, sourceSe
*/ */
function getSourceSettings(source, request) { function getSourceSettings(source, request) {
if (source === 'togetherai') { if (source === 'togetherai') {
let model = String(request.headers['x-togetherai-model']); const model = String(request.headers['x-togetherai-model']);
return { return {
model: model, model: model,
}; };
} else if (source === 'openai') { } else if (source === 'openai') {
let model = String(request.headers['x-openai-model']); const model = String(request.headers['x-openai-model']);
return {
model: model,
};
} else if (source === 'cohere') {
const model = String(request.headers['x-cohere-model']);
return { return {
model: model, model: model,

View File

@ -0,0 +1,65 @@
const fetch = require('node-fetch').default;
const { SECRET_KEYS, readSecret } = require('../endpoints/secrets');
/**
* Gets the vector for the given text batch from an OpenAI compatible endpoint.
* @param {string[]} texts - The array of texts to get the vector for
* @param {boolean} isQuery - If the text is a query for embedding search
* @param {import('../users').UserDirectoryList} directories - The directories object for the user
* @param {string} model - The model to use for the embedding
* @returns {Promise<number[][]>} - The array of vectors for the texts
*/
async function getCohereBatchVector(texts, isQuery, directories, model) {
const key = readSecret(directories, SECRET_KEYS.COHERE);
if (!key) {
console.log('No API key found');
throw new Error('No API key found');
}
const response = await fetch('https://api.cohere.ai/v1/embed', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
Authorization: `Bearer ${key}`,
},
body: JSON.stringify({
texts: texts,
model: model,
input_type: isQuery ? 'search_query' : 'search_document',
truncate: 'END',
}),
});
if (!response.ok) {
const text = await response.text();
console.log('API request failed', response.statusText, text);
throw new Error('API request failed');
}
const data = await response.json();
if (!Array.isArray(data?.embeddings)) {
console.log('API response was not an array');
throw new Error('API response was not an array');
}
return data.embeddings;
}
/**
* Gets the vector for the given text from an OpenAI compatible endpoint.
* @param {string} text - The text to get the vector for
* @param {boolean} isQuery - If the text is a query for embedding search
* @param {import('../users').UserDirectoryList} directories - The directories object for the user
* @param {string} model - The model to use for the embedding
* @returns {Promise<number[]>} - The vector for the text
*/
async function getCohereVector(text, isQuery, directories, model) {
const vectors = await getCohereBatchVector([text], isQuery, directories, model);
return vectors[0];
}
module.exports = {
getCohereBatchVector,
getCohereVector,
};

View File

@ -6,7 +6,7 @@ const TASK = 'feature-extraction';
* @returns {Promise<number[]>} - The vectorized text in form of an array of numbers * @returns {Promise<number[]>} - The vectorized text in form of an array of numbers
*/ */
async function getTransformersVector(text) { async function getTransformersVector(text) {
const module = await import('./transformers.mjs'); const module = await import('../transformers.mjs');
const pipe = await module.default.getPipeline(TASK); const pipe = await module.default.getPipeline(TASK);
const result = await pipe(text, { pooling: 'mean', normalize: true }); const result = await pipe(text, { pooling: 'mean', normalize: true });
const vector = Array.from(result.data); const vector = Array.from(result.data);

View File

@ -1,10 +1,10 @@
const fetch = require('node-fetch').default; const fetch = require('node-fetch').default;
const { SECRET_KEYS, readSecret } = require('./endpoints/secrets'); const { SECRET_KEYS, readSecret } = require('../endpoints/secrets');
/** /**
* Gets the vector for the given text from gecko model * Gets the vector for the given text from gecko model
* @param {string[]} texts - The array of texts to get the vector for * @param {string[]} texts - The array of texts to get the vector for
* @param {import('./users').UserDirectoryList} directories - The directories object for the user * @param {import('../users').UserDirectoryList} directories - The directories object for the user
* @returns {Promise<number[][]>} - The array of vectors for the texts * @returns {Promise<number[][]>} - The array of vectors for the texts
*/ */
async function getMakerSuiteBatchVector(texts, directories) { async function getMakerSuiteBatchVector(texts, directories) {
@ -16,7 +16,7 @@ async function getMakerSuiteBatchVector(texts, directories) {
/** /**
* Gets the vector for the given text from PaLM gecko model * Gets the vector for the given text from PaLM gecko model
* @param {string} text - The text to get the vector for * @param {string} text - The text to get the vector for
* @param {import('./users').UserDirectoryList} directories - The directories object for the user * @param {import('../users').UserDirectoryList} directories - The directories object for the user
* @returns {Promise<number[]>} - The vector for the text * @returns {Promise<number[]>} - The vector for the text
*/ */
async function getMakerSuiteVector(text, directories) { async function getMakerSuiteVector(text, directories) {

View File

@ -1,5 +1,5 @@
const fetch = require('node-fetch').default; const fetch = require('node-fetch').default;
const { SECRET_KEYS, readSecret } = require('./endpoints/secrets'); const { SECRET_KEYS, readSecret } = require('../endpoints/secrets');
const SOURCES = { const SOURCES = {
'nomicai': { 'nomicai': {
@ -13,7 +13,7 @@ const SOURCES = {
* Gets the vector for the given text batch from an OpenAI compatible endpoint. * Gets the vector for the given text batch from an OpenAI compatible endpoint.
* @param {string[]} texts - The array of texts to get the vector for * @param {string[]} texts - The array of texts to get the vector for
* @param {string} source - The source of the vector * @param {string} source - The source of the vector
* @param {import('./users').UserDirectoryList} directories - The directories object for the user * @param {import('../users').UserDirectoryList} directories - The directories object for the user
* @returns {Promise<number[][]>} - The array of vectors for the texts * @returns {Promise<number[][]>} - The array of vectors for the texts
*/ */
async function getNomicAIBatchVector(texts, source, directories) { async function getNomicAIBatchVector(texts, source, directories) {
@ -64,7 +64,7 @@ async function getNomicAIBatchVector(texts, source, directories) {
* Gets the vector for the given text from an OpenAI compatible endpoint. * Gets the vector for the given text from an OpenAI compatible endpoint.
* @param {string} text - The text to get the vector for * @param {string} text - The text to get the vector for
* @param {string} source - The source of the vector * @param {string} source - The source of the vector
* @param {import('./users').UserDirectoryList} directories - The directories object for the user * @param {import('../users').UserDirectoryList} directories - The directories object for the user
* @returns {Promise<number[]>} - The vector for the text * @returns {Promise<number[]>} - The vector for the text
*/ */
async function getNomicAIVector(text, source, directories) { async function getNomicAIVector(text, source, directories) {

View File

@ -1,5 +1,5 @@
const fetch = require('node-fetch').default; const fetch = require('node-fetch').default;
const { SECRET_KEYS, readSecret } = require('./endpoints/secrets'); const { SECRET_KEYS, readSecret } = require('../endpoints/secrets');
const SOURCES = { const SOURCES = {
'togetherai': { 'togetherai': {
@ -23,7 +23,7 @@ const SOURCES = {
* Gets the vector for the given text batch from an OpenAI compatible endpoint. * Gets the vector for the given text batch from an OpenAI compatible endpoint.
* @param {string[]} texts - The array of texts to get the vector for * @param {string[]} texts - The array of texts to get the vector for
* @param {string} source - The source of the vector * @param {string} source - The source of the vector
* @param {import('./users').UserDirectoryList} directories - The directories object for the user * @param {import('../users').UserDirectoryList} directories - The directories object for the user
* @param {string} model - The model to use for the embedding * @param {string} model - The model to use for the embedding
* @returns {Promise<number[][]>} - The array of vectors for the texts * @returns {Promise<number[][]>} - The array of vectors for the texts
*/ */
@ -79,7 +79,7 @@ async function getOpenAIBatchVector(texts, source, directories, model = '') {
* Gets the vector for the given text from an OpenAI compatible endpoint. * Gets the vector for the given text from an OpenAI compatible endpoint.
* @param {string} text - The text to get the vector for * @param {string} text - The text to get the vector for
* @param {string} source - The source of the vector * @param {string} source - The source of the vector
* @param {import('./users').UserDirectoryList} directories - The directories object for the user * @param {import('../users').UserDirectoryList} directories - The directories object for the user
* @param {string} model - The model to use for the embedding * @param {string} model - The model to use for the embedding
* @returns {Promise<number[]>} - The vector for the text * @returns {Promise<number[]>} - The vector for the text
*/ */