Add Cohere as embedding source

2024-04-19 00:07:12 +03:00 · 2024-04-19 00:07:12 +03:00 · 25cb598694
parent b69493d252
commit 25cb598694
9 changed files with 147 additions and 32 deletions
--- a/public/scripts/extensions/vectors/index.js
+++ b/public/scripts/extensions/vectors/index.js
@ -35,6 +35,7 @@ const settings = {
    include_wi: false,
    togetherai_model: 'togethercomputer/m2-bert-80M-32k-retrieval',
    openai_model: 'text-embedding-ada-002',
+    cohere_model: 'embed-english-v3.0',
    summarize: false,
    summarize_sent: false,
    summary_source: 'main',
@ -598,6 +599,9 @@ function getVectorHeaders() {
        case 'openai':
            addOpenAiHeaders(headers);
            break;
+        case 'cohere':
+            addCohereHeaders(headers);
+            break;
        default:
            break;
    }
@ -636,6 +640,16 @@ function addOpenAiHeaders(headers) {
    });
 }

+/**
+ * Add headers for the Cohere API source.
+ * @param {object} headers Header object
+ */
+function addCohereHeaders(headers) {
+    Object.assign(headers, {
+        'X-Cohere-Model': extension_settings.vectors.cohere_model,
+    });
+}
+
 /**
 * Inserts vector items into a collection
 * @param {string} collectionId - The collection to insert into
@ -647,7 +661,8 @@ async function insertVectorItems(collectionId, items) {
        settings.source === 'palm' && !secret_state[SECRET_KEYS.MAKERSUITE] ||
        settings.source === 'mistral' && !secret_state[SECRET_KEYS.MISTRALAI] ||
        settings.source === 'togetherai' && !secret_state[SECRET_KEYS.TOGETHERAI] ||
-        settings.source === 'nomicai' && !secret_state[SECRET_KEYS.NOMICAI]) {
+        settings.source === 'nomicai' && !secret_state[SECRET_KEYS.NOMICAI] ||
+        settings.source === 'cohere' && !secret_state[SECRET_KEYS.COHERE]) {
        throw new Error('Vectors: API key missing', { cause: 'api_key_missing' });
    }

@ -816,6 +831,7 @@ function toggleSettings() {
    $('#vectors_chats_settings').toggle(!!settings.enabled_chats);
    $('#together_vectorsModel').toggle(settings.source === 'togetherai');
    $('#openai_vectorsModel').toggle(settings.source === 'openai');
+    $('#cohere_vectorsModel').toggle(settings.source === 'cohere');
    $('#nomicai_apiKey').toggle(settings.source === 'nomicai');
 }

@ -913,6 +929,12 @@ jQuery(async () => {
        Object.assign(extension_settings.vectors, settings);
        saveSettingsDebounced();
    });
+    $('#vectors_cohere_model').val(settings.cohere_model).on('change', () => {
+        $('#vectors_modelWarning').show();
+        settings.cohere_model = String($('#vectors_cohere_model').val());
+        Object.assign(extension_settings.vectors, settings);
+        saveSettingsDebounced();
+    });
    $('#vectors_template').val(settings.template).on('input', () => {
        settings.template = String($('#vectors_template').val());
        Object.assign(extension_settings.vectors, settings);
--- a/public/scripts/extensions/vectors/settings.html
+++ b/public/scripts/extensions/vectors/settings.html
@ -10,13 +10,14 @@
                    Vectorization Source
                </label>
                <select id="vectors_source" class="text_pole">
-                    <option value="transformers">Local (Transformers)</option>
+                    <option value="cohere">Cohere</option>
                    <option value="extras">Extras</option>
-                    <option value="openai">OpenAI</option>
                    <option value="palm">Google MakerSuite (PaLM)</option>
+                    <option value="transformers">Local (Transformers)</option>
                    <option value="mistral">MistralAI</option>
-                    <option value="togetherai">TogetherAI</option>
                    <option value="nomicai">NomicAI</option>
+                    <option value="openai">OpenAI</option>
+                    <option value="togetherai">TogetherAI</option>
                </select>
            </div>
            <div class="flex-container flexFlowColumn" id="openai_vectorsModel">
@ -29,6 +30,20 @@
                    <option value="text-embedding-3-large">text-embedding-3-large</option>
                </select>
            </div>
+            <div class="flex-container flexFlowColumn" id="cohere_vectorsModel">
+                <label for="vectors_cohere_model">
+                    Vectorization Model
+                </label>
+                <select id="vectors_cohere_model" class="text_pole">
+                    <option value="embed-english-v3.0">embed-english-v3.0</option>
+                    <option value="embed-multilingual-v3.0">embed-multilingual-v3.0</option>
+                    <option value="embed-english-light-v3.0">embed-english-light-v3.0</option>
+                    <option value="embed-multilingual-light-v3.0">embed-multilingual-light-v3.0</option>
+                    <option value="embed-english-v2.0">embed-english-v2.0</option>
+                    <option value="embed-english-light-v2.0">embed-english-light-v2.0</option>
+                    <option value="embed-multilingual-v2.0">embed-multilingual-v2.0</option>
+                </select>
+            </div>
            <div class="flex-container flexFlowColumn" id="together_vectorsModel">
                <label for="vectors_togetherai_model">
                    Vectorization Model
--- a/src/endpoints/vectors.js
+++ b/src/endpoints/vectors.js
@ -12,23 +12,26 @@ const SOURCES = ['transformers', 'mistral', 'openai', 'extras', 'palm', 'togethe
 * @param {string} source - The source of the vector
 * @param {Object} sourceSettings - Settings for the source, if it needs any
 * @param {string} text - The text to get the vector for
+ * @param {boolean} isQuery - If the text is a query for embedding search
 * @param {import('../users').UserDirectoryList} directories - The directories object for the user
 * @returns {Promise<number[]>} - The vector for the text
 */
-async function getVector(source, sourceSettings, text, directories) {
+async function getVector(source, sourceSettings, text, isQuery, directories) {
    switch (source) {
        case 'nomicai':
-            return require('../nomicai-vectors').getNomicAIVector(text, source, directories);
+            return require('../vectors/nomicai-vectors').getNomicAIVector(text, source, directories);
        case 'togetherai':
        case 'mistral':
        case 'openai':
-            return require('../openai-vectors').getOpenAIVector(text, source, directories, sourceSettings.model);
+            return require('../vectors/openai-vectors').getOpenAIVector(text, source, directories, sourceSettings.model);
        case 'transformers':
-            return require('../embedding').getTransformersVector(text);
+            return require('../vectors/embedding').getTransformersVector(text);
        case 'extras':
-            return require('../extras-vectors').getExtrasVector(text, sourceSettings.extrasUrl, sourceSettings.extrasKey);
+            return require('../vectors/extras-vectors').getExtrasVector(text, sourceSettings.extrasUrl, sourceSettings.extrasKey);
        case 'palm':
-            return require('../makersuite-vectors').getMakerSuiteVector(text, directories);
+            return require('../vectors/makersuite-vectors').getMakerSuiteVector(text, directories);
+        case 'cohere':
+            return require('../vectors/cohere-vectors').getCohereVector(text, isQuery, directories, sourceSettings.model);
    }

    throw new Error(`Unknown vector source ${source}`);
@ -39,10 +42,11 @@ async function getVector(source, sourceSettings, text, directories) {
 * @param {string} source - The source of the vector
 * @param {Object} sourceSettings - Settings for the source, if it needs any
 * @param {string[]} texts - The array of texts to get the vector for
+ * @param {boolean} isQuery - If the text is a query for embedding search
 * @param {import('../users').UserDirectoryList} directories - The directories object for the user
 * @returns {Promise<number[][]>} - The array of vectors for the texts
 */
-async function getBatchVector(source, sourceSettings, texts, directories) {
+async function getBatchVector(source, sourceSettings, texts, isQuery, directories) {
    const batchSize = 10;
    const batches = Array(Math.ceil(texts.length / batchSize)).fill(undefined).map((_, i) => texts.slice(i * batchSize, i * batchSize + batchSize));

@ -50,21 +54,24 @@ async function getBatchVector(source, sourceSettings, texts, directories) {
    for (let batch of batches) {
        switch (source) {
            case 'nomicai':
-                results.push(...await require('../nomicai-vectors').getNomicAIBatchVector(batch, source, directories));
+                results.push(...await require('../vectors/nomicai-vectors').getNomicAIBatchVector(batch, source, directories));
                break;
            case 'togetherai':
            case 'mistral':
            case 'openai':
-                results.push(...await require('../openai-vectors').getOpenAIBatchVector(batch, source, directories, sourceSettings.model));
+                results.push(...await require('../vectors/openai-vectors').getOpenAIBatchVector(batch, source, directories, sourceSettings.model));
                break;
            case 'transformers':
-                results.push(...await require('../embedding').getTransformersBatchVector(batch));
+                results.push(...await require('../vectors/embedding').getTransformersBatchVector(batch));
                break;
            case 'extras':
-                results.push(...await require('../extras-vectors').getExtrasBatchVector(batch, sourceSettings.extrasUrl, sourceSettings.extrasKey));
+                results.push(...await require('../vectors/extras-vectors').getExtrasBatchVector(batch, sourceSettings.extrasUrl, sourceSettings.extrasKey));
                break;
            case 'palm':
-                results.push(...await require('../makersuite-vectors').getMakerSuiteBatchVector(batch, directories));
+                results.push(...await require('../vectors/makersuite-vectors').getMakerSuiteBatchVector(batch, directories));
+                break;
+            case 'cohere':
+                results.push(...await require('../vectors/cohere-vectors').getCohereBatchVector(batch, isQuery, directories, sourceSettings.model));
                break;
            default:
                throw new Error(`Unknown vector source ${source}`);
@ -106,7 +113,7 @@ async function insertVectorItems(directories, collectionId, source, sourceSettin

    await store.beginUpdate();

-    const vectors = await getBatchVector(source, sourceSettings, items.map(x => x.text), directories);
+    const vectors = await getBatchVector(source, sourceSettings, items.map(x => x.text), false, directories);

    for (let i = 0; i < items.length; i++) {
        const item = items[i];
@ -165,7 +172,7 @@ async function deleteVectorItems(directories, collectionId, source, hashes) {
 */
 async function queryCollection(directories, collectionId, source, sourceSettings, searchText, topK) {
    const store = await getIndex(directories, collectionId, source);
-    const vector = await getVector(source, sourceSettings, searchText, directories);
+    const vector = await getVector(source, sourceSettings, searchText, true, directories);

    const result = await store.queryItems(vector, topK);
    const metadata = result.map(x => x.item.metadata);
@ -184,7 +191,7 @@ async function queryCollection(directories, collectionId, source, sourceSettings
 * @returns {Promise<Record<string, { hashes: number[], metadata: object[] }>>} - The top K results from each collection
 */
 async function multiQueryCollection(directories, collectionIds, source, sourceSettings, searchText, topK) {
-    const vector = await getVector(source, sourceSettings, searchText, directories);
+    const vector = await getVector(source, sourceSettings, searchText, true, directories);
    const results = [];

    for (const collectionId of collectionIds) {
@ -223,18 +230,24 @@ async function multiQueryCollection(directories, collectionIds, source, sourceSe
 */
 function getSourceSettings(source, request) {
    if (source === 'togetherai') {
-        let model = String(request.headers['x-togetherai-model']);
+        const model = String(request.headers['x-togetherai-model']);

        return {
            model: model,
        };
    } else if (source === 'openai') {
-        let model = String(request.headers['x-openai-model']);
+        const model = String(request.headers['x-openai-model']);

        return {
            model: model,
        };
-    } else {
+    } else if (source === 'cohere') {
+        const model = String(request.headers['x-cohere-model']);
+
+        return {
+            model: model,
+        };
+    }else {
        // Extras API settings to connect to the Extras embeddings provider
        let extrasUrl = '';
        let extrasKey = '';
--- a/src/vectors/cohere-vectors.js
+++ b/src/vectors/cohere-vectors.js
@ -0,0 +1,65 @@
+const fetch = require('node-fetch').default;
+const { SECRET_KEYS, readSecret } = require('../endpoints/secrets');
+
+/**
+ * Gets the vector for the given text batch from an OpenAI compatible endpoint.
+ * @param {string[]} texts - The array of texts to get the vector for
+ * @param {boolean} isQuery - If the text is a query for embedding search
+ * @param {import('../users').UserDirectoryList} directories - The directories object for the user
+ * @param {string} model - The model to use for the embedding
+ * @returns {Promise<number[][]>} - The array of vectors for the texts
+ */
+async function getCohereBatchVector(texts, isQuery, directories, model) {
+    const key = readSecret(directories, SECRET_KEYS.COHERE);
+
+    if (!key) {
+        console.log('No API key found');
+        throw new Error('No API key found');
+    }
+
+    const response = await fetch('https://api.cohere.ai/v1/embed', {
+        method: 'POST',
+        headers: {
+            'Content-Type': 'application/json',
+            Authorization: `Bearer ${key}`,
+        },
+        body: JSON.stringify({
+            texts: texts,
+            model: model,
+            input_type: isQuery ? 'search_query' : 'search_document',
+            truncate: 'END',
+        }),
+    });
+
+    if (!response.ok) {
+        const text = await response.text();
+        console.log('API request failed', response.statusText, text);
+        throw new Error('API request failed');
+    }
+
+    const data = await response.json();
+    if (!Array.isArray(data?.embeddings)) {
+        console.log('API response was not an array');
+        throw new Error('API response was not an array');
+    }
+
+    return data.embeddings;
+}
+
+/**
+ * Gets the vector for the given text from an OpenAI compatible endpoint.
+ * @param {string} text - The text to get the vector for
+ * @param {boolean} isQuery - If the text is a query for embedding search
+ * @param {import('../users').UserDirectoryList} directories - The directories object for the user
+ * @param {string} model - The model to use for the embedding
+ * @returns {Promise<number[]>} - The vector for the text
+ */
+async function getCohereVector(text, isQuery, directories, model) {
+    const vectors = await getCohereBatchVector([text], isQuery, directories, model);
+    return vectors[0];
+}
+
+module.exports = {
+    getCohereBatchVector,
+    getCohereVector,
+};
--- a/src/vectors/embedding.js
+++ b/src/vectors/embedding.js
@ -6,7 +6,7 @@ const TASK = 'feature-extraction';
 * @returns {Promise<number[]>} - The vectorized text in form of an array of numbers
 */
 async function getTransformersVector(text) {
-    const module = await import('./transformers.mjs');
+    const module = await import('../transformers.mjs');
    const pipe = await module.default.getPipeline(TASK);
    const result = await pipe(text, { pooling: 'mean', normalize: true });
    const vector = Array.from(result.data);
--- a/src/vectors/extras-vectors.js
+++ b/src/vectors/extras-vectors.js
--- a/src/vectors/makersuite-vectors.js
+++ b/src/vectors/makersuite-vectors.js
@ -1,10 +1,10 @@
 const fetch = require('node-fetch').default;
-const { SECRET_KEYS, readSecret } = require('./endpoints/secrets');
+const { SECRET_KEYS, readSecret } = require('../endpoints/secrets');

 /**
 * Gets the vector for the given text from gecko model
 * @param {string[]} texts - The array of texts to get the vector for
- * @param {import('./users').UserDirectoryList} directories - The directories object for the user
+ * @param {import('../users').UserDirectoryList} directories - The directories object for the user
 * @returns {Promise<number[][]>} - The array of vectors for the texts
 */
 async function getMakerSuiteBatchVector(texts, directories) {
@ -16,7 +16,7 @@ async function getMakerSuiteBatchVector(texts, directories) {
 /**
 * Gets the vector for the given text from PaLM gecko model
 * @param {string} text - The text to get the vector for
- * @param {import('./users').UserDirectoryList} directories - The directories object for the user
+ * @param {import('../users').UserDirectoryList} directories - The directories object for the user
 * @returns {Promise<number[]>} - The vector for the text
 */
 async function getMakerSuiteVector(text, directories) {
--- a/src/vectors/nomicai-vectors.js
+++ b/src/vectors/nomicai-vectors.js
@ -1,5 +1,5 @@
 const fetch = require('node-fetch').default;
-const { SECRET_KEYS, readSecret } = require('./endpoints/secrets');
+const { SECRET_KEYS, readSecret } = require('../endpoints/secrets');

 const SOURCES = {
    'nomicai': {
@ -13,7 +13,7 @@ const SOURCES = {
 * Gets the vector for the given text batch from an OpenAI compatible endpoint.
 * @param {string[]} texts - The array of texts to get the vector for
 * @param {string} source - The source of the vector
- * @param {import('./users').UserDirectoryList} directories - The directories object for the user
+ * @param {import('../users').UserDirectoryList} directories - The directories object for the user
 * @returns {Promise<number[][]>} - The array of vectors for the texts
 */
 async function getNomicAIBatchVector(texts, source, directories) {
@ -64,7 +64,7 @@ async function getNomicAIBatchVector(texts, source, directories) {
 * Gets the vector for the given text from an OpenAI compatible endpoint.
 * @param {string} text - The text to get the vector for
 * @param {string} source - The source of the vector
- * @param {import('./users').UserDirectoryList} directories - The directories object for the user
+ * @param {import('../users').UserDirectoryList} directories - The directories object for the user
 * @returns {Promise<number[]>} - The vector for the text
 */
 async function getNomicAIVector(text, source, directories) {
--- a/src/vectors/openai-vectors.js
+++ b/src/vectors/openai-vectors.js
@ -1,5 +1,5 @@
 const fetch = require('node-fetch').default;
-const { SECRET_KEYS, readSecret } = require('./endpoints/secrets');
+const { SECRET_KEYS, readSecret } = require('../endpoints/secrets');

 const SOURCES = {
    'togetherai': {
@ -23,7 +23,7 @@ const SOURCES = {
 * Gets the vector for the given text batch from an OpenAI compatible endpoint.
 * @param {string[]} texts - The array of texts to get the vector for
 * @param {string} source - The source of the vector
- * @param {import('./users').UserDirectoryList} directories - The directories object for the user
+ * @param {import('../users').UserDirectoryList} directories - The directories object for the user
 * @param {string} model - The model to use for the embedding
 * @returns {Promise<number[][]>} - The array of vectors for the texts
 */
@ -79,7 +79,7 @@ async function getOpenAIBatchVector(texts, source, directories, model = '') {
 * Gets the vector for the given text from an OpenAI compatible endpoint.
 * @param {string} text - The text to get the vector for
 * @param {string} source - The source of the vector
- * @param {import('./users').UserDirectoryList} directories - The directories object for the user
+ * @param {import('../users').UserDirectoryList} directories - The directories object for the user
 * @param {string} model - The model to use for the embedding
 * @returns {Promise<number[]>} - The vector for the text
 */