diff --git a/public/scripts/extensions/caption/index.js b/public/scripts/extensions/caption/index.js index 97cc4cf37..ec126fcda 100644 --- a/public/scripts/extensions/caption/index.js +++ b/public/scripts/extensions/caption/index.js @@ -4,6 +4,7 @@ import { callPopup, getRequestHeaders, saveSettingsDebounced, substituteParams } import { getMessageTimeStamp } from '../../RossAscends-mods.js'; import { SECRET_KEYS, secret_state } from '../../secrets.js'; import { getMultimodalCaption } from '../shared.js'; +import { textgen_types, textgenerationwebui_settings } from '../../textgen-settings.js'; export { MODULE_NAME }; const MODULE_NAME = 'caption'; @@ -134,7 +135,7 @@ async function doCaptionRequest(base64Img, fileData) { case 'horde': return await captionHorde(base64Img); case 'multimodal': - return await captionMultimodal(extension_settings.caption.multimodal_api === 'google' ? base64Img : fileData); + return await captionMultimodal(fileData); default: throw new Error('Unknown caption source.'); } @@ -271,9 +272,11 @@ jQuery(function () { $(sendButton).on('click', () => { const hasCaptionModule = (modules.includes('caption') && extension_settings.caption.source === 'extras') || - (extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'openai' && secret_state[SECRET_KEYS.OPENAI]) || + (extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'openai' && (secret_state[SECRET_KEYS.OPENAI] || extension_settings.caption.allow_reverse_proxy)) || (extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'openrouter' && secret_state[SECRET_KEYS.OPENROUTER]) || (extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'google' && secret_state[SECRET_KEYS.MAKERSUITE]) || + (extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'ollama' && textgenerationwebui_settings.server_urls[textgen_types.OLLAMA]) || + (extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'llamacpp' && textgenerationwebui_settings.server_urls[textgen_types.LLAMACPP]) || extension_settings.caption.source === 'local' || extension_settings.caption.source === 'horde'; @@ -329,7 +332,7 @@ jQuery(function () { @@ -337,9 +340,11 @@ jQuery(function () {
@@ -349,12 +354,19 @@ jQuery(function () { + + + +
+
+ Hint: Set your API keys and endpoints in the 'API Connections' tab first. +
diff --git a/public/scripts/extensions/shared.js b/public/scripts/extensions/shared.js index 7d4e16720..4947f9b32 100644 --- a/public/scripts/extensions/shared.js +++ b/public/scripts/extensions/shared.js @@ -2,6 +2,7 @@ import { getRequestHeaders } from '../../script.js'; import { extension_settings } from '../extensions.js'; import { oai_settings } from '../openai.js'; import { SECRET_KEYS, secret_state } from '../secrets.js'; +import { textgen_types, textgenerationwebui_settings } from '../textgen-settings.js'; import { createThumbnail, isValidUrl } from '../utils.js'; /** @@ -11,20 +12,18 @@ import { createThumbnail, isValidUrl } from '../utils.js'; * @returns {Promise} Generated caption */ export async function getMultimodalCaption(base64Img, prompt) { - if (extension_settings.caption.multimodal_api === 'openai' && !secret_state[SECRET_KEYS.OPENAI]) { - throw new Error('OpenAI API key is not set.'); - } + throwIfInvalidModel(); - if (extension_settings.caption.multimodal_api === 'openrouter' && !secret_state[SECRET_KEYS.OPENROUTER]) { - throw new Error('OpenRouter API key is not set.'); - } + const noPrefix = ['google', 'ollama', 'llamacpp'].includes(extension_settings.caption.multimodal_api); - if (extension_settings.caption.multimodal_api === 'google' && !secret_state[SECRET_KEYS.MAKERSUITE]) { - throw new Error('MakerSuite API key is not set.'); + if (noPrefix && base64Img.startsWith('data:image/')) { + base64Img = base64Img.split(',')[1]; } // OpenRouter has a payload limit of ~2MB. Google is 4MB, but we love democracy. const isGoogle = extension_settings.caption.multimodal_api === 'google'; + const isOllama = extension_settings.caption.multimodal_api === 'ollama'; + const isLlamaCpp = extension_settings.caption.multimodal_api === 'llamacpp'; const base64Bytes = base64Img.length * 0.75; const compressionLimit = 2 * 1024 * 1024; if (['google', 'openrouter'].includes(extension_settings.caption.multimodal_api) && base64Bytes > compressionLimit) { @@ -45,27 +44,79 @@ export async function getMultimodalCaption(base64Img, prompt) { const proxyUrl = useReverseProxy ? oai_settings.reverse_proxy : ''; const proxyPassword = useReverseProxy ? oai_settings.proxy_password : ''; - const apiResult = await fetch(`/api/${isGoogle ? 'google' : 'openai'}/caption-image`, { + const requestBody = { + image: base64Img, + prompt: prompt, + }; + + if (!isGoogle) { + requestBody.api = extension_settings.caption.multimodal_api || 'openai'; + requestBody.model = extension_settings.caption.multimodal_model || 'gpt-4-vision-preview'; + requestBody.reverse_proxy = proxyUrl; + requestBody.proxy_password = proxyPassword; + } + + if (isOllama) { + if (extension_settings.caption.multimodal_model === 'ollama_current') { + requestBody.model = textgenerationwebui_settings.ollama_model; + } + + requestBody.server_url = textgenerationwebui_settings.server_urls[textgen_types.OLLAMA]; + } + + if (isLlamaCpp) { + requestBody.server_url = textgenerationwebui_settings.server_urls[textgen_types.LLAMACPP]; + } + + function getEndpointUrl() { + switch (extension_settings.caption.multimodal_api) { + case 'google': + return '/api/google/caption-image'; + case 'llamacpp': + return '/api/backends/text-completions/llamacpp/caption-image'; + case 'ollama': + return '/api/backends/text-completions/ollama/caption-image'; + default: + return '/api/openai/caption-image'; + } + } + + const apiResult = await fetch(getEndpointUrl(), { method: 'POST', headers: getRequestHeaders(), - body: JSON.stringify({ - image: base64Img, - prompt: prompt, - ...(isGoogle - ? {} - : { - api: extension_settings.caption.multimodal_api || 'openai', - model: extension_settings.caption.multimodal_model || 'gpt-4-vision-preview', - reverse_proxy: proxyUrl, - proxy_password: proxyPassword, - }), - }), + body: JSON.stringify(requestBody), }); if (!apiResult.ok) { - throw new Error('Failed to caption image via OpenAI.'); + throw new Error('Failed to caption image via Multimodal API.'); } const { caption } = await apiResult.json(); - return caption; + return String(caption).trim(); +} + +function throwIfInvalidModel() { + if (extension_settings.caption.multimodal_api === 'openai' && !secret_state[SECRET_KEYS.OPENAI]) { + throw new Error('OpenAI API key is not set.'); + } + + if (extension_settings.caption.multimodal_api === 'openrouter' && !secret_state[SECRET_KEYS.OPENROUTER]) { + throw new Error('OpenRouter API key is not set.'); + } + + if (extension_settings.caption.multimodal_api === 'google' && !secret_state[SECRET_KEYS.MAKERSUITE]) { + throw new Error('MakerSuite API key is not set.'); + } + + if (extension_settings.caption.multimodal_api === 'ollama' && !textgenerationwebui_settings.server_urls[textgen_types.OLLAMA]) { + throw new Error('Ollama server URL is not set.'); + } + + if (extension_settings.caption.multimodal_api === 'ollama' && extension_settings.caption.multimodal_model === 'ollama_current' && !textgenerationwebui_settings.ollama_model) { + throw new Error('Ollama model is not set.'); + } + + if (extension_settings.caption.multimodal_api === 'llamacpp' && !textgenerationwebui_settings.server_urls[textgen_types.LLAMACPP]) { + throw new Error('LlamaCPP server URL is not set.'); + } } diff --git a/src/endpoints/backends/text-completions.js b/src/endpoints/backends/text-completions.js index 0924e43ad..614b41557 100644 --- a/src/endpoints/backends/text-completions.js +++ b/src/endpoints/backends/text-completions.js @@ -310,11 +310,12 @@ ollama.post('/download', jsonParser, async function (request, response) { const fetchResponse = await fetch(`${url}/api/pull`, { method: 'POST', + headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ name: name, stream: false, }), - headers: { 'Content-Type': 'application/json' }, + timeout: 0, }); if (!fetchResponse.ok) { @@ -329,6 +330,99 @@ ollama.post('/download', jsonParser, async function (request, response) { } }); +ollama.post('/caption-image', jsonParser, async function (request, response) { + try { + if (!request.body.server_url || !request.body.model) { + return response.sendStatus(400); + } + + console.log('Ollama caption request:', request.body); + // Convert to string + remove trailing slash + /v1 suffix + const baseUrl = String(request.body.server_url).replace(/\/$/, '').replace(/\/v1$/, ''); + + const fetchResponse = await fetch(`${baseUrl}/api/generate`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + model: request.body.model, + prompt: request.body.prompt, + images: [request.body.image], + stream: false, + }), + timeout: 0, + }); + + if (!fetchResponse.ok) { + console.log('Ollama caption error:', fetchResponse.status, fetchResponse.statusText); + return response.status(500).send({ error: true }); + } + + const data = await fetchResponse.json(); + console.log('Ollama caption response:', data); + + const caption = data?.response || ''; + + if (!caption) { + console.log('Ollama caption is empty.'); + return response.status(500).send({ error: true }); + } + + return response.send({ caption }); + } catch (error) { + console.error(error); + return response.status(500); + } +}); + +const llamacpp = express.Router(); + +llamacpp.post('/caption-image', jsonParser, async function (request, response) { + try { + if (!request.body.server_url) { + return response.sendStatus(400); + } + + console.log('LlamaCpp caption request:', request.body); + // Convert to string + remove trailing slash + /v1 suffix + const baseUrl = String(request.body.server_url).replace(/\/$/, '').replace(/\/v1$/, ''); + + const fetchResponse = await fetch(`${baseUrl}/completion`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + timeout: 0, + body: JSON.stringify({ + prompt: `USER:[img-1]${String(request.body.prompt).trim()}\nASSISTANT:`, + image_data: [{ data: request.body.image, id: 1 }], + temperature: 0.1, + stream: false, + stop: ['USER:', ''], + }), + }); + + if (!fetchResponse.ok) { + console.log('LlamaCpp caption error:', fetchResponse.status, fetchResponse.statusText); + return response.status(500).send({ error: true }); + } + + const data = await fetchResponse.json(); + console.log('LlamaCpp caption response:', data); + + const caption = data?.content || ''; + + if (!caption) { + console.log('LlamaCpp caption is empty.'); + return response.status(500).send({ error: true }); + } + + return response.send({ caption }); + + } catch (error) { + console.error(error); + return response.status(500); + } +}); + router.use('/ollama', ollama); +router.use('/llamacpp', llamacpp); module.exports = { router };