diff --git a/public/scripts/extensions/caption/index.js b/public/scripts/extensions/caption/index.js
index 97cc4cf37..ec126fcda 100644
--- a/public/scripts/extensions/caption/index.js
+++ b/public/scripts/extensions/caption/index.js
@@ -4,6 +4,7 @@ import { callPopup, getRequestHeaders, saveSettingsDebounced, substituteParams }
import { getMessageTimeStamp } from '../../RossAscends-mods.js';
import { SECRET_KEYS, secret_state } from '../../secrets.js';
import { getMultimodalCaption } from '../shared.js';
+import { textgen_types, textgenerationwebui_settings } from '../../textgen-settings.js';
export { MODULE_NAME };
const MODULE_NAME = 'caption';
@@ -134,7 +135,7 @@ async function doCaptionRequest(base64Img, fileData) {
case 'horde':
return await captionHorde(base64Img);
case 'multimodal':
- return await captionMultimodal(extension_settings.caption.multimodal_api === 'google' ? base64Img : fileData);
+ return await captionMultimodal(fileData);
default:
throw new Error('Unknown caption source.');
}
@@ -271,9 +272,11 @@ jQuery(function () {
$(sendButton).on('click', () => {
const hasCaptionModule =
(modules.includes('caption') && extension_settings.caption.source === 'extras') ||
- (extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'openai' && secret_state[SECRET_KEYS.OPENAI]) ||
+ (extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'openai' && (secret_state[SECRET_KEYS.OPENAI] || extension_settings.caption.allow_reverse_proxy)) ||
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'openrouter' && secret_state[SECRET_KEYS.OPENROUTER]) ||
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'google' && secret_state[SECRET_KEYS.MAKERSUITE]) ||
+ (extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'ollama' && textgenerationwebui_settings.server_urls[textgen_types.OLLAMA]) ||
+ (extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'llamacpp' && textgenerationwebui_settings.server_urls[textgen_types.LLAMACPP]) ||
extension_settings.caption.source === 'local' ||
extension_settings.caption.source === 'horde';
@@ -329,7 +332,7 @@ jQuery(function () {
@@ -337,9 +340,11 @@ jQuery(function () {
@@ -349,12 +354,19 @@ jQuery(function () {
+
+
+
+
+
+ Hint: Set your API keys and endpoints in the 'API Connections' tab first.
+
diff --git a/public/scripts/extensions/shared.js b/public/scripts/extensions/shared.js
index 7d4e16720..4947f9b32 100644
--- a/public/scripts/extensions/shared.js
+++ b/public/scripts/extensions/shared.js
@@ -2,6 +2,7 @@ import { getRequestHeaders } from '../../script.js';
import { extension_settings } from '../extensions.js';
import { oai_settings } from '../openai.js';
import { SECRET_KEYS, secret_state } from '../secrets.js';
+import { textgen_types, textgenerationwebui_settings } from '../textgen-settings.js';
import { createThumbnail, isValidUrl } from '../utils.js';
/**
@@ -11,20 +12,18 @@ import { createThumbnail, isValidUrl } from '../utils.js';
* @returns {Promise} Generated caption
*/
export async function getMultimodalCaption(base64Img, prompt) {
- if (extension_settings.caption.multimodal_api === 'openai' && !secret_state[SECRET_KEYS.OPENAI]) {
- throw new Error('OpenAI API key is not set.');
- }
+ throwIfInvalidModel();
- if (extension_settings.caption.multimodal_api === 'openrouter' && !secret_state[SECRET_KEYS.OPENROUTER]) {
- throw new Error('OpenRouter API key is not set.');
- }
+ const noPrefix = ['google', 'ollama', 'llamacpp'].includes(extension_settings.caption.multimodal_api);
- if (extension_settings.caption.multimodal_api === 'google' && !secret_state[SECRET_KEYS.MAKERSUITE]) {
- throw new Error('MakerSuite API key is not set.');
+ if (noPrefix && base64Img.startsWith('data:image/')) {
+ base64Img = base64Img.split(',')[1];
}
// OpenRouter has a payload limit of ~2MB. Google is 4MB, but we love democracy.
const isGoogle = extension_settings.caption.multimodal_api === 'google';
+ const isOllama = extension_settings.caption.multimodal_api === 'ollama';
+ const isLlamaCpp = extension_settings.caption.multimodal_api === 'llamacpp';
const base64Bytes = base64Img.length * 0.75;
const compressionLimit = 2 * 1024 * 1024;
if (['google', 'openrouter'].includes(extension_settings.caption.multimodal_api) && base64Bytes > compressionLimit) {
@@ -45,27 +44,79 @@ export async function getMultimodalCaption(base64Img, prompt) {
const proxyUrl = useReverseProxy ? oai_settings.reverse_proxy : '';
const proxyPassword = useReverseProxy ? oai_settings.proxy_password : '';
- const apiResult = await fetch(`/api/${isGoogle ? 'google' : 'openai'}/caption-image`, {
+ const requestBody = {
+ image: base64Img,
+ prompt: prompt,
+ };
+
+ if (!isGoogle) {
+ requestBody.api = extension_settings.caption.multimodal_api || 'openai';
+ requestBody.model = extension_settings.caption.multimodal_model || 'gpt-4-vision-preview';
+ requestBody.reverse_proxy = proxyUrl;
+ requestBody.proxy_password = proxyPassword;
+ }
+
+ if (isOllama) {
+ if (extension_settings.caption.multimodal_model === 'ollama_current') {
+ requestBody.model = textgenerationwebui_settings.ollama_model;
+ }
+
+ requestBody.server_url = textgenerationwebui_settings.server_urls[textgen_types.OLLAMA];
+ }
+
+ if (isLlamaCpp) {
+ requestBody.server_url = textgenerationwebui_settings.server_urls[textgen_types.LLAMACPP];
+ }
+
+ function getEndpointUrl() {
+ switch (extension_settings.caption.multimodal_api) {
+ case 'google':
+ return '/api/google/caption-image';
+ case 'llamacpp':
+ return '/api/backends/text-completions/llamacpp/caption-image';
+ case 'ollama':
+ return '/api/backends/text-completions/ollama/caption-image';
+ default:
+ return '/api/openai/caption-image';
+ }
+ }
+
+ const apiResult = await fetch(getEndpointUrl(), {
method: 'POST',
headers: getRequestHeaders(),
- body: JSON.stringify({
- image: base64Img,
- prompt: prompt,
- ...(isGoogle
- ? {}
- : {
- api: extension_settings.caption.multimodal_api || 'openai',
- model: extension_settings.caption.multimodal_model || 'gpt-4-vision-preview',
- reverse_proxy: proxyUrl,
- proxy_password: proxyPassword,
- }),
- }),
+ body: JSON.stringify(requestBody),
});
if (!apiResult.ok) {
- throw new Error('Failed to caption image via OpenAI.');
+ throw new Error('Failed to caption image via Multimodal API.');
}
const { caption } = await apiResult.json();
- return caption;
+ return String(caption).trim();
+}
+
+function throwIfInvalidModel() {
+ if (extension_settings.caption.multimodal_api === 'openai' && !secret_state[SECRET_KEYS.OPENAI]) {
+ throw new Error('OpenAI API key is not set.');
+ }
+
+ if (extension_settings.caption.multimodal_api === 'openrouter' && !secret_state[SECRET_KEYS.OPENROUTER]) {
+ throw new Error('OpenRouter API key is not set.');
+ }
+
+ if (extension_settings.caption.multimodal_api === 'google' && !secret_state[SECRET_KEYS.MAKERSUITE]) {
+ throw new Error('MakerSuite API key is not set.');
+ }
+
+ if (extension_settings.caption.multimodal_api === 'ollama' && !textgenerationwebui_settings.server_urls[textgen_types.OLLAMA]) {
+ throw new Error('Ollama server URL is not set.');
+ }
+
+ if (extension_settings.caption.multimodal_api === 'ollama' && extension_settings.caption.multimodal_model === 'ollama_current' && !textgenerationwebui_settings.ollama_model) {
+ throw new Error('Ollama model is not set.');
+ }
+
+ if (extension_settings.caption.multimodal_api === 'llamacpp' && !textgenerationwebui_settings.server_urls[textgen_types.LLAMACPP]) {
+ throw new Error('LlamaCPP server URL is not set.');
+ }
}
diff --git a/src/endpoints/backends/text-completions.js b/src/endpoints/backends/text-completions.js
index 0924e43ad..614b41557 100644
--- a/src/endpoints/backends/text-completions.js
+++ b/src/endpoints/backends/text-completions.js
@@ -310,11 +310,12 @@ ollama.post('/download', jsonParser, async function (request, response) {
const fetchResponse = await fetch(`${url}/api/pull`, {
method: 'POST',
+ headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
name: name,
stream: false,
}),
- headers: { 'Content-Type': 'application/json' },
+ timeout: 0,
});
if (!fetchResponse.ok) {
@@ -329,6 +330,99 @@ ollama.post('/download', jsonParser, async function (request, response) {
}
});
+ollama.post('/caption-image', jsonParser, async function (request, response) {
+ try {
+ if (!request.body.server_url || !request.body.model) {
+ return response.sendStatus(400);
+ }
+
+ console.log('Ollama caption request:', request.body);
+ // Convert to string + remove trailing slash + /v1 suffix
+ const baseUrl = String(request.body.server_url).replace(/\/$/, '').replace(/\/v1$/, '');
+
+ const fetchResponse = await fetch(`${baseUrl}/api/generate`, {
+ method: 'POST',
+ headers: { 'Content-Type': 'application/json' },
+ body: JSON.stringify({
+ model: request.body.model,
+ prompt: request.body.prompt,
+ images: [request.body.image],
+ stream: false,
+ }),
+ timeout: 0,
+ });
+
+ if (!fetchResponse.ok) {
+ console.log('Ollama caption error:', fetchResponse.status, fetchResponse.statusText);
+ return response.status(500).send({ error: true });
+ }
+
+ const data = await fetchResponse.json();
+ console.log('Ollama caption response:', data);
+
+ const caption = data?.response || '';
+
+ if (!caption) {
+ console.log('Ollama caption is empty.');
+ return response.status(500).send({ error: true });
+ }
+
+ return response.send({ caption });
+ } catch (error) {
+ console.error(error);
+ return response.status(500);
+ }
+});
+
+const llamacpp = express.Router();
+
+llamacpp.post('/caption-image', jsonParser, async function (request, response) {
+ try {
+ if (!request.body.server_url) {
+ return response.sendStatus(400);
+ }
+
+ console.log('LlamaCpp caption request:', request.body);
+ // Convert to string + remove trailing slash + /v1 suffix
+ const baseUrl = String(request.body.server_url).replace(/\/$/, '').replace(/\/v1$/, '');
+
+ const fetchResponse = await fetch(`${baseUrl}/completion`, {
+ method: 'POST',
+ headers: { 'Content-Type': 'application/json' },
+ timeout: 0,
+ body: JSON.stringify({
+ prompt: `USER:[img-1]${String(request.body.prompt).trim()}\nASSISTANT:`,
+ image_data: [{ data: request.body.image, id: 1 }],
+ temperature: 0.1,
+ stream: false,
+ stop: ['USER:', ''],
+ }),
+ });
+
+ if (!fetchResponse.ok) {
+ console.log('LlamaCpp caption error:', fetchResponse.status, fetchResponse.statusText);
+ return response.status(500).send({ error: true });
+ }
+
+ const data = await fetchResponse.json();
+ console.log('LlamaCpp caption response:', data);
+
+ const caption = data?.content || '';
+
+ if (!caption) {
+ console.log('LlamaCpp caption is empty.');
+ return response.status(500).send({ error: true });
+ }
+
+ return response.send({ caption });
+
+ } catch (error) {
+ console.error(error);
+ return response.status(500);
+ }
+});
+
router.use('/ollama', ollama);
+router.use('/llamacpp', llamacpp);
module.exports = { router };