diff --git a/default/config.yaml b/default/config.yaml index fd0be655c..edee81009 100644 --- a/default/config.yaml +++ b/default/config.yaml @@ -54,6 +54,10 @@ extras: openai: # Will send a random user ID to OpenAI completion API randomizeUserId: false + # If not empty, will add this as a system message to the start of every caption completion prompt + # Example: "Perform the instructions to the best of your ability.\n\n" (for LLaVA) + # Not used in image inlining mode + captionSystemPrompt: "" # -- DEEPL TRANSLATION CONFIGURATION -- deepl: # Available options: default, more, less, prefer_more, prefer_less diff --git a/public/scripts/extensions/caption/index.js b/public/scripts/extensions/caption/index.js index aa666a232..97cc4cf37 100644 --- a/public/scripts/extensions/caption/index.js +++ b/public/scripts/extensions/caption/index.js @@ -300,7 +300,7 @@ jQuery(function () { $('#caption_prompt_block').toggle(isMultimodal); $('#caption_multimodal_api').val(extension_settings.caption.multimodal_api); $('#caption_multimodal_model').val(extension_settings.caption.multimodal_model); - $('#caption_multimodal_model option').each(function () { + $('#caption_multimodal_block [data-type]').each(function () { const type = $(this).data('type'); $(this).toggle(type === extension_settings.caption.multimodal_api); }); @@ -351,6 +351,10 @@ jQuery(function () { +
@@ -377,6 +381,7 @@ jQuery(function () { switchMultimodalBlocks(); $('#caption_refine_mode').prop('checked', !!(extension_settings.caption.refine_mode)); + $('#caption_allow_reverse_proxy').prop('checked', !!(extension_settings.caption.allow_reverse_proxy)); $('#caption_source').val(extension_settings.caption.source); $('#caption_prompt').val(extension_settings.caption.prompt); $('#caption_template').val(extension_settings.caption.template); @@ -394,4 +399,8 @@ jQuery(function () { extension_settings.caption.template = String($('#caption_template').val()); saveSettingsDebounced(); }); + $('#caption_allow_reverse_proxy').on('input', () => { + extension_settings.caption.allow_reverse_proxy = $('#caption_allow_reverse_proxy').prop('checked'); + saveSettingsDebounced(); + }); }); diff --git a/public/scripts/extensions/shared.js b/public/scripts/extensions/shared.js index 9058204ec..7d4e16720 100644 --- a/public/scripts/extensions/shared.js +++ b/public/scripts/extensions/shared.js @@ -1,7 +1,8 @@ import { getRequestHeaders } from '../../script.js'; import { extension_settings } from '../extensions.js'; +import { oai_settings } from '../openai.js'; import { SECRET_KEYS, secret_state } from '../secrets.js'; -import { createThumbnail } from '../utils.js'; +import { createThumbnail, isValidUrl } from '../utils.js'; /** * Generates a caption for an image using a multimodal model. @@ -35,6 +36,15 @@ export async function getMultimodalCaption(base64Img, prompt) { } } + const useReverseProxy = + extension_settings.caption.multimodal_api === 'openai' + && extension_settings.caption.allow_reverse_proxy + && oai_settings.reverse_proxy + && isValidUrl(oai_settings.reverse_proxy); + + const proxyUrl = useReverseProxy ? oai_settings.reverse_proxy : ''; + const proxyPassword = useReverseProxy ? oai_settings.proxy_password : ''; + const apiResult = await fetch(`/api/${isGoogle ? 'google' : 'openai'}/caption-image`, { method: 'POST', headers: getRequestHeaders(), @@ -46,6 +56,8 @@ export async function getMultimodalCaption(base64Img, prompt) { : { api: extension_settings.caption.multimodal_api || 'openai', model: extension_settings.caption.multimodal_model || 'gpt-4-vision-preview', + reverse_proxy: proxyUrl, + proxy_password: proxyPassword, }), }), }); diff --git a/src/endpoints/openai.js b/src/endpoints/openai.js index 23a19f943..cb98cf274 100644 --- a/src/endpoints/openai.js +++ b/src/endpoints/openai.js @@ -4,6 +4,7 @@ const express = require('express'); const FormData = require('form-data'); const fs = require('fs'); const { jsonParser, urlencodedParser } = require('../express-common'); +const { getConfigValue } = require('../util'); const router = express.Router(); @@ -11,15 +12,19 @@ router.post('/caption-image', jsonParser, async (request, response) => { try { let key = ''; - if (request.body.api === 'openai') { + if (request.body.api === 'openai' && !request.body.reverse_proxy) { key = readSecret(SECRET_KEYS.OPENAI); } - if (request.body.api === 'openrouter') { + if (request.body.api === 'openrouter' && !request.body.reverse_proxy) { key = readSecret(SECRET_KEYS.OPENROUTER); } - if (!key) { + if (request.body.reverse_proxy && request.body.proxy_password) { + key = request.body.proxy_password; + } + + if (!key && !request.body.reverse_proxy) { console.log('No key found for API', request.body.api); return response.sendStatus(400); } @@ -38,6 +43,14 @@ router.post('/caption-image', jsonParser, async (request, response) => { max_tokens: 500, }; + const captionSystemPrompt = getConfigValue('openai.captionSystemPrompt'); + if (captionSystemPrompt) { + body.messages.unshift({ + role: 'system', + content: captionSystemPrompt, + }); + } + console.log('Multimodal captioning request', body); let apiUrl = ''; @@ -52,6 +65,10 @@ router.post('/caption-image', jsonParser, async (request, response) => { apiUrl = 'https://api.openai.com/v1/chat/completions'; } + if (request.body.reverse_proxy) { + apiUrl = `${request.body.reverse_proxy}/chat/completions`; + } + const result = await fetch(apiUrl, { method: 'POST', headers: {