Add proxy support for multimodal captions. Add caption pre-prompt

2025-06-05 21:59:27 +02:00 · 2023-12-17 19:41:20 +02:00
parent a88cf1552a
commit c7c1513e91
4 changed files with 47 additions and 5 deletions
--- a/default/config.yaml
+++ b/default/config.yaml
@@ -54,6 +54,10 @@ extras:
 openai:
  # Will send a random user ID to OpenAI completion API
  randomizeUserId: false
+  # If not empty, will add this as a system message to the start of every caption completion prompt
+  # Example: "Perform the instructions to the best of your ability.\n\n" (for LLaVA)
+  # Not used in image inlining mode
+  captionSystemPrompt: ""
 # -- DEEPL TRANSLATION CONFIGURATION --
 deepl:
  # Available options: default, more, less, prefer_more, prefer_less
--- a/public/scripts/extensions/caption/index.js
+++ b/public/scripts/extensions/caption/index.js
@@ -300,7 +300,7 @@ jQuery(function () {
        $('#caption_prompt_block').toggle(isMultimodal);
        $('#caption_multimodal_api').val(extension_settings.caption.multimodal_api);
        $('#caption_multimodal_model').val(extension_settings.caption.multimodal_model);
-        $('#caption_multimodal_model option').each(function () {
+        $('#caption_multimodal_block [data-type]').each(function () {
            const type = $(this).data('type');
            $(this).toggle(type === extension_settings.caption.multimodal_api);
        });
@@ -351,6 +351,10 @@ jQuery(function () {
                                <option data-type="openrouter" value="haotian-liu/llava-13b">haotian-liu/llava-13b</option>
                            </select>
                        </div>
+                        <label data-type="openai" class="checkbox_label flexBasis100p" for="caption_allow_reverse_proxy" title="Allow using reverse proxy if defined and valid.">
+                            <input id="caption_allow_reverse_proxy" type="checkbox" class="checkbox">
+                            Allow reverse proxy
+                        </label>
                    </div>
                    <div id="caption_prompt_block">
                        <label for="caption_prompt">Caption Prompt</label>
@@ -377,6 +381,7 @@ jQuery(function () {
    switchMultimodalBlocks();

    $('#caption_refine_mode').prop('checked', !!(extension_settings.caption.refine_mode));
+    $('#caption_allow_reverse_proxy').prop('checked', !!(extension_settings.caption.allow_reverse_proxy));
    $('#caption_source').val(extension_settings.caption.source);
    $('#caption_prompt').val(extension_settings.caption.prompt);
    $('#caption_template').val(extension_settings.caption.template);
@@ -394,4 +399,8 @@ jQuery(function () {
        extension_settings.caption.template = String($('#caption_template').val());
        saveSettingsDebounced();
    });
+    $('#caption_allow_reverse_proxy').on('input', () => {
+        extension_settings.caption.allow_reverse_proxy = $('#caption_allow_reverse_proxy').prop('checked');
+        saveSettingsDebounced();
+    });
 });
--- a/public/scripts/extensions/shared.js
+++ b/public/scripts/extensions/shared.js
@@ -1,7 +1,8 @@
 import { getRequestHeaders } from '../../script.js';
 import { extension_settings } from '../extensions.js';
+import { oai_settings } from '../openai.js';
 import { SECRET_KEYS, secret_state } from '../secrets.js';
-import { createThumbnail } from '../utils.js';
+import { createThumbnail, isValidUrl } from '../utils.js';

 /**
 * Generates a caption for an image using a multimodal model.
@@ -35,6 +36,15 @@ export async function getMultimodalCaption(base64Img, prompt) {
        }
    }

+    const useReverseProxy =
+        extension_settings.caption.multimodal_api === 'openai'
+        && extension_settings.caption.allow_reverse_proxy
+        && oai_settings.reverse_proxy
+        && isValidUrl(oai_settings.reverse_proxy);
+
+    const proxyUrl = useReverseProxy ? oai_settings.reverse_proxy : '';
+    const proxyPassword = useReverseProxy ? oai_settings.proxy_password : '';
+
    const apiResult = await fetch(`/api/${isGoogle ? 'google' : 'openai'}/caption-image`, {
        method: 'POST',
        headers: getRequestHeaders(),
@@ -46,6 +56,8 @@ export async function getMultimodalCaption(base64Img, prompt) {
                : {
                    api: extension_settings.caption.multimodal_api || 'openai',
                    model: extension_settings.caption.multimodal_model || 'gpt-4-vision-preview',
+                    reverse_proxy: proxyUrl,
+                    proxy_password: proxyPassword,
                }),
        }),
    });
--- a/src/endpoints/openai.js
+++ b/src/endpoints/openai.js
@@ -4,6 +4,7 @@ const express = require('express');
 const FormData = require('form-data');
 const fs = require('fs');
 const { jsonParser, urlencodedParser } = require('../express-common');
+const { getConfigValue } = require('../util');

 const router = express.Router();

@@ -11,15 +12,19 @@ router.post('/caption-image', jsonParser, async (request, response) => {
    try {
        let key = '';

-        if (request.body.api === 'openai') {
+        if (request.body.api === 'openai' && !request.body.reverse_proxy) {
            key = readSecret(SECRET_KEYS.OPENAI);
        }

-        if (request.body.api === 'openrouter') {
+        if (request.body.api === 'openrouter' && !request.body.reverse_proxy) {
            key = readSecret(SECRET_KEYS.OPENROUTER);
        }

-        if (!key) {
+        if (request.body.reverse_proxy && request.body.proxy_password) {
+            key = request.body.proxy_password;
+        }
+
+        if (!key && !request.body.reverse_proxy) {
            console.log('No key found for API', request.body.api);
            return response.sendStatus(400);
        }
@@ -38,6 +43,14 @@ router.post('/caption-image', jsonParser, async (request, response) => {
            max_tokens: 500,
        };

+        const captionSystemPrompt = getConfigValue('openai.captionSystemPrompt');
+        if (captionSystemPrompt) {
+            body.messages.unshift({
+                role: 'system',
+                content: captionSystemPrompt,
+            });
+        }
+
        console.log('Multimodal captioning request', body);

        let apiUrl = '';
@@ -52,6 +65,10 @@ router.post('/caption-image', jsonParser, async (request, response) => {
            apiUrl = 'https://api.openai.com/v1/chat/completions';
        }

+        if (request.body.reverse_proxy) {
+            apiUrl = `${request.body.reverse_proxy}/chat/completions`;
+        }
+
        const result = await fetch(apiUrl, {
            method: 'POST',
            headers: {