Add multimodal captioning for SD prompt generation

2025-06-05 21:59:27 +02:00 · 2023-11-19 15:24:43 +02:00
parent c3e5d0f6f2
commit b0b19edf31
4 changed files with 137 additions and 10 deletions
--- a/public/scripts/extensions/shared.js
+++ b/public/scripts/extensions/shared.js
@@ -1,6 +1,7 @@
 import { getRequestHeaders } from "../../script.js";
 import { extension_settings } from "../extensions.js";
 import { SECRET_KEYS, secret_state } from "../secrets.js";
+import { createThumbnail } from "../utils.js";

 /**
 * Generates a caption for an image using a multimodal model.
@@ -17,6 +18,14 @@ export async function getMultimodalCaption(base64Img, prompt) {
        throw new Error('OpenRouter API key is not set.');
    }

+    // OpenRouter has a payload limit of ~2MB
+    const base64Bytes = base64Img.length * 0.75;
+    const compressionLimit = 2 * 1024 * 1024;
+    if (extension_settings.caption.multimodal_api === 'openrouter' && base64Bytes > compressionLimit) {
+        const maxSide = 1024;
+        base64Img = await createThumbnail(base64Img, maxSide, maxSide);
+    }
+
    const apiResult = await fetch('/api/openai/caption-image', {
        method: 'POST',
        headers: getRequestHeaders(),