Add multimodal captioning for SD prompt generation

This commit is contained in:
Cohee
2023-11-19 15:24:43 +02:00
parent c3e5d0f6f2
commit b0b19edf31
4 changed files with 137 additions and 10 deletions

View File

@ -1,6 +1,7 @@
import { getRequestHeaders } from "../../script.js";
import { extension_settings } from "../extensions.js";
import { SECRET_KEYS, secret_state } from "../secrets.js";
import { createThumbnail } from "../utils.js";
/**
* Generates a caption for an image using a multimodal model.
@ -17,6 +18,14 @@ export async function getMultimodalCaption(base64Img, prompt) {
throw new Error('OpenRouter API key is not set.');
}
// OpenRouter has a payload limit of ~2MB
const base64Bytes = base64Img.length * 0.75;
const compressionLimit = 2 * 1024 * 1024;
if (extension_settings.caption.multimodal_api === 'openrouter' && base64Bytes > compressionLimit) {
const maxSide = 1024;
base64Img = await createThumbnail(base64Img, maxSide, maxSide);
}
const apiResult = await fetch('/api/openai/caption-image', {
method: 'POST',
headers: getRequestHeaders(),