Add multimodal captioning for SD prompt generation

2025-06-05 21:59:27 +02:00 · 2023-11-19 15:24:43 +02:00
parent c3e5d0f6f2
commit b0b19edf31
4 changed files with 137 additions and 10 deletions
--- a/public/script.js
+++ b/public/script.js
@ -1867,10 +1867,35 @@ function addOneMessage(mes, { type = "normal", insertAfter = null, scroll = true
    }
 }
-function getUserAvatar(avatarImg) {
+/**
 * Returns the URL of the avatar for the given user avatar Id.
 * @param {string} avatarImg User avatar Id
 * @returns {string} User avatar URL
 */
 export function getUserAvatar(avatarImg) {
    return `User Avatars/${avatarImg}`;
 }
 /**
 * Returns the URL of the avatar for the given character Id.
 * @param {number} characterId Character Id
 * @returns {string} Avatar URL
 */
 export function getCharacterAvatar(characterId) {
    const character = characters[characterId];
    const avatarImg = character?.avatar;
    if (!avatarImg || avatarImg === 'none') {
        return default_avatar;
    }
    return formatCharacterAvatar(avatarImg);
 }
 export function formatCharacterAvatar(characterAvatar) {
    return `characters/${characterAvatar}`;
 }
 /**
 * Formats the title for the generation timer.
 * @param {Date} gen_started Date when generation was started
--- a/public/scripts/extensions/shared.js
+++ b/public/scripts/extensions/shared.js
@ -1,6 +1,7 @@
 import { getRequestHeaders } from "../../script.js";
 import { extension_settings } from "../extensions.js";
 import { SECRET_KEYS, secret_state } from "../secrets.js";
 import { createThumbnail } from "../utils.js";
 /**
 * Generates a caption for an image using a multimodal model.
@ -17,6 +18,14 @@ export async function getMultimodalCaption(base64Img, prompt) {
        throw new Error('OpenRouter API key is not set.');
    }
    // OpenRouter has a payload limit of ~2MB
    const base64Bytes = base64Img.length * 0.75;
    const compressionLimit = 2 * 1024 * 1024;
    if (extension_settings.caption.multimodal_api === 'openrouter' && base64Bytes > compressionLimit) {
        const maxSide = 1024;
        base64Img = await createThumbnail(base64Img, maxSide, maxSide);
    }
    const apiResult = await fetch('/api/openai/caption-image', {
        method: 'POST',
        headers: getRequestHeaders(),
--- a/public/scripts/extensions/stable-diffusion/index.js
+++ b/public/scripts/extensions/stable-diffusion/index.js
@ -12,13 +12,18 @@ import {
    getCurrentChatId,
    animation_duration,
    appendMediaToMessage,
    getUserAvatar,
    user_avatar,
    getCharacterAvatar,
    formatCharacterAvatar,
 } from "../../../script.js";
 import { getApiUrl, getContext, extension_settings, doExtrasFetch, modules, renderExtensionTemplate } from "../../extensions.js";
 import { selected_group } from "../../group-chats.js";
-import { stringFormat, initScrollHeight, resetScrollHeight, getCharaFilename, saveBase64AsFile } from "../../utils.js";
+import { stringFormat, initScrollHeight, resetScrollHeight, getCharaFilename, saveBase64AsFile, getBase64Async } from "../../utils.js";
 import { getMessageTimeStamp, humanizedDateTime } from "../../RossAscends-mods.js";
 import { SECRET_KEYS, secret_state } from "../../secrets.js";
 import { getNovelUnlimitedImageGeneration, getNovelAnlas, loadNovelSubscriptionData } from "../../nai-settings.js";
 import { getMultimodalCaption } from "../shared.js";
 export { MODULE_NAME };
 // Wraps a string into monospace font-face span
@ -49,6 +54,15 @@ const generationMode = {
    FACE: 5,
    FREE: 6,
    BACKGROUND: 7,
    CHARACTER_MULTIMODAL: 8,
    USER_MULTIMODAL: 9,
    FACE_MULTIMODAL: 10,
 }
 const multimodalMap = {
    [generationMode.CHARACTER]: generationMode.CHARACTER_MULTIMODAL,
    [generationMode.USER]: generationMode.USER_MULTIMODAL,
    [generationMode.FACE]: generationMode.FACE_MULTIMODAL,
 }
 const modeLabels = {
@ -59,6 +73,9 @@ const modeLabels = {
    [generationMode.NOW]: 'Last Message',
    [generationMode.RAW_LAST]: 'Raw Last Message',
    [generationMode.BACKGROUND]: 'Background',
    [generationMode.CHARACTER_MULTIMODAL]: 'Character (Multimodal Mode)',
    [generationMode.FACE_MULTIMODAL]: 'Portrait (Multimodal Mode)',
    [generationMode.USER_MULTIMODAL]: 'User (Multimodal Mode)',
 }
 const triggerWords = {
@ -118,6 +135,9 @@ const promptTemplates = {
    [generationMode.RAW_LAST]: "[Pause your roleplay and provide ONLY the last chat message string back to me verbatim. Do not write anything after the string. Do not roleplay at all in your response. Do not continue the roleplay story.]",
    [generationMode.BACKGROUND]: "[Pause your roleplay and provide a detailed description of {{char}}'s surroundings in the form of a comma-delimited list of keywords and phrases. The list must include all of the following items in this order: location, time of day, weather, lighting, and any other relevant details. Do not include descriptions of characters and non-visual qualities such as names, personality, movements, scents, mental traits, or anything which could not be seen in a still photograph. Do not write in full sentences. Prefix your description with the phrase 'background,'. Ignore the rest of the story when crafting this description. Do not roleplay as {{user}} when writing this description, and do not attempt to continue the story.]",
    [generationMode.FACE_MULTIMODAL]: `Provide an exhaustive comma-separated list of tags describing the appearance of the character on this image in great detail. Start with "close-up portrait".`,
    [generationMode.CHARACTER_MULTIMODAL]: `Provide an exhaustive comma-separated list of tags describing the appearance of the character on this image in great detail. Start with "full body portrait".`,
    [generationMode.USER_MULTIMODAL]: `Provide an exhaustive comma-separated list of tags describing the appearance of the character on this image in great detail. Start with "full body portrait".`,
 }
 const helpString = [
@ -177,6 +197,7 @@ const defaultSettings = {
    refine_mode: false,
    expand: false,
    interactive_mode: false,
    multimodal_captioning: false,
    prompts: promptTemplates,
@ -342,6 +363,7 @@ async function loadSettings() {
    $('#sd_enable_hr').prop('checked', extension_settings.sd.enable_hr);
    $('#sd_refine_mode').prop('checked', extension_settings.sd.refine_mode);
    $('#sd_expand').prop('checked', extension_settings.sd.expand);
    $('#sd_multimodal_captioning').prop('checked', extension_settings.sd.multimodal_captioning);
    $('#sd_auto_url').val(extension_settings.sd.auto_url);
    $('#sd_auto_auth').val(extension_settings.sd.auto_auth);
    $('#sd_vlad_url').val(extension_settings.sd.vlad_url);
@ -401,6 +423,11 @@ function onInteractiveModeInput() {
    saveSettingsDebounced();
 }
 function onMultimodalCaptioningInput() {
    extension_settings.sd.multimodal_captioning = !!$(this).prop('checked');
    saveSettingsDebounced();
 }
 function onStyleSelect() {
    const selectedStyle = String($('#sd_style').find(':selected').val());
    const styleObject = extension_settings.sd.styles.find(x => x.name === selectedStyle);
@ -1205,15 +1232,22 @@ async function loadNovelModels() {
 }
 function getGenerationType(prompt) {
    let mode = generationMode.FREE;
    for (const [key, values] of Object.entries(triggerWords)) {
        for (const value of values) {
            if (value.toLowerCase() === prompt.toLowerCase().trim()) {
-                return Number(key);
+                mode = Number(key);
                break;
            }
        }
    }
-    return generationMode.FREE;
+    if (extension_settings.sd.multimodal_captioning && multimodalMap[mode] !== undefined) {
        mode = multimodalMap[mode];
    }
    return mode;
 }
 function getQuietPrompt(mode, trigger) {
@ -1284,7 +1318,7 @@ async function generatePicture(_, trigger, message, callback) {
    trigger = trigger.trim();
    const generationType = getGenerationType(trigger);
    console.log('Generation mode', generationType, 'triggered with', trigger);
-    const quiet_prompt = getQuietPrompt(generationType, trigger);
+    const quietPrompt = getQuietPrompt(generationType, trigger);
    const context = getContext();
    // if context.characterId is not null, then we get context.characters[context.characterId].avatar, else we get groupId and context.groups[groupId].id
@ -1308,7 +1342,7 @@ async function generatePicture(_, trigger, message, callback) {
    const dimensions = setTypeSpecificDimensions(generationType);
    try {
-        const prompt = await getPrompt(generationType, message, trigger, quiet_prompt);
+        const prompt = await getPrompt(generationType, message, trigger, quietPrompt);
        console.log('Processed image prompt:', prompt);
        context.deactivateSendButtons();
@ -1353,7 +1387,7 @@ function restoreOriginalDimensions(savedParams) {
    extension_settings.sd.width = savedParams.width;
 }
-async function getPrompt(generationType, message, trigger, quiet_prompt) {
+async function getPrompt(generationType, message, trigger, quietPrompt) {
    let prompt;
    switch (generationType) {
@ -1363,8 +1397,13 @@ async function getPrompt(generationType, message, trigger, quiet_prompt) {
        case generationMode.FREE:
            prompt = trigger.trim();
            break;
        case generationMode.FACE_MULTIMODAL:
        case generationMode.CHARACTER_MULTIMODAL:
        case generationMode.USER_MULTIMODAL:
            prompt = await generateMultimodalPrompt(generationType, quietPrompt);
            break;
        default:
-            prompt = await generatePrompt(quiet_prompt);
+            prompt = await generatePrompt(quietPrompt);
            break;
    }
@ -1375,8 +1414,57 @@ async function getPrompt(generationType, message, trigger, quiet_prompt) {
    return prompt;
 }
-async function generatePrompt(quiet_prompt) {
+/**
-    const reply = await generateQuietPrompt(quiet_prompt, false, false);
+ * Generates a prompt using multimodal captioning.
 * @param {number} generationType - The type of image generation to perform.
 * @param {string} quietPrompt - The prompt to use for the image generation.
 */
 async function generateMultimodalPrompt(generationType, quietPrompt) {
    let avatarUrl;
    if (generationType == generationMode.USER_MULTIMODAL) {
        avatarUrl = getUserAvatar(user_avatar);
    }
    if (generationType == generationMode.CHARACTER_MULTIMODAL || generationType === generationMode.FACE_MULTIMODAL) {
        const context = getContext();
        if (context.groupId) {
            const groupMembers = context.groups.find(x => x.id === context.groupId)?.members;
            const lastMessageAvatar = context.chat?.filter(x => !x.is_system && !x.is_user)?.slice(-1)[0]?.original_avatar;
            const randomMemberAvatar = Array.isArray(groupMembers) ? groupMembers[Math.floor(Math.random() * groupMembers.length)]?.avatar : null;
            const avatarToUse = lastMessageAvatar || randomMemberAvatar;
            avatarUrl = formatCharacterAvatar(avatarToUse);
        } else {
            avatarUrl = getCharacterAvatar(context.characterId);
        }
    }
    const response = await fetch(avatarUrl);
    if (!response.ok) {
        throw new Error('Could not fetch avatar image.');
    }
    const avatarBlob = await response.blob();
    const avatarBase64 = await getBase64Async(avatarBlob);
    const caption = await getMultimodalCaption(avatarBase64, quietPrompt);
    if (!caption) {
        throw new Error('Multimodal captioning failed.');
    }
    return caption;
 }
 /**
 * Generates a prompt using the main LLM API.
 * @param {string} quietPrompt - The prompt to use for the image generation.
 * @returns {Promise<string>} - A promise that resolves when the prompt generation completes.
 */
 async function generatePrompt(quietPrompt) {
    const reply = await generateQuietPrompt(quietPrompt, false, false);
    return processReply(reply);
 }
@ -1932,6 +2020,7 @@ jQuery(async () => {
    $('#sd_interactive_mode').on('input', onInteractiveModeInput);
    $('#sd_openai_style').on('change', onOpenAiStyleSelect);
    $('#sd_openai_quality').on('change', onOpenAiQualitySelect);
    $('#sd_multimodal_captioning').on('input', onMultimodalCaptioningInput);
    $('.sd_settings .inline-drawer-toggle').on('click', function () {
        initScrollHeight($("#sd_prompt_prefix"));
--- a/public/scripts/extensions/stable-diffusion/settings.html
+++ b/public/scripts/extensions/stable-diffusion/settings.html
@ -18,6 +18,10 @@
                <input id="sd_interactive_mode" type="checkbox" />
                Interactive mode
            </label>
            <label for="sd_multimodal_captioning" class="checkbox_label" title="Use multimodal captioning to generate prompts for user and character portraits based on their avatars.">
                <input id="sd_multimodal_captioning" type="checkbox" />
                Use multimodal captioning for portraits
            </label>
            <label for="sd_expand" class="checkbox_label" title="Automatically extend prompts using text generation model">
                <input id="sd_expand" type="checkbox" />
                Auto-enhance prompts