mirror of
https://github.com/SillyTavern/SillyTavern.git
synced 2025-06-05 21:59:27 +02:00
Add multimodal captioning for SD prompt generation
This commit is contained in:
@ -1867,10 +1867,35 @@ function addOneMessage(mes, { type = "normal", insertAfter = null, scroll = true
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function getUserAvatar(avatarImg) {
|
/**
|
||||||
|
* Returns the URL of the avatar for the given user avatar Id.
|
||||||
|
* @param {string} avatarImg User avatar Id
|
||||||
|
* @returns {string} User avatar URL
|
||||||
|
*/
|
||||||
|
export function getUserAvatar(avatarImg) {
|
||||||
return `User Avatars/${avatarImg}`;
|
return `User Avatars/${avatarImg}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the URL of the avatar for the given character Id.
|
||||||
|
* @param {number} characterId Character Id
|
||||||
|
* @returns {string} Avatar URL
|
||||||
|
*/
|
||||||
|
export function getCharacterAvatar(characterId) {
|
||||||
|
const character = characters[characterId];
|
||||||
|
const avatarImg = character?.avatar;
|
||||||
|
|
||||||
|
if (!avatarImg || avatarImg === 'none') {
|
||||||
|
return default_avatar;
|
||||||
|
}
|
||||||
|
|
||||||
|
return formatCharacterAvatar(avatarImg);
|
||||||
|
}
|
||||||
|
|
||||||
|
export function formatCharacterAvatar(characterAvatar) {
|
||||||
|
return `characters/${characterAvatar}`;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Formats the title for the generation timer.
|
* Formats the title for the generation timer.
|
||||||
* @param {Date} gen_started Date when generation was started
|
* @param {Date} gen_started Date when generation was started
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
import { getRequestHeaders } from "../../script.js";
|
import { getRequestHeaders } from "../../script.js";
|
||||||
import { extension_settings } from "../extensions.js";
|
import { extension_settings } from "../extensions.js";
|
||||||
import { SECRET_KEYS, secret_state } from "../secrets.js";
|
import { SECRET_KEYS, secret_state } from "../secrets.js";
|
||||||
|
import { createThumbnail } from "../utils.js";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Generates a caption for an image using a multimodal model.
|
* Generates a caption for an image using a multimodal model.
|
||||||
@ -17,6 +18,14 @@ export async function getMultimodalCaption(base64Img, prompt) {
|
|||||||
throw new Error('OpenRouter API key is not set.');
|
throw new Error('OpenRouter API key is not set.');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// OpenRouter has a payload limit of ~2MB
|
||||||
|
const base64Bytes = base64Img.length * 0.75;
|
||||||
|
const compressionLimit = 2 * 1024 * 1024;
|
||||||
|
if (extension_settings.caption.multimodal_api === 'openrouter' && base64Bytes > compressionLimit) {
|
||||||
|
const maxSide = 1024;
|
||||||
|
base64Img = await createThumbnail(base64Img, maxSide, maxSide);
|
||||||
|
}
|
||||||
|
|
||||||
const apiResult = await fetch('/api/openai/caption-image', {
|
const apiResult = await fetch('/api/openai/caption-image', {
|
||||||
method: 'POST',
|
method: 'POST',
|
||||||
headers: getRequestHeaders(),
|
headers: getRequestHeaders(),
|
||||||
|
@ -12,13 +12,18 @@ import {
|
|||||||
getCurrentChatId,
|
getCurrentChatId,
|
||||||
animation_duration,
|
animation_duration,
|
||||||
appendMediaToMessage,
|
appendMediaToMessage,
|
||||||
|
getUserAvatar,
|
||||||
|
user_avatar,
|
||||||
|
getCharacterAvatar,
|
||||||
|
formatCharacterAvatar,
|
||||||
} from "../../../script.js";
|
} from "../../../script.js";
|
||||||
import { getApiUrl, getContext, extension_settings, doExtrasFetch, modules, renderExtensionTemplate } from "../../extensions.js";
|
import { getApiUrl, getContext, extension_settings, doExtrasFetch, modules, renderExtensionTemplate } from "../../extensions.js";
|
||||||
import { selected_group } from "../../group-chats.js";
|
import { selected_group } from "../../group-chats.js";
|
||||||
import { stringFormat, initScrollHeight, resetScrollHeight, getCharaFilename, saveBase64AsFile } from "../../utils.js";
|
import { stringFormat, initScrollHeight, resetScrollHeight, getCharaFilename, saveBase64AsFile, getBase64Async } from "../../utils.js";
|
||||||
import { getMessageTimeStamp, humanizedDateTime } from "../../RossAscends-mods.js";
|
import { getMessageTimeStamp, humanizedDateTime } from "../../RossAscends-mods.js";
|
||||||
import { SECRET_KEYS, secret_state } from "../../secrets.js";
|
import { SECRET_KEYS, secret_state } from "../../secrets.js";
|
||||||
import { getNovelUnlimitedImageGeneration, getNovelAnlas, loadNovelSubscriptionData } from "../../nai-settings.js";
|
import { getNovelUnlimitedImageGeneration, getNovelAnlas, loadNovelSubscriptionData } from "../../nai-settings.js";
|
||||||
|
import { getMultimodalCaption } from "../shared.js";
|
||||||
export { MODULE_NAME };
|
export { MODULE_NAME };
|
||||||
|
|
||||||
// Wraps a string into monospace font-face span
|
// Wraps a string into monospace font-face span
|
||||||
@ -49,6 +54,15 @@ const generationMode = {
|
|||||||
FACE: 5,
|
FACE: 5,
|
||||||
FREE: 6,
|
FREE: 6,
|
||||||
BACKGROUND: 7,
|
BACKGROUND: 7,
|
||||||
|
CHARACTER_MULTIMODAL: 8,
|
||||||
|
USER_MULTIMODAL: 9,
|
||||||
|
FACE_MULTIMODAL: 10,
|
||||||
|
}
|
||||||
|
|
||||||
|
const multimodalMap = {
|
||||||
|
[generationMode.CHARACTER]: generationMode.CHARACTER_MULTIMODAL,
|
||||||
|
[generationMode.USER]: generationMode.USER_MULTIMODAL,
|
||||||
|
[generationMode.FACE]: generationMode.FACE_MULTIMODAL,
|
||||||
}
|
}
|
||||||
|
|
||||||
const modeLabels = {
|
const modeLabels = {
|
||||||
@ -59,6 +73,9 @@ const modeLabels = {
|
|||||||
[generationMode.NOW]: 'Last Message',
|
[generationMode.NOW]: 'Last Message',
|
||||||
[generationMode.RAW_LAST]: 'Raw Last Message',
|
[generationMode.RAW_LAST]: 'Raw Last Message',
|
||||||
[generationMode.BACKGROUND]: 'Background',
|
[generationMode.BACKGROUND]: 'Background',
|
||||||
|
[generationMode.CHARACTER_MULTIMODAL]: 'Character (Multimodal Mode)',
|
||||||
|
[generationMode.FACE_MULTIMODAL]: 'Portrait (Multimodal Mode)',
|
||||||
|
[generationMode.USER_MULTIMODAL]: 'User (Multimodal Mode)',
|
||||||
}
|
}
|
||||||
|
|
||||||
const triggerWords = {
|
const triggerWords = {
|
||||||
@ -118,6 +135,9 @@ const promptTemplates = {
|
|||||||
|
|
||||||
[generationMode.RAW_LAST]: "[Pause your roleplay and provide ONLY the last chat message string back to me verbatim. Do not write anything after the string. Do not roleplay at all in your response. Do not continue the roleplay story.]",
|
[generationMode.RAW_LAST]: "[Pause your roleplay and provide ONLY the last chat message string back to me verbatim. Do not write anything after the string. Do not roleplay at all in your response. Do not continue the roleplay story.]",
|
||||||
[generationMode.BACKGROUND]: "[Pause your roleplay and provide a detailed description of {{char}}'s surroundings in the form of a comma-delimited list of keywords and phrases. The list must include all of the following items in this order: location, time of day, weather, lighting, and any other relevant details. Do not include descriptions of characters and non-visual qualities such as names, personality, movements, scents, mental traits, or anything which could not be seen in a still photograph. Do not write in full sentences. Prefix your description with the phrase 'background,'. Ignore the rest of the story when crafting this description. Do not roleplay as {{user}} when writing this description, and do not attempt to continue the story.]",
|
[generationMode.BACKGROUND]: "[Pause your roleplay and provide a detailed description of {{char}}'s surroundings in the form of a comma-delimited list of keywords and phrases. The list must include all of the following items in this order: location, time of day, weather, lighting, and any other relevant details. Do not include descriptions of characters and non-visual qualities such as names, personality, movements, scents, mental traits, or anything which could not be seen in a still photograph. Do not write in full sentences. Prefix your description with the phrase 'background,'. Ignore the rest of the story when crafting this description. Do not roleplay as {{user}} when writing this description, and do not attempt to continue the story.]",
|
||||||
|
[generationMode.FACE_MULTIMODAL]: `Provide an exhaustive comma-separated list of tags describing the appearance of the character on this image in great detail. Start with "close-up portrait".`,
|
||||||
|
[generationMode.CHARACTER_MULTIMODAL]: `Provide an exhaustive comma-separated list of tags describing the appearance of the character on this image in great detail. Start with "full body portrait".`,
|
||||||
|
[generationMode.USER_MULTIMODAL]: `Provide an exhaustive comma-separated list of tags describing the appearance of the character on this image in great detail. Start with "full body portrait".`,
|
||||||
}
|
}
|
||||||
|
|
||||||
const helpString = [
|
const helpString = [
|
||||||
@ -177,6 +197,7 @@ const defaultSettings = {
|
|||||||
refine_mode: false,
|
refine_mode: false,
|
||||||
expand: false,
|
expand: false,
|
||||||
interactive_mode: false,
|
interactive_mode: false,
|
||||||
|
multimodal_captioning: false,
|
||||||
|
|
||||||
prompts: promptTemplates,
|
prompts: promptTemplates,
|
||||||
|
|
||||||
@ -342,6 +363,7 @@ async function loadSettings() {
|
|||||||
$('#sd_enable_hr').prop('checked', extension_settings.sd.enable_hr);
|
$('#sd_enable_hr').prop('checked', extension_settings.sd.enable_hr);
|
||||||
$('#sd_refine_mode').prop('checked', extension_settings.sd.refine_mode);
|
$('#sd_refine_mode').prop('checked', extension_settings.sd.refine_mode);
|
||||||
$('#sd_expand').prop('checked', extension_settings.sd.expand);
|
$('#sd_expand').prop('checked', extension_settings.sd.expand);
|
||||||
|
$('#sd_multimodal_captioning').prop('checked', extension_settings.sd.multimodal_captioning);
|
||||||
$('#sd_auto_url').val(extension_settings.sd.auto_url);
|
$('#sd_auto_url').val(extension_settings.sd.auto_url);
|
||||||
$('#sd_auto_auth').val(extension_settings.sd.auto_auth);
|
$('#sd_auto_auth').val(extension_settings.sd.auto_auth);
|
||||||
$('#sd_vlad_url').val(extension_settings.sd.vlad_url);
|
$('#sd_vlad_url').val(extension_settings.sd.vlad_url);
|
||||||
@ -401,6 +423,11 @@ function onInteractiveModeInput() {
|
|||||||
saveSettingsDebounced();
|
saveSettingsDebounced();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function onMultimodalCaptioningInput() {
|
||||||
|
extension_settings.sd.multimodal_captioning = !!$(this).prop('checked');
|
||||||
|
saveSettingsDebounced();
|
||||||
|
}
|
||||||
|
|
||||||
function onStyleSelect() {
|
function onStyleSelect() {
|
||||||
const selectedStyle = String($('#sd_style').find(':selected').val());
|
const selectedStyle = String($('#sd_style').find(':selected').val());
|
||||||
const styleObject = extension_settings.sd.styles.find(x => x.name === selectedStyle);
|
const styleObject = extension_settings.sd.styles.find(x => x.name === selectedStyle);
|
||||||
@ -1205,15 +1232,22 @@ async function loadNovelModels() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
function getGenerationType(prompt) {
|
function getGenerationType(prompt) {
|
||||||
|
let mode = generationMode.FREE;
|
||||||
|
|
||||||
for (const [key, values] of Object.entries(triggerWords)) {
|
for (const [key, values] of Object.entries(triggerWords)) {
|
||||||
for (const value of values) {
|
for (const value of values) {
|
||||||
if (value.toLowerCase() === prompt.toLowerCase().trim()) {
|
if (value.toLowerCase() === prompt.toLowerCase().trim()) {
|
||||||
return Number(key);
|
mode = Number(key);
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return generationMode.FREE;
|
if (extension_settings.sd.multimodal_captioning && multimodalMap[mode] !== undefined) {
|
||||||
|
mode = multimodalMap[mode];
|
||||||
|
}
|
||||||
|
|
||||||
|
return mode;
|
||||||
}
|
}
|
||||||
|
|
||||||
function getQuietPrompt(mode, trigger) {
|
function getQuietPrompt(mode, trigger) {
|
||||||
@ -1284,7 +1318,7 @@ async function generatePicture(_, trigger, message, callback) {
|
|||||||
trigger = trigger.trim();
|
trigger = trigger.trim();
|
||||||
const generationType = getGenerationType(trigger);
|
const generationType = getGenerationType(trigger);
|
||||||
console.log('Generation mode', generationType, 'triggered with', trigger);
|
console.log('Generation mode', generationType, 'triggered with', trigger);
|
||||||
const quiet_prompt = getQuietPrompt(generationType, trigger);
|
const quietPrompt = getQuietPrompt(generationType, trigger);
|
||||||
const context = getContext();
|
const context = getContext();
|
||||||
|
|
||||||
// if context.characterId is not null, then we get context.characters[context.characterId].avatar, else we get groupId and context.groups[groupId].id
|
// if context.characterId is not null, then we get context.characters[context.characterId].avatar, else we get groupId and context.groups[groupId].id
|
||||||
@ -1308,7 +1342,7 @@ async function generatePicture(_, trigger, message, callback) {
|
|||||||
const dimensions = setTypeSpecificDimensions(generationType);
|
const dimensions = setTypeSpecificDimensions(generationType);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const prompt = await getPrompt(generationType, message, trigger, quiet_prompt);
|
const prompt = await getPrompt(generationType, message, trigger, quietPrompt);
|
||||||
console.log('Processed image prompt:', prompt);
|
console.log('Processed image prompt:', prompt);
|
||||||
|
|
||||||
context.deactivateSendButtons();
|
context.deactivateSendButtons();
|
||||||
@ -1353,7 +1387,7 @@ function restoreOriginalDimensions(savedParams) {
|
|||||||
extension_settings.sd.width = savedParams.width;
|
extension_settings.sd.width = savedParams.width;
|
||||||
}
|
}
|
||||||
|
|
||||||
async function getPrompt(generationType, message, trigger, quiet_prompt) {
|
async function getPrompt(generationType, message, trigger, quietPrompt) {
|
||||||
let prompt;
|
let prompt;
|
||||||
|
|
||||||
switch (generationType) {
|
switch (generationType) {
|
||||||
@ -1363,8 +1397,13 @@ async function getPrompt(generationType, message, trigger, quiet_prompt) {
|
|||||||
case generationMode.FREE:
|
case generationMode.FREE:
|
||||||
prompt = trigger.trim();
|
prompt = trigger.trim();
|
||||||
break;
|
break;
|
||||||
|
case generationMode.FACE_MULTIMODAL:
|
||||||
|
case generationMode.CHARACTER_MULTIMODAL:
|
||||||
|
case generationMode.USER_MULTIMODAL:
|
||||||
|
prompt = await generateMultimodalPrompt(generationType, quietPrompt);
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
prompt = await generatePrompt(quiet_prompt);
|
prompt = await generatePrompt(quietPrompt);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1375,8 +1414,57 @@ async function getPrompt(generationType, message, trigger, quiet_prompt) {
|
|||||||
return prompt;
|
return prompt;
|
||||||
}
|
}
|
||||||
|
|
||||||
async function generatePrompt(quiet_prompt) {
|
/**
|
||||||
const reply = await generateQuietPrompt(quiet_prompt, false, false);
|
* Generates a prompt using multimodal captioning.
|
||||||
|
* @param {number} generationType - The type of image generation to perform.
|
||||||
|
* @param {string} quietPrompt - The prompt to use for the image generation.
|
||||||
|
*/
|
||||||
|
async function generateMultimodalPrompt(generationType, quietPrompt) {
|
||||||
|
let avatarUrl;
|
||||||
|
|
||||||
|
if (generationType == generationMode.USER_MULTIMODAL) {
|
||||||
|
avatarUrl = getUserAvatar(user_avatar);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (generationType == generationMode.CHARACTER_MULTIMODAL || generationType === generationMode.FACE_MULTIMODAL) {
|
||||||
|
const context = getContext();
|
||||||
|
|
||||||
|
if (context.groupId) {
|
||||||
|
const groupMembers = context.groups.find(x => x.id === context.groupId)?.members;
|
||||||
|
const lastMessageAvatar = context.chat?.filter(x => !x.is_system && !x.is_user)?.slice(-1)[0]?.original_avatar;
|
||||||
|
const randomMemberAvatar = Array.isArray(groupMembers) ? groupMembers[Math.floor(Math.random() * groupMembers.length)]?.avatar : null;
|
||||||
|
const avatarToUse = lastMessageAvatar || randomMemberAvatar;
|
||||||
|
avatarUrl = formatCharacterAvatar(avatarToUse);
|
||||||
|
} else {
|
||||||
|
avatarUrl = getCharacterAvatar(context.characterId);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const response = await fetch(avatarUrl);
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
throw new Error('Could not fetch avatar image.');
|
||||||
|
}
|
||||||
|
|
||||||
|
const avatarBlob = await response.blob();
|
||||||
|
const avatarBase64 = await getBase64Async(avatarBlob);
|
||||||
|
|
||||||
|
const caption = await getMultimodalCaption(avatarBase64, quietPrompt);
|
||||||
|
|
||||||
|
if (!caption) {
|
||||||
|
throw new Error('Multimodal captioning failed.');
|
||||||
|
}
|
||||||
|
|
||||||
|
return caption;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generates a prompt using the main LLM API.
|
||||||
|
* @param {string} quietPrompt - The prompt to use for the image generation.
|
||||||
|
* @returns {Promise<string>} - A promise that resolves when the prompt generation completes.
|
||||||
|
*/
|
||||||
|
async function generatePrompt(quietPrompt) {
|
||||||
|
const reply = await generateQuietPrompt(quietPrompt, false, false);
|
||||||
return processReply(reply);
|
return processReply(reply);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1932,6 +2020,7 @@ jQuery(async () => {
|
|||||||
$('#sd_interactive_mode').on('input', onInteractiveModeInput);
|
$('#sd_interactive_mode').on('input', onInteractiveModeInput);
|
||||||
$('#sd_openai_style').on('change', onOpenAiStyleSelect);
|
$('#sd_openai_style').on('change', onOpenAiStyleSelect);
|
||||||
$('#sd_openai_quality').on('change', onOpenAiQualitySelect);
|
$('#sd_openai_quality').on('change', onOpenAiQualitySelect);
|
||||||
|
$('#sd_multimodal_captioning').on('input', onMultimodalCaptioningInput);
|
||||||
|
|
||||||
$('.sd_settings .inline-drawer-toggle').on('click', function () {
|
$('.sd_settings .inline-drawer-toggle').on('click', function () {
|
||||||
initScrollHeight($("#sd_prompt_prefix"));
|
initScrollHeight($("#sd_prompt_prefix"));
|
||||||
|
@ -18,6 +18,10 @@
|
|||||||
<input id="sd_interactive_mode" type="checkbox" />
|
<input id="sd_interactive_mode" type="checkbox" />
|
||||||
Interactive mode
|
Interactive mode
|
||||||
</label>
|
</label>
|
||||||
|
<label for="sd_multimodal_captioning" class="checkbox_label" title="Use multimodal captioning to generate prompts for user and character portraits based on their avatars.">
|
||||||
|
<input id="sd_multimodal_captioning" type="checkbox" />
|
||||||
|
Use multimodal captioning for portraits
|
||||||
|
</label>
|
||||||
<label for="sd_expand" class="checkbox_label" title="Automatically extend prompts using text generation model">
|
<label for="sd_expand" class="checkbox_label" title="Automatically extend prompts using text generation model">
|
||||||
<input id="sd_expand" type="checkbox" />
|
<input id="sd_expand" type="checkbox" />
|
||||||
Auto-enhance prompts
|
Auto-enhance prompts
|
||||||
|
Reference in New Issue
Block a user