SillyTavern/public/scripts/extensions/caption/index.js

497 lines
23 KiB
JavaScript
Raw Normal View History

2024-04-18 15:22:33 +02:00
import { getBase64Async, isTrueBoolean, saveBase64AsFile } from '../../utils.js';
2023-12-02 19:04:51 +01:00
import { getContext, getApiUrl, doExtrasFetch, extension_settings, modules } from '../../extensions.js';
import { callPopup, getRequestHeaders, saveSettingsDebounced, substituteParams } from '../../../script.js';
import { getMessageTimeStamp } from '../../RossAscends-mods.js';
import { SECRET_KEYS, secret_state } from '../../secrets.js';
import { getMultimodalCaption } from '../shared.js';
2023-12-19 23:45:45 +01:00
import { textgen_types, textgenerationwebui_settings } from '../../textgen-settings.js';
2024-04-18 15:22:33 +02:00
import { registerSlashCommand } from '../../slash-commands.js';
2023-07-20 19:32:15 +02:00
export { MODULE_NAME };
const MODULE_NAME = 'caption';
2023-11-07 00:58:34 +01:00
const PROMPT_DEFAULT = 'Whats in this image?';
const TEMPLATE_DEFAULT = '[{{user}} sends {{char}} a picture that contains: {{caption}}]';
/**
* Migrates old extension settings to the new format.
* Must keep this function for compatibility with old settings.
*/
2023-11-07 00:58:34 +01:00
function migrateSettings() {
if (extension_settings.caption.local !== undefined) {
extension_settings.caption.source = extension_settings.caption.local ? 'local' : 'extras';
}
delete extension_settings.caption.local;
2023-11-07 00:58:34 +01:00
if (!extension_settings.caption.source) {
extension_settings.caption.source = 'extras';
}
if (extension_settings.caption.source === 'openai') {
extension_settings.caption.source = 'multimodal';
extension_settings.caption.multimodal_api = 'openai';
2024-04-10 06:24:43 +02:00
extension_settings.caption.multimodal_model = 'gpt-4-turbo';
}
if (!extension_settings.caption.multimodal_api) {
extension_settings.caption.multimodal_api = 'openai';
}
if (!extension_settings.caption.multimodal_model) {
2024-04-10 06:24:43 +02:00
extension_settings.caption.multimodal_model = 'gpt-4-turbo';
}
2023-11-07 00:58:34 +01:00
if (!extension_settings.caption.prompt) {
extension_settings.caption.prompt = PROMPT_DEFAULT;
}
if (!extension_settings.caption.template) {
extension_settings.caption.template = TEMPLATE_DEFAULT;
}
}
/**
* Sets an image icon for the send button.
*/
2023-07-20 19:32:15 +02:00
async function setImageIcon() {
try {
const sendButton = $('#send_picture .extensionsMenuExtensionButton');
sendButton.addClass('fa-image');
sendButton.removeClass('fa-hourglass-half');
}
catch (error) {
console.log(error);
}
}
/**
* Sets a spinner icon for the send button.
*/
2023-07-20 19:32:15 +02:00
async function setSpinnerIcon() {
try {
const sendButton = $('#send_picture .extensionsMenuExtensionButton');
sendButton.removeClass('fa-image');
sendButton.addClass('fa-hourglass-half');
}
catch (error) {
console.log(error);
}
}
/**
* Sends a captioned message to the chat.
* @param {string} caption Caption text
* @param {string} image Image URL
*/
2023-07-20 19:32:15 +02:00
async function sendCaptionedMessage(caption, image) {
const context = getContext();
2023-11-07 00:58:34 +01:00
let template = extension_settings.caption.template || TEMPLATE_DEFAULT;
if (!/{{caption}}/i.test(template)) {
2023-12-02 20:11:06 +01:00
console.warn('Poka-yoke: Caption template does not contain {{caption}}. Appending it.');
2023-11-07 00:58:34 +01:00
template += ' {{caption}}';
}
let messageText = substituteParams(template).replace(/{{caption}}/i, caption);
2023-07-20 19:32:15 +02:00
if (extension_settings.caption.refine_mode) {
messageText = await callPopup(
'<h3>Review and edit the generated message:</h3>Press "Cancel" to abort the caption sending.',
'input',
messageText,
{ rows: 5, okButton: 'Send' });
if (!messageText) {
throw new Error('User aborted the caption sending.');
}
}
const message = {
name: context.name1,
is_user: true,
send_date: getMessageTimeStamp(),
2023-07-20 19:32:15 +02:00
mes: messageText,
extra: {
image: image,
title: messageText,
},
};
context.chat.push(message);
context.addOneMessage(message);
}
/**
* Generates a caption for an image using a selected source.
* @param {string} base64Img Base64 encoded image without the data:image/...;base64, prefix
* @param {string} fileData Base64 encoded image with the data:image/...;base64, prefix
2024-04-18 15:22:33 +02:00
* @param {string} externalPrompt Caption prompt
* @returns {Promise<{caption: string}>} Generated caption
*/
2024-04-18 15:22:33 +02:00
async function doCaptionRequest(base64Img, fileData, externalPrompt) {
switch (extension_settings.caption.source) {
case 'local':
return await captionLocal(base64Img);
case 'extras':
return await captionExtras(base64Img);
case 'horde':
return await captionHorde(base64Img);
case 'multimodal':
2024-04-18 15:22:33 +02:00
return await captionMultimodal(fileData, externalPrompt);
default:
throw new Error('Unknown caption source.');
}
}
2023-07-20 19:32:15 +02:00
/**
* Generates a caption for an image using Extras API.
* @param {string} base64Img Base64 encoded image without the data:image/...;base64, prefix
* @returns {Promise<{caption: string}>} Generated caption
*/
async function captionExtras(base64Img) {
if (!modules.includes('caption')) {
throw new Error('No captioning module is available.');
}
2023-07-20 19:32:15 +02:00
const url = new URL(getApiUrl());
url.pathname = '/api/caption';
2023-07-20 19:32:15 +02:00
const apiResult = await doExtrasFetch(url, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'Bypass-Tunnel-Reminder': 'bypass',
},
2023-12-02 21:06:57 +01:00
body: JSON.stringify({ image: base64Img }),
});
if (!apiResult.ok) {
throw new Error('Failed to caption image via Extras.');
}
const data = await apiResult.json();
return data;
}
/**
* Generates a caption for an image using a local model.
* @param {string} base64Img Base64 encoded image without the data:image/...;base64, prefix
* @returns {Promise<{caption: string}>} Generated caption
*/
async function captionLocal(base64Img) {
const apiResult = await fetch('/api/extra/caption', {
method: 'POST',
headers: getRequestHeaders(),
2023-12-02 21:06:57 +01:00
body: JSON.stringify({ image: base64Img }),
});
if (!apiResult.ok) {
throw new Error('Failed to caption image via local pipeline.');
}
const data = await apiResult.json();
return data;
}
/**
* Generates a caption for an image using a Horde model.
* @param {string} base64Img Base64 encoded image without the data:image/...;base64, prefix
* @returns {Promise<{caption: string}>} Generated caption
*/
async function captionHorde(base64Img) {
const apiResult = await fetch('/api/horde/caption-image', {
method: 'POST',
headers: getRequestHeaders(),
2023-12-02 21:06:57 +01:00
body: JSON.stringify({ image: base64Img }),
});
if (!apiResult.ok) {
throw new Error('Failed to caption image via Horde.');
}
const data = await apiResult.json();
return data;
}
/**
* Generates a caption for an image using a multimodal model.
* @param {string} base64Img Base64 encoded image with the data:image/...;base64, prefix
2024-04-18 15:22:33 +02:00
* @param {string} externalPrompt Caption prompt
* @returns {Promise<{caption: string}>} Generated caption
*/
2024-04-18 15:22:33 +02:00
async function captionMultimodal(base64Img, externalPrompt) {
let prompt = externalPrompt || extension_settings.caption.prompt || PROMPT_DEFAULT;
2023-12-20 20:23:59 +01:00
2024-04-18 15:22:33 +02:00
if (!externalPrompt && extension_settings.caption.prompt_ask) {
2023-12-20 20:23:59 +01:00
const customPrompt = await callPopup('<h3>Enter a comment or question:</h3>', 'input', prompt, { rows: 2 });
if (!customPrompt) {
throw new Error('User aborted the caption sending.');
}
prompt = String(customPrompt).trim();
}
const caption = await getMultimodalCaption(base64Img, prompt);
return { caption };
}
2024-04-18 15:22:33 +02:00
/**
* Handles the image selection event.
* @param {Event} e Input event
* @param {string} prompt Caption prompt
* @param {boolean} quiet Suppresses sending a message
* @returns {Promise<string>} Generated caption
*/
async function onSelectImage(e, prompt, quiet) {
if (!(e.target instanceof HTMLInputElement)) {
return '';
}
const file = e.target.files[0];
2024-04-18 15:22:33 +02:00
const form = e.target.form;
if (!file || !(file instanceof File)) {
2024-04-18 15:22:33 +02:00
form && form.reset();
return '';
}
try {
2024-04-18 15:22:33 +02:00
setSpinnerIcon();
const context = getContext();
const fileData = await getBase64Async(file);
const base64Format = fileData.split(',')[0].split(';')[0].split('/')[1];
const base64Data = fileData.split(',')[1];
2024-04-18 15:22:33 +02:00
const { caption } = await doCaptionRequest(base64Data, fileData, prompt);
if (!quiet) {
const imagePath = await saveBase64AsFile(base64Data, context.name2, '', base64Format);
await sendCaptionedMessage(caption, imagePath);
}
return caption;
2023-07-20 19:32:15 +02:00
}
catch (error) {
toastr.error('Failed to caption image.');
2023-07-20 19:32:15 +02:00
console.log(error);
2024-04-18 15:22:33 +02:00
return '';
2023-07-20 19:32:15 +02:00
}
finally {
2024-04-18 15:22:33 +02:00
form && form.reset();
2023-07-20 19:32:15 +02:00
setImageIcon();
}
}
function onRefineModeInput() {
extension_settings.caption.refine_mode = $('#caption_refine_mode').prop('checked');
saveSettingsDebounced();
}
2024-04-18 15:22:33 +02:00
/**
* Callback for the /caption command.
* @param {object} args Named parameters
* @param {string} prompt Caption prompt
*/
function captionCommandCallback(args, prompt) {
return new Promise(resolve => {
const quiet = isTrueBoolean(args?.quiet);
const input = document.createElement('input');
input.type = 'file';
input.accept = 'image/*';
input.onchange = async (e) => {
const caption = await onSelectImage(e, prompt, quiet);
resolve(caption);
};
input.oncancel = () => resolve('');
input.click();
});
}
2023-07-20 19:32:15 +02:00
jQuery(function () {
function addSendPictureButton() {
const sendButton = $(`
<div id="send_picture" class="list-group-item flex-container flexGap5">
<div class="fa-solid fa-image extensionsMenuExtensionButton"></div>
Generate Caption
</div>`);
2023-07-20 19:32:15 +02:00
$('#extensionsMenu').prepend(sendButton);
$(sendButton).on('click', () => {
const hasCaptionModule =
(modules.includes('caption') && extension_settings.caption.source === 'extras') ||
2023-12-19 23:45:45 +01:00
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'openai' && (secret_state[SECRET_KEYS.OPENAI] || extension_settings.caption.allow_reverse_proxy)) ||
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'openrouter' && secret_state[SECRET_KEYS.OPENROUTER]) ||
2023-12-14 13:37:53 +01:00
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'google' && secret_state[SECRET_KEYS.MAKERSUITE]) ||
2024-03-04 22:07:38 +01:00
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'anthropic' && secret_state[SECRET_KEYS.CLAUDE]) ||
2023-12-19 23:45:45 +01:00
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'ollama' && textgenerationwebui_settings.server_urls[textgen_types.OLLAMA]) ||
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'llamacpp' && textgenerationwebui_settings.server_urls[textgen_types.LLAMACPP]) ||
2023-12-24 00:43:29 +01:00
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'ooba' && textgenerationwebui_settings.server_urls[textgen_types.OOBA]) ||
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'koboldcpp' && textgenerationwebui_settings.server_urls[textgen_types.KOBOLDCPP]) ||
2023-12-20 20:05:20 +01:00
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'custom') ||
extension_settings.caption.source === 'local' ||
extension_settings.caption.source === 'horde';
if (!hasCaptionModule) {
toastr.error('Choose other captioning source in the extension settings.', 'Captioning is not available');
return;
}
$('#img_file').trigger('click');
});
2023-07-20 19:32:15 +02:00
}
function addPictureSendForm() {
2023-12-02 19:04:51 +01:00
const inputHtml = '<input id="img_file" type="file" hidden accept="image/*">';
2023-07-20 19:32:15 +02:00
const imgForm = document.createElement('form');
imgForm.id = 'img_form';
$(imgForm).append(inputHtml);
$(imgForm).hide();
$('#form_sheld').append(imgForm);
2024-04-18 15:22:33 +02:00
$('#img_file').on('change', (e) => onSelectImage(e.originalEvent, '', false));
2023-07-20 19:32:15 +02:00
}
function switchMultimodalBlocks() {
const isMultimodal = extension_settings.caption.source === 'multimodal';
$('#caption_multimodal_block').toggle(isMultimodal);
$('#caption_prompt_block').toggle(isMultimodal);
$('#caption_multimodal_api').val(extension_settings.caption.multimodal_api);
$('#caption_multimodal_model').val(extension_settings.caption.multimodal_model);
$('#caption_multimodal_block [data-type]').each(function () {
const type = $(this).data('type');
const types = type.split(',');
$(this).toggle(types.includes(extension_settings.caption.multimodal_api));
});
$('#caption_multimodal_api').on('change', () => {
const api = String($('#caption_multimodal_api').val());
const model = String($(`#caption_multimodal_model option[data-type="${api}"]`).first().val());
extension_settings.caption.multimodal_api = api;
extension_settings.caption.multimodal_model = model;
saveSettingsDebounced();
switchMultimodalBlocks();
});
$('#caption_multimodal_model').on('change', () => {
extension_settings.caption.multimodal_model = String($('#caption_multimodal_model').val());
saveSettingsDebounced();
});
}
2023-07-20 19:32:15 +02:00
function addSettings() {
const html = `
<div class="caption_settings">
2023-07-20 19:32:15 +02:00
<div class="inline-drawer">
<div class="inline-drawer-toggle inline-drawer-header">
<b>Image Captioning</b>
<div class="inline-drawer-icon fa-solid fa-circle-chevron-down down"></div>
</div>
<div class="inline-drawer-content">
<label for="caption_source">Source</label>
2023-11-07 00:58:34 +01:00
<select id="caption_source" class="text_pole">
<option value="local">Local</option>
2024-03-04 22:07:38 +01:00
<option value="multimodal">Multimodal (OpenAI / Anthropic / llama / Google)</option>
<option value="extras">Extras</option>
<option value="horde">Horde</option>
</select>
<div id="caption_multimodal_block" class="flex-container wide100p">
<div class="flex1 flex-container flexFlowColumn flexNoGap">
<label for="caption_multimodal_api">API</label>
<select id="caption_multimodal_api" class="flex1 text_pole">
2024-04-02 22:17:51 +02:00
<option value="anthropic">Anthropic</option>
<option value="custom">Custom (OpenAI-compatible)</option>
<option value="google">Google MakerSuite</option>
<option value="koboldcpp">KoboldCpp</option>
2024-04-02 22:17:51 +02:00
<option value="llamacpp">llama.cpp</option>
2023-12-19 23:45:45 +01:00
<option value="ollama">Ollama</option>
<option value="openai">OpenAI</option>
<option value="openrouter">OpenRouter</option>
2024-04-02 22:17:51 +02:00
<option value="ooba">Text Generation WebUI (oobabooga)</option>
</select>
</div>
<div class="flex1 flex-container flexFlowColumn flexNoGap">
<label for="caption_multimodal_model">Model</label>
<select id="caption_multimodal_model" class="flex1 text_pole">
<option data-type="openai" value="gpt-4-vision-preview">gpt-4-vision-preview</option>
2024-04-10 06:24:43 +02:00
<option data-type="openai" value="gpt-4-turbo">gpt-4-turbo</option>
2024-03-04 22:07:38 +01:00
<option data-type="anthropic" value="claude-3-opus-20240229">claude-3-opus-20240229</option>
<option data-type="anthropic" value="claude-3-sonnet-20240229">claude-3-sonnet-20240229</option>
2024-03-13 22:56:08 +01:00
<option data-type="anthropic" value="claude-3-haiku-20240307">claude-3-haiku-20240307</option>
2023-12-14 13:37:53 +01:00
<option data-type="google" value="gemini-pro-vision">gemini-pro-vision</option>
<option data-type="openrouter" value="openai/gpt-4-vision-preview">openai/gpt-4-vision-preview</option>
<option data-type="openrouter" value="haotian-liu/llava-13b">haotian-liu/llava-13b</option>
2024-04-02 22:17:51 +02:00
<option data-type="openrouter" value="anthropic/claude-3-haiku">anthropic/claude-3-haiku</option>
<option data-type="openrouter" value="anthropic/claude-3-sonnet">anthropic/claude-3-sonnet</option>
<option data-type="openrouter" value="anthropic/claude-3-opus">anthropic/claude-3-opus</option>
<option data-type="openrouter" value="anthropic/claude-3-haiku:beta">anthropic/claude-3-haiku:beta</option>
<option data-type="openrouter" value="anthropic/claude-3-sonnet:beta">anthropic/claude-3-sonnet:beta</option>
<option data-type="openrouter" value="anthropic/claude-3-opus:beta">anthropic/claude-3-opus:beta</option>
<option data-type="openrouter" value="nousresearch/nous-hermes-2-vision-7b">nousresearch/nous-hermes-2-vision-7b</option>
<option data-type="openrouter" value="google/gemini-pro-vision">google/gemini-pro-vision</option>
2023-12-19 23:45:45 +01:00
<option data-type="ollama" value="ollama_current">[Currently selected]</option>
<option data-type="ollama" value="bakllava:latest">bakllava:latest</option>
<option data-type="ollama" value="llava:latest">llava:latest</option>
<option data-type="llamacpp" value="llamacpp_current">[Currently loaded]</option>
2023-12-24 00:43:29 +01:00
<option data-type="ooba" value="ooba_current">[Currently loaded]</option>
<option data-type="koboldcpp" value="koboldcpp_current">[Currently loaded]</option>
2023-12-20 20:05:20 +01:00
<option data-type="custom" value="custom_current">[Currently selected]</option>
</select>
</div>
<label data-type="openai,anthropic" class="checkbox_label flexBasis100p" for="caption_allow_reverse_proxy" title="Allow using reverse proxy if defined and valid.">
<input id="caption_allow_reverse_proxy" type="checkbox" class="checkbox">
Allow reverse proxy
</label>
2023-12-19 23:45:45 +01:00
<div class="flexBasis100p m-b-1">
<small><b>Hint:</b> Set your API keys and endpoints in the 'API Connections' tab first.</small>
</div>
</div>
<div id="caption_prompt_block">
<label for="caption_prompt">Caption Prompt</label>
<textarea id="caption_prompt" class="text_pole" rows="1" placeholder="&lt; Use default &gt;">${PROMPT_DEFAULT}</textarea>
2023-12-20 20:23:59 +01:00
<label class="checkbox_label margin-bot-10px" for="caption_prompt_ask" title="Ask for a custom prompt every time an image is captioned.">
<input id="caption_prompt_ask" type="checkbox" class="checkbox">
Ask every time
</label>
</div>
<label for="caption_template">Message Template <small>(use <code>{{caption}}</code> macro)</small></label>
2023-11-07 00:58:34 +01:00
<textarea id="caption_template" class="text_pole" rows="2" placeholder="&lt; Use default &gt;">${TEMPLATE_DEFAULT}</textarea>
<label class="checkbox_label margin-bot-10px" for="caption_refine_mode">
2023-07-20 19:32:15 +02:00
<input id="caption_refine_mode" type="checkbox" class="checkbox">
Edit captions before saving
2023-07-20 19:32:15 +02:00
</label>
</div>
</div>
</div>
`;
$('#extensions_settings2').append(html);
}
addSettings();
addPictureSendForm();
addSendPictureButton();
setImageIcon();
2023-11-07 00:58:34 +01:00
migrateSettings();
switchMultimodalBlocks();
2023-07-20 19:32:15 +02:00
$('#caption_refine_mode').prop('checked', !!(extension_settings.caption.refine_mode));
$('#caption_allow_reverse_proxy').prop('checked', !!(extension_settings.caption.allow_reverse_proxy));
2023-12-20 20:23:59 +01:00
$('#caption_prompt_ask').prop('checked', !!(extension_settings.caption.prompt_ask));
$('#caption_source').val(extension_settings.caption.source);
2023-11-07 00:58:34 +01:00
$('#caption_prompt').val(extension_settings.caption.prompt);
$('#caption_template').val(extension_settings.caption.template);
2023-07-20 19:32:15 +02:00
$('#caption_refine_mode').on('input', onRefineModeInput);
$('#caption_source').on('change', () => {
extension_settings.caption.source = String($('#caption_source').val());
switchMultimodalBlocks();
saveSettingsDebounced();
});
2023-11-07 00:58:34 +01:00
$('#caption_prompt').on('input', () => {
extension_settings.caption.prompt = String($('#caption_prompt').val());
saveSettingsDebounced();
});
$('#caption_template').on('input', () => {
extension_settings.caption.template = String($('#caption_template').val());
saveSettingsDebounced();
});
$('#caption_allow_reverse_proxy').on('input', () => {
extension_settings.caption.allow_reverse_proxy = $('#caption_allow_reverse_proxy').prop('checked');
saveSettingsDebounced();
});
2023-12-20 20:23:59 +01:00
$('#caption_prompt_ask').on('input', () => {
extension_settings.caption.prompt_ask = $('#caption_prompt_ask').prop('checked');
saveSettingsDebounced();
});
2024-04-18 15:22:33 +02:00
registerSlashCommand('caption', captionCommandCallback, [], '<span class="monospace">quiet=true/false [prompt]</span> - caption an image with an optional prompt and passes the caption down the pipe. Only multimodal sources support custom prompts. Set the "quiet" argument to true to suppress sending a captioned message, default: false.', true, true);
2023-07-20 19:32:15 +02:00
});