import { ensureImageFormatSupported, getBase64Async, isTrueBoolean, saveBase64AsFile } from '../../utils.js'; import { getContext, getApiUrl, doExtrasFetch, extension_settings, modules, renderExtensionTemplateAsync } from '../../extensions.js'; import { appendMediaToMessage, callPopup, eventSource, event_types, getRequestHeaders, saveChatConditional, saveSettingsDebounced, substituteParamsExtended } from '../../../script.js'; import { getMessageTimeStamp } from '../../RossAscends-mods.js'; import { SECRET_KEYS, secret_state } from '../../secrets.js'; import { getMultimodalCaption } from '../shared.js'; import { textgen_types, textgenerationwebui_settings } from '../../textgen-settings.js'; import { SlashCommandParser } from '../../slash-commands/SlashCommandParser.js'; import { SlashCommand } from '../../slash-commands/SlashCommand.js'; import { ARGUMENT_TYPE, SlashCommandArgument, SlashCommandNamedArgument } from '../../slash-commands/SlashCommandArgument.js'; import { SlashCommandEnumValue } from '../../slash-commands/SlashCommandEnumValue.js'; import { commonEnumProviders } from '../../slash-commands/SlashCommandCommonEnumsProvider.js'; export { MODULE_NAME }; const MODULE_NAME = 'caption'; const PROMPT_DEFAULT = 'What’s in this image?'; const TEMPLATE_DEFAULT = '[{{user}} sends {{char}} a picture that contains: {{caption}}]'; /** * Migrates old extension settings to the new format. * Must keep this function for compatibility with old settings. */ function migrateSettings() { if (extension_settings.caption.local !== undefined) { extension_settings.caption.source = extension_settings.caption.local ? 'local' : 'extras'; } delete extension_settings.caption.local; if (!extension_settings.caption.source) { extension_settings.caption.source = 'extras'; } if (extension_settings.caption.source === 'openai') { extension_settings.caption.source = 'multimodal'; extension_settings.caption.multimodal_api = 'openai'; extension_settings.caption.multimodal_model = 'gpt-4-turbo'; } if (!extension_settings.caption.multimodal_api) { extension_settings.caption.multimodal_api = 'openai'; } if (!extension_settings.caption.multimodal_model) { extension_settings.caption.multimodal_model = 'gpt-4-turbo'; } if (!extension_settings.caption.prompt) { extension_settings.caption.prompt = PROMPT_DEFAULT; } if (!extension_settings.caption.template) { extension_settings.caption.template = TEMPLATE_DEFAULT; } } /** * Sets an image icon for the send button. */ async function setImageIcon() { try { const sendButton = $('#send_picture .extensionsMenuExtensionButton'); sendButton.addClass('fa-image'); sendButton.removeClass('fa-hourglass-half'); } catch (error) { console.log(error); } } /** * Sets a spinner icon for the send button. */ async function setSpinnerIcon() { try { const sendButton = $('#send_picture .extensionsMenuExtensionButton'); sendButton.removeClass('fa-image'); sendButton.addClass('fa-hourglass-half'); } catch (error) { console.log(error); } } /** * Wraps a caption with a message template. * @param {string} caption Raw caption * @returns {Promise} Wrapped caption */ async function wrapCaptionTemplate(caption) { let template = extension_settings.caption.template || TEMPLATE_DEFAULT; if (!/{{caption}}/i.test(template)) { console.warn('Poka-yoke: Caption template does not contain {{caption}}. Appending it.'); template += ' {{caption}}'; } let messageText = substituteParamsExtended(template, { caption: caption }); if (extension_settings.caption.refine_mode) { messageText = await callPopup( '

Review and edit the generated caption:

Press "Cancel" to abort the caption sending.', 'input', messageText, { rows: 5, okButton: 'Send' }); if (!messageText) { throw new Error('User aborted the caption sending.'); } } return messageText; } /** * Appends caption to an existing message. * @param {Object} data Message data * @returns {Promise} */ async function captionExistingMessage(data) { if (!(data?.extra?.image)) { return; } const imageData = await fetch(data.extra.image); const blob = await imageData.blob(); const type = imageData.headers.get('Content-Type'); const file = new File([blob], 'image.png', { type }); const caption = await getCaptionForFile(file, null, true); if (!caption) { console.warn('Failed to generate a caption for the image.'); return; } const wrappedCaption = await wrapCaptionTemplate(caption); const messageText = String(data.mes).trim(); if (!messageText) { data.extra.inline_image = false; data.mes = wrappedCaption; data.extra.title = wrappedCaption; } else { data.extra.inline_image = true; data.extra.append_title = true; data.extra.title = wrappedCaption; } } /** * Sends a captioned message to the chat. * @param {string} caption Caption text * @param {string} image Image URL */ async function sendCaptionedMessage(caption, image) { const messageText = await wrapCaptionTemplate(caption); const context = getContext(); const message = { name: context.name1, is_user: true, send_date: getMessageTimeStamp(), mes: messageText, extra: { image: image, title: messageText, }, }; context.chat.push(message); context.addOneMessage(message); } /** * Generates a caption for an image using a selected source. * @param {string} base64Img Base64 encoded image without the data:image/...;base64, prefix * @param {string} fileData Base64 encoded image with the data:image/...;base64, prefix * @param {string} externalPrompt Caption prompt * @returns {Promise<{caption: string}>} Generated caption */ async function doCaptionRequest(base64Img, fileData, externalPrompt) { switch (extension_settings.caption.source) { case 'local': return await captionLocal(base64Img); case 'extras': return await captionExtras(base64Img); case 'horde': return await captionHorde(base64Img); case 'multimodal': return await captionMultimodal(fileData, externalPrompt); default: throw new Error('Unknown caption source.'); } } /** * Generates a caption for an image using Extras API. * @param {string} base64Img Base64 encoded image without the data:image/...;base64, prefix * @returns {Promise<{caption: string}>} Generated caption */ async function captionExtras(base64Img) { if (!modules.includes('caption')) { throw new Error('No captioning module is available.'); } const url = new URL(getApiUrl()); url.pathname = '/api/caption'; const apiResult = await doExtrasFetch(url, { method: 'POST', headers: { 'Content-Type': 'application/json', 'Bypass-Tunnel-Reminder': 'bypass', }, body: JSON.stringify({ image: base64Img }), }); if (!apiResult.ok) { throw new Error('Failed to caption image via Extras.'); } const data = await apiResult.json(); return data; } /** * Generates a caption for an image using a local model. * @param {string} base64Img Base64 encoded image without the data:image/...;base64, prefix * @returns {Promise<{caption: string}>} Generated caption */ async function captionLocal(base64Img) { const apiResult = await fetch('/api/extra/caption', { method: 'POST', headers: getRequestHeaders(), body: JSON.stringify({ image: base64Img }), }); if (!apiResult.ok) { throw new Error('Failed to caption image via local pipeline.'); } const data = await apiResult.json(); return data; } /** * Generates a caption for an image using a Horde model. * @param {string} base64Img Base64 encoded image without the data:image/...;base64, prefix * @returns {Promise<{caption: string}>} Generated caption */ async function captionHorde(base64Img) { const apiResult = await fetch('/api/horde/caption-image', { method: 'POST', headers: getRequestHeaders(), body: JSON.stringify({ image: base64Img }), }); if (!apiResult.ok) { throw new Error('Failed to caption image via Horde.'); } const data = await apiResult.json(); return data; } /** * Generates a caption for an image using a multimodal model. * @param {string} base64Img Base64 encoded image with the data:image/...;base64, prefix * @param {string} externalPrompt Caption prompt * @returns {Promise<{caption: string}>} Generated caption */ async function captionMultimodal(base64Img, externalPrompt) { let prompt = externalPrompt || extension_settings.caption.prompt || PROMPT_DEFAULT; if (!externalPrompt && extension_settings.caption.prompt_ask) { const customPrompt = await callPopup('

Enter a comment or question:

', 'input', prompt, { rows: 2 }); if (!customPrompt) { throw new Error('User aborted the caption sending.'); } prompt = String(customPrompt).trim(); } const caption = await getMultimodalCaption(base64Img, prompt); return { caption }; } /** * Handles the image selection event. * @param {Event} e Input event * @param {string} prompt Caption prompt * @param {boolean} quiet Suppresses sending a message * @returns {Promise} Generated caption */ async function onSelectImage(e, prompt, quiet) { if (!(e.target instanceof HTMLInputElement)) { return ''; } const file = e.target.files[0]; const form = e.target.form; if (!file || !(file instanceof File)) { form && form.reset(); return ''; } const caption = await getCaptionForFile(file, prompt, quiet); form && form.reset(); return caption; } /** * Gets a caption for an image file. * @param {File} file Input file * @param {string} prompt Caption prompt * @param {boolean} quiet Suppresses sending a message * @returns {Promise} Generated caption */ async function getCaptionForFile(file, prompt, quiet) { try { setSpinnerIcon(); const context = getContext(); const fileData = await getBase64Async(await ensureImageFormatSupported(file)); const base64Format = fileData.split(',')[0].split(';')[0].split('/')[1]; const base64Data = fileData.split(',')[1]; const { caption } = await doCaptionRequest(base64Data, fileData, prompt); if (!quiet) { const imagePath = await saveBase64AsFile(base64Data, context.name2, '', base64Format); await sendCaptionedMessage(caption, imagePath); } return caption; } catch (error) { const errorMessage = error.message || 'Unknown error'; toastr.error(errorMessage, "Failed to caption image."); console.error(error); return ''; } finally { setImageIcon(); } } function onRefineModeInput() { extension_settings.caption.refine_mode = $('#caption_refine_mode').prop('checked'); saveSettingsDebounced(); } /** * Callback for the /caption command. * @param {object} args Named parameters * @param {string} prompt Caption prompt */ async function captionCommandCallback(args, prompt) { const quiet = isTrueBoolean(args?.quiet); const id = args?.id; if (!isNaN(Number(id))) { const message = getContext().chat[id]; if (message?.extra?.image) { try { const fetchResult = await fetch(message.extra.image); const blob = await fetchResult.blob(); const file = new File([blob], 'image.jpg', { type: blob.type }); return await getCaptionForFile(file, prompt, quiet); } catch (error) { toastr.error('Failed to get image from the message. Make sure the image is accessible.'); return ''; } } } return new Promise(resolve => { const input = document.createElement('input'); input.type = 'file'; input.accept = 'image/*'; input.onchange = async (e) => { const caption = await onSelectImage(e, prompt, quiet); resolve(caption); }; input.oncancel = () => resolve(''); input.click(); }); } jQuery(async function () { function addSendPictureButton() { const sendButton = $(`
Generate Caption
`); $('#caption_wand_container').append(sendButton); $(sendButton).on('click', () => { const hasCaptionModule = (modules.includes('caption') && extension_settings.caption.source === 'extras') || (extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'openai' && (secret_state[SECRET_KEYS.OPENAI] || extension_settings.caption.allow_reverse_proxy)) || (extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'openrouter' && secret_state[SECRET_KEYS.OPENROUTER]) || (extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'google' && (secret_state[SECRET_KEYS.MAKERSUITE] || extension_settings.caption.allow_reverse_proxy)) || (extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'anthropic' && (secret_state[SECRET_KEYS.CLAUDE] || extension_settings.caption.allow_reverse_proxy)) || (extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'ollama' && textgenerationwebui_settings.server_urls[textgen_types.OLLAMA]) || (extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'llamacpp' && textgenerationwebui_settings.server_urls[textgen_types.LLAMACPP]) || (extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'ooba' && textgenerationwebui_settings.server_urls[textgen_types.OOBA]) || (extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'koboldcpp' && textgenerationwebui_settings.server_urls[textgen_types.KOBOLDCPP]) || (extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'vllm' && textgenerationwebui_settings.server_urls[textgen_types.VLLM]) || (extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'custom') || extension_settings.caption.source === 'local' || extension_settings.caption.source === 'horde'; if (!hasCaptionModule) { toastr.error('Choose other captioning source in the extension settings.', 'Captioning is not available'); return; } $('#img_file').trigger('click'); }); } function addPictureSendForm() { const inputHtml = ''; const imgForm = document.createElement('form'); imgForm.id = 'img_form'; $(imgForm).append(inputHtml); $(imgForm).hide(); $('#form_sheld').append(imgForm); $('#img_file').on('change', (e) => onSelectImage(e.originalEvent, '', false)); } function switchMultimodalBlocks() { const isMultimodal = extension_settings.caption.source === 'multimodal'; $('#caption_ollama_pull').on('click', (e) => { const presetModel = extension_settings.caption.multimodal_model !== 'ollama_current' ? extension_settings.caption.multimodal_model : ''; e.preventDefault(); $('#ollama_download_model').trigger('click'); $('#dialogue_popup_input').val(presetModel); }); $('#caption_multimodal_block').toggle(isMultimodal); $('#caption_prompt_block').toggle(isMultimodal); $('#caption_multimodal_api').val(extension_settings.caption.multimodal_api); $('#caption_multimodal_model').val(extension_settings.caption.multimodal_model); $('#caption_multimodal_block [data-type]').each(function () { const type = $(this).data('type'); const types = type.split(','); $(this).toggle(types.includes(extension_settings.caption.multimodal_api)); }); $('#caption_multimodal_api').on('change', () => { const api = String($('#caption_multimodal_api').val()); const model = String($(`#caption_multimodal_model option[data-type="${api}"]`).first().val()); extension_settings.caption.multimodal_api = api; extension_settings.caption.multimodal_model = model; saveSettingsDebounced(); switchMultimodalBlocks(); }); $('#caption_multimodal_model').on('change', () => { extension_settings.caption.multimodal_model = String($('#caption_multimodal_model').val()); saveSettingsDebounced(); }); } async function addSettings() { const html = await renderExtensionTemplateAsync('caption', 'settings', { TEMPLATE_DEFAULT, PROMPT_DEFAULT }); $('#caption_container').append(html); } await addSettings(); addPictureSendForm(); addSendPictureButton(); setImageIcon(); migrateSettings(); switchMultimodalBlocks(); $('#caption_refine_mode').prop('checked', !!(extension_settings.caption.refine_mode)); $('#caption_allow_reverse_proxy').prop('checked', !!(extension_settings.caption.allow_reverse_proxy)); $('#caption_prompt_ask').prop('checked', !!(extension_settings.caption.prompt_ask)); $('#caption_auto_mode').prop('checked', !!(extension_settings.caption.auto_mode)); $('#caption_source').val(extension_settings.caption.source); $('#caption_prompt').val(extension_settings.caption.prompt); $('#caption_template').val(extension_settings.caption.template); $('#caption_refine_mode').on('input', onRefineModeInput); $('#caption_source').on('change', () => { extension_settings.caption.source = String($('#caption_source').val()); switchMultimodalBlocks(); saveSettingsDebounced(); }); $('#caption_prompt').on('input', () => { extension_settings.caption.prompt = String($('#caption_prompt').val()); saveSettingsDebounced(); }); $('#caption_template').on('input', () => { extension_settings.caption.template = String($('#caption_template').val()); saveSettingsDebounced(); }); $('#caption_allow_reverse_proxy').on('input', () => { extension_settings.caption.allow_reverse_proxy = $('#caption_allow_reverse_proxy').prop('checked'); saveSettingsDebounced(); }); $('#caption_prompt_ask').on('input', () => { extension_settings.caption.prompt_ask = $('#caption_prompt_ask').prop('checked'); saveSettingsDebounced(); }); $('#caption_auto_mode').on('input', () => { extension_settings.caption.auto_mode = !!$('#caption_auto_mode').prop('checked'); saveSettingsDebounced(); }); const onMessageEvent = async (index) => { if (!extension_settings.caption.auto_mode) { return; } const data = getContext().chat[index]; await captionExistingMessage(data); }; eventSource.on(event_types.MESSAGE_SENT, onMessageEvent); eventSource.on(event_types.MESSAGE_FILE_EMBEDDED, onMessageEvent); $(document).on('click', '.mes_img_caption', async function () { const animationClass = 'fa-fade'; const messageBlock = $(this).closest('.mes'); const messageImg = messageBlock.find('.mes_img'); if (messageImg.hasClass(animationClass)) return; messageImg.addClass(animationClass); try { const index = Number(messageBlock.attr('mesid')); const data = getContext().chat[index]; await captionExistingMessage(data); appendMediaToMessage(data, messageBlock, false); await saveChatConditional(); } catch(e) { console.error('Message image recaption failed', e); } finally { messageImg.removeClass(animationClass); } }); SlashCommandParser.addCommandObject(SlashCommand.fromProps({ name: 'caption', callback: captionCommandCallback, returns: 'caption', namedArgumentList: [ new SlashCommandNamedArgument( 'quiet', 'suppress sending a captioned message', [ARGUMENT_TYPE.BOOLEAN], false, false, 'false', ), SlashCommandNamedArgument.fromProps({ name: 'id', description: 'get image from a message with this ID', typeList: [ARGUMENT_TYPE.NUMBER], enumProvider: commonEnumProviders.messages(), }), ], unnamedArgumentList: [ new SlashCommandArgument( 'prompt', [ARGUMENT_TYPE.STRING], false, ), ], helpString: `
Caption an image with an optional prompt and passes the caption down the pipe.
Only multimodal sources support custom prompts.
Provide a message ID to get an image from a message instead of uploading one.
Set the "quiet" argument to true to suppress sending a captioned message, default: false.
`, })); document.body.classList.add('caption'); });