mirror of
				https://github.com/SillyTavern/SillyTavern.git
				synced 2025-06-05 21:59:27 +02:00 
			
		
		
		
	
		
			
				
	
	
		
			443 lines
		
	
	
		
			19 KiB
		
	
	
	
		
			JavaScript
		
	
	
	
	
	
			
		
		
	
	
			443 lines
		
	
	
		
			19 KiB
		
	
	
	
		
			JavaScript
		
	
	
	
	
	
| import { getBase64Async, saveBase64AsFile } from '../../utils.js';
 | ||
| import { getContext, getApiUrl, doExtrasFetch, extension_settings, modules } from '../../extensions.js';
 | ||
| import { callPopup, getRequestHeaders, saveSettingsDebounced, substituteParams } from '../../../script.js';
 | ||
| import { getMessageTimeStamp } from '../../RossAscends-mods.js';
 | ||
| import { SECRET_KEYS, secret_state } from '../../secrets.js';
 | ||
| import { getMultimodalCaption } from '../shared.js';
 | ||
| import { textgen_types, textgenerationwebui_settings } from '../../textgen-settings.js';
 | ||
| export { MODULE_NAME };
 | ||
| 
 | ||
| const MODULE_NAME = 'caption';
 | ||
| 
 | ||
| const PROMPT_DEFAULT = 'What’s in this image?';
 | ||
| const TEMPLATE_DEFAULT = '[{{user}} sends {{char}} a picture that contains: {{caption}}]';
 | ||
| 
 | ||
| /**
 | ||
|  * Migrates old extension settings to the new format.
 | ||
|  * Must keep this function for compatibility with old settings.
 | ||
|  */
 | ||
| function migrateSettings() {
 | ||
|     if (extension_settings.caption.local !== undefined) {
 | ||
|         extension_settings.caption.source = extension_settings.caption.local ? 'local' : 'extras';
 | ||
|     }
 | ||
| 
 | ||
|     delete extension_settings.caption.local;
 | ||
| 
 | ||
|     if (!extension_settings.caption.source) {
 | ||
|         extension_settings.caption.source = 'extras';
 | ||
|     }
 | ||
| 
 | ||
|     if (extension_settings.caption.source === 'openai') {
 | ||
|         extension_settings.caption.source = 'multimodal';
 | ||
|         extension_settings.caption.multimodal_api = 'openai';
 | ||
|         extension_settings.caption.multimodal_model = 'gpt-4-vision-preview';
 | ||
|     }
 | ||
| 
 | ||
|     if (!extension_settings.caption.multimodal_api) {
 | ||
|         extension_settings.caption.multimodal_api = 'openai';
 | ||
|     }
 | ||
| 
 | ||
|     if (!extension_settings.caption.multimodal_model) {
 | ||
|         extension_settings.caption.multimodal_model = 'gpt-4-vision-preview';
 | ||
|     }
 | ||
| 
 | ||
|     if (!extension_settings.caption.prompt) {
 | ||
|         extension_settings.caption.prompt = PROMPT_DEFAULT;
 | ||
|     }
 | ||
| 
 | ||
|     if (!extension_settings.caption.template) {
 | ||
|         extension_settings.caption.template = TEMPLATE_DEFAULT;
 | ||
|     }
 | ||
| }
 | ||
| 
 | ||
| /**
 | ||
|  * Sets an image icon for the send button.
 | ||
|  */
 | ||
| async function setImageIcon() {
 | ||
|     try {
 | ||
|         const sendButton = $('#send_picture .extensionsMenuExtensionButton');
 | ||
|         sendButton.addClass('fa-image');
 | ||
|         sendButton.removeClass('fa-hourglass-half');
 | ||
|     }
 | ||
|     catch (error) {
 | ||
|         console.log(error);
 | ||
|     }
 | ||
| }
 | ||
| 
 | ||
| /**
 | ||
|  * Sets a spinner icon for the send button.
 | ||
|  */
 | ||
| async function setSpinnerIcon() {
 | ||
|     try {
 | ||
|         const sendButton = $('#send_picture .extensionsMenuExtensionButton');
 | ||
|         sendButton.removeClass('fa-image');
 | ||
|         sendButton.addClass('fa-hourglass-half');
 | ||
|     }
 | ||
|     catch (error) {
 | ||
|         console.log(error);
 | ||
|     }
 | ||
| }
 | ||
| 
 | ||
| /**
 | ||
|  * Sends a captioned message to the chat.
 | ||
|  * @param {string} caption Caption text
 | ||
|  * @param {string} image Image URL
 | ||
|  */
 | ||
| async function sendCaptionedMessage(caption, image) {
 | ||
|     const context = getContext();
 | ||
|     let template = extension_settings.caption.template || TEMPLATE_DEFAULT;
 | ||
| 
 | ||
|     if (!/{{caption}}/i.test(template)) {
 | ||
|         console.warn('Poka-yoke: Caption template does not contain {{caption}}. Appending it.');
 | ||
|         template += ' {{caption}}';
 | ||
|     }
 | ||
| 
 | ||
|     let messageText = substituteParams(template).replace(/{{caption}}/i, caption);
 | ||
| 
 | ||
|     if (extension_settings.caption.refine_mode) {
 | ||
|         messageText = await callPopup(
 | ||
|             '<h3>Review and edit the generated message:</h3>Press "Cancel" to abort the caption sending.',
 | ||
|             'input',
 | ||
|             messageText,
 | ||
|             { rows: 5, okButton: 'Send' });
 | ||
| 
 | ||
|         if (!messageText) {
 | ||
|             throw new Error('User aborted the caption sending.');
 | ||
|         }
 | ||
|     }
 | ||
| 
 | ||
|     const message = {
 | ||
|         name: context.name1,
 | ||
|         is_user: true,
 | ||
|         send_date: getMessageTimeStamp(),
 | ||
|         mes: messageText,
 | ||
|         extra: {
 | ||
|             image: image,
 | ||
|             title: messageText,
 | ||
|         },
 | ||
|     };
 | ||
|     context.chat.push(message);
 | ||
|     context.addOneMessage(message);
 | ||
| }
 | ||
| 
 | ||
| /**
 | ||
|  * Generates a caption for an image using a selected source.
 | ||
|  * @param {string} base64Img Base64 encoded image without the data:image/...;base64, prefix
 | ||
|  * @param {string} fileData Base64 encoded image with the data:image/...;base64, prefix
 | ||
|  * @returns {Promise<{caption: string}>} Generated caption
 | ||
|  */
 | ||
| async function doCaptionRequest(base64Img, fileData) {
 | ||
|     switch (extension_settings.caption.source) {
 | ||
|         case 'local':
 | ||
|             return await captionLocal(base64Img);
 | ||
|         case 'extras':
 | ||
|             return await captionExtras(base64Img);
 | ||
|         case 'horde':
 | ||
|             return await captionHorde(base64Img);
 | ||
|         case 'multimodal':
 | ||
|             return await captionMultimodal(fileData);
 | ||
|         default:
 | ||
|             throw new Error('Unknown caption source.');
 | ||
|     }
 | ||
| }
 | ||
| 
 | ||
| /**
 | ||
|  * Generates a caption for an image using Extras API.
 | ||
|  * @param {string} base64Img Base64 encoded image without the data:image/...;base64, prefix
 | ||
|  * @returns {Promise<{caption: string}>} Generated caption
 | ||
|  */
 | ||
| async function captionExtras(base64Img) {
 | ||
|     if (!modules.includes('caption')) {
 | ||
|         throw new Error('No captioning module is available.');
 | ||
|     }
 | ||
| 
 | ||
|     const url = new URL(getApiUrl());
 | ||
|     url.pathname = '/api/caption';
 | ||
| 
 | ||
|     const apiResult = await doExtrasFetch(url, {
 | ||
|         method: 'POST',
 | ||
|         headers: {
 | ||
|             'Content-Type': 'application/json',
 | ||
|             'Bypass-Tunnel-Reminder': 'bypass',
 | ||
|         },
 | ||
|         body: JSON.stringify({ image: base64Img }),
 | ||
|     });
 | ||
| 
 | ||
|     if (!apiResult.ok) {
 | ||
|         throw new Error('Failed to caption image via Extras.');
 | ||
|     }
 | ||
| 
 | ||
|     const data = await apiResult.json();
 | ||
|     return data;
 | ||
| }
 | ||
| 
 | ||
| /**
 | ||
|  * Generates a caption for an image using a local model.
 | ||
|  * @param {string} base64Img Base64 encoded image without the data:image/...;base64, prefix
 | ||
|  * @returns {Promise<{caption: string}>} Generated caption
 | ||
|  */
 | ||
| async function captionLocal(base64Img) {
 | ||
|     const apiResult = await fetch('/api/extra/caption', {
 | ||
|         method: 'POST',
 | ||
|         headers: getRequestHeaders(),
 | ||
|         body: JSON.stringify({ image: base64Img }),
 | ||
|     });
 | ||
| 
 | ||
|     if (!apiResult.ok) {
 | ||
|         throw new Error('Failed to caption image via local pipeline.');
 | ||
|     }
 | ||
| 
 | ||
|     const data = await apiResult.json();
 | ||
|     return data;
 | ||
| }
 | ||
| 
 | ||
| /**
 | ||
|  * Generates a caption for an image using a Horde model.
 | ||
|  * @param {string} base64Img Base64 encoded image without the data:image/...;base64, prefix
 | ||
|  * @returns {Promise<{caption: string}>} Generated caption
 | ||
|  */
 | ||
| async function captionHorde(base64Img) {
 | ||
|     const apiResult = await fetch('/api/horde/caption-image', {
 | ||
|         method: 'POST',
 | ||
|         headers: getRequestHeaders(),
 | ||
|         body: JSON.stringify({ image: base64Img }),
 | ||
|     });
 | ||
| 
 | ||
|     if (!apiResult.ok) {
 | ||
|         throw new Error('Failed to caption image via Horde.');
 | ||
|     }
 | ||
| 
 | ||
|     const data = await apiResult.json();
 | ||
|     return data;
 | ||
| }
 | ||
| 
 | ||
| /**
 | ||
|  * Generates a caption for an image using a multimodal model.
 | ||
|  * @param {string} base64Img Base64 encoded image with the data:image/...;base64, prefix
 | ||
|  * @returns {Promise<{caption: string}>} Generated caption
 | ||
|  */
 | ||
| async function captionMultimodal(base64Img) {
 | ||
|     let prompt = extension_settings.caption.prompt || PROMPT_DEFAULT;
 | ||
| 
 | ||
|     if (extension_settings.caption.prompt_ask) {
 | ||
|         const customPrompt = await callPopup('<h3>Enter a comment or question:</h3>', 'input', prompt, { rows: 2 });
 | ||
|         if (!customPrompt) {
 | ||
|             throw new Error('User aborted the caption sending.');
 | ||
|         }
 | ||
|         prompt = String(customPrompt).trim();
 | ||
|     }
 | ||
| 
 | ||
|     const caption = await getMultimodalCaption(base64Img, prompt);
 | ||
|     return { caption };
 | ||
| }
 | ||
| 
 | ||
| async function onSelectImage(e) {
 | ||
|     setSpinnerIcon();
 | ||
|     const file = e.target.files[0];
 | ||
| 
 | ||
|     if (!file || !(file instanceof File)) {
 | ||
|         return;
 | ||
|     }
 | ||
| 
 | ||
|     try {
 | ||
|         const context = getContext();
 | ||
|         const fileData = await getBase64Async(file);
 | ||
|         const base64Format = fileData.split(',')[0].split(';')[0].split('/')[1];
 | ||
|         const base64Data = fileData.split(',')[1];
 | ||
|         const { caption } = await doCaptionRequest(base64Data, fileData);
 | ||
|         const imagePath = await saveBase64AsFile(base64Data, context.name2, '', base64Format);
 | ||
|         await sendCaptionedMessage(caption, imagePath);
 | ||
|     }
 | ||
|     catch (error) {
 | ||
|         toastr.error('Failed to caption image.');
 | ||
|         console.log(error);
 | ||
|     }
 | ||
|     finally {
 | ||
|         e.target.form.reset();
 | ||
|         setImageIcon();
 | ||
|     }
 | ||
| }
 | ||
| 
 | ||
| function onRefineModeInput() {
 | ||
|     extension_settings.caption.refine_mode = $('#caption_refine_mode').prop('checked');
 | ||
|     saveSettingsDebounced();
 | ||
| }
 | ||
| 
 | ||
| jQuery(function () {
 | ||
|     function addSendPictureButton() {
 | ||
|         const sendButton = $(`
 | ||
|         <div id="send_picture" class="list-group-item flex-container flexGap5">
 | ||
|             <div class="fa-solid fa-image extensionsMenuExtensionButton"></div>
 | ||
|             Generate Caption
 | ||
|         </div>`);
 | ||
|         const attachFileButton = $(`
 | ||
|         <div id="attachFile" class="list-group-item flex-container flexGap5">
 | ||
|             <div class="fa-solid fa-paperclip extensionsMenuExtensionButton"></div>
 | ||
|             Attach a File
 | ||
|         </div>`);
 | ||
| 
 | ||
|         $('#extensionsMenu').prepend(sendButton);
 | ||
|         $('#extensionsMenu').prepend(attachFileButton);
 | ||
|         $(sendButton).on('click', () => {
 | ||
|             const hasCaptionModule =
 | ||
|                 (modules.includes('caption') && extension_settings.caption.source === 'extras') ||
 | ||
|                 (extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'openai' && (secret_state[SECRET_KEYS.OPENAI] || extension_settings.caption.allow_reverse_proxy)) ||
 | ||
|                 (extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'openrouter' && secret_state[SECRET_KEYS.OPENROUTER]) ||
 | ||
|                 (extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'google' && secret_state[SECRET_KEYS.MAKERSUITE]) ||
 | ||
|                 (extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'ollama' && textgenerationwebui_settings.server_urls[textgen_types.OLLAMA]) ||
 | ||
|                 (extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'llamacpp' && textgenerationwebui_settings.server_urls[textgen_types.LLAMACPP]) ||
 | ||
|                 (extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'ooba' && textgenerationwebui_settings.server_urls[textgen_types.OOBA]) ||
 | ||
|                 (extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'custom') ||
 | ||
|                 extension_settings.caption.source === 'local' ||
 | ||
|                 extension_settings.caption.source === 'horde';
 | ||
| 
 | ||
|             if (!hasCaptionModule) {
 | ||
|                 toastr.error('Choose other captioning source in the extension settings.', 'Captioning is not available');
 | ||
|                 return;
 | ||
|             }
 | ||
| 
 | ||
|             $('#img_file').trigger('click');
 | ||
|         });
 | ||
|     }
 | ||
|     function addPictureSendForm() {
 | ||
|         const inputHtml = '<input id="img_file" type="file" hidden accept="image/*">';
 | ||
|         const imgForm = document.createElement('form');
 | ||
|         imgForm.id = 'img_form';
 | ||
|         $(imgForm).append(inputHtml);
 | ||
|         $(imgForm).hide();
 | ||
|         $('#form_sheld').append(imgForm);
 | ||
|         $('#img_file').on('change', onSelectImage);
 | ||
|     }
 | ||
|     function switchMultimodalBlocks() {
 | ||
|         const isMultimodal = extension_settings.caption.source === 'multimodal';
 | ||
|         $('#caption_multimodal_block').toggle(isMultimodal);
 | ||
|         $('#caption_prompt_block').toggle(isMultimodal);
 | ||
|         $('#caption_multimodal_api').val(extension_settings.caption.multimodal_api);
 | ||
|         $('#caption_multimodal_model').val(extension_settings.caption.multimodal_model);
 | ||
|         $('#caption_multimodal_block [data-type]').each(function () {
 | ||
|             const type = $(this).data('type');
 | ||
|             $(this).toggle(type === extension_settings.caption.multimodal_api);
 | ||
|         });
 | ||
|         $('#caption_multimodal_api').on('change', () => {
 | ||
|             const api = String($('#caption_multimodal_api').val());
 | ||
|             const model = String($(`#caption_multimodal_model option[data-type="${api}"]`).first().val());
 | ||
|             extension_settings.caption.multimodal_api = api;
 | ||
|             extension_settings.caption.multimodal_model = model;
 | ||
|             saveSettingsDebounced();
 | ||
|             switchMultimodalBlocks();
 | ||
|         });
 | ||
|         $('#caption_multimodal_model').on('change', () => {
 | ||
|             extension_settings.caption.multimodal_model = String($('#caption_multimodal_model').val());
 | ||
|             saveSettingsDebounced();
 | ||
|         });
 | ||
|     }
 | ||
|     function addSettings() {
 | ||
|         const html = `
 | ||
|         <div class="caption_settings">
 | ||
|             <div class="inline-drawer">
 | ||
|                 <div class="inline-drawer-toggle inline-drawer-header">
 | ||
|                     <b>Image Captioning</b>
 | ||
|                     <div class="inline-drawer-icon fa-solid fa-circle-chevron-down down"></div>
 | ||
|                 </div>
 | ||
|                 <div class="inline-drawer-content">
 | ||
|                     <label for="caption_source">Source</label>
 | ||
|                     <select id="caption_source" class="text_pole">
 | ||
|                         <option value="local">Local</option>
 | ||
|                         <option value="multimodal">Multimodal (OpenAI / llama / Google)</option>
 | ||
|                         <option value="extras">Extras</option>
 | ||
|                         <option value="horde">Horde</option>
 | ||
|                     </select>
 | ||
|                     <div id="caption_multimodal_block" class="flex-container wide100p">
 | ||
|                         <div class="flex1 flex-container flexFlowColumn flexNoGap">
 | ||
|                             <label for="caption_multimodal_api">API</label>
 | ||
|                             <select id="caption_multimodal_api" class="flex1 text_pole">
 | ||
|                                 <option value="llamacpp">llama.cpp</option>
 | ||
|                                 <option value="ooba">Text Generation WebUI (oobabooga)</option>
 | ||
|                                 <option value="ollama">Ollama</option>
 | ||
|                                 <option value="openai">OpenAI</option>
 | ||
|                                 <option value="openrouter">OpenRouter</option>
 | ||
|                                 <option value="google">Google MakerSuite</option>
 | ||
|                                 <option value="custom">Custom (OpenAI-compatible)</option>
 | ||
|                             </select>
 | ||
|                         </div>
 | ||
|                         <div class="flex1 flex-container flexFlowColumn flexNoGap">
 | ||
|                             <label for="caption_multimodal_model">Model</label>
 | ||
|                             <select id="caption_multimodal_model" class="flex1 text_pole">
 | ||
|                                 <option data-type="openai" value="gpt-4-vision-preview">gpt-4-vision-preview</option>
 | ||
|                                 <option data-type="google" value="gemini-pro-vision">gemini-pro-vision</option>
 | ||
|                                 <option data-type="openrouter" value="openai/gpt-4-vision-preview">openai/gpt-4-vision-preview</option>
 | ||
|                                 <option data-type="openrouter" value="haotian-liu/llava-13b">haotian-liu/llava-13b</option>
 | ||
|                                 <option data-type="ollama" value="ollama_current">[Currently selected]</option>
 | ||
|                                 <option data-type="ollama" value="bakllava:latest">bakllava:latest</option>
 | ||
|                                 <option data-type="ollama" value="llava:latest">llava:latest</option>
 | ||
|                                 <option data-type="llamacpp" value="llamacpp_current">[Currently loaded]</option>
 | ||
|                                 <option data-type="ooba" value="ooba_current">[Currently loaded]</option>
 | ||
|                                 <option data-type="custom" value="custom_current">[Currently selected]</option>
 | ||
|                             </select>
 | ||
|                         </div>
 | ||
|                         <label data-type="openai" class="checkbox_label flexBasis100p" for="caption_allow_reverse_proxy" title="Allow using reverse proxy if defined and valid.">
 | ||
|                             <input id="caption_allow_reverse_proxy" type="checkbox" class="checkbox">
 | ||
|                             Allow reverse proxy
 | ||
|                         </label>
 | ||
|                         <div class="flexBasis100p m-b-1">
 | ||
|                             <small><b>Hint:</b> Set your API keys and endpoints in the 'API Connections' tab first.</small>
 | ||
|                         </div>
 | ||
|                     </div>
 | ||
|                     <div id="caption_prompt_block">
 | ||
|                         <label for="caption_prompt">Caption Prompt</label>
 | ||
|                         <textarea id="caption_prompt" class="text_pole" rows="1" placeholder="< Use default >">${PROMPT_DEFAULT}</textarea>
 | ||
|                         <label class="checkbox_label margin-bot-10px" for="caption_prompt_ask" title="Ask for a custom prompt every time an image is captioned.">
 | ||
|                             <input id="caption_prompt_ask" type="checkbox" class="checkbox">
 | ||
|                             Ask every time
 | ||
|                         </label>
 | ||
|                     </div>
 | ||
|                     <label for="caption_template">Message Template <small>(use <code>{{caption}}</code> macro)</small></label>
 | ||
|                     <textarea id="caption_template" class="text_pole" rows="2" placeholder="< Use default >">${TEMPLATE_DEFAULT}</textarea>
 | ||
|                     <label class="checkbox_label margin-bot-10px" for="caption_refine_mode">
 | ||
|                         <input id="caption_refine_mode" type="checkbox" class="checkbox">
 | ||
|                         Edit captions before saving
 | ||
|                     </label>
 | ||
|                 </div>
 | ||
|             </div>
 | ||
|         </div>
 | ||
|         `;
 | ||
|         $('#extensions_settings2').append(html);
 | ||
|     }
 | ||
| 
 | ||
|     addSettings();
 | ||
|     addPictureSendForm();
 | ||
|     addSendPictureButton();
 | ||
|     setImageIcon();
 | ||
|     migrateSettings();
 | ||
|     switchMultimodalBlocks();
 | ||
| 
 | ||
|     $('#caption_refine_mode').prop('checked', !!(extension_settings.caption.refine_mode));
 | ||
|     $('#caption_allow_reverse_proxy').prop('checked', !!(extension_settings.caption.allow_reverse_proxy));
 | ||
|     $('#caption_prompt_ask').prop('checked', !!(extension_settings.caption.prompt_ask));
 | ||
|     $('#caption_source').val(extension_settings.caption.source);
 | ||
|     $('#caption_prompt').val(extension_settings.caption.prompt);
 | ||
|     $('#caption_template').val(extension_settings.caption.template);
 | ||
|     $('#caption_refine_mode').on('input', onRefineModeInput);
 | ||
|     $('#caption_source').on('change', () => {
 | ||
|         extension_settings.caption.source = String($('#caption_source').val());
 | ||
|         switchMultimodalBlocks();
 | ||
|         saveSettingsDebounced();
 | ||
|     });
 | ||
|     $('#caption_prompt').on('input', () => {
 | ||
|         extension_settings.caption.prompt = String($('#caption_prompt').val());
 | ||
|         saveSettingsDebounced();
 | ||
|     });
 | ||
|     $('#caption_template').on('input', () => {
 | ||
|         extension_settings.caption.template = String($('#caption_template').val());
 | ||
|         saveSettingsDebounced();
 | ||
|     });
 | ||
|     $('#caption_allow_reverse_proxy').on('input', () => {
 | ||
|         extension_settings.caption.allow_reverse_proxy = $('#caption_allow_reverse_proxy').prop('checked');
 | ||
|         saveSettingsDebounced();
 | ||
|     });
 | ||
|     $('#caption_prompt_ask').on('input', () => {
 | ||
|         extension_settings.caption.prompt_ask = $('#caption_prompt_ask').prop('checked');
 | ||
|         saveSettingsDebounced();
 | ||
|     });
 | ||
| });
 |