mirror of
				https://github.com/SillyTavern/SillyTavern.git
				synced 2025-06-05 21:59:27 +02:00 
			
		
		
		
	Add OpenRouter and Llava to captioning plugin.
This commit is contained in:
		| @@ -4,19 +4,18 @@ import { appendImageToMessage, callPopup, getRequestHeaders, saveSettingsDebounc | ||||
| import { getMessageTimeStamp } from "../../RossAscends-mods.js"; | ||||
| import { SECRET_KEYS, secret_state } from "../../secrets.js"; | ||||
| import { isImageInliningSupported } from "../../openai.js"; | ||||
| import { getMultimodalCaption } from "../shared.js"; | ||||
| export { MODULE_NAME }; | ||||
|  | ||||
| const MODULE_NAME = 'caption'; | ||||
| const UPDATE_INTERVAL = 1000; | ||||
|  | ||||
| const PROMPT_DEFAULT = 'What’s in this image?'; | ||||
| const TEMPLATE_DEFAULT = '[{{user}} sends {{char}} a picture that contains: {{caption}}]'; | ||||
|  | ||||
| async function moduleWorker() { | ||||
|     const hasConnection = getContext().onlineStatus !== 'no_connection'; | ||||
|     $('#send_picture').toggle(hasConnection); | ||||
| } | ||||
|  | ||||
| /** | ||||
|  * Migrates old extension settings to the new format. | ||||
|  * Must keep this function for compatibility with old settings. | ||||
|  */ | ||||
| function migrateSettings() { | ||||
|     if (extension_settings.caption.local !== undefined) { | ||||
|         extension_settings.caption.source = extension_settings.caption.local ? 'local' : 'extras'; | ||||
| @@ -28,6 +27,20 @@ function migrateSettings() { | ||||
|         extension_settings.caption.source = 'extras'; | ||||
|     } | ||||
|  | ||||
|     if (extension_settings.caption.source === 'openai') { | ||||
|         extension_settings.caption.source = 'multimodal'; | ||||
|         extension_settings.caption.multimodal_api = 'openai'; | ||||
|         extension_settings.caption.multimodal_model = 'gpt-4-vision-preview'; | ||||
|     } | ||||
|  | ||||
|     if (!extension_settings.caption.multimodal_api) { | ||||
|         extension_settings.caption.multimodal_api = 'openai'; | ||||
|     } | ||||
|  | ||||
|     if (!extension_settings.caption.multimodal_model) { | ||||
|         extension_settings.caption.multimodal_model = 'gpt-4-vision-preview'; | ||||
|     } | ||||
|  | ||||
|     if (!extension_settings.caption.prompt) { | ||||
|         extension_settings.caption.prompt = PROMPT_DEFAULT; | ||||
|     } | ||||
| @@ -37,6 +50,9 @@ function migrateSettings() { | ||||
|     } | ||||
| } | ||||
|  | ||||
| /** | ||||
|  * Sets an image icon for the send button. | ||||
|  */ | ||||
| async function setImageIcon() { | ||||
|     try { | ||||
|         const sendButton = $('#send_picture .extensionsMenuExtensionButton'); | ||||
| @@ -48,6 +64,9 @@ async function setImageIcon() { | ||||
|     } | ||||
| } | ||||
|  | ||||
| /** | ||||
|  * Sets a spinner icon for the send button. | ||||
|  */ | ||||
| async function setSpinnerIcon() { | ||||
|     try { | ||||
|         const sendButton = $('#send_picture .extensionsMenuExtensionButton'); | ||||
| @@ -59,6 +78,11 @@ async function setSpinnerIcon() { | ||||
|     } | ||||
| } | ||||
|  | ||||
| /** | ||||
|  * Sends a captioned message to the chat. | ||||
|  * @param {string} caption Caption text | ||||
|  * @param {string} image Image URL | ||||
|  */ | ||||
| async function sendCaptionedMessage(caption, image) { | ||||
|     const context = getContext(); | ||||
|     let template = extension_settings.caption.template || TEMPLATE_DEFAULT; | ||||
| @@ -98,10 +122,10 @@ async function sendCaptionedMessage(caption, image) { | ||||
| } | ||||
|  | ||||
| /** | ||||
|  * | ||||
|  * Generates a caption for an image using a selected source. | ||||
|  * @param {string} base64Img Base64 encoded image without the data:image/...;base64, prefix | ||||
|  * @param {string} fileData Base64 encoded image with the data:image/...;base64, prefix | ||||
|  * @returns | ||||
|  * @returns {Promise<{caption: string}>} Generated caption | ||||
|  */ | ||||
| async function doCaptionRequest(base64Img, fileData) { | ||||
|     switch (extension_settings.caption.source) { | ||||
| @@ -111,13 +135,18 @@ async function doCaptionRequest(base64Img, fileData) { | ||||
|             return await captionExtras(base64Img); | ||||
|         case 'horde': | ||||
|             return await captionHorde(base64Img); | ||||
|         case 'openai': | ||||
|             return await captionOpenAI(fileData); | ||||
|         case 'multimodal': | ||||
|             return await captionMultimodal(fileData); | ||||
|         default: | ||||
|             throw new Error('Unknown caption source.'); | ||||
|     } | ||||
| } | ||||
|  | ||||
| /** | ||||
|  * Generates a caption for an image using Extras API. | ||||
|  * @param {string} base64Img Base64 encoded image without the data:image/...;base64, prefix | ||||
|  * @returns {Promise<{caption: string}>} Generated caption | ||||
|  */ | ||||
| async function captionExtras(base64Img) { | ||||
|     if (!modules.includes('caption')) { | ||||
|         throw new Error('No captioning module is available.'); | ||||
| @@ -143,6 +172,11 @@ async function captionExtras(base64Img) { | ||||
|     return data; | ||||
| } | ||||
|  | ||||
| /** | ||||
|  * Generates a caption for an image using a local model. | ||||
|  * @param {string} base64Img Base64 encoded image without the data:image/...;base64, prefix | ||||
|  * @returns {Promise<{caption: string}>} Generated caption | ||||
|  */ | ||||
| async function captionLocal(base64Img) { | ||||
|     const apiResult = await fetch('/api/extra/caption', { | ||||
|         method: 'POST', | ||||
| @@ -158,6 +192,11 @@ async function captionLocal(base64Img) { | ||||
|     return data; | ||||
| } | ||||
|  | ||||
| /** | ||||
|  * Generates a caption for an image using a Horde model. | ||||
|  * @param {string} base64Img Base64 encoded image without the data:image/...;base64, prefix | ||||
|  * @returns {Promise<{caption: string}>} Generated caption | ||||
|  */ | ||||
| async function captionHorde(base64Img) { | ||||
|     const apiResult = await fetch('/api/horde/caption-image', { | ||||
|         method: 'POST', | ||||
| @@ -173,20 +212,15 @@ async function captionHorde(base64Img) { | ||||
|     return data; | ||||
| } | ||||
|  | ||||
| async function captionOpenAI(base64Img) { | ||||
| /** | ||||
|  * Generates a caption for an image using a multimodal model. | ||||
|  * @param {string} base64Img Base64 encoded image with the data:image/...;base64, prefix | ||||
|  * @returns {Promise<{caption: string}>} Generated caption | ||||
|  */ | ||||
| async function captionMultimodal(base64Img) { | ||||
|     const prompt = extension_settings.caption.prompt || PROMPT_DEFAULT; | ||||
|     const apiResult = await fetch('/api/openai/caption-image', { | ||||
|         method: 'POST', | ||||
|         headers: getRequestHeaders(), | ||||
|         body: JSON.stringify({ image: base64Img, prompt: prompt }), | ||||
|     }); | ||||
|  | ||||
|     if (!apiResult.ok) { | ||||
|         throw new Error('Failed to caption image via OpenAI.'); | ||||
|     } | ||||
|  | ||||
|     const data = await apiResult.json(); | ||||
|     return data; | ||||
|     const caption = await getMultimodalCaption(base64Img, prompt); | ||||
|     return { caption }; | ||||
| } | ||||
|  | ||||
| async function onSelectImage(e) { | ||||
| @@ -202,11 +236,8 @@ async function onSelectImage(e) { | ||||
|         const fileData = await getBase64Async(file); | ||||
|         const base64Format = fileData.split(',')[0].split(';')[0].split('/')[1]; | ||||
|         const base64Data = fileData.split(',')[1]; | ||||
|         const data = await doCaptionRequest(base64Data, fileData); | ||||
|         const caption = data.caption; | ||||
|         const imageToSave = data.thumbnail ? data.thumbnail : base64Data; | ||||
|         const format = data.thumbnail ? 'jpeg' : base64Format; | ||||
|         const imagePath = await saveBase64AsFile(imageToSave, context.name2, '', format); | ||||
|         const { caption } = await doCaptionRequest(base64Data, fileData); | ||||
|         const imagePath = await saveBase64AsFile(base64Data, context.name2, '', base64Format); | ||||
|         await sendCaptionedMessage(caption, imagePath); | ||||
|     } | ||||
|     catch (error) { | ||||
| @@ -310,7 +341,6 @@ jQuery(function () { | ||||
|         </div>`); | ||||
|  | ||||
|         $('#extensionsMenu').prepend(sendButton); | ||||
|         $(sendButton).hide(); | ||||
|         $(sendButton).on('click', () => { | ||||
|             if (isImageInliningSupported()) { | ||||
|                 console.log('Native image inlining is supported. Skipping captioning.'); | ||||
| @@ -320,12 +350,13 @@ jQuery(function () { | ||||
|  | ||||
|             const hasCaptionModule = | ||||
|                 (modules.includes('caption') && extension_settings.caption.source === 'extras') || | ||||
|                 (extension_settings.caption.source === 'openai' && secret_state[SECRET_KEYS.OPENAI]) || | ||||
|                 (extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'openai' && secret_state[SECRET_KEYS.OPENAI]) || | ||||
|                 (extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'openrouter' && secret_state[SECRET_KEYS.OPENROUTER]) || | ||||
|                 extension_settings.caption.source === 'local' || | ||||
|                 extension_settings.caption.source === 'horde'; | ||||
|  | ||||
|             if (!hasCaptionModule) { | ||||
|                 toastr.error('No captioning module is available. Choose other captioning source in the extension settings.'); | ||||
|                 toastr.error('Choose other captioning source in the extension settings.', 'Captioning is not available'); | ||||
|                 return; | ||||
|             } | ||||
|  | ||||
| @@ -343,6 +374,29 @@ jQuery(function () { | ||||
|         $('#form_sheld').append(imgForm); | ||||
|         $('#img_file').on('change', onSelectImage); | ||||
|     } | ||||
|     function switchMultimodalBlocks() { | ||||
|         const isMultimodal = extension_settings.caption.source === 'multimodal'; | ||||
|         $('#caption_multimodal_block').toggle(isMultimodal); | ||||
|         $('#caption_prompt_block').toggle(isMultimodal); | ||||
|         $('#caption_multimodal_api').val(extension_settings.caption.multimodal_api); | ||||
|         $('#caption_multimodal_model').val(extension_settings.caption.multimodal_model); | ||||
|         $('#caption_multimodal_model option').each(function () { | ||||
|             const type = $(this).data('type'); | ||||
|             $(this).toggle(type === extension_settings.caption.multimodal_api); | ||||
|         }); | ||||
|         $('#caption_multimodal_api').on('change', () => { | ||||
|             const api = String($('#caption_multimodal_api').val()); | ||||
|             const model = String($(`#caption_multimodal_model option[data-type="${api}"]`).first().val()); | ||||
|             extension_settings.caption.multimodal_api = api; | ||||
|             extension_settings.caption.multimodal_model = model; | ||||
|             saveSettingsDebounced(); | ||||
|             switchMultimodalBlocks(); | ||||
|         }); | ||||
|         $('#caption_multimodal_model').on('change', () => { | ||||
|             extension_settings.caption.multimodal_model = String($('#caption_multimodal_model').val()); | ||||
|             saveSettingsDebounced(); | ||||
|         }); | ||||
|     } | ||||
|     function addSettings() { | ||||
|         const html = ` | ||||
|         <div class="caption_settings"> | ||||
| @@ -355,12 +409,31 @@ jQuery(function () { | ||||
|                     <label for="caption_source">Source:</label> | ||||
|                     <select id="caption_source" class="text_pole"> | ||||
|                         <option value="local">Local</option> | ||||
|                         <option value="multimodal">Multimodal (OpenAI / OpenRouter)</option> | ||||
|                         <option value="extras">Extras</option> | ||||
|                         <option value="horde">Horde</option> | ||||
|                         <option value="openai">OpenAI</option> | ||||
|                     </select> | ||||
|                     <label for="caption_prompt">Caption Prompt (OpenAI):</label> | ||||
|                     <textarea id="caption_prompt" class="text_pole" rows="1" placeholder="< Use default >">${PROMPT_DEFAULT}</textarea> | ||||
|                     <div id="caption_multimodal_block" class="flex-container wide100p"> | ||||
|                         <div class="flex1 flex-container flexFlowColumn flexNoGap"> | ||||
|                             <label for="caption_multimodal_api">API</label> | ||||
|                             <select id="caption_multimodal_api" class="flex1 text_pole"> | ||||
|                                 <option value="openai">OpenAI</option> | ||||
|                                 <option value="openrouter">OpenRouter</option> | ||||
|                             </select> | ||||
|                         </div> | ||||
|                         <div class="flex1 flex-container flexFlowColumn flexNoGap"> | ||||
|                             <label for="caption_multimodal_model">Model</label> | ||||
|                             <select id="caption_multimodal_model" class="flex1 text_pole"> | ||||
|                                 <option data-type="openai" value="gpt-4-vision-preview">gpt-4-vision-preview</option> | ||||
|                                 <option data-type="openrouter" value="openai/gpt-4-vision-preview">openai/gpt-4-vision-preview</option> | ||||
|                                 <option data-type="openrouter" value="haotian-liu/llava-13b">haotian-liu/llava-13b</option> | ||||
|                             </select> | ||||
|                         </div> | ||||
|                     </div> | ||||
|                     <div id="caption_prompt_block"> | ||||
|                         <label for="caption_prompt">Caption Prompt (Multimodal):</label> | ||||
|                         <textarea id="caption_prompt" class="text_pole" rows="1" placeholder="< Use default >">${PROMPT_DEFAULT}</textarea> | ||||
|                     </div> | ||||
|                     <label for="caption_template">Message Template: <small>(use <tt>{{caption}}</tt> macro)</small></label> | ||||
|                     <textarea id="caption_template" class="text_pole" rows="2" placeholder="< Use default >">${TEMPLATE_DEFAULT}</textarea> | ||||
|                     <label class="checkbox_label margin-bot-10px" for="caption_refine_mode"> | ||||
| @@ -379,7 +452,7 @@ jQuery(function () { | ||||
|     addSendPictureButton(); | ||||
|     setImageIcon(); | ||||
|     migrateSettings(); | ||||
|     moduleWorker(); | ||||
|     switchMultimodalBlocks(); | ||||
|  | ||||
|     $('#caption_refine_mode').prop('checked', !!(extension_settings.caption.refine_mode)); | ||||
|     $('#caption_source').val(extension_settings.caption.source); | ||||
| @@ -388,6 +461,7 @@ jQuery(function () { | ||||
|     $('#caption_refine_mode').on('input', onRefineModeInput); | ||||
|     $('#caption_source').on('change', () => { | ||||
|         extension_settings.caption.source = String($('#caption_source').val()); | ||||
|         switchMultimodalBlocks(); | ||||
|         saveSettingsDebounced(); | ||||
|     }); | ||||
|     $('#caption_prompt').on('input', () => { | ||||
| @@ -399,5 +473,4 @@ jQuery(function () { | ||||
|         saveSettingsDebounced(); | ||||
|     }); | ||||
|     $(document).on('click', '.mes_embed', onImageEmbedClicked); | ||||
|     setInterval(moduleWorker, UPDATE_INTERVAL); | ||||
| }); | ||||
|   | ||||
		Reference in New Issue
	
	Block a user