From d2817678672d0c3f68c0a7c9f34cfc6ed600e7db Mon Sep 17 00:00:00 2001
From: Cohee <18619528+Cohee1207@users.noreply.github.com>
Date: Thu, 18 Apr 2024 16:22:33 +0300
Subject: [PATCH] Add /caption command

---
 public/scripts/extensions/caption/index.js | 70 +++++++++++++++++-----
 1 file changed, 56 insertions(+), 14 deletions(-)

diff --git a/public/scripts/extensions/caption/index.js b/public/scripts/extensions/caption/index.js
index 8534fd0f6..5f49d0a31 100644
--- a/public/scripts/extensions/caption/index.js
+++ b/public/scripts/extensions/caption/index.js
@@ -1,10 +1,11 @@
-import { getBase64Async, saveBase64AsFile } from '../../utils.js';
+import { getBase64Async, isTrueBoolean, saveBase64AsFile } from '../../utils.js';
 import { getContext, getApiUrl, doExtrasFetch, extension_settings, modules } from '../../extensions.js';
 import { callPopup, getRequestHeaders, saveSettingsDebounced, substituteParams } from '../../../script.js';
 import { getMessageTimeStamp } from '../../RossAscends-mods.js';
 import { SECRET_KEYS, secret_state } from '../../secrets.js';
 import { getMultimodalCaption } from '../shared.js';
 import { textgen_types, textgenerationwebui_settings } from '../../textgen-settings.js';
+import { registerSlashCommand } from '../../slash-commands.js';
 export { MODULE_NAME };
 
 const MODULE_NAME = 'caption';
@@ -124,9 +125,10 @@ async function sendCaptionedMessage(caption, image) {
  * Generates a caption for an image using a selected source.
  * @param {string} base64Img Base64 encoded image without the data:image/...;base64, prefix
  * @param {string} fileData Base64 encoded image with the data:image/...;base64, prefix
+ * @param {string} externalPrompt Caption prompt
  * @returns {Promise<{caption: string}>} Generated caption
  */
-async function doCaptionRequest(base64Img, fileData) {
+async function doCaptionRequest(base64Img, fileData, externalPrompt) {
     switch (extension_settings.caption.source) {
         case 'local':
             return await captionLocal(base64Img);
@@ -135,7 +137,7 @@ async function doCaptionRequest(base64Img, fileData) {
         case 'horde':
             return await captionHorde(base64Img);
         case 'multimodal':
-            return await captionMultimodal(fileData);
+            return await captionMultimodal(fileData, externalPrompt);
         default:
             throw new Error('Unknown caption source.');
     }
@@ -214,12 +216,13 @@ async function captionHorde(base64Img) {
 /**
  * Generates a caption for an image using a multimodal model.
  * @param {string} base64Img Base64 encoded image with the data:image/...;base64, prefix
+ * @param {string} externalPrompt Caption prompt
  * @returns {Promise<{caption: string}>} Generated caption
  */
-async function captionMultimodal(base64Img) {
-    let prompt = extension_settings.caption.prompt || PROMPT_DEFAULT;
+async function captionMultimodal(base64Img, externalPrompt) {
+    let prompt = externalPrompt || extension_settings.caption.prompt || PROMPT_DEFAULT;
 
-    if (extension_settings.caption.prompt_ask) {
+    if (!externalPrompt && extension_settings.caption.prompt_ask) {
         const customPrompt = await callPopup('<h3>Enter a comment or question:</h3>', 'input', prompt, { rows: 2 });
         if (!customPrompt) {
             throw new Error('User aborted the caption sending.');
@@ -231,29 +234,46 @@ async function captionMultimodal(base64Img) {
     return { caption };
 }
 
-async function onSelectImage(e) {
-    setSpinnerIcon();
+/**
+ * Handles the image selection event.
+ * @param {Event} e Input event
+ * @param {string} prompt Caption prompt
+ * @param {boolean} quiet Suppresses sending a message
+ * @returns {Promise<string>} Generated caption
+ */
+async function onSelectImage(e, prompt, quiet) {
+    if (!(e.target instanceof HTMLInputElement)) {
+        return '';
+    }
+
     const file = e.target.files[0];
+    const form = e.target.form;
 
     if (!file || !(file instanceof File)) {
-        return;
+        form && form.reset();
+        return '';
     }
 
     try {
+        setSpinnerIcon();
         const context = getContext();
         const fileData = await getBase64Async(file);
         const base64Format = fileData.split(',')[0].split(';')[0].split('/')[1];
         const base64Data = fileData.split(',')[1];
-        const { caption } = await doCaptionRequest(base64Data, fileData);
-        const imagePath = await saveBase64AsFile(base64Data, context.name2, '', base64Format);
-        await sendCaptionedMessage(caption, imagePath);
+        const { caption } = await doCaptionRequest(base64Data, fileData, prompt);
+        if (!quiet) {
+            const imagePath = await saveBase64AsFile(base64Data, context.name2, '', base64Format);
+            await sendCaptionedMessage(caption, imagePath);
+        }
+        return caption;
     }
     catch (error) {
         toastr.error('Failed to caption image.');
         console.log(error);
+        return '';
     }
     finally {
-        e.target.form.reset();
+        form && form.reset();
         setImageIcon();
     }
 }
@@ -263,6 +283,26 @@ function onRefineModeInput() {
     saveSettingsDebounced();
 }
 
+/**
+ * Callback for the /caption command.
+ * @param {object} args Named parameters
+ * @param {string} prompt Caption prompt
+ */
+function captionCommandCallback(args, prompt) {
+    return new Promise(resolve => {
+        const quiet = isTrueBoolean(args?.quiet);
+        const input = document.createElement('input');
+        input.type = 'file';
+        input.accept = 'image/*';
+        input.onchange = async (e) => {
+            const caption = await onSelectImage(e, prompt, quiet);
+            resolve(caption);
+        };
+        input.oncancel = () => resolve('');
+        input.click();
+    });
+}
+
 jQuery(function () {
     function addSendPictureButton() {
         const sendButton = $(`
@@ -308,7 +348,7 @@ jQuery(function () {
         $(imgForm).append(inputHtml);
         $(imgForm).hide();
         $('#form_sheld').append(imgForm);
-        $('#img_file').on('change', onSelectImage);
+        $('#img_file').on('change', (e) => onSelectImage(e.originalEvent, '', false));
     }
     function switchMultimodalBlocks() {
         const isMultimodal = extension_settings.caption.source === 'multimodal';
@@ -457,4 +497,6 @@ jQuery(function () {
         extension_settings.caption.prompt_ask = $('#caption_prompt_ask').prop('checked');
         saveSettingsDebounced();
     });
+
+    registerSlashCommand('caption', captionCommandCallback, [], '<span class="monospace">quiet=true/false [prompt]</span> - caption an image with an optional prompt and passes the caption down the pipe. Only multimodal sources support custom prompts. Set the "quiet" argument to true to suppress sending a captioned message, default: false.', true, true);
 });