From 2c4f53e7b5a79c824f49a26e99ebab331419e1a3 Mon Sep 17 00:00:00 2001
From: Cohee <18619528+Cohee1207@users.noreply.github.com>
Date: Sun, 12 Nov 2023 00:09:48 +0200
Subject: [PATCH] Add native GPT-4V image inlining

---
 public/css/st-tailwind.css                 |   8 ++
 public/css/toggle-dependent.css            |   8 ++
 public/index.html                          |  13 +++
 public/script.js                           |   2 +-
 public/scripts/extensions/caption/index.js |  91 ++++++++++++++-
 public/scripts/openai.js                   | 124 ++++++++++++++++++---
 6 files changed, 229 insertions(+), 17 deletions(-)
diff --git a/public/css/st-tailwind.css b/public/css/st-tailwind.css
index 701d03890..d86d53f49 100644
--- a/public/css/st-tailwind.css
+++ b/public/css/st-tailwind.css
@@ -229,6 +229,10 @@
     display: flex;
 }
 
+.flexBasis100p {
+    flex-basis: 100%;
+}
+
 .flexBasis50p {
     flex-basis: 50%
 }
@@ -263,6 +267,10 @@
     flex-shrink: 1
 }
 
+.flexWrap {
+    flex-wrap: wrap;
+}
+
 .flexnowrap {
     flex-wrap: nowrap;
 }
diff --git a/public/css/toggle-dependent.css b/public/css/toggle-dependent.css
index 4bf01922e..692ebd233 100644
--- a/public/css/toggle-dependent.css
+++ b/public/css/toggle-dependent.css
@@ -358,3 +358,11 @@ body.expandMessageActions .mes .mes_buttons .extraMesButtons {
 body.expandMessageActions .mes .mes_buttons .extraMesButtonsHint {
     display: none !important;
 }
+
+#openai_image_inlining:not(:checked) ~ #image_inlining_hint {
+    display: none;
+}
+
+#openai_image_inlining:checked ~ #image_inlining_hint {
+    display: block;
+}
diff --git a/public/index.html b/public/index.html
index 25fec85c5..3188f7e12 100644
--- a/public/index.html
+++ b/public/index.html
@@ -1723,6 +1723,7 @@
                                     </optgroup>
                                     <optgroup label="GPT-4">
                                         <option value="gpt-4">gpt-4</option>
+                                        <option value="gpt-4-vision-preview">gpt-4-vision-preview</option>
                                         <option value="gpt-4-1106-preview">gpt-4-1106-preview</option>
                                         <option value="gpt-4-0613">gpt-4-0613</option>
                                         <option value="gpt-4-0314">gpt-4-0314</option>
@@ -1745,6 +1746,17 @@
                                     <input id="openai_show_external_models" type="checkbox" />
                                     <span data-i18n="Show External models (provided by API)">Show "External" models (provided by API)</span>
                                 </label>
+                                <label for="openai_image_inlining" class="checkbox_label flexWrap">
+                                    <input id="openai_image_inlining" type="checkbox" />
+                                    <span data-i18n="Send inline images">Send inline images (only GPT-4V model)</span>
+                                    <div id="image_inlining_hint" class="flexBasis100p">
+                                        <small>
+                                            Natively replaces captioning if the model supports it.<br>
+                                            Use the <code><i class="fa-solid fa-image"></i></code> action on any message or the
+                                            <code><i class="fa-solid fa-wand-magic-sparkles"></i></code> menu to attach an image to the chat.
+                                        </small>
+                                    </div>
+                                </label>
                             </div>
                         </form>
                         <form id="claude_form" data-source="claude" action="javascript:void(null);" method="post" enctype="multipart/form-data">
@@ -4057,6 +4069,7 @@
                             <div title="Prompt" class="mes_prompt fa-solid fa-square-poll-horizontal " data-i18n="[title]Prompt"></div>
                             <div title="Exclude message from prompts" class="mes_hide fa-solid fa-eye" data-i18n="[title]Exclude message from prompts"></div>
                             <div title="Include message in prompts" class="mes_unhide fa-solid fa-eye-slash" data-i18n="[title]Include message in prompts"></div>
+                            <div title="Embed image" class="mes_embed fa-solid fa-image" data-i18n="[title]Embed image"></div>
                             <div title="Create bookmark" class="mes_create_bookmark fa-regular fa-solid fa-book-bookmark" data-i18n="[title]Create Bookmark"></div>
                             <div title="Create branch" class="mes_create_branch fa-regular fa-code-branch" data-i18n="[title]Create Branch"></div>
                             <div title="Copy" class="mes_copy fa-solid fa-copy " data-i18n="[title]Copy"></div>
diff --git a/public/script.js b/public/script.js
index 4d7e5d0d3..914613071 100644
--- a/public/script.js
+++ b/public/script.js
@@ -3409,7 +3409,7 @@ async function Generate(type, { automatic_trigger, force_name2, resolve, reject,
                 generate_data = getNovelGenerationData(finalPrompt, presetSettings, maxLength, isImpersonate, cfgValues);
             }
             else if (main_api == 'openai') {
-                let [prompt, counts] = prepareOpenAIMessages({
+                let [prompt, counts] = await prepareOpenAIMessages({
                     name2: name2,
                     charDescription: description,
                     charPersonality: personality,
diff --git a/public/scripts/extensions/caption/index.js b/public/scripts/extensions/caption/index.js
index 05f70dff4..4d83d1419 100644
--- a/public/scripts/extensions/caption/index.js
+++ b/public/scripts/extensions/caption/index.js
@@ -1,8 +1,9 @@
 import { getBase64Async, saveBase64AsFile } from "../../utils.js";
 import { getContext, getApiUrl, doExtrasFetch, extension_settings, modules } from "../../extensions.js";
-import { callPopup, getRequestHeaders, saveSettingsDebounced, substituteParams } from "../../../script.js";
+import { appendImageToMessage, callPopup, getRequestHeaders, saveSettingsDebounced, substituteParams } from "../../../script.js";
 import { getMessageTimeStamp } from "../../RossAscends-mods.js";
 import { SECRET_KEYS, secret_state } from "../../secrets.js";
+import { isImageInliningSupported } from "../../openai.js";
 export { MODULE_NAME };
 
 const MODULE_NAME = 'caption';
@@ -223,6 +224,83 @@ function onRefineModeInput() {
     saveSettingsDebounced();
 }
 
+async function sendEmbeddedImage(e) {
+    const file = e.target.files[0];
+
+    if (!file || !(file instanceof File)) {
+        return;
+    }
+
+    try {
+        const context = getContext();
+        const fileData = await getBase64Async(file);
+        const base64Format = fileData.split(',')[0].split(';')[0].split('/')[1];
+        const base64Data = fileData.split(',')[1];
+        const caption = await callPopup('<h3>Enter a comment or question (optional)</h3>', 'input', 'What is this?', { okButton: 'Send', rows: 2 });
+        const imagePath = await saveBase64AsFile(base64Data, context.name2, '', base64Format);
+        const message = {
+            name: context.name1,
+            is_user: true,
+            send_date: getMessageTimeStamp(),
+            mes: caption || `[${context.name1} sends ${context.name2} a picture]`,
+            extra: {
+                image: imagePath,
+                inline_image: !!caption,
+                title: caption || '',
+            },
+        };
+        context.chat.push(message);
+        context.addOneMessage(message);
+        await context.generate('caption');
+    }
+    catch (error) {
+        console.log(error);
+    }
+    finally {
+        e.target.form.reset();
+        setImageIcon();
+    }
+}
+
+function onImageEmbedClicked() {
+    const context = getContext();
+    const messageElement = $(this).closest('.mes');
+    const messageId = messageElement.attr('mesid');
+    const message = context.chat[messageId];
+
+    if (!message) {
+        console.warn('Failed to find message with id', messageId);
+        return;
+    }
+
+    $('#embed_img_file')
+        .off('change')
+        .on('change', parseAndUploadEmbed)
+        .trigger('click');
+
+    async function parseAndUploadEmbed(e) {
+        const file = e.target.files[0];
+
+        if (!file || !(file instanceof File)) {
+            return;
+        }
+        const fileData = await getBase64Async(file);
+        const base64Data = fileData.split(',')[1];
+        const base64Format = fileData.split(',')[0].split(';')[0].split('/')[1];
+        const imagePath = await saveBase64AsFile(base64Data, context.name2, '', base64Format);
+
+        if (!message.extra) {
+            message.extra = {};
+        }
+
+        message.extra.image = imagePath;
+        message.extra.inline_image = true;
+        message.extra.title = '';
+        appendImageToMessage(message, messageElement);
+        await context.saveChat();
+    }
+}
+
 jQuery(function () {
     function addSendPictureButton() {
         const sendButton = $(`
@@ -234,6 +312,12 @@ jQuery(function () {
         $('#extensionsMenu').prepend(sendButton);
         $(sendButton).hide();
         $(sendButton).on('click', () => {
+            if (isImageInliningSupported()) {
+                console.log('Native image inlining is supported. Skipping captioning.');
+                $('#embed_img_file').off('change').on('change', sendEmbeddedImage).trigger('click');
+                return;
+            }
+
             const hasCaptionModule =
                 (modules.includes('caption') && extension_settings.caption.source === 'extras') ||
                 (extension_settings.caption.source === 'openai' && secret_state[SECRET_KEYS.OPENAI]) ||
@@ -249,10 +333,12 @@ jQuery(function () {
         });
     }
     function addPictureSendForm() {
-        const inputHtml = `<input id="img_file" type="file" accept="image/*">`;
+        const inputHtml = `<input id="img_file" type="file" hidden accept="image/*">`;
+        const embedInputHtml = `<input id="embed_img_file" type="file" hidden accept="image/*">`;
         const imgForm = document.createElement('form');
         imgForm.id = 'img_form';
         $(imgForm).append(inputHtml);
+        $(imgForm).append(embedInputHtml);
         $(imgForm).hide();
         $('#form_sheld').append(imgForm);
         $('#img_file').on('change', onSelectImage);
@@ -312,5 +398,6 @@ jQuery(function () {
         extension_settings.caption.template = String($('#caption_template').val());
         saveSettingsDebounced();
     });
+    $(document).on('click', '.mes_embed', onImageEmbedClicked);
     setInterval(moduleWorker, UPDATE_INTERVAL);
 });
diff --git a/public/scripts/openai.js b/public/scripts/openai.js
index ec5e712bf..c98860888 100644
--- a/public/scripts/openai.js
+++ b/public/scripts/openai.js
@@ -54,7 +54,9 @@ import {
 import {
     delay,
     download,
+    getBase64Async,
     getFileText, getSortableDelay,
+    isDataURL,
     parseJsonFile,
     resetScrollHeight,
     stringFormat,
@@ -70,7 +72,6 @@ export {
     setOpenAIMessages,
     setOpenAIMessageExamples,
     setupChatCompletionPromptManager,
-    prepareOpenAIMessages,
     sendOpenAIRequest,
     getChatCompletionModel,
     TokenHandler,
@@ -221,6 +222,7 @@ const default_settings = {
     exclude_assistant: false,
     use_alt_scale: false,
     squash_system_messages: false,
+    image_inlining: false,
 };
 
 const oai_settings = {
@@ -267,6 +269,7 @@ const oai_settings = {
     exclude_assistant: false,
     use_alt_scale: false,
     squash_system_messages: false,
+    image_inlining: false,
 };
 
 let openai_setting_names;
@@ -409,7 +412,8 @@ function setOpenAIMessages(chat) {
         // Apply the "wrap in quotes" option
         if (role == 'user' && oai_settings.wrap_in_quotes) content = `"${content}"`;
         const name = chat[j]['name'];
-        openai_msgs[i] = { "role": role, "content": content, name: name };
+        const image = chat[j]?.extra?.image;
+        openai_msgs[i] = { "role": role, "content": content, name: name, "image": image };
         j++;
     }
 }
@@ -592,7 +596,7 @@ export function isOpenRouterWithInstruct() {
  * @param type
  * @param cyclePrompt
  */
-function populateChatHistory(prompts, chatCompletion, type = null, cyclePrompt = null) {
+async function populateChatHistory(prompts, chatCompletion, type = null, cyclePrompt = null) {
     chatCompletion.add(new MessageCollection('chatHistory'), prompts.index('chatHistory'));
 
     let names = (selected_group && groups.find(x => x.id === selected_group)?.members.map(member => characters.find(c => c.avatar === member)?.name).filter(Boolean).join(', ')) || '';
@@ -629,8 +633,13 @@ function populateChatHistory(prompts, chatCompletion, type = null, cyclePrompt =
         chatCompletion.insert(message, 'chatHistory');
     }
 
+    const imageInlining = isImageInliningSupported();
+
     // Insert chat messages as long as there is budget available
-    [...openai_msgs].reverse().every((chatPrompt, index) => {
+    const chatPool = [...openai_msgs].reverse();
+    for (let index = 0; index < chatPool.length; index++) {
+        const chatPrompt = chatPool[index];
+
         // We do not want to mutate the prompt
         const prompt = new Prompt(chatPrompt);
         prompt.identifier = `chatHistory-${openai_msgs.length - index}`;
@@ -641,10 +650,16 @@ function populateChatHistory(prompts, chatCompletion, type = null, cyclePrompt =
             chatMessage.setName(messageName);
         }
 
-        if (chatCompletion.canAfford(chatMessage)) chatCompletion.insertAtStart(chatMessage, 'chatHistory');
-        else return false;
-        return true;
-    });
+        if (imageInlining && chatPrompt.image) {
+            await chatMessage.addImage(chatPrompt.image);
+        }
+
+        if (chatCompletion.canAfford(chatMessage)) {
+            chatCompletion.insertAtStart(chatMessage, 'chatHistory');
+        } else {
+            break;
+        }
+    }
 
     // Insert and free new chat
     chatCompletion.freeBudget(newChatMessage);
@@ -724,7 +739,7 @@ function getPromptPosition(position) {
  * @param {string} options.quietPrompt - Instruction prompt for extras
  * @param {string} options.type - The type of the chat, can be 'impersonate'.
  */
-function populateChatCompletion(prompts, chatCompletion, { bias, quietPrompt, type, cyclePrompt } = {}) {
+async function populateChatCompletion(prompts, chatCompletion, { bias, quietPrompt, type, cyclePrompt } = {}) {
     // Helper function for preparing a prompt, that already exists within the prompt collection, for completion
     const addToChatCompletion = (source, target = null) => {
         // We need the prompts array to determine a position for the source.
@@ -825,9 +840,9 @@ function populateChatCompletion(prompts, chatCompletion, { bias, quietPrompt, ty
     // Decide whether dialogue examples should always be added
     if (power_user.pin_examples) {
         populateDialogueExamples(prompts, chatCompletion);
-        populateChatHistory(prompts, chatCompletion, type, cyclePrompt);
+        await populateChatHistory(prompts, chatCompletion, type, cyclePrompt);
     } else {
-        populateChatHistory(prompts, chatCompletion, type, cyclePrompt);
+        await populateChatHistory(prompts, chatCompletion, type, cyclePrompt);
         populateDialogueExamples(prompts, chatCompletion);
     }
 
@@ -969,7 +984,7 @@ function preparePromptsForChatCompletion({ Scenario, charPersonality, name2, wor
  * @param dryRun - Whether this is a live call or not.
  * @returns {(*[]|boolean)[]} An array where the first element is the prepared chat and the second element is a boolean flag.
  */
-function prepareOpenAIMessages({
+export async function prepareOpenAIMessages({
     name2,
     charDescription,
     charPersonality,
@@ -1012,7 +1027,7 @@ function prepareOpenAIMessages({
         });
 
         // Fill the chat completion with as much context as the budget allows
-        populateChatCompletion(prompts, chatCompletion, { bias, quietPrompt, type, cyclePrompt });
+        await populateChatCompletion(prompts, chatCompletion, { bias, quietPrompt, type, cyclePrompt });
     } catch (error) {
         if (error instanceof TokenBudgetExceededError) {
             toastr.error('An error occurred while counting tokens: Token budget exceeded.')
@@ -1372,6 +1387,16 @@ async function sendOpenAIRequest(type, openai_msgs_tosend, signal) {
         "stop": getCustomStoppingStrings(openai_max_stop_strings),
     };
 
+    // Empty array will produce a validation error
+    if (!Array.isArray(generate_data.stop) || !generate_data.stop.length) {
+        delete generate_data.stop;
+    }
+
+    // Vision models don't support logit bias
+    if (isImageInliningSupported()) {
+        delete generate_data.logit_bias;
+    }
+
     // Proxy is only supported for Claude and OpenAI
     if (oai_settings.reverse_proxy && [chat_completion_sources.CLAUDE, chat_completion_sources.OPENAI].includes(oai_settings.chat_completion_source)) {
         validateReverseProxy();
@@ -1640,7 +1665,18 @@ class InvalidCharacterNameError extends Error {
  * Used for creating, managing, and interacting with a specific message object.
  */
 class Message {
-    tokens; identifier; role; content; name;
+    static tokensPerImage = 85;
+
+    /** @type {number} */
+    tokens;
+    /** @type {string} */
+    identifier;
+    /** @type {string} */
+    role;
+    /** @type {string|any[]} */
+    content;
+    /** @type {string} */
+    name;
 
     /**
      * @constructor
@@ -1665,6 +1701,30 @@ class Message {
         this.tokens = tokenHandler.count({ role: this.role, content: this.content, name: this.name });
     }
 
+    async addImage(image) {
+        const textContent = this.content;
+        const isDataUrl = isDataURL(image);
+
+        if (!isDataUrl) {
+            try {
+                const response = await fetch(image, { method: 'GET', cache: 'force-cache' });
+                if (!response.ok) throw new Error('Failed to fetch image');
+                const blob = await response.blob();
+                image = await getBase64Async(blob);
+            } catch (error) {
+                console.error('Image adding skipped', error);
+                return;
+            }
+        }
+
+        this.content = [
+            { type: "text", text: textContent },
+            { type: "image_url", image_url: { "url": image, "detail": "low" } },
+        ];
+
+        this.tokens += Message.tokensPerImage;
+    }
+
     /**
      * Create a new Message instance from a prompt.
      * @static
@@ -2148,6 +2208,7 @@ function loadOpenAISettings(data, settings) {
     oai_settings.show_external_models = settings.show_external_models ?? default_settings.show_external_models;
     oai_settings.proxy_password = settings.proxy_password ?? default_settings.proxy_password;
     oai_settings.assistant_prefill = settings.assistant_prefill ?? default_settings.assistant_prefill;
+    oai_settings.image_inlining = settings.image_inlining ?? default_settings.image_inlining;
 
     oai_settings.prompts = settings.prompts ?? default_settings.prompts;
     oai_settings.prompt_order = settings.prompt_order ?? default_settings.prompt_order;
@@ -2168,6 +2229,7 @@ function loadOpenAISettings(data, settings) {
     $('#api_url_scale').val(oai_settings.api_url_scale);
     $('#openai_proxy_password').val(oai_settings.proxy_password);
     $('#claude_assistant_prefill').val(oai_settings.assistant_prefill);
+    $('#openai_image_inlining').prop('checked', oai_settings.image_inlining);
 
     $('#model_openai_select').val(oai_settings.openai_model);
     $(`#model_openai_select option[value="${oai_settings.openai_model}"`).attr('selected', true);
@@ -2388,6 +2450,7 @@ async function saveOpenAIPreset(name, settings, triggerUi = true) {
         exclude_assistant: settings.exclude_assistant,
         use_alt_scale: settings.use_alt_scale,
         squash_system_messages: settings.squash_system_messages,
+        image_inlining: settings.image_inlining,
     };
 
     const savePresetSettings = await fetch(`/api/presets/save-openai?name=${name}`, {
@@ -2741,6 +2804,7 @@ function onSettingsPresetChange() {
         exclude_assistant: ['#exclude_assistant', 'exclude_assistant', true],
         use_alt_scale: ['#use_alt_scale', 'use_alt_scale', true],
         squash_system_messages: ['#squash_system_messages', 'squash_system_messages', true],
+        image_inlining: ['#openai_image_inlining', 'image_inlining', true],
     };
 
     const presetName = $('#settings_preset_openai').find(":selected").text();
@@ -2785,6 +2849,9 @@ function getMaxContextOpenAI(value) {
     else if (value.includes('gpt-4-1106')) {
         return max_128k;
     }
+    else if (value.includes('gpt-4-vision')) {
+        return max_128k;
+    }
     else if (value.includes('gpt-3.5-turbo-1106')) {
         return max_16k;
     }
@@ -2831,6 +2898,9 @@ function getMaxContextWindowAI(value) {
     else if (value.includes('gpt-4-1106')) {
         return max_128k;
     }
+    else if (value.includes('gpt-4-vision')) {
+        return max_128k;
+    }
     else if (value.includes('gpt-4-32k')) {
         return max_32k;
     }
@@ -3217,6 +3287,27 @@ function updateScaleForm() {
     }
 }
 
+/**
+ * Check if the model supports image inlining
+ * @returns {boolean} True if the model supports image inlining
+ */
+export function isImageInliningSupported() {
+    const modelId = 'gpt-4-vision';
+
+    if (!oai_settings.image_inlining) {
+        return false;
+    }
+
+    switch (oai_settings.chat_completion_source) {
+        case chat_completion_sources.OPENAI:
+            return oai_settings.openai_model.includes(modelId);
+        case chat_completion_sources.OPENROUTER:
+            return oai_settings.openrouter_model.includes(modelId);
+        default:
+            return false;
+    }
+}
+
 $(document).ready(async function () {
     $('#test_api_button').on('click', testApiConnection);
 
@@ -3463,6 +3554,11 @@ $(document).ready(async function () {
         saveSettingsDebounced();
     });
 
+    $('#openai_image_inlining').on('input', function () {
+        oai_settings.image_inlining = !!$(this).prop('checked');
+        saveSettingsDebounced();
+    });
+
     $(document).on('input', '#openai_settings .autoSetHeight', function () {
         resetScrollHeight($(this));
     });