From 8bb964515a2daf75c2d93c2e45a5dee909dd29f6 Mon Sep 17 00:00:00 2001
From: Cohee <18619528+Cohee1207@users.noreply.github.com>
Date: Sun, 8 Sep 2024 10:48:28 +0300
Subject: [PATCH 1/2] Fix Gemini multimodal with JPG images Fixes #2763

---
 public/scripts/extensions/shared.js |  7 +------
 public/scripts/openai.js            | 29 ++++++++++++++++++++++++++---
 src/endpoints/google.js             |  4 ++--
 src/prompt-converters.js            |  6 ++++--
 4 files changed, 33 insertions(+), 13 deletions(-)
diff --git a/public/scripts/extensions/shared.js b/public/scripts/extensions/shared.js
index 70dbca631..9fb1f980b 100644
--- a/public/scripts/extensions/shared.js
+++ b/public/scripts/extensions/shared.js
@@ -20,7 +20,7 @@ export async function getMultimodalCaption(base64Img, prompt) {
 
     throwIfInvalidModel(useReverseProxy);
 
-    const noPrefix = ['google', 'ollama', 'llamacpp'].includes(extension_settings.caption.multimodal_api);
+    const noPrefix = ['ollama', 'llamacpp'].includes(extension_settings.caption.multimodal_api);
 
     if (noPrefix && base64Img.startsWith('data:image/')) {
         base64Img = base64Img.split(',')[1];
@@ -28,7 +28,6 @@ export async function getMultimodalCaption(base64Img, prompt) {
 
     // OpenRouter has a payload limit of ~2MB. Google is 4MB, but we love democracy.
     // Ooba requires all images to be JPEGs. Koboldcpp just asked nicely.
-    const isGoogle = extension_settings.caption.multimodal_api === 'google';
     const isOllama = extension_settings.caption.multimodal_api === 'ollama';
     const isLlamaCpp = extension_settings.caption.multimodal_api === 'llamacpp';
     const isCustom = extension_settings.caption.multimodal_api === 'custom';
@@ -40,10 +39,6 @@ export async function getMultimodalCaption(base64Img, prompt) {
     if ((['google', 'openrouter'].includes(extension_settings.caption.multimodal_api) && base64Bytes > compressionLimit) || isOoba || isKoboldCpp) {
         const maxSide = 1024;
         base64Img = await createThumbnail(base64Img, maxSide, maxSide, 'image/jpeg');
-
-        if (isGoogle) {
-            base64Img = base64Img.split(',')[1];
-        }
     }
 
     const proxyUrl = useReverseProxy ? oai_settings.reverse_proxy : '';
diff --git a/public/scripts/openai.js b/public/scripts/openai.js
index e0590c4ab..8d4372e40 100644
--- a/public/scripts/openai.js
+++ b/public/scripts/openai.js
@@ -47,6 +47,7 @@ import { SECRET_KEYS, secret_state, writeSecret } from './secrets.js';
 
 import { getEventSourceStream } from './sse-stream.js';
 import {
+    createThumbnail,
     delay,
     download,
     getBase64Async,
@@ -2440,15 +2441,14 @@ class Message {
                 if (!response.ok) throw new Error('Failed to fetch image');
                 const blob = await response.blob();
                 image = await getBase64Async(blob);
-                if (oai_settings.chat_completion_source === chat_completion_sources.MAKERSUITE) {
-                    image = image.split(',')[1];
-                }
             } catch (error) {
                 console.error('Image adding skipped', error);
                 return;
             }
         }
 
+        image = await this.compressImage(image);
+
         const quality = oai_settings.inline_image_quality || default_settings.inline_image_quality;
         this.content = [
             { type: 'text', text: textContent },
@@ -2464,6 +2464,29 @@ class Message {
         }
     }
 
+    /**
+     * Compress an image if it exceeds the size threshold for the current chat completion source.
+     * @param {string} image Data URL of the image.
+     * @returns {Promise<string>} Compressed image as a Data URL.
+     */
+    async compressImage(image) {
+        if ([chat_completion_sources.OPENROUTER, chat_completion_sources.MAKERSUITE].includes(oai_settings.chat_completion_source)) {
+            const sizeThreshold = 2 * 1024 * 1024;
+            const dataSize = image.length * 0.75;
+            const maxSide = 1024;
+            if (dataSize > sizeThreshold) {
+                image = await createThumbnail(image, maxSide);
+            }
+        }
+        return image;
+    }
+
+    /**
+     * Get the token cost of an image.
+     * @param {string} dataUrl Data URL of the image.
+     * @param {string} quality String representing the quality of the image. Can be 'low', 'auto', or 'high'.
+     * @returns
+     */
     async getImageTokenCost(dataUrl, quality) {
         if (quality === 'low') {
             return Message.tokensPerImage;
diff --git a/src/endpoints/google.js b/src/endpoints/google.js
index 1eb479586..21aa6d82a 100644
--- a/src/endpoints/google.js
+++ b/src/endpoints/google.js
@@ -22,8 +22,8 @@ router.post('/caption-image', jsonParser, async (request, response) => {
                     { text: request.body.prompt },
                     {
                         inlineData: {
-                            mimeType: 'image/png', // It needs to specify a MIME type in data if it's not a PNG
-                            data: mimeType === 'image/png' ? base64Data : request.body.image,
+                            mimeType: mimeType,
+                            data: base64Data,
                         },
                     }],
             }],
diff --git a/src/prompt-converters.js b/src/prompt-converters.js
index 2b87e8b93..fd687b5d1 100644
--- a/src/prompt-converters.js
+++ b/src/prompt-converters.js
@@ -335,10 +335,12 @@ function convertGooglePrompt(messages, model, useSysPrompt = false, charName = '
                 if (part.type === 'text') {
                     parts.push({ text: part.text });
                 } else if (part.type === 'image_url' && isMultimodal) {
+                    const mimeType = part.image_url.url.split(';')[0].split(':')[1];
+                    const base64Data = part.image_url.url.split(',')[1];
                     parts.push({
                         inlineData: {
-                            mimeType: 'image/png',
-                            data: part.image_url.url,
+                            mimeType: mimeType,
+                            data: base64Data,
                         },
                     });
                     hasImage = true;

From be145c0bfff00eb8fd3e2836506265dbc1458863 Mon Sep 17 00:00:00 2001
From: Cohee <18619528+Cohee1207@users.noreply.github.com>
Date: Sun, 8 Sep 2024 17:24:42 +0300
Subject: [PATCH 2/2] Add missing return type

---
 public/scripts/openai.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/public/scripts/openai.js b/public/scripts/openai.js
index 8d4372e40..09bb23131 100644
--- a/public/scripts/openai.js
+++ b/public/scripts/openai.js
@@ -2485,7 +2485,7 @@ class Message {
      * Get the token cost of an image.
      * @param {string} dataUrl Data URL of the image.
      * @param {string} quality String representing the quality of the image. Can be 'low', 'auto', or 'high'.
-     * @returns
+     * @returns {Promise<number>} The token cost of the image.
      */
     async getImageTokenCost(dataUrl, quality) {
         if (quality === 'low') {