added google vision caption support

This commit is contained in:
based
2023-12-14 22:37:53 +10:00
parent ca87f29771
commit 0b7c1a98cd
5 changed files with 94 additions and 16 deletions

View File

@@ -134,7 +134,7 @@ async function doCaptionRequest(base64Img, fileData) {
case 'horde':
return await captionHorde(base64Img);
case 'multimodal':
return await captionMultimodal(fileData);
return await captionMultimodal(extension_settings.caption.multimodal_api === 'google' ? base64Img : fileData);
default:
throw new Error('Unknown caption source.');
}
@@ -273,6 +273,7 @@ jQuery(function () {
(modules.includes('caption') && extension_settings.caption.source === 'extras') ||
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'openai' && secret_state[SECRET_KEYS.OPENAI]) ||
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'openrouter' && secret_state[SECRET_KEYS.OPENROUTER]) ||
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'google' && secret_state[SECRET_KEYS.MAKERSUITE]) ||
extension_settings.caption.source === 'local' ||
extension_settings.caption.source === 'horde';
@@ -328,7 +329,7 @@ jQuery(function () {
<label for="caption_source">Source</label>
<select id="caption_source" class="text_pole">
<option value="local">Local</option>
<option value="multimodal">Multimodal (OpenAI / OpenRouter)</option>
<option value="multimodal">Multimodal (OpenAI / OpenRouter / Google)</option>
<option value="extras">Extras</option>
<option value="horde">Horde</option>
</select>
@@ -338,12 +339,14 @@ jQuery(function () {
<select id="caption_multimodal_api" class="flex1 text_pole">
<option value="openai">OpenAI</option>
<option value="openrouter">OpenRouter</option>
<option value="google">Google</option>
</select>
</div>
<div class="flex1 flex-container flexFlowColumn flexNoGap">
<label for="caption_multimodal_model">Model</label>
<select id="caption_multimodal_model" class="flex1 text_pole">
<option data-type="openai" value="gpt-4-vision-preview">gpt-4-vision-preview</option>
<option data-type="google" value="gemini-pro-vision">gemini-pro-vision</option>
<option data-type="openrouter" value="openai/gpt-4-vision-preview">openai/gpt-4-vision-preview</option>
<option data-type="openrouter" value="haotian-liu/llava-13b">haotian-liu/llava-13b</option>
</select>

View File

@@ -1,7 +1,7 @@
import { getRequestHeaders } from '../../script.js';
import { extension_settings } from '../extensions.js';
import { SECRET_KEYS, secret_state } from '../secrets.js';
import { createThumbnail } from '../utils.js';
import {getRequestHeaders} from '../../script.js';
import {extension_settings} from '../extensions.js';
import {SECRET_KEYS, secret_state} from '../secrets.js';
import {createThumbnail} from '../utils.js';
/**
* Generates a caption for an image using a multimodal model.
@@ -18,6 +18,10 @@ export async function getMultimodalCaption(base64Img, prompt) {
throw new Error('OpenRouter API key is not set.');
}
if (extension_settings.caption.multimodal_api === 'google' && !secret_state[SECRET_KEYS.MAKERSUITE]) {
throw new Error('MakerSuite API key is not set.');
}
// OpenRouter has a payload limit of ~2MB
const base64Bytes = base64Img.length * 0.75;
const compressionLimit = 2 * 1024 * 1024;
@@ -26,16 +30,25 @@ export async function getMultimodalCaption(base64Img, prompt) {
base64Img = await createThumbnail(base64Img, maxSide, maxSide, 'image/jpeg');
}
const apiResult = await fetch('/api/openai/caption-image', {
method: 'POST',
headers: getRequestHeaders(),
body: JSON.stringify({
image: base64Img,
prompt: prompt,
api: extension_settings.caption.multimodal_api || 'openai',
model: extension_settings.caption.multimodal_model || 'gpt-4-vision-preview',
}),
});
const apiResult = extension_settings.caption.multimodal_api === 'google' ?
await fetch('/api/google/caption-image', {
method: 'POST',
headers: getRequestHeaders(),
body: JSON.stringify({
image: base64Img,
prompt: prompt,
}),
})
: await fetch('/api/openai/caption-image', {
method: 'POST',
headers: getRequestHeaders(),
body: JSON.stringify({
image: base64Img,
prompt: prompt,
api: extension_settings.caption.multimodal_api || 'openai',
model: extension_settings.caption.multimodal_model || 'gpt-4-vision-preview',
}),
});
if (!apiResult.ok) {
throw new Error('Failed to caption image via OpenAI.');