mirror of
https://github.com/SillyTavern/SillyTavern.git
synced 2025-02-05 05:39:36 +01:00
added google vision caption support
This commit is contained in:
parent
ca87f29771
commit
0b7c1a98cd
@ -134,7 +134,7 @@ async function doCaptionRequest(base64Img, fileData) {
|
|||||||
case 'horde':
|
case 'horde':
|
||||||
return await captionHorde(base64Img);
|
return await captionHorde(base64Img);
|
||||||
case 'multimodal':
|
case 'multimodal':
|
||||||
return await captionMultimodal(fileData);
|
return await captionMultimodal(extension_settings.caption.multimodal_api === 'google' ? base64Img : fileData);
|
||||||
default:
|
default:
|
||||||
throw new Error('Unknown caption source.');
|
throw new Error('Unknown caption source.');
|
||||||
}
|
}
|
||||||
@ -273,6 +273,7 @@ jQuery(function () {
|
|||||||
(modules.includes('caption') && extension_settings.caption.source === 'extras') ||
|
(modules.includes('caption') && extension_settings.caption.source === 'extras') ||
|
||||||
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'openai' && secret_state[SECRET_KEYS.OPENAI]) ||
|
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'openai' && secret_state[SECRET_KEYS.OPENAI]) ||
|
||||||
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'openrouter' && secret_state[SECRET_KEYS.OPENROUTER]) ||
|
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'openrouter' && secret_state[SECRET_KEYS.OPENROUTER]) ||
|
||||||
|
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'google' && secret_state[SECRET_KEYS.MAKERSUITE]) ||
|
||||||
extension_settings.caption.source === 'local' ||
|
extension_settings.caption.source === 'local' ||
|
||||||
extension_settings.caption.source === 'horde';
|
extension_settings.caption.source === 'horde';
|
||||||
|
|
||||||
@ -328,7 +329,7 @@ jQuery(function () {
|
|||||||
<label for="caption_source">Source</label>
|
<label for="caption_source">Source</label>
|
||||||
<select id="caption_source" class="text_pole">
|
<select id="caption_source" class="text_pole">
|
||||||
<option value="local">Local</option>
|
<option value="local">Local</option>
|
||||||
<option value="multimodal">Multimodal (OpenAI / OpenRouter)</option>
|
<option value="multimodal">Multimodal (OpenAI / OpenRouter / Google)</option>
|
||||||
<option value="extras">Extras</option>
|
<option value="extras">Extras</option>
|
||||||
<option value="horde">Horde</option>
|
<option value="horde">Horde</option>
|
||||||
</select>
|
</select>
|
||||||
@ -338,12 +339,14 @@ jQuery(function () {
|
|||||||
<select id="caption_multimodal_api" class="flex1 text_pole">
|
<select id="caption_multimodal_api" class="flex1 text_pole">
|
||||||
<option value="openai">OpenAI</option>
|
<option value="openai">OpenAI</option>
|
||||||
<option value="openrouter">OpenRouter</option>
|
<option value="openrouter">OpenRouter</option>
|
||||||
|
<option value="google">Google</option>
|
||||||
</select>
|
</select>
|
||||||
</div>
|
</div>
|
||||||
<div class="flex1 flex-container flexFlowColumn flexNoGap">
|
<div class="flex1 flex-container flexFlowColumn flexNoGap">
|
||||||
<label for="caption_multimodal_model">Model</label>
|
<label for="caption_multimodal_model">Model</label>
|
||||||
<select id="caption_multimodal_model" class="flex1 text_pole">
|
<select id="caption_multimodal_model" class="flex1 text_pole">
|
||||||
<option data-type="openai" value="gpt-4-vision-preview">gpt-4-vision-preview</option>
|
<option data-type="openai" value="gpt-4-vision-preview">gpt-4-vision-preview</option>
|
||||||
|
<option data-type="google" value="gemini-pro-vision">gemini-pro-vision</option>
|
||||||
<option data-type="openrouter" value="openai/gpt-4-vision-preview">openai/gpt-4-vision-preview</option>
|
<option data-type="openrouter" value="openai/gpt-4-vision-preview">openai/gpt-4-vision-preview</option>
|
||||||
<option data-type="openrouter" value="haotian-liu/llava-13b">haotian-liu/llava-13b</option>
|
<option data-type="openrouter" value="haotian-liu/llava-13b">haotian-liu/llava-13b</option>
|
||||||
</select>
|
</select>
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
import { getRequestHeaders } from '../../script.js';
|
import {getRequestHeaders} from '../../script.js';
|
||||||
import { extension_settings } from '../extensions.js';
|
import {extension_settings} from '../extensions.js';
|
||||||
import { SECRET_KEYS, secret_state } from '../secrets.js';
|
import {SECRET_KEYS, secret_state} from '../secrets.js';
|
||||||
import { createThumbnail } from '../utils.js';
|
import {createThumbnail} from '../utils.js';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Generates a caption for an image using a multimodal model.
|
* Generates a caption for an image using a multimodal model.
|
||||||
@ -18,6 +18,10 @@ export async function getMultimodalCaption(base64Img, prompt) {
|
|||||||
throw new Error('OpenRouter API key is not set.');
|
throw new Error('OpenRouter API key is not set.');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (extension_settings.caption.multimodal_api === 'google' && !secret_state[SECRET_KEYS.MAKERSUITE]) {
|
||||||
|
throw new Error('MakerSuite API key is not set.');
|
||||||
|
}
|
||||||
|
|
||||||
// OpenRouter has a payload limit of ~2MB
|
// OpenRouter has a payload limit of ~2MB
|
||||||
const base64Bytes = base64Img.length * 0.75;
|
const base64Bytes = base64Img.length * 0.75;
|
||||||
const compressionLimit = 2 * 1024 * 1024;
|
const compressionLimit = 2 * 1024 * 1024;
|
||||||
@ -26,16 +30,25 @@ export async function getMultimodalCaption(base64Img, prompt) {
|
|||||||
base64Img = await createThumbnail(base64Img, maxSide, maxSide, 'image/jpeg');
|
base64Img = await createThumbnail(base64Img, maxSide, maxSide, 'image/jpeg');
|
||||||
}
|
}
|
||||||
|
|
||||||
const apiResult = await fetch('/api/openai/caption-image', {
|
const apiResult = extension_settings.caption.multimodal_api === 'google' ?
|
||||||
method: 'POST',
|
await fetch('/api/google/caption-image', {
|
||||||
headers: getRequestHeaders(),
|
method: 'POST',
|
||||||
body: JSON.stringify({
|
headers: getRequestHeaders(),
|
||||||
image: base64Img,
|
body: JSON.stringify({
|
||||||
prompt: prompt,
|
image: base64Img,
|
||||||
api: extension_settings.caption.multimodal_api || 'openai',
|
prompt: prompt,
|
||||||
model: extension_settings.caption.multimodal_model || 'gpt-4-vision-preview',
|
}),
|
||||||
}),
|
})
|
||||||
});
|
: await fetch('/api/openai/caption-image', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: getRequestHeaders(),
|
||||||
|
body: JSON.stringify({
|
||||||
|
image: base64Img,
|
||||||
|
prompt: prompt,
|
||||||
|
api: extension_settings.caption.multimodal_api || 'openai',
|
||||||
|
model: extension_settings.caption.multimodal_model || 'gpt-4-vision-preview',
|
||||||
|
}),
|
||||||
|
});
|
||||||
|
|
||||||
if (!apiResult.ok) {
|
if (!apiResult.ok) {
|
||||||
throw new Error('Failed to caption image via OpenAI.');
|
throw new Error('Failed to caption image via OpenAI.');
|
||||||
|
@ -3424,6 +3424,7 @@ export function isImageInliningSupported() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const gpt4v = 'gpt-4-vision';
|
const gpt4v = 'gpt-4-vision';
|
||||||
|
const geminiProV = 'gemini-pro-vision';
|
||||||
const llava13b = 'llava-13b';
|
const llava13b = 'llava-13b';
|
||||||
|
|
||||||
if (!oai_settings.image_inlining) {
|
if (!oai_settings.image_inlining) {
|
||||||
@ -3433,6 +3434,8 @@ export function isImageInliningSupported() {
|
|||||||
switch (oai_settings.chat_completion_source) {
|
switch (oai_settings.chat_completion_source) {
|
||||||
case chat_completion_sources.OPENAI:
|
case chat_completion_sources.OPENAI:
|
||||||
return oai_settings.openai_model.includes(gpt4v);
|
return oai_settings.openai_model.includes(gpt4v);
|
||||||
|
case chat_completion_sources.MAKERSUITE:
|
||||||
|
return oai_settings.openai_model.includes(geminiProV);
|
||||||
case chat_completion_sources.OPENROUTER:
|
case chat_completion_sources.OPENROUTER:
|
||||||
return oai_settings.openrouter_model.includes(gpt4v) || oai_settings.openrouter_model.includes(llava13b);
|
return oai_settings.openrouter_model.includes(gpt4v) || oai_settings.openrouter_model.includes(llava13b);
|
||||||
default:
|
default:
|
||||||
|
@ -1412,6 +1412,9 @@ redirect('/downloadbackground', '/api/backgrounds/upload'); // yes, the download
|
|||||||
// OpenAI API
|
// OpenAI API
|
||||||
app.use('/api/openai', require('./src/endpoints/openai').router);
|
app.use('/api/openai', require('./src/endpoints/openai').router);
|
||||||
|
|
||||||
|
//Google API
|
||||||
|
app.use('/api/google', require('./src/endpoints/google').router);
|
||||||
|
|
||||||
// Tokenizers
|
// Tokenizers
|
||||||
app.use('/api/tokenizers', require('./src/endpoints/tokenizers').router);
|
app.use('/api/tokenizers', require('./src/endpoints/tokenizers').router);
|
||||||
|
|
||||||
|
56
src/endpoints/google.js
Normal file
56
src/endpoints/google.js
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
const { readSecret, SECRET_KEYS } = require('./secrets');
|
||||||
|
const fetch = require('node-fetch').default;
|
||||||
|
const express = require('express');
|
||||||
|
const { jsonParser } = require('../express-common');
|
||||||
|
const { MAKERSUITE_SAFETY } = require('../constants');
|
||||||
|
|
||||||
|
const router = express.Router();
|
||||||
|
|
||||||
|
router.post('/caption-image', jsonParser, async (request, response) => {
|
||||||
|
try {
|
||||||
|
const url = `https://generativelanguage.googleapis.com/v1beta/models/gemini-pro-vision:generateContent?key=${readSecret(SECRET_KEYS.MAKERSUITE)}`;
|
||||||
|
const body = {
|
||||||
|
contents: [{
|
||||||
|
parts: [
|
||||||
|
{ text: request.body.prompt },
|
||||||
|
{ inlineData: {
|
||||||
|
mimeType: 'image/png',
|
||||||
|
data: request.body.image,
|
||||||
|
},
|
||||||
|
}],
|
||||||
|
}],
|
||||||
|
safetySettings: MAKERSUITE_SAFETY,
|
||||||
|
generationConfig: { maxOutputTokens: 1000 },
|
||||||
|
};
|
||||||
|
|
||||||
|
const result = await fetch(url, {
|
||||||
|
body: JSON.stringify(body),
|
||||||
|
method: 'POST',
|
||||||
|
headers: {
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
},
|
||||||
|
timeout: 0,
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log('Multimodal captioning request', body);
|
||||||
|
|
||||||
|
if (!result.ok) {
|
||||||
|
console.log(`MakerSuite API returned error: ${result.status} ${result.statusText} ${await result.text()}`);
|
||||||
|
return response.status(result.status).send({ error: true });
|
||||||
|
}
|
||||||
|
|
||||||
|
const data = await result.json();
|
||||||
|
|
||||||
|
const caption = data?.candidates[0].content.parts[0].text;
|
||||||
|
if (!caption) {
|
||||||
|
return response.status(500).send('No caption found');
|
||||||
|
}
|
||||||
|
|
||||||
|
return response.json({ caption });
|
||||||
|
} catch (error) {
|
||||||
|
console.error(error);
|
||||||
|
response.status(500).send('Internal server error');
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
module.exports = { router };
|
Loading…
x
Reference in New Issue
Block a user