2023-12-14 17:01:42 +01:00
|
|
|
import { getRequestHeaders } from '../../script.js';
|
|
|
|
import { extension_settings } from '../extensions.js';
|
2023-12-17 18:41:20 +01:00
|
|
|
import { oai_settings } from '../openai.js';
|
2023-12-14 17:01:42 +01:00
|
|
|
import { SECRET_KEYS, secret_state } from '../secrets.js';
|
2023-12-17 18:41:20 +01:00
|
|
|
import { createThumbnail, isValidUrl } from '../utils.js';
|
2023-11-17 22:19:21 +01:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Generates a caption for an image using a multimodal model.
|
|
|
|
* @param {string} base64Img Base64 encoded image
|
|
|
|
* @param {string} prompt Prompt to use for captioning
|
|
|
|
* @returns {Promise<string>} Generated caption
|
|
|
|
*/
|
|
|
|
export async function getMultimodalCaption(base64Img, prompt) {
|
2023-11-18 19:58:04 +01:00
|
|
|
if (extension_settings.caption.multimodal_api === 'openai' && !secret_state[SECRET_KEYS.OPENAI]) {
|
|
|
|
throw new Error('OpenAI API key is not set.');
|
|
|
|
}
|
|
|
|
|
|
|
|
if (extension_settings.caption.multimodal_api === 'openrouter' && !secret_state[SECRET_KEYS.OPENROUTER]) {
|
|
|
|
throw new Error('OpenRouter API key is not set.');
|
|
|
|
}
|
|
|
|
|
2023-12-14 13:37:53 +01:00
|
|
|
if (extension_settings.caption.multimodal_api === 'google' && !secret_state[SECRET_KEYS.MAKERSUITE]) {
|
|
|
|
throw new Error('MakerSuite API key is not set.');
|
|
|
|
}
|
|
|
|
|
2023-12-14 21:28:22 +01:00
|
|
|
// OpenRouter has a payload limit of ~2MB. Google is 4MB, but we love democracy.
|
|
|
|
const isGoogle = extension_settings.caption.multimodal_api === 'google';
|
2023-11-19 14:24:43 +01:00
|
|
|
const base64Bytes = base64Img.length * 0.75;
|
|
|
|
const compressionLimit = 2 * 1024 * 1024;
|
2023-12-14 21:28:22 +01:00
|
|
|
if (['google', 'openrouter'].includes(extension_settings.caption.multimodal_api) && base64Bytes > compressionLimit) {
|
2023-11-19 14:24:43 +01:00
|
|
|
const maxSide = 1024;
|
2023-11-23 19:50:08 +01:00
|
|
|
base64Img = await createThumbnail(base64Img, maxSide, maxSide, 'image/jpeg');
|
2023-12-14 21:28:22 +01:00
|
|
|
|
|
|
|
if (isGoogle) {
|
|
|
|
base64Img = base64Img.split(',')[1];
|
|
|
|
}
|
2023-11-19 14:24:43 +01:00
|
|
|
}
|
|
|
|
|
2023-12-17 18:41:20 +01:00
|
|
|
const useReverseProxy =
|
|
|
|
extension_settings.caption.multimodal_api === 'openai'
|
|
|
|
&& extension_settings.caption.allow_reverse_proxy
|
|
|
|
&& oai_settings.reverse_proxy
|
|
|
|
&& isValidUrl(oai_settings.reverse_proxy);
|
|
|
|
|
|
|
|
const proxyUrl = useReverseProxy ? oai_settings.reverse_proxy : '';
|
|
|
|
const proxyPassword = useReverseProxy ? oai_settings.proxy_password : '';
|
|
|
|
|
2023-12-14 14:18:56 +01:00
|
|
|
const apiResult = await fetch(`/api/${isGoogle ? 'google' : 'openai'}/caption-image`, {
|
|
|
|
method: 'POST',
|
|
|
|
headers: getRequestHeaders(),
|
|
|
|
body: JSON.stringify({
|
|
|
|
image: base64Img,
|
|
|
|
prompt: prompt,
|
|
|
|
...(isGoogle
|
|
|
|
? {}
|
|
|
|
: {
|
|
|
|
api: extension_settings.caption.multimodal_api || 'openai',
|
|
|
|
model: extension_settings.caption.multimodal_model || 'gpt-4-vision-preview',
|
2023-12-17 18:41:20 +01:00
|
|
|
reverse_proxy: proxyUrl,
|
|
|
|
proxy_password: proxyPassword,
|
2023-12-14 14:18:56 +01:00
|
|
|
}),
|
|
|
|
}),
|
|
|
|
});
|
2023-11-17 22:19:21 +01:00
|
|
|
|
|
|
|
if (!apiResult.ok) {
|
|
|
|
throw new Error('Failed to caption image via OpenAI.');
|
|
|
|
}
|
|
|
|
|
|
|
|
const { caption } = await apiResult.json();
|
|
|
|
return caption;
|
|
|
|
}
|