Add local multimodal caption sources

This commit is contained in:
Cohee
2023-12-20 00:45:45 +02:00
parent 029cf598ce
commit 4b131067e4
3 changed files with 185 additions and 28 deletions

View File

@@ -4,6 +4,7 @@ import { callPopup, getRequestHeaders, saveSettingsDebounced, substituteParams }
import { getMessageTimeStamp } from '../../RossAscends-mods.js';
import { SECRET_KEYS, secret_state } from '../../secrets.js';
import { getMultimodalCaption } from '../shared.js';
import { textgen_types, textgenerationwebui_settings } from '../../textgen-settings.js';
export { MODULE_NAME };
const MODULE_NAME = 'caption';
@@ -134,7 +135,7 @@ async function doCaptionRequest(base64Img, fileData) {
case 'horde':
return await captionHorde(base64Img);
case 'multimodal':
return await captionMultimodal(extension_settings.caption.multimodal_api === 'google' ? base64Img : fileData);
return await captionMultimodal(fileData);
default:
throw new Error('Unknown caption source.');
}
@@ -271,9 +272,11 @@ jQuery(function () {
$(sendButton).on('click', () => {
const hasCaptionModule =
(modules.includes('caption') && extension_settings.caption.source === 'extras') ||
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'openai' && secret_state[SECRET_KEYS.OPENAI]) ||
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'openai' && (secret_state[SECRET_KEYS.OPENAI] || extension_settings.caption.allow_reverse_proxy)) ||
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'openrouter' && secret_state[SECRET_KEYS.OPENROUTER]) ||
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'google' && secret_state[SECRET_KEYS.MAKERSUITE]) ||
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'ollama' && textgenerationwebui_settings.server_urls[textgen_types.OLLAMA]) ||
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'llamacpp' && textgenerationwebui_settings.server_urls[textgen_types.LLAMACPP]) ||
extension_settings.caption.source === 'local' ||
extension_settings.caption.source === 'horde';
@@ -329,7 +332,7 @@ jQuery(function () {
<label for="caption_source">Source</label>
<select id="caption_source" class="text_pole">
<option value="local">Local</option>
<option value="multimodal">Multimodal (OpenAI / OpenRouter / Google)</option>
<option value="multimodal">Multimodal (OpenAI / llama / Google)</option>
<option value="extras">Extras</option>
<option value="horde">Horde</option>
</select>
@@ -337,9 +340,11 @@ jQuery(function () {
<div class="flex1 flex-container flexFlowColumn flexNoGap">
<label for="caption_multimodal_api">API</label>
<select id="caption_multimodal_api" class="flex1 text_pole">
<option value="llamacpp">llama.cpp</option>
<option value="ollama">Ollama</option>
<option value="openai">OpenAI</option>
<option value="openrouter">OpenRouter</option>
<option value="google">Google</option>
<option value="google">Google MakerSuite</option>
</select>
</div>
<div class="flex1 flex-container flexFlowColumn flexNoGap">
@@ -349,12 +354,19 @@ jQuery(function () {
<option data-type="google" value="gemini-pro-vision">gemini-pro-vision</option>
<option data-type="openrouter" value="openai/gpt-4-vision-preview">openai/gpt-4-vision-preview</option>
<option data-type="openrouter" value="haotian-liu/llava-13b">haotian-liu/llava-13b</option>
<option data-type="ollama" value="ollama_current">[Currently selected]</option>
<option data-type="ollama" value="bakllava:latest">bakllava:latest</option>
<option data-type="ollama" value="llava:latest">llava:latest</option>
<option data-type="llamacpp" value="llamacpp_current">[Currently loaded]</option>
</select>
</div>
<label data-type="openai" class="checkbox_label flexBasis100p" for="caption_allow_reverse_proxy" title="Allow using reverse proxy if defined and valid.">
<input id="caption_allow_reverse_proxy" type="checkbox" class="checkbox">
Allow reverse proxy
</label>
<div class="flexBasis100p m-b-1">
<small><b>Hint:</b> Set your API keys and endpoints in the 'API Connections' tab first.</small>
</div>
</div>
<div id="caption_prompt_block">
<label for="caption_prompt">Caption Prompt</label>

View File

@@ -2,6 +2,7 @@ import { getRequestHeaders } from '../../script.js';
import { extension_settings } from '../extensions.js';
import { oai_settings } from '../openai.js';
import { SECRET_KEYS, secret_state } from '../secrets.js';
import { textgen_types, textgenerationwebui_settings } from '../textgen-settings.js';
import { createThumbnail, isValidUrl } from '../utils.js';
/**
@@ -11,20 +12,18 @@ import { createThumbnail, isValidUrl } from '../utils.js';
* @returns {Promise<string>} Generated caption
*/
export async function getMultimodalCaption(base64Img, prompt) {
if (extension_settings.caption.multimodal_api === 'openai' && !secret_state[SECRET_KEYS.OPENAI]) {
throw new Error('OpenAI API key is not set.');
}
throwIfInvalidModel();
if (extension_settings.caption.multimodal_api === 'openrouter' && !secret_state[SECRET_KEYS.OPENROUTER]) {
throw new Error('OpenRouter API key is not set.');
}
const noPrefix = ['google', 'ollama', 'llamacpp'].includes(extension_settings.caption.multimodal_api);
if (extension_settings.caption.multimodal_api === 'google' && !secret_state[SECRET_KEYS.MAKERSUITE]) {
throw new Error('MakerSuite API key is not set.');
if (noPrefix && base64Img.startsWith('data:image/')) {
base64Img = base64Img.split(',')[1];
}
// OpenRouter has a payload limit of ~2MB. Google is 4MB, but we love democracy.
const isGoogle = extension_settings.caption.multimodal_api === 'google';
const isOllama = extension_settings.caption.multimodal_api === 'ollama';
const isLlamaCpp = extension_settings.caption.multimodal_api === 'llamacpp';
const base64Bytes = base64Img.length * 0.75;
const compressionLimit = 2 * 1024 * 1024;
if (['google', 'openrouter'].includes(extension_settings.caption.multimodal_api) && base64Bytes > compressionLimit) {
@@ -45,27 +44,79 @@ export async function getMultimodalCaption(base64Img, prompt) {
const proxyUrl = useReverseProxy ? oai_settings.reverse_proxy : '';
const proxyPassword = useReverseProxy ? oai_settings.proxy_password : '';
const apiResult = await fetch(`/api/${isGoogle ? 'google' : 'openai'}/caption-image`, {
const requestBody = {
image: base64Img,
prompt: prompt,
};
if (!isGoogle) {
requestBody.api = extension_settings.caption.multimodal_api || 'openai';
requestBody.model = extension_settings.caption.multimodal_model || 'gpt-4-vision-preview';
requestBody.reverse_proxy = proxyUrl;
requestBody.proxy_password = proxyPassword;
}
if (isOllama) {
if (extension_settings.caption.multimodal_model === 'ollama_current') {
requestBody.model = textgenerationwebui_settings.ollama_model;
}
requestBody.server_url = textgenerationwebui_settings.server_urls[textgen_types.OLLAMA];
}
if (isLlamaCpp) {
requestBody.server_url = textgenerationwebui_settings.server_urls[textgen_types.LLAMACPP];
}
function getEndpointUrl() {
switch (extension_settings.caption.multimodal_api) {
case 'google':
return '/api/google/caption-image';
case 'llamacpp':
return '/api/backends/text-completions/llamacpp/caption-image';
case 'ollama':
return '/api/backends/text-completions/ollama/caption-image';
default:
return '/api/openai/caption-image';
}
}
const apiResult = await fetch(getEndpointUrl(), {
method: 'POST',
headers: getRequestHeaders(),
body: JSON.stringify({
image: base64Img,
prompt: prompt,
...(isGoogle
? {}
: {
api: extension_settings.caption.multimodal_api || 'openai',
model: extension_settings.caption.multimodal_model || 'gpt-4-vision-preview',
reverse_proxy: proxyUrl,
proxy_password: proxyPassword,
}),
}),
body: JSON.stringify(requestBody),
});
if (!apiResult.ok) {
throw new Error('Failed to caption image via OpenAI.');
throw new Error('Failed to caption image via Multimodal API.');
}
const { caption } = await apiResult.json();
return caption;
return String(caption).trim();
}
function throwIfInvalidModel() {
if (extension_settings.caption.multimodal_api === 'openai' && !secret_state[SECRET_KEYS.OPENAI]) {
throw new Error('OpenAI API key is not set.');
}
if (extension_settings.caption.multimodal_api === 'openrouter' && !secret_state[SECRET_KEYS.OPENROUTER]) {
throw new Error('OpenRouter API key is not set.');
}
if (extension_settings.caption.multimodal_api === 'google' && !secret_state[SECRET_KEYS.MAKERSUITE]) {
throw new Error('MakerSuite API key is not set.');
}
if (extension_settings.caption.multimodal_api === 'ollama' && !textgenerationwebui_settings.server_urls[textgen_types.OLLAMA]) {
throw new Error('Ollama server URL is not set.');
}
if (extension_settings.caption.multimodal_api === 'ollama' && extension_settings.caption.multimodal_model === 'ollama_current' && !textgenerationwebui_settings.ollama_model) {
throw new Error('Ollama model is not set.');
}
if (extension_settings.caption.multimodal_api === 'llamacpp' && !textgenerationwebui_settings.server_urls[textgen_types.LLAMACPP]) {
throw new Error('LlamaCPP server URL is not set.');
}
}