2023-12-14 17:01:42 +01:00
|
|
|
import { getRequestHeaders } from '../../script.js';
|
2024-08-13 00:09:14 +02:00
|
|
|
import { extension_settings, openThirdPartyExtensionMenu } from '../extensions.js';
|
2023-12-17 18:41:20 +01:00
|
|
|
import { oai_settings } from '../openai.js';
|
2023-12-14 17:01:42 +01:00
|
|
|
import { SECRET_KEYS, secret_state } from '../secrets.js';
|
2023-12-19 23:45:45 +01:00
|
|
|
import { textgen_types, textgenerationwebui_settings } from '../textgen-settings.js';
|
2023-12-17 18:41:20 +01:00
|
|
|
import { createThumbnail, isValidUrl } from '../utils.js';
|
2023-11-17 22:19:21 +01:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Generates a caption for an image using a multimodal model.
|
|
|
|
* @param {string} base64Img Base64 encoded image
|
|
|
|
* @param {string} prompt Prompt to use for captioning
|
|
|
|
* @returns {Promise<string>} Generated caption
|
|
|
|
*/
|
|
|
|
export async function getMultimodalCaption(base64Img, prompt) {
|
2024-05-24 20:38:29 +02:00
|
|
|
const useReverseProxy =
|
2024-09-17 20:44:25 +02:00
|
|
|
(['openai', 'anthropic', 'google', 'mistral'].includes(extension_settings.caption.multimodal_api))
|
2024-05-24 20:38:29 +02:00
|
|
|
&& extension_settings.caption.allow_reverse_proxy
|
|
|
|
&& oai_settings.reverse_proxy
|
|
|
|
&& isValidUrl(oai_settings.reverse_proxy);
|
|
|
|
|
|
|
|
throwIfInvalidModel(useReverseProxy);
|
2023-11-18 19:58:04 +01:00
|
|
|
|
2024-09-08 09:48:28 +02:00
|
|
|
const noPrefix = ['ollama', 'llamacpp'].includes(extension_settings.caption.multimodal_api);
|
2023-11-18 19:58:04 +01:00
|
|
|
|
2023-12-19 23:45:45 +01:00
|
|
|
if (noPrefix && base64Img.startsWith('data:image/')) {
|
|
|
|
base64Img = base64Img.split(',')[1];
|
2023-12-14 13:37:53 +01:00
|
|
|
}
|
|
|
|
|
2023-12-14 21:28:22 +01:00
|
|
|
// OpenRouter has a payload limit of ~2MB. Google is 4MB, but we love democracy.
|
2024-03-14 00:03:51 +01:00
|
|
|
// Ooba requires all images to be JPEGs. Koboldcpp just asked nicely.
|
2023-12-19 23:45:45 +01:00
|
|
|
const isOllama = extension_settings.caption.multimodal_api === 'ollama';
|
|
|
|
const isLlamaCpp = extension_settings.caption.multimodal_api === 'llamacpp';
|
2023-12-20 20:05:20 +01:00
|
|
|
const isCustom = extension_settings.caption.multimodal_api === 'custom';
|
2023-12-24 00:43:29 +01:00
|
|
|
const isOoba = extension_settings.caption.multimodal_api === 'ooba';
|
2024-03-14 00:03:51 +01:00
|
|
|
const isKoboldCpp = extension_settings.caption.multimodal_api === 'koboldcpp';
|
2024-06-28 23:33:12 +02:00
|
|
|
const isVllm = extension_settings.caption.multimodal_api === 'vllm';
|
2023-11-19 14:24:43 +01:00
|
|
|
const base64Bytes = base64Img.length * 0.75;
|
|
|
|
const compressionLimit = 2 * 1024 * 1024;
|
2024-09-17 20:44:25 +02:00
|
|
|
if ((['google', 'openrouter', 'mistral'].includes(extension_settings.caption.multimodal_api) && base64Bytes > compressionLimit) || isOoba || isKoboldCpp) {
|
2023-11-19 14:24:43 +01:00
|
|
|
const maxSide = 1024;
|
2023-11-23 19:50:08 +01:00
|
|
|
base64Img = await createThumbnail(base64Img, maxSide, maxSide, 'image/jpeg');
|
2023-11-19 14:24:43 +01:00
|
|
|
}
|
|
|
|
|
2023-12-17 18:41:20 +01:00
|
|
|
const proxyUrl = useReverseProxy ? oai_settings.reverse_proxy : '';
|
|
|
|
const proxyPassword = useReverseProxy ? oai_settings.proxy_password : '';
|
|
|
|
|
2023-12-19 23:45:45 +01:00
|
|
|
const requestBody = {
|
|
|
|
image: base64Img,
|
|
|
|
prompt: prompt,
|
2024-05-24 20:38:29 +02:00
|
|
|
reverse_proxy: proxyUrl,
|
|
|
|
proxy_password: proxyPassword,
|
|
|
|
api: extension_settings.caption.multimodal_api || 'openai',
|
|
|
|
model: extension_settings.caption.multimodal_model || 'gpt-4-turbo',
|
2023-12-19 23:45:45 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
if (isOllama) {
|
|
|
|
if (extension_settings.caption.multimodal_model === 'ollama_current') {
|
|
|
|
requestBody.model = textgenerationwebui_settings.ollama_model;
|
|
|
|
}
|
|
|
|
|
|
|
|
requestBody.server_url = textgenerationwebui_settings.server_urls[textgen_types.OLLAMA];
|
|
|
|
}
|
|
|
|
|
2024-06-28 23:33:12 +02:00
|
|
|
if (isVllm) {
|
|
|
|
if (extension_settings.caption.multimodal_model === 'vllm_current') {
|
|
|
|
requestBody.model = textgenerationwebui_settings.vllm_model;
|
|
|
|
}
|
|
|
|
|
|
|
|
requestBody.server_url = textgenerationwebui_settings.server_urls[textgen_types.VLLM];
|
|
|
|
}
|
|
|
|
|
2023-12-19 23:45:45 +01:00
|
|
|
if (isLlamaCpp) {
|
|
|
|
requestBody.server_url = textgenerationwebui_settings.server_urls[textgen_types.LLAMACPP];
|
|
|
|
}
|
|
|
|
|
2023-12-24 00:43:29 +01:00
|
|
|
if (isOoba) {
|
|
|
|
requestBody.server_url = textgenerationwebui_settings.server_urls[textgen_types.OOBA];
|
|
|
|
}
|
|
|
|
|
2024-03-14 00:03:51 +01:00
|
|
|
if (isKoboldCpp) {
|
|
|
|
requestBody.server_url = textgenerationwebui_settings.server_urls[textgen_types.KOBOLDCPP];
|
|
|
|
}
|
|
|
|
|
2023-12-20 20:05:20 +01:00
|
|
|
if (isCustom) {
|
|
|
|
requestBody.server_url = oai_settings.custom_url;
|
2024-04-10 06:24:43 +02:00
|
|
|
requestBody.model = oai_settings.custom_model || 'gpt-4-turbo';
|
2023-12-20 22:39:10 +01:00
|
|
|
requestBody.custom_include_headers = oai_settings.custom_include_headers;
|
|
|
|
requestBody.custom_include_body = oai_settings.custom_include_body;
|
|
|
|
requestBody.custom_exclude_body = oai_settings.custom_exclude_body;
|
2023-12-20 20:05:20 +01:00
|
|
|
}
|
|
|
|
|
2023-12-19 23:45:45 +01:00
|
|
|
function getEndpointUrl() {
|
|
|
|
switch (extension_settings.caption.multimodal_api) {
|
|
|
|
case 'google':
|
|
|
|
return '/api/google/caption-image';
|
2024-03-04 22:07:38 +01:00
|
|
|
case 'anthropic':
|
|
|
|
return '/api/anthropic/caption-image';
|
2023-12-19 23:45:45 +01:00
|
|
|
case 'llamacpp':
|
|
|
|
return '/api/backends/text-completions/llamacpp/caption-image';
|
|
|
|
case 'ollama':
|
|
|
|
return '/api/backends/text-completions/ollama/caption-image';
|
|
|
|
default:
|
|
|
|
return '/api/openai/caption-image';
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
const apiResult = await fetch(getEndpointUrl(), {
|
2023-12-14 14:18:56 +01:00
|
|
|
method: 'POST',
|
|
|
|
headers: getRequestHeaders(),
|
2023-12-19 23:45:45 +01:00
|
|
|
body: JSON.stringify(requestBody),
|
2023-12-14 14:18:56 +01:00
|
|
|
});
|
2023-11-17 22:19:21 +01:00
|
|
|
|
|
|
|
if (!apiResult.ok) {
|
2023-12-19 23:45:45 +01:00
|
|
|
throw new Error('Failed to caption image via Multimodal API.');
|
2023-11-17 22:19:21 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
const { caption } = await apiResult.json();
|
2023-12-19 23:45:45 +01:00
|
|
|
return String(caption).trim();
|
|
|
|
}
|
|
|
|
|
2024-05-24 20:38:29 +02:00
|
|
|
function throwIfInvalidModel(useReverseProxy) {
|
|
|
|
if (extension_settings.caption.multimodal_api === 'openai' && !secret_state[SECRET_KEYS.OPENAI] && !useReverseProxy) {
|
2023-12-19 23:45:45 +01:00
|
|
|
throw new Error('OpenAI API key is not set.');
|
|
|
|
}
|
|
|
|
|
|
|
|
if (extension_settings.caption.multimodal_api === 'openrouter' && !secret_state[SECRET_KEYS.OPENROUTER]) {
|
|
|
|
throw new Error('OpenRouter API key is not set.');
|
|
|
|
}
|
|
|
|
|
2024-05-24 20:38:29 +02:00
|
|
|
if (extension_settings.caption.multimodal_api === 'anthropic' && !secret_state[SECRET_KEYS.CLAUDE] && !useReverseProxy) {
|
|
|
|
throw new Error('Anthropic (Claude) API key is not set.');
|
|
|
|
}
|
|
|
|
|
2024-08-01 00:34:49 +02:00
|
|
|
if (extension_settings.caption.multimodal_api === 'zerooneai' && !secret_state[SECRET_KEYS.ZEROONEAI]) {
|
|
|
|
throw new Error('01.AI API key is not set.');
|
|
|
|
}
|
|
|
|
|
2024-05-24 20:38:29 +02:00
|
|
|
if (extension_settings.caption.multimodal_api === 'google' && !secret_state[SECRET_KEYS.MAKERSUITE] && !useReverseProxy) {
|
2024-08-21 20:00:17 +02:00
|
|
|
throw new Error('Google AI Studio API key is not set.');
|
2023-12-19 23:45:45 +01:00
|
|
|
}
|
|
|
|
|
2024-09-17 20:44:25 +02:00
|
|
|
if (extension_settings.caption.multi_modal_api === 'mistral' && !secret_state[SECRET_KEYS.MISTRALAI] && !useReverseProxy) {
|
|
|
|
throw new Error('Mistral AI API key is not set.');
|
|
|
|
}
|
|
|
|
|
2023-12-19 23:45:45 +01:00
|
|
|
if (extension_settings.caption.multimodal_api === 'ollama' && !textgenerationwebui_settings.server_urls[textgen_types.OLLAMA]) {
|
|
|
|
throw new Error('Ollama server URL is not set.');
|
|
|
|
}
|
|
|
|
|
|
|
|
if (extension_settings.caption.multimodal_api === 'ollama' && extension_settings.caption.multimodal_model === 'ollama_current' && !textgenerationwebui_settings.ollama_model) {
|
|
|
|
throw new Error('Ollama model is not set.');
|
|
|
|
}
|
|
|
|
|
|
|
|
if (extension_settings.caption.multimodal_api === 'llamacpp' && !textgenerationwebui_settings.server_urls[textgen_types.LLAMACPP]) {
|
|
|
|
throw new Error('LlamaCPP server URL is not set.');
|
|
|
|
}
|
2023-12-20 20:05:20 +01:00
|
|
|
|
2023-12-24 00:43:29 +01:00
|
|
|
if (extension_settings.caption.multimodal_api === 'ooba' && !textgenerationwebui_settings.server_urls[textgen_types.OOBA]) {
|
|
|
|
throw new Error('Text Generation WebUI server URL is not set.');
|
|
|
|
}
|
|
|
|
|
2024-03-14 00:03:51 +01:00
|
|
|
if (extension_settings.caption.multimodal_api === 'koboldcpp' && !textgenerationwebui_settings.server_urls[textgen_types.KOBOLDCPP]) {
|
|
|
|
throw new Error('KoboldCpp server URL is not set.');
|
|
|
|
}
|
|
|
|
|
2024-06-28 23:33:12 +02:00
|
|
|
if (extension_settings.caption.multimodal_api === 'vllm' && !textgenerationwebui_settings.server_urls[textgen_types.VLLM]) {
|
|
|
|
throw new Error('vLLM server URL is not set.');
|
|
|
|
}
|
|
|
|
|
|
|
|
if (extension_settings.caption.multimodal_api === 'vllm' && extension_settings.caption.multimodal_model === 'vllm_current' && !textgenerationwebui_settings.vllm_model) {
|
|
|
|
throw new Error('vLLM model is not set.');
|
|
|
|
}
|
|
|
|
|
2023-12-20 20:05:20 +01:00
|
|
|
if (extension_settings.caption.multimodal_api === 'custom' && !oai_settings.custom_url) {
|
|
|
|
throw new Error('Custom API URL is not set.');
|
|
|
|
}
|
2023-11-17 22:19:21 +01:00
|
|
|
}
|
2024-08-12 21:07:44 +02:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Check if the WebLLM extension is installed and supported.
|
|
|
|
* @returns {boolean} Whether the extension is installed and supported
|
|
|
|
*/
|
|
|
|
export function isWebLlmSupported() {
|
|
|
|
if (!('gpu' in navigator)) {
|
2024-08-12 22:01:03 +02:00
|
|
|
const warningKey = 'webllm_browser_warning_shown';
|
|
|
|
if (!sessionStorage.getItem(warningKey)) {
|
|
|
|
toastr.error('Your browser does not support the WebGPU API. Please use a different browser.', 'WebLLM', {
|
|
|
|
preventDuplicates: true,
|
|
|
|
timeOut: 0,
|
|
|
|
extendedTimeOut: 0,
|
|
|
|
});
|
|
|
|
sessionStorage.setItem(warningKey, '1');
|
|
|
|
}
|
2024-08-12 21:07:44 +02:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!('llm' in SillyTavern)) {
|
2024-08-12 22:01:03 +02:00
|
|
|
const warningKey = 'webllm_extension_warning_shown';
|
|
|
|
if (!sessionStorage.getItem(warningKey)) {
|
|
|
|
toastr.error('WebLLM extension is not installed. Click here to install it.', 'WebLLM', {
|
|
|
|
timeOut: 0,
|
|
|
|
extendedTimeOut: 0,
|
|
|
|
preventDuplicates: true,
|
2024-08-13 00:09:14 +02:00
|
|
|
onclick: () => openThirdPartyExtensionMenu('https://github.com/SillyTavern/Extension-WebLLM'),
|
2024-08-12 22:01:03 +02:00
|
|
|
});
|
|
|
|
sessionStorage.setItem(warningKey, '1');
|
|
|
|
}
|
2024-08-12 21:07:44 +02:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Generates text in response to a chat prompt using WebLLM.
|
|
|
|
* @param {any[]} messages Messages to use for generating
|
2024-08-12 22:01:03 +02:00
|
|
|
* @param {object} params Additional parameters
|
2024-08-12 21:07:44 +02:00
|
|
|
* @returns {Promise<string>} Generated response
|
|
|
|
*/
|
2024-08-12 22:01:03 +02:00
|
|
|
export async function generateWebLlmChatPrompt(messages, params = {}) {
|
2024-08-12 21:07:44 +02:00
|
|
|
if (!isWebLlmSupported()) {
|
|
|
|
throw new Error('WebLLM extension is not installed.');
|
|
|
|
}
|
|
|
|
|
2024-08-13 18:57:38 +02:00
|
|
|
console.debug('WebLLM chat completion request:', messages, params);
|
2024-08-12 21:07:44 +02:00
|
|
|
const engine = SillyTavern.llm;
|
2024-08-12 22:01:03 +02:00
|
|
|
const response = await engine.generateChatPrompt(messages, params);
|
2024-08-13 18:57:38 +02:00
|
|
|
console.debug('WebLLM chat completion response:', response);
|
2024-08-12 21:07:44 +02:00
|
|
|
return response;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Counts the number of tokens in the provided text using WebLLM's default model.
|
|
|
|
* @param {string} text Text to count tokens in
|
|
|
|
* @returns {Promise<number>} Number of tokens in the text
|
|
|
|
*/
|
|
|
|
export async function countWebLlmTokens(text) {
|
|
|
|
if (!isWebLlmSupported()) {
|
|
|
|
throw new Error('WebLLM extension is not installed.');
|
|
|
|
}
|
|
|
|
|
|
|
|
const engine = SillyTavern.llm;
|
|
|
|
const response = await engine.countTokens(text);
|
|
|
|
return response;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Gets the size of the context in the WebLLM's default model.
|
|
|
|
* @returns {Promise<number>} Size of the context in the WebLLM model
|
|
|
|
*/
|
|
|
|
export async function getWebLlmContextSize() {
|
|
|
|
if (!isWebLlmSupported()) {
|
|
|
|
throw new Error('WebLLM extension is not installed.');
|
|
|
|
}
|
|
|
|
|
|
|
|
const engine = SillyTavern.llm;
|
|
|
|
await engine.loadModel();
|
|
|
|
const model = await engine.getCurrentModelInfo();
|
|
|
|
return model?.context_size;
|
|
|
|
}
|