Add multimodal captioning for 01.ai

This commit is contained in:
Cohee 2024-08-01 01:34:49 +03:00
parent 7b9eb97c7f
commit 40ee236ca8
4 changed files with 17 additions and 3 deletions

View File

@ -8,13 +8,12 @@ import { textgen_types, textgenerationwebui_settings } from '../../textgen-setti
import { SlashCommandParser } from '../../slash-commands/SlashCommandParser.js';
import { SlashCommand } from '../../slash-commands/SlashCommand.js';
import { ARGUMENT_TYPE, SlashCommandArgument, SlashCommandNamedArgument } from '../../slash-commands/SlashCommandArgument.js';
import { SlashCommandEnumValue } from '../../slash-commands/SlashCommandEnumValue.js';
import { commonEnumProviders } from '../../slash-commands/SlashCommandCommonEnumsProvider.js';
export { MODULE_NAME };
const MODULE_NAME = 'caption';
const PROMPT_DEFAULT = 'Whats in this image?';
const PROMPT_DEFAULT = 'What\'s in this image?';
const TEMPLATE_DEFAULT = '[{{user}} sends {{char}} a picture that contains: {{caption}}]';
/**
@ -334,7 +333,7 @@ async function getCaptionForFile(file, prompt, quiet) {
}
catch (error) {
const errorMessage = error.message || 'Unknown error';
toastr.error(errorMessage, "Failed to caption image.");
toastr.error(errorMessage, 'Failed to caption image.');
console.error(error);
return '';
}
@ -399,6 +398,7 @@ jQuery(async function () {
(modules.includes('caption') && extension_settings.caption.source === 'extras') ||
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'openai' && (secret_state[SECRET_KEYS.OPENAI] || extension_settings.caption.allow_reverse_proxy)) ||
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'openrouter' && secret_state[SECRET_KEYS.OPENROUTER]) ||
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'zerooneai' && secret_state[SECRET_KEYS.ZEROONEAI]) ||
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'google' && (secret_state[SECRET_KEYS.MAKERSUITE] || extension_settings.caption.allow_reverse_proxy)) ||
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'anthropic' && (secret_state[SECRET_KEYS.CLAUDE] || extension_settings.caption.allow_reverse_proxy)) ||
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'ollama' && textgenerationwebui_settings.server_urls[textgen_types.OLLAMA]) ||

View File

@ -17,6 +17,7 @@
<div class="flex1 flex-container flexFlowColumn flexNoGap">
<label for="caption_multimodal_api" data-i18n="API">API</label>
<select id="caption_multimodal_api" class="flex1 text_pole">
<option value="zerooneai">01.AI (Yi)</option>
<option value="anthropic">Anthropic</option>
<option value="custom" data-i18n="Custom (OpenAI-compatible)">Custom (OpenAI-compatible)</option>
<option value="google">Google MakerSuite</option>
@ -32,6 +33,7 @@
<div class="flex1 flex-container flexFlowColumn flexNoGap">
<label for="caption_multimodal_model" data-i18n="Model">Model</label>
<select id="caption_multimodal_model" class="flex1 text_pole">
<option data-type="zerooneai" value="yi-vision">yi-vision</option>
<option data-type="openai" value="gpt-4-vision-preview">gpt-4-vision-preview</option>
<option data-type="openai" value="gpt-4-turbo">gpt-4-turbo</option>
<option data-type="openai" value="gpt-4o">gpt-4o</option>

View File

@ -136,6 +136,10 @@ function throwIfInvalidModel(useReverseProxy) {
throw new Error('Anthropic (Claude) API key is not set.');
}
if (extension_settings.caption.multimodal_api === 'zerooneai' && !secret_state[SECRET_KEYS.ZEROONEAI]) {
throw new Error('01.AI API key is not set.');
}
if (extension_settings.caption.multimodal_api === 'google' && !secret_state[SECRET_KEYS.MAKERSUITE] && !useReverseProxy) {
throw new Error('MakerSuite API key is not set.');
}

View File

@ -47,6 +47,10 @@ router.post('/caption-image', jsonParser, async (request, response) => {
key = readSecret(request.user.directories, SECRET_KEYS.VLLM);
}
if (request.body.api === 'zerooneai') {
key = readSecret(request.user.directories, SECRET_KEYS.ZEROONEAI);
}
if (!key && !request.body.reverse_proxy && ['custom', 'ooba', 'koboldcpp', 'vllm'].includes(request.body.api) === false) {
console.log('No key found for API', request.body.api);
return response.sendStatus(400);
@ -100,6 +104,10 @@ router.post('/caption-image', jsonParser, async (request, response) => {
apiUrl = `${request.body.server_url}/chat/completions`;
}
if (request.body.api === 'zerooneai') {
apiUrl = 'https://api.01.ai/v1/chat/completions';
}
if (request.body.api === 'ooba') {
apiUrl = `${trimV1(request.body.server_url)}/v1/chat/completions`;
const imgMessage = body.messages.pop();