Add multimodal captioning for Cohere

This commit is contained in:
Cohee
2025-03-05 21:36:43 +02:00
parent a09b9fa746
commit c167890d26
4 changed files with 73 additions and 19 deletions

View File

@ -398,23 +398,62 @@ jQuery(async function () {
$('#caption_wand_container').append(sendButton); $('#caption_wand_container').append(sendButton);
$(sendButton).on('click', () => { $(sendButton).on('click', () => {
const hasCaptionModule = const hasCaptionModule = (() => {
(modules.includes('caption') && extension_settings.caption.source === 'extras') || const settings = extension_settings.caption;
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'openai' && (secret_state[SECRET_KEYS.OPENAI] || extension_settings.caption.allow_reverse_proxy)) ||
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'openrouter' && secret_state[SECRET_KEYS.OPENROUTER]) || // Handle non-multimodal sources
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'zerooneai' && secret_state[SECRET_KEYS.ZEROONEAI]) || if (settings.source === 'extras' && modules.includes('caption')) return true;
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'groq' && secret_state[SECRET_KEYS.GROQ]) || if (settings.source === 'local' || settings.source === 'horde') return true;
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'mistral' && (secret_state[SECRET_KEYS.MISTRALAI] || extension_settings.caption.allow_reverse_proxy)) ||
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'google' && (secret_state[SECRET_KEYS.MAKERSUITE] || extension_settings.caption.allow_reverse_proxy)) || // Handle multimodal sources
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'anthropic' && (secret_state[SECRET_KEYS.CLAUDE] || extension_settings.caption.allow_reverse_proxy)) || if (settings.source === 'multimodal') {
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'ollama' && textgenerationwebui_settings.server_urls[textgen_types.OLLAMA]) || const api = settings.multimodal_api;
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'llamacpp' && textgenerationwebui_settings.server_urls[textgen_types.LLAMACPP]) ||
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'ooba' && textgenerationwebui_settings.server_urls[textgen_types.OOBA]) || // APIs that support reverse proxy
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'koboldcpp' && textgenerationwebui_settings.server_urls[textgen_types.KOBOLDCPP]) || const reverseProxyApis = {
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'vllm' && textgenerationwebui_settings.server_urls[textgen_types.VLLM]) || 'openai': SECRET_KEYS.OPENAI,
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'custom') || 'mistral': SECRET_KEYS.MISTRALAI,
extension_settings.caption.source === 'local' || 'google': SECRET_KEYS.MAKERSUITE,
extension_settings.caption.source === 'horde'; 'anthropic': SECRET_KEYS.CLAUDE,
};
if (reverseProxyApis[api]) {
if (secret_state[reverseProxyApis[api]] || settings.allow_reverse_proxy) {
return true;
}
}
const chatCompletionApis = {
'openrouter': SECRET_KEYS.OPENROUTER,
'zerooneai': SECRET_KEYS.ZEROONEAI,
'groq': SECRET_KEYS.GROQ,
'cohere': SECRET_KEYS.COHERE,
};
if (chatCompletionApis[api] && secret_state[chatCompletionApis[api]]) {
return true;
}
const textCompletionApis = {
'ollama': textgen_types.OLLAMA,
'llamacpp': textgen_types.LLAMACPP,
'ooba': textgen_types.OOBA,
'koboldcpp': textgen_types.KOBOLDCPP,
'vllm': textgen_types.VLLM,
};
if (textCompletionApis[api] && textgenerationwebui_settings.server_urls[textCompletionApis[api]]) {
return true;
}
// Custom API doesn't need additional checks
if (api === 'custom') {
return true;
}
}
return false;
})();
if (!hasCaptionModule) { if (!hasCaptionModule) {
toastr.error('Choose other captioning source in the extension settings.', 'Captioning is not available'); toastr.error('Choose other captioning source in the extension settings.', 'Captioning is not available');

View File

@ -19,6 +19,7 @@
<select id="caption_multimodal_api" class="flex1 text_pole"> <select id="caption_multimodal_api" class="flex1 text_pole">
<option value="zerooneai">01.AI (Yi)</option> <option value="zerooneai">01.AI (Yi)</option>
<option value="anthropic">Anthropic</option> <option value="anthropic">Anthropic</option>
<option value="cohere">Cohere</option>
<option value="custom" data-i18n="Custom (OpenAI-compatible)">Custom (OpenAI-compatible)</option> <option value="custom" data-i18n="Custom (OpenAI-compatible)">Custom (OpenAI-compatible)</option>
<option value="google">Google AI Studio</option> <option value="google">Google AI Studio</option>
<option value="groq">Groq</option> <option value="groq">Groq</option>
@ -35,6 +36,8 @@
<div class="flex1 flex-container flexFlowColumn flexNoGap"> <div class="flex1 flex-container flexFlowColumn flexNoGap">
<label for="caption_multimodal_model" data-i18n="Model">Model</label> <label for="caption_multimodal_model" data-i18n="Model">Model</label>
<select id="caption_multimodal_model" class="flex1 text_pole"> <select id="caption_multimodal_model" class="flex1 text_pole">
<option data-type="cohere" value="c4ai-aya-vision-8b">c4ai-aya-vision-8b</option>
<option data-type="cohere" value="c4ai-aya-vision-32b">c4ai-aya-vision-32b</option>
<option data-type="mistral" value="pixtral-12b-latest">pixtral-12b-latest</option> <option data-type="mistral" value="pixtral-12b-latest">pixtral-12b-latest</option>
<option data-type="mistral" value="pixtral-12b-2409">pixtral-12b-2409</option> <option data-type="mistral" value="pixtral-12b-2409">pixtral-12b-2409</option>
<option data-type="mistral" value="pixtral-large-latest">pixtral-large-latest</option> <option data-type="mistral" value="pixtral-large-latest">pixtral-large-latest</option>

View File

@ -144,10 +144,14 @@ function throwIfInvalidModel(useReverseProxy) {
throw new Error('Google AI Studio API key is not set.'); throw new Error('Google AI Studio API key is not set.');
} }
if (extension_settings.caption.multi_modal_api === 'mistral' && !secret_state[SECRET_KEYS.MISTRALAI] && !useReverseProxy) { if (extension_settings.caption.multimodal_api === 'mistral' && !secret_state[SECRET_KEYS.MISTRALAI] && !useReverseProxy) {
throw new Error('Mistral AI API key is not set.'); throw new Error('Mistral AI API key is not set.');
} }
if (extension_settings.caption.multimodal_api === 'cohere' && !secret_state[SECRET_KEYS.COHERE]) {
throw new Error('Cohere API key is not set.');
}
if (extension_settings.caption.multimodal_api === 'ollama' && !textgenerationwebui_settings.server_urls[textgen_types.OLLAMA]) { if (extension_settings.caption.multimodal_api === 'ollama' && !textgenerationwebui_settings.server_urls[textgen_types.OLLAMA]) {
throw new Error('Ollama server URL is not set.'); throw new Error('Ollama server URL is not set.');
} }

View File

@ -62,6 +62,10 @@ router.post('/caption-image', jsonParser, async (request, response) => {
key = readSecret(request.user.directories, SECRET_KEYS.GROQ); key = readSecret(request.user.directories, SECRET_KEYS.GROQ);
} }
if (request.body.api === 'cohere') {
key = readSecret(request.user.directories, SECRET_KEYS.COHERE);
}
if (!key && !request.body.reverse_proxy && ['custom', 'ooba', 'koboldcpp', 'vllm'].includes(request.body.api) === false) { if (!key && !request.body.reverse_proxy && ['custom', 'ooba', 'koboldcpp', 'vllm'].includes(request.body.api) === false) {
console.warn('No key found for API', request.body.api); console.warn('No key found for API', request.body.api);
return response.sendStatus(400); return response.sendStatus(400);
@ -126,6 +130,10 @@ router.post('/caption-image', jsonParser, async (request, response) => {
apiUrl = 'https://api.mistral.ai/v1/chat/completions'; apiUrl = 'https://api.mistral.ai/v1/chat/completions';
} }
if (request.body.api === 'cohere') {
apiUrl = 'https://api.cohere.ai/v2/chat';
}
if (request.body.api === 'ooba') { if (request.body.api === 'ooba') {
apiUrl = `${trimV1(request.body.server_url)}/v1/chat/completions`; apiUrl = `${trimV1(request.body.server_url)}/v1/chat/completions`;
const imgMessage = body.messages.pop(); const imgMessage = body.messages.pop();
@ -165,7 +173,7 @@ router.post('/caption-image', jsonParser, async (request, response) => {
/** @type {any} */ /** @type {any} */
const data = await result.json(); const data = await result.json();
console.info('Multimodal captioning response', data); console.info('Multimodal captioning response', data);
const caption = data?.choices[0]?.message?.content; const caption = data?.choices?.[0]?.message?.content ?? data?.message?.content?.[0]?.text;
if (!caption) { if (!caption) {
return response.status(500).send('No caption found'); return response.status(500).send('No caption found');