mirror of
https://github.com/SillyTavern/SillyTavern.git
synced 2025-06-05 21:59:27 +02:00
Gemini inline video (#4078)
* Add inline video attachment support for Gemini 2.5 Pro * file formatting * removed redundant function for saving video to message * removed other redundant function for saving video to message * Seperate inlining check for video * Edit video token cost to be a conservative estimate of 10000 tokens * fixed missing semicolon * Adds seperate ui toggle for video inlining. * Move mes_video out of img_container * Remove title from video element for now * Better visibilty of video with controls --------- Co-authored-by: Cohee <18619528+Cohee1207@users.noreply.github.com>
This commit is contained in:
@@ -2019,6 +2019,20 @@
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
<div class="range-block" data-source="makersuite,vertexai">
|
||||||
|
<label for="openai_video_inlining" class="checkbox_label flexWrap widthFreeExpand">
|
||||||
|
<input id="openai_video_inlining" type="checkbox" />
|
||||||
|
<span data-i18n="Send inline videos">Send inline videos</span>
|
||||||
|
</label>
|
||||||
|
<div id="video_inlining_hint" class="flexBasis100p toggle-description justifyLeft">
|
||||||
|
<span data-i18n="video_inlining_hint_1">Sends videos in prompts if the model supports it. Use the</span>
|
||||||
|
<code><i class="fa-solid fa-paperclip"></i></code>
|
||||||
|
<span data-i18n="video_inlining_hint_2">action on any message or the</span>
|
||||||
|
<code><i class="fa-solid fa-wand-magic-sparkles"></i></code>
|
||||||
|
<span data-i18n="video_inlining_hint_3">menu to attach a video file to the chat.</span>
|
||||||
|
<strong data-i18n="video_inlining_hint_4">Videos must be less than 20 MB and under 1 minute long</strong>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
<div class="range-block" data-source="makersuite,vertexai">
|
<div class="range-block" data-source="makersuite,vertexai">
|
||||||
<label for="openai_request_images" class="checkbox_label widthFreeExpand">
|
<label for="openai_request_images" class="checkbox_label widthFreeExpand">
|
||||||
<input id="openai_request_images" type="checkbox" />
|
<input id="openai_request_images" type="checkbox" />
|
||||||
|
@@ -2473,6 +2473,31 @@ export function appendMediaToMessage(mes, messageElement, adjustScroll = true) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Add video to message
|
||||||
|
if (mes.extra?.video) {
|
||||||
|
const container = messageElement.find('.mes_block');
|
||||||
|
const chatHeight = $('#chat').prop('scrollHeight');
|
||||||
|
|
||||||
|
// Create video element if it doesn't exist
|
||||||
|
let video = messageElement.find('.mes_video');
|
||||||
|
if (video.length === 0) {
|
||||||
|
video = $('<video class="mes_video" controls preload="metadata"></video>');
|
||||||
|
container.append(video);
|
||||||
|
}
|
||||||
|
|
||||||
|
video.off('loadedmetadata').on('loadedmetadata', function () {
|
||||||
|
if (!adjustScroll) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const scrollPosition = $('#chat').scrollTop();
|
||||||
|
const newChatHeight = $('#chat').prop('scrollHeight');
|
||||||
|
const diff = newChatHeight - chatHeight;
|
||||||
|
$('#chat').scrollTop(scrollPosition + diff);
|
||||||
|
});
|
||||||
|
|
||||||
|
video.attr('src', mes.extra?.video);
|
||||||
|
}
|
||||||
|
|
||||||
// Add file to message
|
// Add file to message
|
||||||
if (mes.extra?.file) {
|
if (mes.extra?.file) {
|
||||||
messageElement.find('.mes_file_container').remove();
|
messageElement.find('.mes_file_container').remove();
|
||||||
|
@@ -211,6 +211,12 @@ export async function populateFileAttachment(message, inputId = 'file_form_input
|
|||||||
const imageUrl = await saveBase64AsFile(base64Data, name2, fileNamePrefix, extension);
|
const imageUrl = await saveBase64AsFile(base64Data, name2, fileNamePrefix, extension);
|
||||||
message.extra.image = imageUrl;
|
message.extra.image = imageUrl;
|
||||||
message.extra.inline_image = true;
|
message.extra.inline_image = true;
|
||||||
|
}
|
||||||
|
// If file is video
|
||||||
|
else if (file.type.startsWith('video/')) {
|
||||||
|
const extension = file.type.split('/')[1];
|
||||||
|
const videoUrl = await saveBase64AsFile(base64Data, name2, fileNamePrefix, extension);
|
||||||
|
message.extra.video = videoUrl;
|
||||||
} else {
|
} else {
|
||||||
const uniqueFileName = `${fileNamePrefix}.txt`;
|
const uniqueFileName = `${fileNamePrefix}.txt`;
|
||||||
|
|
||||||
@@ -1240,7 +1246,7 @@ async function openAttachmentManager() {
|
|||||||
|
|
||||||
return () => {
|
return () => {
|
||||||
modalButtonData.forEach(p => {
|
modalButtonData.forEach(p => {
|
||||||
const { popper,bodyListener } = p;
|
const { popper, bodyListener } = p;
|
||||||
popper.destroy();
|
popper.destroy();
|
||||||
document.body.removeEventListener('click', bodyListener);
|
document.body.removeEventListener('click', bodyListener);
|
||||||
});
|
});
|
||||||
|
@@ -313,6 +313,7 @@ export const settingsToUpdate = {
|
|||||||
squash_system_messages: ['#squash_system_messages', 'squash_system_messages', true, false],
|
squash_system_messages: ['#squash_system_messages', 'squash_system_messages', true, false],
|
||||||
image_inlining: ['#openai_image_inlining', 'image_inlining', true, false],
|
image_inlining: ['#openai_image_inlining', 'image_inlining', true, false],
|
||||||
inline_image_quality: ['#openai_inline_image_quality', 'inline_image_quality', false, false],
|
inline_image_quality: ['#openai_inline_image_quality', 'inline_image_quality', false, false],
|
||||||
|
video_inlining: ['#openai_video_inlining', 'video_inlining', true, false],
|
||||||
continue_prefill: ['#continue_prefill', 'continue_prefill', true, false],
|
continue_prefill: ['#continue_prefill', 'continue_prefill', true, false],
|
||||||
continue_postfix: ['#continue_postfix', 'continue_postfix', false, false],
|
continue_postfix: ['#continue_postfix', 'continue_postfix', false, false],
|
||||||
function_calling: ['#openai_function_calling', 'function_calling', true, false],
|
function_calling: ['#openai_function_calling', 'function_calling', true, false],
|
||||||
@@ -396,6 +397,7 @@ const default_settings = {
|
|||||||
squash_system_messages: false,
|
squash_system_messages: false,
|
||||||
image_inlining: false,
|
image_inlining: false,
|
||||||
inline_image_quality: 'low',
|
inline_image_quality: 'low',
|
||||||
|
video_inlining: false,
|
||||||
bypass_status_check: false,
|
bypass_status_check: false,
|
||||||
continue_prefill: false,
|
continue_prefill: false,
|
||||||
function_calling: false,
|
function_calling: false,
|
||||||
@@ -482,6 +484,7 @@ const oai_settings = {
|
|||||||
squash_system_messages: false,
|
squash_system_messages: false,
|
||||||
image_inlining: false,
|
image_inlining: false,
|
||||||
inline_image_quality: 'low',
|
inline_image_quality: 'low',
|
||||||
|
video_inlining: false,
|
||||||
bypass_status_check: false,
|
bypass_status_check: false,
|
||||||
continue_prefill: false,
|
continue_prefill: false,
|
||||||
function_calling: false,
|
function_calling: false,
|
||||||
@@ -593,8 +596,9 @@ function setOpenAIMessages(chat) {
|
|||||||
if (role == 'user' && oai_settings.wrap_in_quotes) content = `"${content}"`;
|
if (role == 'user' && oai_settings.wrap_in_quotes) content = `"${content}"`;
|
||||||
const name = chat[j]['name'];
|
const name = chat[j]['name'];
|
||||||
const image = chat[j]?.extra?.image;
|
const image = chat[j]?.extra?.image;
|
||||||
|
const video = chat[j]?.extra?.video;
|
||||||
const invocations = chat[j]?.extra?.tool_invocations;
|
const invocations = chat[j]?.extra?.tool_invocations;
|
||||||
messages[i] = { 'role': role, 'content': content, name: name, 'image': image, 'invocations': invocations };
|
messages[i] = { 'role': role, 'content': content, name: name, 'image': image, 'video': video, 'invocations': invocations };
|
||||||
j++;
|
j++;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -886,6 +890,7 @@ async function populateChatHistory(messages, prompts, chatCompletion, type = nul
|
|||||||
}
|
}
|
||||||
|
|
||||||
const imageInlining = isImageInliningSupported();
|
const imageInlining = isImageInliningSupported();
|
||||||
|
const videoInlining = isVideoInliningSupported();
|
||||||
const canUseTools = ToolManager.isToolCallingSupported();
|
const canUseTools = ToolManager.isToolCallingSupported();
|
||||||
|
|
||||||
// Insert chat messages as long as there is budget available
|
// Insert chat messages as long as there is budget available
|
||||||
@@ -908,6 +913,10 @@ async function populateChatHistory(messages, prompts, chatCompletion, type = nul
|
|||||||
await chatMessage.addImage(chatPrompt.image);
|
await chatMessage.addImage(chatPrompt.image);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (videoInlining && chatPrompt.video) {
|
||||||
|
await chatMessage.addVideo(chatPrompt.video);
|
||||||
|
}
|
||||||
|
|
||||||
if (canUseTools && Array.isArray(chatPrompt.invocations)) {
|
if (canUseTools && Array.isArray(chatPrompt.invocations)) {
|
||||||
/** @type {import('./tool-calling.js').ToolInvocation[]} */
|
/** @type {import('./tool-calling.js').ToolInvocation[]} */
|
||||||
const invocations = chatPrompt.invocations;
|
const invocations = chatPrompt.invocations;
|
||||||
@@ -2781,6 +2790,38 @@ class Message {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async addVideo(video) {
|
||||||
|
const textContent = this.content;
|
||||||
|
const isDataUrl = isDataURL(video);
|
||||||
|
if (!isDataUrl) {
|
||||||
|
try {
|
||||||
|
const response = await fetch(video, { method: 'GET', cache: 'force-cache' });
|
||||||
|
if (!response.ok) throw new Error('Failed to fetch video');
|
||||||
|
const blob = await response.blob();
|
||||||
|
video = await getBase64Async(blob);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Video adding skipped', error);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Note: No compression for videos (unlike images)
|
||||||
|
this.content = [
|
||||||
|
{ type: 'text', text: textContent },
|
||||||
|
{ type: 'video_url', video_url: { 'url': video } },
|
||||||
|
];
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Convservative estimate for video token cost without knowing duration
|
||||||
|
// Using Gemini calculation (263 tokens per second)
|
||||||
|
const tokens = 10000; // ~40 second video (60 seconds max)
|
||||||
|
this.tokens += tokens;
|
||||||
|
} catch (error) {
|
||||||
|
this.tokens += 10000;
|
||||||
|
console.error('Failed to get video token cost', error);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Compress an image if it exceeds the size threshold for the current chat completion source.
|
* Compress an image if it exceeds the size threshold for the current chat completion source.
|
||||||
* @param {string} image Data URL of the image.
|
* @param {string} image Data URL of the image.
|
||||||
@@ -3398,6 +3439,7 @@ function loadOpenAISettings(data, settings) {
|
|||||||
oai_settings.assistant_impersonation = settings.assistant_impersonation ?? default_settings.assistant_impersonation;
|
oai_settings.assistant_impersonation = settings.assistant_impersonation ?? default_settings.assistant_impersonation;
|
||||||
oai_settings.image_inlining = settings.image_inlining ?? default_settings.image_inlining;
|
oai_settings.image_inlining = settings.image_inlining ?? default_settings.image_inlining;
|
||||||
oai_settings.inline_image_quality = settings.inline_image_quality ?? default_settings.inline_image_quality;
|
oai_settings.inline_image_quality = settings.inline_image_quality ?? default_settings.inline_image_quality;
|
||||||
|
oai_settings.video_inlining = settings.video_inlining ?? default_settings.video_inlining;
|
||||||
oai_settings.bypass_status_check = settings.bypass_status_check ?? default_settings.bypass_status_check;
|
oai_settings.bypass_status_check = settings.bypass_status_check ?? default_settings.bypass_status_check;
|
||||||
oai_settings.show_thoughts = settings.show_thoughts ?? default_settings.show_thoughts;
|
oai_settings.show_thoughts = settings.show_thoughts ?? default_settings.show_thoughts;
|
||||||
oai_settings.reasoning_effort = settings.reasoning_effort ?? default_settings.reasoning_effort;
|
oai_settings.reasoning_effort = settings.reasoning_effort ?? default_settings.reasoning_effort;
|
||||||
@@ -3448,6 +3490,8 @@ function loadOpenAISettings(data, settings) {
|
|||||||
$('#openai_inline_image_quality').val(oai_settings.inline_image_quality);
|
$('#openai_inline_image_quality').val(oai_settings.inline_image_quality);
|
||||||
$(`#openai_inline_image_quality option[value="${oai_settings.inline_image_quality}"]`).prop('selected', true);
|
$(`#openai_inline_image_quality option[value="${oai_settings.inline_image_quality}"]`).prop('selected', true);
|
||||||
|
|
||||||
|
$('#openai_video_inlining').prop('checked', oai_settings.video_inlining);
|
||||||
|
|
||||||
$('#model_openai_select').val(oai_settings.openai_model);
|
$('#model_openai_select').val(oai_settings.openai_model);
|
||||||
$(`#model_openai_select option[value="${oai_settings.openai_model}"`).prop('selected', true);
|
$(`#model_openai_select option[value="${oai_settings.openai_model}"`).prop('selected', true);
|
||||||
$('#model_claude_select').val(oai_settings.claude_model);
|
$('#model_claude_select').val(oai_settings.claude_model);
|
||||||
@@ -3824,6 +3868,7 @@ async function saveOpenAIPreset(name, settings, triggerUi = true) {
|
|||||||
squash_system_messages: settings.squash_system_messages,
|
squash_system_messages: settings.squash_system_messages,
|
||||||
image_inlining: settings.image_inlining,
|
image_inlining: settings.image_inlining,
|
||||||
inline_image_quality: settings.inline_image_quality,
|
inline_image_quality: settings.inline_image_quality,
|
||||||
|
video_inlining: settings.video_inlining,
|
||||||
bypass_status_check: settings.bypass_status_check,
|
bypass_status_check: settings.bypass_status_check,
|
||||||
continue_prefill: settings.continue_prefill,
|
continue_prefill: settings.continue_prefill,
|
||||||
continue_postfix: settings.continue_postfix,
|
continue_postfix: settings.continue_postfix,
|
||||||
@@ -5387,6 +5432,36 @@ export function isImageInliningSupported() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if the model supports video inlining
|
||||||
|
* @returns {boolean} True if the model supports video inlining
|
||||||
|
*/
|
||||||
|
export function isVideoInliningSupported() {
|
||||||
|
if (main_api !== 'openai') {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!oai_settings.video_inlining) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Only Gemini models support video for now
|
||||||
|
const videoSupportedModels = [
|
||||||
|
'gemini-2.0',
|
||||||
|
'gemini-2.5',
|
||||||
|
'gemini-exp-1206',
|
||||||
|
];
|
||||||
|
|
||||||
|
switch (oai_settings.chat_completion_source) {
|
||||||
|
case chat_completion_sources.MAKERSUITE:
|
||||||
|
return videoSupportedModels.some(model => oai_settings.google_model.includes(model));
|
||||||
|
case chat_completion_sources.VERTEXAI:
|
||||||
|
return videoSupportedModels.some(model => oai_settings.vertexai_model.includes(model));
|
||||||
|
default:
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Proxy stuff
|
* Proxy stuff
|
||||||
*/
|
*/
|
||||||
@@ -5945,6 +6020,11 @@ export function initOpenAI() {
|
|||||||
saveSettingsDebounced();
|
saveSettingsDebounced();
|
||||||
});
|
});
|
||||||
|
|
||||||
|
$('#openai_video_inlining').on('input', function () {
|
||||||
|
oai_settings.video_inlining = !!$(this).prop('checked');
|
||||||
|
saveSettingsDebounced();
|
||||||
|
});
|
||||||
|
|
||||||
$('#continue_prefill').on('input', function () {
|
$('#continue_prefill').on('input', function () {
|
||||||
oai_settings.continue_prefill = !!$(this).prop('checked');
|
oai_settings.continue_prefill = !!$(this).prop('checked');
|
||||||
saveSettingsDebounced();
|
saveSettingsDebounced();
|
||||||
|
@@ -5198,6 +5198,24 @@ body:not(.sd) .mes_img_swipes {
|
|||||||
max-width: 100% !important;
|
max-width: 100% !important;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Video message styling */
|
||||||
|
.mes_video {
|
||||||
|
max-width: 100%;
|
||||||
|
max-height: 400px;
|
||||||
|
border-radius: 8px;
|
||||||
|
background: #000;
|
||||||
|
margin: 0.5rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Ensure video controls are visible */
|
||||||
|
.mes_video::-webkit-media-controls {
|
||||||
|
display: flex !important;
|
||||||
|
}
|
||||||
|
|
||||||
|
.mes_video::-webkit-media-controls-panel {
|
||||||
|
background-color: rgba(0, 0, 0, 0.2);
|
||||||
|
}
|
||||||
|
|
||||||
/* Align the content of this span to the right */
|
/* Align the content of this span to the right */
|
||||||
.delete-button {
|
.delete-button {
|
||||||
margin-right: 10px;
|
margin-right: 10px;
|
||||||
|
@@ -46,7 +46,7 @@ router.post('/upload', async (request, response) => {
|
|||||||
const splitParts = request.body.image.split(',');
|
const splitParts = request.body.image.split(',');
|
||||||
const format = splitParts[0].split(';')[0].split('/')[1];
|
const format = splitParts[0].split(';')[0].split('/')[1];
|
||||||
const base64Data = splitParts[1];
|
const base64Data = splitParts[1];
|
||||||
const validFormat = ['png', 'jpg', 'webp', 'jpeg', 'gif'].includes(format);
|
const validFormat = ['png', 'jpg', 'webp', 'jpeg', 'gif', 'mp4', 'avi', 'mov', 'wmv', 'flv', 'webm', '3gp', 'mkv'].includes(format);
|
||||||
if (!validFormat) {
|
if (!validFormat) {
|
||||||
return response.status(400).send({ error: 'Invalid image format' });
|
return response.status(400).send({ error: 'Invalid image format' });
|
||||||
}
|
}
|
||||||
|
@@ -471,6 +471,19 @@ export function convertGooglePrompt(messages, _model, useSysPrompt, names) {
|
|||||||
data: base64Data,
|
data: base64Data,
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
} else if (part.type === 'video_url') {
|
||||||
|
const videoUrl = part.video_url?.url;
|
||||||
|
if (videoUrl && videoUrl.startsWith('data:')) {
|
||||||
|
const [header, data] = videoUrl.split(',');
|
||||||
|
const mimeType = header.match(/data:([^;]+)/)?.[1] || 'video/mp4';
|
||||||
|
|
||||||
|
parts.push({
|
||||||
|
inlineData: {
|
||||||
|
mimeType: mimeType,
|
||||||
|
data: data,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user