Gemini inline video (#4078)

* Add inline video attachment support for Gemini 2.5 Pro

* file formatting

* removed redundant function for saving video to message

* removed other redundant function for saving video to message

* Seperate inlining check for video

* Edit video token cost to be a conservative estimate of 10000 tokens

* fixed missing semicolon

* Adds seperate ui toggle for video inlining.

* Move mes_video out of img_container

* Remove title from video element for now

* Better visibilty of video with controls

---------

Co-authored-by: Cohee <18619528+Cohee1207@users.noreply.github.com>
This commit is contained in:
Nikolas Brown
2025-06-01 08:04:16 -04:00
committed by GitHub
parent 3ec9b1a099
commit c4d89b2067
7 changed files with 159 additions and 3 deletions

View File

@@ -2019,6 +2019,20 @@
</div> </div>
</div> </div>
</div> </div>
<div class="range-block" data-source="makersuite,vertexai">
<label for="openai_video_inlining" class="checkbox_label flexWrap widthFreeExpand">
<input id="openai_video_inlining" type="checkbox" />
<span data-i18n="Send inline videos">Send inline videos</span>
</label>
<div id="video_inlining_hint" class="flexBasis100p toggle-description justifyLeft">
<span data-i18n="video_inlining_hint_1">Sends videos in prompts if the model supports it. Use the</span>
<code><i class="fa-solid fa-paperclip"></i></code>
<span data-i18n="video_inlining_hint_2">action on any message or the</span>
<code><i class="fa-solid fa-wand-magic-sparkles"></i></code>
<span data-i18n="video_inlining_hint_3">menu to attach a video file to the chat.</span>
<strong data-i18n="video_inlining_hint_4">Videos must be less than 20 MB and under 1 minute long</strong>
</div>
</div>
<div class="range-block" data-source="makersuite,vertexai"> <div class="range-block" data-source="makersuite,vertexai">
<label for="openai_request_images" class="checkbox_label widthFreeExpand"> <label for="openai_request_images" class="checkbox_label widthFreeExpand">
<input id="openai_request_images" type="checkbox" /> <input id="openai_request_images" type="checkbox" />

View File

@@ -2473,6 +2473,31 @@ export function appendMediaToMessage(mes, messageElement, adjustScroll = true) {
} }
} }
// Add video to message
if (mes.extra?.video) {
const container = messageElement.find('.mes_block');
const chatHeight = $('#chat').prop('scrollHeight');
// Create video element if it doesn't exist
let video = messageElement.find('.mes_video');
if (video.length === 0) {
video = $('<video class="mes_video" controls preload="metadata"></video>');
container.append(video);
}
video.off('loadedmetadata').on('loadedmetadata', function () {
if (!adjustScroll) {
return;
}
const scrollPosition = $('#chat').scrollTop();
const newChatHeight = $('#chat').prop('scrollHeight');
const diff = newChatHeight - chatHeight;
$('#chat').scrollTop(scrollPosition + diff);
});
video.attr('src', mes.extra?.video);
}
// Add file to message // Add file to message
if (mes.extra?.file) { if (mes.extra?.file) {
messageElement.find('.mes_file_container').remove(); messageElement.find('.mes_file_container').remove();

View File

@@ -211,6 +211,12 @@ export async function populateFileAttachment(message, inputId = 'file_form_input
const imageUrl = await saveBase64AsFile(base64Data, name2, fileNamePrefix, extension); const imageUrl = await saveBase64AsFile(base64Data, name2, fileNamePrefix, extension);
message.extra.image = imageUrl; message.extra.image = imageUrl;
message.extra.inline_image = true; message.extra.inline_image = true;
}
// If file is video
else if (file.type.startsWith('video/')) {
const extension = file.type.split('/')[1];
const videoUrl = await saveBase64AsFile(base64Data, name2, fileNamePrefix, extension);
message.extra.video = videoUrl;
} else { } else {
const uniqueFileName = `${fileNamePrefix}.txt`; const uniqueFileName = `${fileNamePrefix}.txt`;
@@ -1240,7 +1246,7 @@ async function openAttachmentManager() {
return () => { return () => {
modalButtonData.forEach(p => { modalButtonData.forEach(p => {
const { popper,bodyListener } = p; const { popper, bodyListener } = p;
popper.destroy(); popper.destroy();
document.body.removeEventListener('click', bodyListener); document.body.removeEventListener('click', bodyListener);
}); });

View File

@@ -313,6 +313,7 @@ export const settingsToUpdate = {
squash_system_messages: ['#squash_system_messages', 'squash_system_messages', true, false], squash_system_messages: ['#squash_system_messages', 'squash_system_messages', true, false],
image_inlining: ['#openai_image_inlining', 'image_inlining', true, false], image_inlining: ['#openai_image_inlining', 'image_inlining', true, false],
inline_image_quality: ['#openai_inline_image_quality', 'inline_image_quality', false, false], inline_image_quality: ['#openai_inline_image_quality', 'inline_image_quality', false, false],
video_inlining: ['#openai_video_inlining', 'video_inlining', true, false],
continue_prefill: ['#continue_prefill', 'continue_prefill', true, false], continue_prefill: ['#continue_prefill', 'continue_prefill', true, false],
continue_postfix: ['#continue_postfix', 'continue_postfix', false, false], continue_postfix: ['#continue_postfix', 'continue_postfix', false, false],
function_calling: ['#openai_function_calling', 'function_calling', true, false], function_calling: ['#openai_function_calling', 'function_calling', true, false],
@@ -396,6 +397,7 @@ const default_settings = {
squash_system_messages: false, squash_system_messages: false,
image_inlining: false, image_inlining: false,
inline_image_quality: 'low', inline_image_quality: 'low',
video_inlining: false,
bypass_status_check: false, bypass_status_check: false,
continue_prefill: false, continue_prefill: false,
function_calling: false, function_calling: false,
@@ -482,6 +484,7 @@ const oai_settings = {
squash_system_messages: false, squash_system_messages: false,
image_inlining: false, image_inlining: false,
inline_image_quality: 'low', inline_image_quality: 'low',
video_inlining: false,
bypass_status_check: false, bypass_status_check: false,
continue_prefill: false, continue_prefill: false,
function_calling: false, function_calling: false,
@@ -593,8 +596,9 @@ function setOpenAIMessages(chat) {
if (role == 'user' && oai_settings.wrap_in_quotes) content = `"${content}"`; if (role == 'user' && oai_settings.wrap_in_quotes) content = `"${content}"`;
const name = chat[j]['name']; const name = chat[j]['name'];
const image = chat[j]?.extra?.image; const image = chat[j]?.extra?.image;
const video = chat[j]?.extra?.video;
const invocations = chat[j]?.extra?.tool_invocations; const invocations = chat[j]?.extra?.tool_invocations;
messages[i] = { 'role': role, 'content': content, name: name, 'image': image, 'invocations': invocations }; messages[i] = { 'role': role, 'content': content, name: name, 'image': image, 'video': video, 'invocations': invocations };
j++; j++;
} }
@@ -886,6 +890,7 @@ async function populateChatHistory(messages, prompts, chatCompletion, type = nul
} }
const imageInlining = isImageInliningSupported(); const imageInlining = isImageInliningSupported();
const videoInlining = isVideoInliningSupported();
const canUseTools = ToolManager.isToolCallingSupported(); const canUseTools = ToolManager.isToolCallingSupported();
// Insert chat messages as long as there is budget available // Insert chat messages as long as there is budget available
@@ -908,6 +913,10 @@ async function populateChatHistory(messages, prompts, chatCompletion, type = nul
await chatMessage.addImage(chatPrompt.image); await chatMessage.addImage(chatPrompt.image);
} }
if (videoInlining && chatPrompt.video) {
await chatMessage.addVideo(chatPrompt.video);
}
if (canUseTools && Array.isArray(chatPrompt.invocations)) { if (canUseTools && Array.isArray(chatPrompt.invocations)) {
/** @type {import('./tool-calling.js').ToolInvocation[]} */ /** @type {import('./tool-calling.js').ToolInvocation[]} */
const invocations = chatPrompt.invocations; const invocations = chatPrompt.invocations;
@@ -2781,6 +2790,38 @@ class Message {
} }
} }
async addVideo(video) {
const textContent = this.content;
const isDataUrl = isDataURL(video);
if (!isDataUrl) {
try {
const response = await fetch(video, { method: 'GET', cache: 'force-cache' });
if (!response.ok) throw new Error('Failed to fetch video');
const blob = await response.blob();
video = await getBase64Async(blob);
} catch (error) {
console.error('Video adding skipped', error);
return;
}
}
// Note: No compression for videos (unlike images)
this.content = [
{ type: 'text', text: textContent },
{ type: 'video_url', video_url: { 'url': video } },
];
try {
// Convservative estimate for video token cost without knowing duration
// Using Gemini calculation (263 tokens per second)
const tokens = 10000; // ~40 second video (60 seconds max)
this.tokens += tokens;
} catch (error) {
this.tokens += 10000;
console.error('Failed to get video token cost', error);
}
}
/** /**
* Compress an image if it exceeds the size threshold for the current chat completion source. * Compress an image if it exceeds the size threshold for the current chat completion source.
* @param {string} image Data URL of the image. * @param {string} image Data URL of the image.
@@ -3398,6 +3439,7 @@ function loadOpenAISettings(data, settings) {
oai_settings.assistant_impersonation = settings.assistant_impersonation ?? default_settings.assistant_impersonation; oai_settings.assistant_impersonation = settings.assistant_impersonation ?? default_settings.assistant_impersonation;
oai_settings.image_inlining = settings.image_inlining ?? default_settings.image_inlining; oai_settings.image_inlining = settings.image_inlining ?? default_settings.image_inlining;
oai_settings.inline_image_quality = settings.inline_image_quality ?? default_settings.inline_image_quality; oai_settings.inline_image_quality = settings.inline_image_quality ?? default_settings.inline_image_quality;
oai_settings.video_inlining = settings.video_inlining ?? default_settings.video_inlining;
oai_settings.bypass_status_check = settings.bypass_status_check ?? default_settings.bypass_status_check; oai_settings.bypass_status_check = settings.bypass_status_check ?? default_settings.bypass_status_check;
oai_settings.show_thoughts = settings.show_thoughts ?? default_settings.show_thoughts; oai_settings.show_thoughts = settings.show_thoughts ?? default_settings.show_thoughts;
oai_settings.reasoning_effort = settings.reasoning_effort ?? default_settings.reasoning_effort; oai_settings.reasoning_effort = settings.reasoning_effort ?? default_settings.reasoning_effort;
@@ -3448,6 +3490,8 @@ function loadOpenAISettings(data, settings) {
$('#openai_inline_image_quality').val(oai_settings.inline_image_quality); $('#openai_inline_image_quality').val(oai_settings.inline_image_quality);
$(`#openai_inline_image_quality option[value="${oai_settings.inline_image_quality}"]`).prop('selected', true); $(`#openai_inline_image_quality option[value="${oai_settings.inline_image_quality}"]`).prop('selected', true);
$('#openai_video_inlining').prop('checked', oai_settings.video_inlining);
$('#model_openai_select').val(oai_settings.openai_model); $('#model_openai_select').val(oai_settings.openai_model);
$(`#model_openai_select option[value="${oai_settings.openai_model}"`).prop('selected', true); $(`#model_openai_select option[value="${oai_settings.openai_model}"`).prop('selected', true);
$('#model_claude_select').val(oai_settings.claude_model); $('#model_claude_select').val(oai_settings.claude_model);
@@ -3824,6 +3868,7 @@ async function saveOpenAIPreset(name, settings, triggerUi = true) {
squash_system_messages: settings.squash_system_messages, squash_system_messages: settings.squash_system_messages,
image_inlining: settings.image_inlining, image_inlining: settings.image_inlining,
inline_image_quality: settings.inline_image_quality, inline_image_quality: settings.inline_image_quality,
video_inlining: settings.video_inlining,
bypass_status_check: settings.bypass_status_check, bypass_status_check: settings.bypass_status_check,
continue_prefill: settings.continue_prefill, continue_prefill: settings.continue_prefill,
continue_postfix: settings.continue_postfix, continue_postfix: settings.continue_postfix,
@@ -5387,6 +5432,36 @@ export function isImageInliningSupported() {
} }
} }
/**
* Check if the model supports video inlining
* @returns {boolean} True if the model supports video inlining
*/
export function isVideoInliningSupported() {
if (main_api !== 'openai') {
return false;
}
if (!oai_settings.video_inlining) {
return false;
}
// Only Gemini models support video for now
const videoSupportedModels = [
'gemini-2.0',
'gemini-2.5',
'gemini-exp-1206',
];
switch (oai_settings.chat_completion_source) {
case chat_completion_sources.MAKERSUITE:
return videoSupportedModels.some(model => oai_settings.google_model.includes(model));
case chat_completion_sources.VERTEXAI:
return videoSupportedModels.some(model => oai_settings.vertexai_model.includes(model));
default:
return false;
}
}
/** /**
* Proxy stuff * Proxy stuff
*/ */
@@ -5945,6 +6020,11 @@ export function initOpenAI() {
saveSettingsDebounced(); saveSettingsDebounced();
}); });
$('#openai_video_inlining').on('input', function () {
oai_settings.video_inlining = !!$(this).prop('checked');
saveSettingsDebounced();
});
$('#continue_prefill').on('input', function () { $('#continue_prefill').on('input', function () {
oai_settings.continue_prefill = !!$(this).prop('checked'); oai_settings.continue_prefill = !!$(this).prop('checked');
saveSettingsDebounced(); saveSettingsDebounced();

View File

@@ -5198,6 +5198,24 @@ body:not(.sd) .mes_img_swipes {
max-width: 100% !important; max-width: 100% !important;
} }
/* Video message styling */
.mes_video {
max-width: 100%;
max-height: 400px;
border-radius: 8px;
background: #000;
margin: 0.5rem;
}
/* Ensure video controls are visible */
.mes_video::-webkit-media-controls {
display: flex !important;
}
.mes_video::-webkit-media-controls-panel {
background-color: rgba(0, 0, 0, 0.2);
}
/* Align the content of this span to the right */ /* Align the content of this span to the right */
.delete-button { .delete-button {
margin-right: 10px; margin-right: 10px;

View File

@@ -46,7 +46,7 @@ router.post('/upload', async (request, response) => {
const splitParts = request.body.image.split(','); const splitParts = request.body.image.split(',');
const format = splitParts[0].split(';')[0].split('/')[1]; const format = splitParts[0].split(';')[0].split('/')[1];
const base64Data = splitParts[1]; const base64Data = splitParts[1];
const validFormat = ['png', 'jpg', 'webp', 'jpeg', 'gif'].includes(format); const validFormat = ['png', 'jpg', 'webp', 'jpeg', 'gif', 'mp4', 'avi', 'mov', 'wmv', 'flv', 'webm', '3gp', 'mkv'].includes(format);
if (!validFormat) { if (!validFormat) {
return response.status(400).send({ error: 'Invalid image format' }); return response.status(400).send({ error: 'Invalid image format' });
} }

View File

@@ -471,6 +471,19 @@ export function convertGooglePrompt(messages, _model, useSysPrompt, names) {
data: base64Data, data: base64Data,
}, },
}); });
} else if (part.type === 'video_url') {
const videoUrl = part.video_url?.url;
if (videoUrl && videoUrl.startsWith('data:')) {
const [header, data] = videoUrl.split(',');
const mimeType = header.match(/data:([^;]+)/)?.[1] || 'video/mp4';
parts.push({
inlineData: {
mimeType: mimeType,
data: data,
},
});
}
} }
}); });