diff --git a/public/scripts/custom-request.js b/public/scripts/custom-request.js index d6fccf1a9..d19a62563 100644 --- a/public/scripts/custom-request.js +++ b/public/scripts/custom-request.js @@ -3,10 +3,13 @@ import { extractMessageFromData, getGenerateUrl, getRequestHeaders } from '../sc import { getTextGenServer } from './textgen-settings.js'; import { extractReasoningFromData } from './reasoning.js'; import { formatInstructModeChat, formatInstructModePrompt, names_behavior_types } from './instruct-mode.js'; +import { getStreamingReply, tryParseStreamingError } from './openai.js'; +import EventSourceStream from './sse-stream.js'; // #region Type Definitions /** * @typedef {Object} TextCompletionRequestBase + * @property {boolean?} [stream=false] - Whether to stream the response * @property {number} max_tokens - Maximum number of tokens to generate * @property {string} [model] - Optional model name * @property {string} api_type - Type of API to use @@ -17,6 +20,7 @@ import { formatInstructModeChat, formatInstructModePrompt, names_behavior_types /** * @typedef {Object} TextCompletionPayloadBase + * @property {boolean?} [stream=false] - Whether to stream the response * @property {string} prompt - The text prompt for completion * @property {number} max_tokens - Maximum number of tokens to generate * @property {number} max_new_tokens - Alias for max_tokens @@ -36,6 +40,7 @@ import { formatInstructModeChat, formatInstructModePrompt, names_behavior_types /** * @typedef {Object} ChatCompletionPayloadBase + * @property {boolean?} [stream=false] - Whether to stream the response * @property {ChatCompletionMessage[]} messages - Array of chat messages * @property {string} [model] - Optional model name to use for completion * @property {string} chat_completion_source - Source provider for chat completion @@ -52,10 +57,20 @@ import { formatInstructModeChat, formatInstructModePrompt, names_behavior_types * @property {string} reasoning - Extracted reasoning. */ +/** + * @typedef {Object} StreamResponse + * @property {string} text - Generated text. + * @property {string[]} swipes - Generated swipes + * @property {Object} state - Generated state + * @property {string?} [state.reasoning] - Generated reasoning + * @property {string?} [state.image] - Generated image + * @returns {StreamResponse} + */ + // #endregion /** - * Creates & sends a text completion request. Streaming is not supported. + * Creates & sends a text completion request. */ export class TextCompletionService { static TYPE = 'textgenerationwebui'; @@ -64,9 +79,10 @@ export class TextCompletionService { * @param {Record & TextCompletionRequestBase & {prompt: string}} custom * @returns {TextCompletionPayload} */ - static createRequestData({ prompt, max_tokens, model, api_type, api_server, temperature, min_p, ...props }) { + static createRequestData({ stream = false, prompt, max_tokens, model, api_type, api_server, temperature, min_p, ...props }) { const payload = { ...props, + stream, prompt, max_tokens, max_new_tokens: max_tokens, @@ -75,7 +91,6 @@ export class TextCompletionService { api_server: api_server ?? getTextGenServer(api_type), temperature, min_p, - stream: false, }; // Remove undefined values to avoid API errors @@ -92,34 +107,81 @@ export class TextCompletionService { * Sends a text completion request to the specified server * @param {TextCompletionPayload} data Request data * @param {boolean?} extractData Extract message from the response. Default true - * @returns {Promise} Extracted data or the raw response + * @param {AbortSignal?} signal + * @returns {Promise AsyncGenerator)>} If not streaming, returns extracted data; if streaming, returns a function that creates an AsyncGenerator * @throws {Error} */ - static async sendRequest(data, extractData = true) { - const response = await fetch(getGenerateUrl(this.TYPE), { + static async sendRequest(data, extractData = true, signal = null) { + if (!data.stream) { + const response = await fetch(getGenerateUrl(this.TYPE), { + method: 'POST', + headers: getRequestHeaders(), + cache: 'no-cache', + body: JSON.stringify(data), + signal: signal ?? new AbortController().signal, + }); + + const json = await response.json(); + if (!response.ok || json.error) { + throw json; + } + + if (!extractData) { + return json; + } + + return { + content: extractMessageFromData(json, this.TYPE), + reasoning: extractReasoningFromData(json, { + mainApi: this.TYPE, + textGenType: data.api_type, + ignoreShowThoughts: true, + }), + }; + } + + const response = await fetch('/api/backends/text-completions/generate', { method: 'POST', headers: getRequestHeaders(), cache: 'no-cache', body: JSON.stringify(data), - signal: new AbortController().signal, + signal: signal ?? new AbortController().signal, }); - const json = await response.json(); - if (!response.ok || json.error) { - throw json; + if (!response.ok) { + const text = await response.text(); + tryParseStreamingError(response, text, true); + + throw new Error(`Got response status ${response.status}`); } - if (!extractData) { - return json; - } + const eventStream = new EventSourceStream(); + response.body.pipeThrough(eventStream); + const reader = eventStream.readable.getReader(); + return async function* streamData() { + let text = ''; + const swipes = []; + const state = { reasoning: '' }; + while (true) { + const { done, value } = await reader.read(); + if (done) return; + if (value.data === '[DONE]') return; - return { - content: extractMessageFromData(json, this.TYPE), - reasoning: extractReasoningFromData(json, { - mainApi: this.TYPE, - textGenType: data.api_type, - ignoreShowThoughts: true, - }), + tryParseStreamingError(response, value.data, true); + + let data = JSON.parse(value.data); + + if (data?.choices?.[0]?.index > 0) { + const swipeIndex = data.choices[0].index - 1; + swipes[swipeIndex] = (swipes[swipeIndex] || '') + data.choices[0].text; + } else { + const newText = data?.choices?.[0]?.text || data?.content || ''; + text += newText; + state.reasoning += data?.choices?.[0]?.reasoning ?? ''; + } + + yield { text, swipes, state }; + } }; } @@ -130,13 +192,15 @@ export class TextCompletionService { * @param {string?} [options.presetName] - Name of the preset to use for generation settings * @param {string?} [options.instructName] - Name of instruct preset for message formatting * @param {boolean} extractData - Whether to extract structured data from response - * @returns {Promise} Extracted data or the raw response + * @param {AbortSignal?} [signal] + * @returns {Promise AsyncGenerator)>} If not streaming, returns extracted data; if streaming, returns a function that creates an AsyncGenerator * @throws {Error} */ static async processRequest( custom, options = {}, extractData = true, + signal = null, ) { const { presetName, instructName } = options; let requestData = { ...custom }; @@ -220,7 +284,7 @@ export class TextCompletionService { // @ts-ignore const data = this.createRequestData(requestData); - return await this.sendRequest(data, extractData); + return await this.sendRequest(data, extractData, signal); } /** @@ -256,7 +320,7 @@ export class TextCompletionService { } /** - * Creates & sends a chat completion request. Streaming is not supported. + * Creates & sends a chat completion request. */ export class ChatCompletionService { static TYPE = 'openai'; @@ -265,16 +329,16 @@ export class ChatCompletionService { * @param {ChatCompletionPayload} custom * @returns {ChatCompletionPayload} */ - static createRequestData({ messages, model, chat_completion_source, max_tokens, temperature, custom_url, ...props }) { + static createRequestData({ stream = false, messages, model, chat_completion_source, max_tokens, temperature, custom_url, ...props }) { const payload = { ...props, + stream, messages, model, chat_completion_source, max_tokens, temperature, custom_url, - stream: false, }; // Remove undefined values to avoid API errors @@ -291,34 +355,74 @@ export class ChatCompletionService { * Sends a chat completion request * @param {ChatCompletionPayload} data Request data * @param {boolean?} extractData Extract message from the response. Default true - * @returns {Promise} Extracted data or the raw response + * @param {AbortSignal?} signal Abort signal + * @returns {Promise AsyncGenerator)>} If not streaming, returns extracted data; if streaming, returns a function that creates an AsyncGenerator * @throws {Error} */ - static async sendRequest(data, extractData = true) { + static async sendRequest(data, extractData = true, signal = null) { const response = await fetch('/api/backends/chat-completions/generate', { method: 'POST', headers: getRequestHeaders(), cache: 'no-cache', body: JSON.stringify(data), - signal: new AbortController().signal, + signal: signal ?? new AbortController().signal, }); - const json = await response.json(); - if (!response.ok || json.error) { - throw json; + if (!data.stream) { + const json = await response.json(); + if (!response.ok || json.error) { + throw json; + } + + if (!extractData) { + return json; + } + + return { + content: extractMessageFromData(json, this.TYPE), + reasoning: extractReasoningFromData(json, { + mainApi: this.TYPE, + textGenType: data.chat_completion_source, + ignoreShowThoughts: true, + }), + }; } - if (!extractData) { - return json; + if (!response.ok) { + const text = await response.text(); + tryParseStreamingError(response, text, true); + + throw new Error(`Got response status ${response.status}`); } - return { - content: extractMessageFromData(json, this.TYPE), - reasoning: extractReasoningFromData(json, { - mainApi: this.TYPE, - textGenType: data.chat_completion_source, - ignoreShowThoughts: true, - }), + const eventStream = new EventSourceStream(); + response.body.pipeThrough(eventStream); + const reader = eventStream.readable.getReader(); + return async function* streamData() { + let text = ''; + const swipes = []; + const state = { reasoning: '', image: '' }; + while (true) { + const { done, value } = await reader.read(); + if (done) return; + const rawData = value.data; + if (rawData === '[DONE]') return; + tryParseStreamingError(response, rawData, true); + const parsed = JSON.parse(rawData); + + const reply = getStreamingReply(parsed, state, { + chatCompletionSource: data.chat_completion_source, + ignoreShowThoughts: true, + }); + if (Array.isArray(parsed?.choices) && parsed?.choices?.[0]?.index > 0) { + const swipeIndex = parsed.choices[0].index - 1; + swipes[swipeIndex] = (swipes[swipeIndex] || '') + reply; + } else { + text += reply; + } + + yield { text, swipes: swipes, state }; + } }; } @@ -327,11 +431,12 @@ export class ChatCompletionService { * @param {ChatCompletionPayload} custom * @param {Object} options - Configuration options * @param {string?} [options.presetName] - Name of the preset to use for generation settings - * @param {boolean} extractData - Whether to extract structured data from response - * @returns {Promise} Extracted data or the raw response + * @param {boolean} [extractData=true] - Whether to extract structured data from response + * @param {AbortSignal?} [signal] - Abort signal + * @returns {Promise AsyncGenerator)>} If not streaming, returns extracted data; if streaming, returns a function that creates an AsyncGenerator * @throws {Error} */ - static async processRequest(custom, options, extractData = true) { + static async processRequest(custom, options, extractData = true, signal = null) { const { presetName } = options; let requestData = { ...custom }; @@ -354,7 +459,7 @@ export class ChatCompletionService { const data = this.createRequestData(requestData); - return await this.sendRequest(data, extractData); + return await this.sendRequest(data, extractData, signal); } /** diff --git a/public/scripts/extensions/shared.js b/public/scripts/extensions/shared.js index d5549c40a..144ac3d6b 100644 --- a/public/scripts/extensions/shared.js +++ b/public/scripts/extensions/shared.js @@ -276,10 +276,12 @@ export async function getWebLlmContextSize() { } /** - * It uses the profiles to send a generate request to the API. Doesn't support streaming. + * It uses the profiles to send a generate request to the API. */ export class ConnectionManagerRequestService { static defaultSendRequestParams = { + stream: false, + signal: null, extractData: true, includePreset: true, includeInstruct: true, @@ -296,11 +298,11 @@ export class ConnectionManagerRequestService { * @param {string} profileId * @param {string | (import('../custom-request.js').ChatCompletionMessage & {ignoreInstruct?: boolean})[]} prompt * @param {number} maxTokens - * @param {{extractData?: boolean, includePreset?: boolean, includeInstruct?: boolean}} custom - default values are true - * @returns {Promise} Extracted data or the raw response + * @param {{stream?: boolean, signal?: AbortSignal, extractData?: boolean, includePreset?: boolean, includeInstruct?: boolean}} custom - default values are true + * @returns {Promise AsyncGenerator)>} If not streaming, returns extracted data; if streaming, returns a function that creates an AsyncGenerator */ static async sendRequest(profileId, prompt, maxTokens, custom = this.defaultSendRequestParams) { - const { extractData, includePreset, includeInstruct } = { ...this.defaultSendRequestParams, ...custom }; + const { stream, signal, extractData, includePreset, includeInstruct } = { ...this.defaultSendRequestParams, ...custom }; const context = SillyTavern.getContext(); if (context.extensionSettings.disabledExtensions.includes('connection-manager')) { @@ -319,6 +321,7 @@ export class ConnectionManagerRequestService { const messages = Array.isArray(prompt) ? prompt : [{ role: 'user', content: prompt }]; return await context.ChatCompletionService.processRequest({ + stream, messages, max_tokens: maxTokens, model: profile.model, @@ -326,7 +329,7 @@ export class ConnectionManagerRequestService { custom_url: profile['api-url'], }, { presetName: includePreset ? profile.preset : undefined, - }, extractData); + }, extractData, signal); } case 'textgenerationwebui': { if (!selectedApiMap.type) { @@ -334,6 +337,7 @@ export class ConnectionManagerRequestService { } return await context.TextCompletionService.processRequest({ + stream, prompt, max_tokens: maxTokens, model: profile.model, @@ -342,7 +346,7 @@ export class ConnectionManagerRequestService { }, { instructName: includeInstruct ? profile.instruct : undefined, presetName: includePreset ? profile.preset : undefined, - }, extractData); + }, extractData, signal); } default: { throw new Error(`Unknown API type ${selectedApiMap.selected}`); diff --git a/public/scripts/openai.js b/public/scripts/openai.js index 36e224bf9..6626ac26e 100644 --- a/public/scripts/openai.js +++ b/public/scripts/openai.js @@ -1444,8 +1444,9 @@ export async function prepareOpenAIMessages({ * Handles errors during streaming requests. * @param {Response} response * @param {string} decoded - response text or decoded stream data + * @param {boolean?} [supressToastr=false] */ -function tryParseStreamingError(response, decoded) { +export function tryParseStreamingError(response, decoded, supressToastr = false) { try { const data = JSON.parse(decoded); @@ -1453,19 +1454,19 @@ function tryParseStreamingError(response, decoded) { return; } - checkQuotaError(data); - checkModerationError(data); + checkQuotaError(data, supressToastr); + checkModerationError(data, supressToastr); // these do not throw correctly (equiv to Error("[object Object]")) // if trying to fix "[object Object]" displayed to users, start here if (data.error) { - toastr.error(data.error.message || response.statusText, 'Chat Completion API'); + !supressToastr && toastr.error(data.error.message || response.statusText, 'Chat Completion API'); throw new Error(data); } if (data.message) { - toastr.error(data.message, 'Chat Completion API'); + !supressToastr && toastr.error(data.message, 'Chat Completion API'); throw new Error(data); } } @@ -1477,16 +1478,17 @@ function tryParseStreamingError(response, decoded) { /** * Checks if the response contains a quota error and displays a popup if it does. * @param data + * @param {boolean?} [supressToastr=false] * @returns {void} * @throws {object} - response JSON */ -function checkQuotaError(data) { +function checkQuotaError(data, supressToastr = false) { if (!data) { return; } if (data.quota_error) { - renderTemplateAsync('quotaError').then((html) => Popup.show.text('Quota Error', html)); + !supressToastr && renderTemplateAsync('quotaError').then((html) => Popup.show.text('Quota Error', html)); // this does not throw correctly (equiv to Error("[object Object]")) // if trying to fix "[object Object]" displayed to users, start here @@ -1494,9 +1496,13 @@ function checkQuotaError(data) { } } -function checkModerationError(data) { +/** + * @param {any} data + * @param {boolean?} [supressToastr=false] + */ +function checkModerationError(data, supressToastr = false) { const moderationError = data?.error?.message?.includes('requires moderation'); - if (moderationError) { + if (moderationError && !supressToastr) { const moderationReason = `Reasons: ${data?.error?.metadata?.reasons?.join(', ') ?? '(N/A)'}`; const flaggedText = data?.error?.metadata?.flagged_input ?? '(N/A)'; toastr.info(flaggedText, moderationReason, { timeOut: 10000 }); @@ -2255,37 +2261,43 @@ async function sendOpenAIRequest(type, messages, signal) { * Extracts the reply from the response data from a chat completions-like source * @param {object} data Response data from the chat completions-like source * @param {object} state Additional state to keep track of + * @param {object} options Additional options + * @param {string?} [options.chatCompletionSource] Chat completion source + * @param {boolean?} [options.ignoreShowThoughts] Ignore show thoughts * @returns {string} The reply extracted from the response data */ -function getStreamingReply(data, state) { - if (oai_settings.chat_completion_source === chat_completion_sources.CLAUDE) { - if (oai_settings.show_thoughts) { +export function getStreamingReply(data, state, { chatCompletionSource = null, ignoreShowThoughts = false } = {}) { + const chat_completion_source = chatCompletionSource ?? oai_settings.chat_completion_source; + const show_thoughts = ignoreShowThoughts ? true : oai_settings.show_thoughts; + + if (chat_completion_source === chat_completion_sources.CLAUDE) { + if (show_thoughts) { state.reasoning += data?.delta?.thinking || ''; } return data?.delta?.text || ''; - } else if (oai_settings.chat_completion_source === chat_completion_sources.MAKERSUITE) { + } else if (chat_completion_source === chat_completion_sources.MAKERSUITE) { const inlineData = data?.candidates?.[0]?.content?.parts?.find(x => x.inlineData)?.inlineData; if (inlineData) { state.image = `data:${inlineData.mimeType};base64,${inlineData.data}`; } - if (oai_settings.show_thoughts) { + if (show_thoughts) { state.reasoning += (data?.candidates?.[0]?.content?.parts?.filter(x => x.thought)?.map(x => x.text)?.[0] || ''); } return data?.candidates?.[0]?.content?.parts?.filter(x => !x.thought)?.map(x => x.text)?.[0] || ''; - } else if (oai_settings.chat_completion_source === chat_completion_sources.COHERE) { + } else if (chat_completion_source === chat_completion_sources.COHERE) { return data?.delta?.message?.content?.text || data?.delta?.message?.tool_plan || ''; - } else if (oai_settings.chat_completion_source === chat_completion_sources.DEEPSEEK) { - if (oai_settings.show_thoughts) { + } else if (chat_completion_source === chat_completion_sources.DEEPSEEK) { + if (show_thoughts) { state.reasoning += (data.choices?.filter(x => x?.delta?.reasoning_content)?.[0]?.delta?.reasoning_content || ''); } return data.choices?.[0]?.delta?.content || ''; - } else if (oai_settings.chat_completion_source === chat_completion_sources.OPENROUTER) { - if (oai_settings.show_thoughts) { + } else if (chat_completion_source === chat_completion_sources.OPENROUTER) { + if (show_thoughts) { state.reasoning += (data.choices?.filter(x => x?.delta?.reasoning)?.[0]?.delta?.reasoning || ''); } return data.choices?.[0]?.delta?.content ?? data.choices?.[0]?.message?.content ?? data.choices?.[0]?.text ?? ''; - } else if (oai_settings.chat_completion_source === chat_completion_sources.CUSTOM) { - if (oai_settings.show_thoughts) { + } else if (chat_completion_source === chat_completion_sources.CUSTOM) { + if (show_thoughts) { state.reasoning += data.choices?.filter(x => x?.delta?.reasoning_content)?.[0]?.delta?.reasoning_content ?? data.choices?.filter(x => x?.delta?.reasoning)?.[0]?.delta?.reasoning ??